Adding upstream version 16.2.11+ds.upstream/16.2.11+ds upstream

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-07 18:45:59 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-07 18:45:59 +0000
commit: 19fcec84d8d7d21e796c7624e521b60d28ee21ed (patch)
tree: 42d26aa27d1e3f7c0b8bd3fd14e7d7082f5008dc /src/rocksdb/db
parent: Initial commit. (diff)
download: ceph-19fcec84d8d7d21e796c7624e521b60d28ee21ed.tar.xz
ceph-19fcec84d8d7d21e796c7624e521b60d28ee21ed.zip
195 files changed, 144976 insertions, 0 deletions
diff --git a/src/rocksdb/db/arena_wrapped_db_iter.cc b/src/rocksdb/db/arena_wrapped_db_iter.cc
new file mode 100644
index 000000000..f43282a75
--- /dev/null
+++ b/src/rocksdb/db/arena_wrapped_db_iter.cc
@@ -0,0 +1,106 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/arena_wrapped_db_iter.h"
+#include "memory/arena.h"
+#include "rocksdb/env.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/options.h"
+#include "table/internal_iterator.h"
+#include "table/iterator_wrapper.h"
+#include "util/user_comparator_wrapper.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+Status ArenaWrappedDBIter::GetProperty(std::string prop_name,
+                                       std::string* prop) {
+  if (prop_name == "rocksdb.iterator.super-version-number") {
+    // First try to pass the value returned from inner iterator.
+    if (!db_iter_->GetProperty(prop_name, prop).ok()) {
+      *prop = ToString(sv_number_);
+    }
+    return Status::OK();
+  }
+  return db_iter_->GetProperty(prop_name, prop);
+}
+
+void ArenaWrappedDBIter::Init(Env* env, const ReadOptions& read_options,
+                              const ImmutableCFOptions& cf_options,
+                              const MutableCFOptions& mutable_cf_options,
+                              const SequenceNumber& sequence,
+                              uint64_t max_sequential_skip_in_iteration,
+                              uint64_t version_number,
+                              ReadCallback* read_callback, DBImpl* db_impl,
+                              ColumnFamilyData* cfd, bool allow_blob,
+                              bool allow_refresh) {
+  auto mem = arena_.AllocateAligned(sizeof(DBIter));
+  db_iter_ = new (mem) DBIter(env, read_options, cf_options, mutable_cf_options,
+                              cf_options.user_comparator, nullptr, sequence,
+                              true, max_sequential_skip_in_iteration,
+                              read_callback, db_impl, cfd, allow_blob);
+  sv_number_ = version_number;
+  allow_refresh_ = allow_refresh;
+}
+
+Status ArenaWrappedDBIter::Refresh() {
+  if (cfd_ == nullptr || db_impl_ == nullptr || !allow_refresh_) {
+    return Status::NotSupported("Creating renew iterator is not allowed.");
+  }
+  assert(db_iter_ != nullptr);
+  // TODO(yiwu): For last_seq_same_as_publish_seq_==false, this is not the
+  // correct behavior. Will be corrected automatically when we take a snapshot
+  // here for the case of WritePreparedTxnDB.
+  SequenceNumber latest_seq = db_impl_->GetLatestSequenceNumber();
+  uint64_t cur_sv_number = cfd_->GetSuperVersionNumber();
+  if (sv_number_ != cur_sv_number) {
+    Env* env = db_iter_->env();
+    db_iter_->~DBIter();
+    arena_.~Arena();
+    new (&arena_) Arena();
+
+    SuperVersion* sv = cfd_->GetReferencedSuperVersion(db_impl_);
+    if (read_callback_) {
+      read_callback_->Refresh(latest_seq);
+    }
+    Init(env, read_options_, *(cfd_->ioptions()), sv->mutable_cf_options,
+         latest_seq, sv->mutable_cf_options.max_sequential_skip_in_iterations,
+         cur_sv_number, read_callback_, db_impl_, cfd_, allow_blob_,
+         allow_refresh_);
+
+    InternalIterator* internal_iter = db_impl_->NewInternalIterator(
+        read_options_, cfd_, sv, &arena_, db_iter_->GetRangeDelAggregator(),
+        latest_seq);
+    SetIterUnderDBIter(internal_iter);
+  } else {
+    db_iter_->set_sequence(latest_seq);
+    db_iter_->set_valid(false);
+  }
+  return Status::OK();
+}
+
+ArenaWrappedDBIter* NewArenaWrappedDbIterator(
+    Env* env, const ReadOptions& read_options,
+    const ImmutableCFOptions& cf_options,
+    const MutableCFOptions& mutable_cf_options, const SequenceNumber& sequence,
+    uint64_t max_sequential_skip_in_iterations, uint64_t version_number,
+    ReadCallback* read_callback, DBImpl* db_impl, ColumnFamilyData* cfd,
+    bool allow_blob, bool allow_refresh) {
+  ArenaWrappedDBIter* iter = new ArenaWrappedDBIter();
+  iter->Init(env, read_options, cf_options, mutable_cf_options, sequence,
+             max_sequential_skip_in_iterations, version_number, read_callback,
+             db_impl, cfd, allow_blob, allow_refresh);
+  if (db_impl != nullptr && cfd != nullptr && allow_refresh) {
+    iter->StoreRefreshInfo(read_options, db_impl, cfd, read_callback,
+                           allow_blob);
+  }
+
+  return iter;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/arena_wrapped_db_iter.h b/src/rocksdb/db/arena_wrapped_db_iter.h
new file mode 100644
index 000000000..0c135f857
--- /dev/null
+++ b/src/rocksdb/db/arena_wrapped_db_iter.h
@@ -0,0 +1,112 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <stdint.h>
+#include <string>
+#include "db/db_impl/db_impl.h"
+#include "db/db_iter.h"
+#include "db/dbformat.h"
+#include "db/range_del_aggregator.h"
+#include "memory/arena.h"
+#include "options/cf_options.h"
+#include "rocksdb/db.h"
+#include "rocksdb/iterator.h"
+#include "util/autovector.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Arena;
+
+// A wrapper iterator which wraps DB Iterator and the arena, with which the DB
+// iterator is supposed to be allocated. This class is used as an entry point of
+// a iterator hierarchy whose memory can be allocated inline. In that way,
+// accessing the iterator tree can be more cache friendly. It is also faster
+// to allocate.
+// When using the class's Iterator interface, the behavior is exactly
+// the same as the inner DBIter.
+class ArenaWrappedDBIter : public Iterator {
+ public:
+  virtual ~ArenaWrappedDBIter() { db_iter_->~DBIter(); }
+
+  // Get the arena to be used to allocate memory for DBIter to be wrapped,
+  // as well as child iterators in it.
+  virtual Arena* GetArena() { return &arena_; }
+  virtual ReadRangeDelAggregator* GetRangeDelAggregator() {
+    return db_iter_->GetRangeDelAggregator();
+  }
+
+  // Set the internal iterator wrapped inside the DB Iterator. Usually it is
+  // a merging iterator.
+  virtual void SetIterUnderDBIter(InternalIterator* iter) {
+    db_iter_->SetIter(iter);
+  }
+
+  bool Valid() const override { return db_iter_->Valid(); }
+  void SeekToFirst() override { db_iter_->SeekToFirst(); }
+  void SeekToLast() override { db_iter_->SeekToLast(); }
+  void Seek(const Slice& target) override { db_iter_->Seek(target); }
+  void SeekForPrev(const Slice& target) override {
+    db_iter_->SeekForPrev(target);
+  }
+  void Next() override { db_iter_->Next(); }
+  void Prev() override { db_iter_->Prev(); }
+  Slice key() const override { return db_iter_->key(); }
+  Slice value() const override { return db_iter_->value(); }
+  Status status() const override { return db_iter_->status(); }
+  bool IsBlob() const { return db_iter_->IsBlob(); }
+
+  Status GetProperty(std::string prop_name, std::string* prop) override;
+
+  Status Refresh() override;
+
+  void Init(Env* env, const ReadOptions& read_options,
+            const ImmutableCFOptions& cf_options,
+            const MutableCFOptions& mutable_cf_options,
+            const SequenceNumber& sequence,
+            uint64_t max_sequential_skip_in_iterations, uint64_t version_number,
+            ReadCallback* read_callback, DBImpl* db_impl, ColumnFamilyData* cfd,
+            bool allow_blob, bool allow_refresh);
+
+  // Store some parameters so we can refresh the iterator at a later point
+  // with these same params
+  void StoreRefreshInfo(const ReadOptions& read_options, DBImpl* db_impl,
+                        ColumnFamilyData* cfd, ReadCallback* read_callback,
+                        bool allow_blob) {
+    read_options_ = read_options;
+    db_impl_ = db_impl;
+    cfd_ = cfd;
+    read_callback_ = read_callback;
+    allow_blob_ = allow_blob;
+  }
+
+ private:
+  DBIter* db_iter_;
+  Arena arena_;
+  uint64_t sv_number_;
+  ColumnFamilyData* cfd_ = nullptr;
+  DBImpl* db_impl_ = nullptr;
+  ReadOptions read_options_;
+  ReadCallback* read_callback_;
+  bool allow_blob_ = false;
+  bool allow_refresh_ = true;
+};
+
+// Generate the arena wrapped iterator class.
+// `db_impl` and `cfd` are used for reneweal. If left null, renewal will not
+// be supported.
+extern ArenaWrappedDBIter* NewArenaWrappedDbIterator(
+    Env* env, const ReadOptions& read_options,
+    const ImmutableCFOptions& cf_options,
+    const MutableCFOptions& mutable_cf_options, const SequenceNumber& sequence,
+    uint64_t max_sequential_skip_in_iterations, uint64_t version_number,
+    ReadCallback* read_callback, DBImpl* db_impl = nullptr,
+    ColumnFamilyData* cfd = nullptr, bool allow_blob = false,
+    bool allow_refresh = true);
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob_index.h b/src/rocksdb/db/blob_index.h
new file mode 100644
index 000000000..483a7b97b
--- /dev/null
+++ b/src/rocksdb/db/blob_index.h
@@ -0,0 +1,179 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include <sstream>
+#include <string>
+
+#include "rocksdb/options.h"
+#include "util/coding.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// BlobIndex is a pointer to the blob and metadata of the blob. The index is
+// stored in base DB as ValueType::kTypeBlobIndex.
+// There are three types of blob index:
+//
+//    kInlinedTTL:
+//      +------+------------+---------------+
+//      | type | expiration | value         |
+//      +------+------------+---------------+
+//      | char | varint64   | variable size |
+//      +------+------------+---------------+
+//
+//    kBlob:
+//      +------+-------------+----------+----------+-------------+
+//      | type | file number | offset   | size     | compression |
+//      +------+-------------+----------+----------+-------------+
+//      | char | varint64    | varint64 | varint64 | char        |
+//      +------+-------------+----------+----------+-------------+
+//
+//    kBlobTTL:
+//      +------+------------+-------------+----------+----------+-------------+
+//      | type | expiration | file number | offset   | size     | compression |
+//      +------+------------+-------------+----------+----------+-------------+
+//      | char | varint64   | varint64    | varint64 | varint64 | char        |
+//      +------+------------+-------------+----------+----------+-------------+
+//
+// There isn't a kInlined (without TTL) type since we can store it as a plain
+// value (i.e. ValueType::kTypeValue).
+class BlobIndex {
+ public:
+  enum class Type : unsigned char {
+    kInlinedTTL = 0,
+    kBlob = 1,
+    kBlobTTL = 2,
+    kUnknown = 3,
+  };
+
+  BlobIndex() : type_(Type::kUnknown) {}
+
+  bool IsInlined() const { return type_ == Type::kInlinedTTL; }
+
+  bool HasTTL() const {
+    return type_ == Type::kInlinedTTL || type_ == Type::kBlobTTL;
+  }
+
+  uint64_t expiration() const {
+    assert(HasTTL());
+    return expiration_;
+  }
+
+  const Slice& value() const {
+    assert(IsInlined());
+    return value_;
+  }
+
+  uint64_t file_number() const {
+    assert(!IsInlined());
+    return file_number_;
+  }
+
+  uint64_t offset() const {
+    assert(!IsInlined());
+    return offset_;
+  }
+
+  uint64_t size() const {
+    assert(!IsInlined());
+    return size_;
+  }
+
+  Status DecodeFrom(Slice slice) {
+    static const std::string kErrorMessage = "Error while decoding blob index";
+    assert(slice.size() > 0);
+    type_ = static_cast<Type>(*slice.data());
+    if (type_ >= Type::kUnknown) {
+      return Status::Corruption(
+          kErrorMessage,
+          "Unknown blob index type: " + ToString(static_cast<char>(type_)));
+    }
+    slice = Slice(slice.data() + 1, slice.size() - 1);
+    if (HasTTL()) {
+      if (!GetVarint64(&slice, &expiration_)) {
+        return Status::Corruption(kErrorMessage, "Corrupted expiration");
+      }
+    }
+    if (IsInlined()) {
+      value_ = slice;
+    } else {
+      if (GetVarint64(&slice, &file_number_) && GetVarint64(&slice, &offset_) &&
+          GetVarint64(&slice, &size_) && slice.size() == 1) {
+        compression_ = static_cast<CompressionType>(*slice.data());
+      } else {
+        return Status::Corruption(kErrorMessage, "Corrupted blob offset");
+      }
+    }
+    return Status::OK();
+  }
+
+  std::string DebugString(bool output_hex) const {
+    std::ostringstream oss;
+
+    if (IsInlined()) {
+      oss << "[inlined blob] value:" << value_.ToString(output_hex);
+    } else {
+      oss << "[blob ref] file:" << file_number_ << " offset:" << offset_
+          << " size:" << size_;
+    }
+
+    if (HasTTL()) {
+      oss << " exp:" << expiration_;
+    }
+
+    return oss.str();
+  }
+
+  static void EncodeInlinedTTL(std::string* dst, uint64_t expiration,
+                               const Slice& value) {
+    assert(dst != nullptr);
+    dst->clear();
+    dst->reserve(1 + kMaxVarint64Length + value.size());
+    dst->push_back(static_cast<char>(Type::kInlinedTTL));
+    PutVarint64(dst, expiration);
+    dst->append(value.data(), value.size());
+  }
+
+  static void EncodeBlob(std::string* dst, uint64_t file_number,
+                         uint64_t offset, uint64_t size,
+                         CompressionType compression) {
+    assert(dst != nullptr);
+    dst->clear();
+    dst->reserve(kMaxVarint64Length * 3 + 2);
+    dst->push_back(static_cast<char>(Type::kBlob));
+    PutVarint64(dst, file_number);
+    PutVarint64(dst, offset);
+    PutVarint64(dst, size);
+    dst->push_back(static_cast<char>(compression));
+  }
+
+  static void EncodeBlobTTL(std::string* dst, uint64_t expiration,
+                            uint64_t file_number, uint64_t offset,
+                            uint64_t size, CompressionType compression) {
+    assert(dst != nullptr);
+    dst->clear();
+    dst->reserve(kMaxVarint64Length * 4 + 2);
+    dst->push_back(static_cast<char>(Type::kBlobTTL));
+    PutVarint64(dst, expiration);
+    PutVarint64(dst, file_number);
+    PutVarint64(dst, offset);
+    PutVarint64(dst, size);
+    dst->push_back(static_cast<char>(compression));
+  }
+
+ private:
+  Type type_ = Type::kUnknown;
+  uint64_t expiration_ = 0;
+  Slice value_;
+  uint64_t file_number_ = 0;
+  uint64_t offset_ = 0;
+  uint64_t size_ = 0;
+  CompressionType compression_ = kNoCompression;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/db/builder.cc b/src/rocksdb/db/builder.cc
new file mode 100644
index 000000000..fdb814cbb
--- /dev/null
+++ b/src/rocksdb/db/builder.cc
@@ -0,0 +1,263 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/builder.h"
+
+#include <algorithm>
+#include <deque>
+#include <vector>
+
+#include "db/compaction/compaction_iterator.h"
+#include "db/dbformat.h"
+#include "db/event_helpers.h"
+#include "db/internal_stats.h"
+#include "db/merge_helper.h"
+#include "db/range_del_aggregator.h"
+#include "db/table_cache.h"
+#include "db/version_edit.h"
+#include "file/filename.h"
+#include "file/read_write_util.h"
+#include "file/writable_file_writer.h"
+#include "monitoring/iostats_context_imp.h"
+#include "monitoring/thread_status_util.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/options.h"
+#include "rocksdb/table.h"
+#include "table/block_based/block_based_table_builder.h"
+#include "table/format.h"
+#include "table/internal_iterator.h"
+#include "test_util/sync_point.h"
+#include "util/stop_watch.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class TableFactory;
+
+TableBuilder* NewTableBuilder(
+    const ImmutableCFOptions& ioptions, const MutableCFOptions& moptions,
+    const InternalKeyComparator& internal_comparator,
+    const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
+        int_tbl_prop_collector_factories,
+    uint32_t column_family_id, const std::string& column_family_name,
+    WritableFileWriter* file, const CompressionType compression_type,
+    uint64_t sample_for_compression, const CompressionOptions& compression_opts,
+    int level, const bool skip_filters, const uint64_t creation_time,
+    const uint64_t oldest_key_time, const uint64_t target_file_size,
+    const uint64_t file_creation_time) {
+  assert((column_family_id ==
+          TablePropertiesCollectorFactory::Context::kUnknownColumnFamily) ==
+         column_family_name.empty());
+  return ioptions.table_factory->NewTableBuilder(
+      TableBuilderOptions(ioptions, moptions, internal_comparator,
+                          int_tbl_prop_collector_factories, compression_type,
+                          sample_for_compression, compression_opts,
+                          skip_filters, column_family_name, level,
+                          creation_time, oldest_key_time, target_file_size,
+                          file_creation_time),
+      column_family_id, file);
+}
+
+Status BuildTable(
+    const std::string& dbname, Env* env, FileSystem* fs,
+    const ImmutableCFOptions& ioptions,
+    const MutableCFOptions& mutable_cf_options, const FileOptions& file_options,
+    TableCache* table_cache, InternalIterator* iter,
+    std::vector<std::unique_ptr<FragmentedRangeTombstoneIterator>>
+        range_del_iters,
+    FileMetaData* meta, const InternalKeyComparator& internal_comparator,
+    const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
+        int_tbl_prop_collector_factories,
+    uint32_t column_family_id, const std::string& column_family_name,
+    std::vector<SequenceNumber> snapshots,
+    SequenceNumber earliest_write_conflict_snapshot,
+    SnapshotChecker* snapshot_checker, const CompressionType compression,
+    uint64_t sample_for_compression, const CompressionOptions& compression_opts,
+    bool paranoid_file_checks, InternalStats* internal_stats,
+    TableFileCreationReason reason, EventLogger* event_logger, int job_id,
+    const Env::IOPriority io_priority, TableProperties* table_properties,
+    int level, const uint64_t creation_time, const uint64_t oldest_key_time,
+    Env::WriteLifeTimeHint write_hint, const uint64_t file_creation_time) {
+  assert((column_family_id ==
+          TablePropertiesCollectorFactory::Context::kUnknownColumnFamily) ==
+         column_family_name.empty());
+  // Reports the IOStats for flush for every following bytes.
+  const size_t kReportFlushIOStatsEvery = 1048576;
+  Status s;
+  meta->fd.file_size = 0;
+  iter->SeekToFirst();
+  std::unique_ptr<CompactionRangeDelAggregator> range_del_agg(
+      new CompactionRangeDelAggregator(&internal_comparator, snapshots));
+  for (auto& range_del_iter : range_del_iters) {
+    range_del_agg->AddTombstones(std::move(range_del_iter));
+  }
+
+  std::string fname = TableFileName(ioptions.cf_paths, meta->fd.GetNumber(),
+                                    meta->fd.GetPathId());
+#ifndef ROCKSDB_LITE
+  EventHelpers::NotifyTableFileCreationStarted(
+      ioptions.listeners, dbname, column_family_name, fname, job_id, reason);
+#endif  // !ROCKSDB_LITE
+  TableProperties tp;
+
+  if (iter->Valid() || !range_del_agg->IsEmpty()) {
+    TableBuilder* builder;
+    std::unique_ptr<WritableFileWriter> file_writer;
+    // Currently we only enable dictionary compression during compaction to the
+    // bottommost level.
+    CompressionOptions compression_opts_for_flush(compression_opts);
+    compression_opts_for_flush.max_dict_bytes = 0;
+    compression_opts_for_flush.zstd_max_train_bytes = 0;
+    {
+      std::unique_ptr<FSWritableFile> file;
+#ifndef NDEBUG
+      bool use_direct_writes = file_options.use_direct_writes;
+      TEST_SYNC_POINT_CALLBACK("BuildTable:create_file", &use_direct_writes);
+#endif  // !NDEBUG
+      s = NewWritableFile(fs, fname, &file, file_options);
+      if (!s.ok()) {
+        EventHelpers::LogAndNotifyTableFileCreationFinished(
+            event_logger, ioptions.listeners, dbname, column_family_name, fname,
+            job_id, meta->fd, kInvalidBlobFileNumber, tp, reason, s);
+        return s;
+      }
+      file->SetIOPriority(io_priority);
+      file->SetWriteLifeTimeHint(write_hint);
+
+      file_writer.reset(new WritableFileWriter(
+          std::move(file), fname, file_options, env, ioptions.statistics,
+          ioptions.listeners, ioptions.sst_file_checksum_func));
+
+      builder = NewTableBuilder(
+          ioptions, mutable_cf_options, internal_comparator,
+          int_tbl_prop_collector_factories, column_family_id,
+          column_family_name, file_writer.get(), compression,
+          sample_for_compression, compression_opts_for_flush, level,
+          false /* skip_filters */, creation_time, oldest_key_time,
+          0 /*target_file_size*/, file_creation_time);
+    }
+
+    MergeHelper merge(env, internal_comparator.user_comparator(),
+                      ioptions.merge_operator, nullptr, ioptions.info_log,
+                      true /* internal key corruption is not ok */,
+                      snapshots.empty() ? 0 : snapshots.back(),
+                      snapshot_checker);
+
+    CompactionIterator c_iter(
+        iter, internal_comparator.user_comparator(), &merge, kMaxSequenceNumber,
+        &snapshots, earliest_write_conflict_snapshot, snapshot_checker, env,
+        ShouldReportDetailedTime(env, ioptions.statistics),
+        true /* internal key corruption is not ok */, range_del_agg.get());
+    c_iter.SeekToFirst();
+    for (; c_iter.Valid(); c_iter.Next()) {
+      const Slice& key = c_iter.key();
+      const Slice& value = c_iter.value();
+      const ParsedInternalKey& ikey = c_iter.ikey();
+      builder->Add(key, value);
+      meta->UpdateBoundaries(key, value, ikey.sequence, ikey.type);
+
+      // TODO(noetzli): Update stats after flush, too.
+      if (io_priority == Env::IO_HIGH &&
+          IOSTATS(bytes_written) >= kReportFlushIOStatsEvery) {
+        ThreadStatusUtil::SetThreadOperationProperty(
+            ThreadStatus::FLUSH_BYTES_WRITTEN, IOSTATS(bytes_written));
+      }
+    }
+
+    auto range_del_it = range_del_agg->NewIterator();
+    for (range_del_it->SeekToFirst(); range_del_it->Valid();
+         range_del_it->Next()) {
+      auto tombstone = range_del_it->Tombstone();
+      auto kv = tombstone.Serialize();
+      builder->Add(kv.first.Encode(), kv.second);
+      meta->UpdateBoundariesForRange(kv.first, tombstone.SerializeEndKey(),
+                                     tombstone.seq_, internal_comparator);
+    }
+
+    // Finish and check for builder errors
+    tp = builder->GetTableProperties();
+    bool empty = builder->NumEntries() == 0 && tp.num_range_deletions == 0;
+    s = c_iter.status();
+    if (!s.ok() || empty) {
+      builder->Abandon();
+    } else {
+      s = builder->Finish();
+    }
+
+    if (s.ok() && !empty) {
+      uint64_t file_size = builder->FileSize();
+      meta->fd.file_size = file_size;
+      meta->marked_for_compaction = builder->NeedCompact();
+      assert(meta->fd.GetFileSize() > 0);
+      tp = builder->GetTableProperties(); // refresh now that builder is finished
+      if (table_properties) {
+        *table_properties = tp;
+      }
+      // Add the checksum information to file metadata.
+      meta->file_checksum = builder->GetFileChecksum();
+      meta->file_checksum_func_name = builder->GetFileChecksumFuncName();
+    }
+    delete builder;
+
+    // Finish and check for file errors
+    if (s.ok() && !empty) {
+      StopWatch sw(env, ioptions.statistics, TABLE_SYNC_MICROS);
+      s = file_writer->Sync(ioptions.use_fsync);
+    }
+    if (s.ok() && !empty) {
+      s = file_writer->Close();
+    }
+
+    if (s.ok() && !empty) {
+      // Verify that the table is usable
+      // We set for_compaction to false and don't OptimizeForCompactionTableRead
+      // here because this is a special case after we finish the table building
+      // No matter whether use_direct_io_for_flush_and_compaction is true,
+      // we will regrad this verification as user reads since the goal is
+      // to cache it here for further user reads
+      std::unique_ptr<InternalIterator> it(table_cache->NewIterator(
+          ReadOptions(), file_options, internal_comparator, *meta,
+          nullptr /* range_del_agg */,
+          mutable_cf_options.prefix_extractor.get(), nullptr,
+          (internal_stats == nullptr) ? nullptr
+                                      : internal_stats->GetFileReadHist(0),
+          TableReaderCaller::kFlush, /*arena=*/nullptr,
+          /*skip_filter=*/false, level, /*smallest_compaction_key=*/nullptr,
+          /*largest_compaction_key*/ nullptr));
+      s = it->status();
+      if (s.ok() && paranoid_file_checks) {
+        for (it->SeekToFirst(); it->Valid(); it->Next()) {
+        }
+        s = it->status();
+      }
+    }
+  }
+
+  // Check for input iterator errors
+  if (!iter->status().ok()) {
+    s = iter->status();
+  }
+
+  if (!s.ok() || meta->fd.GetFileSize() == 0) {
+    fs->DeleteFile(fname, IOOptions(), nullptr);
+  }
+
+  if (meta->fd.GetFileSize() == 0) {
+    fname = "(nil)";
+  }
+  // Output to event logger and fire events.
+  EventHelpers::LogAndNotifyTableFileCreationFinished(
+      event_logger, ioptions.listeners, dbname, column_family_name, fname,
+      job_id, meta->fd, meta->oldest_blob_file_number, tp, reason, s);
+
+  return s;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/builder.h b/src/rocksdb/db/builder.h
new file mode 100644
index 000000000..062f1fb80
--- /dev/null
+++ b/src/rocksdb/db/builder.h
@@ -0,0 +1,88 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+#include <string>
+#include <utility>
+#include <vector>
+#include "db/range_tombstone_fragmenter.h"
+#include "db/table_properties_collector.h"
+#include "logging/event_logger.h"
+#include "options/cf_options.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/env.h"
+#include "rocksdb/listener.h"
+#include "rocksdb/options.h"
+#include "rocksdb/status.h"
+#include "rocksdb/table_properties.h"
+#include "rocksdb/types.h"
+#include "table/scoped_arena_iterator.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+struct Options;
+struct FileMetaData;
+
+class Env;
+struct EnvOptions;
+class Iterator;
+class SnapshotChecker;
+class TableCache;
+class VersionEdit;
+class TableBuilder;
+class WritableFileWriter;
+class InternalStats;
+
+// @param column_family_name Name of the column family that is also identified
+//    by column_family_id, or empty string if unknown. It must outlive the
+//    TableBuilder returned by this function.
+TableBuilder* NewTableBuilder(
+    const ImmutableCFOptions& options, const MutableCFOptions& moptions,
+    const InternalKeyComparator& internal_comparator,
+    const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
+        int_tbl_prop_collector_factories,
+    uint32_t column_family_id, const std::string& column_family_name,
+    WritableFileWriter* file, const CompressionType compression_type,
+    const uint64_t sample_for_compression,
+    const CompressionOptions& compression_opts, int level,
+    const bool skip_filters = false, const uint64_t creation_time = 0,
+    const uint64_t oldest_key_time = 0, const uint64_t target_file_size = 0,
+    const uint64_t file_creation_time = 0);
+
+// Build a Table file from the contents of *iter.  The generated file
+// will be named according to number specified in meta. On success, the rest of
+// *meta will be filled with metadata about the generated table.
+// If no data is present in *iter, meta->file_size will be set to
+// zero, and no Table file will be produced.
+//
+// @param column_family_name Name of the column family that is also identified
+//    by column_family_id, or empty string if unknown.
+extern Status BuildTable(
+    const std::string& dbname, Env* env, FileSystem* fs,
+    const ImmutableCFOptions& options,
+    const MutableCFOptions& mutable_cf_options, const FileOptions& file_options,
+    TableCache* table_cache, InternalIterator* iter,
+    std::vector<std::unique_ptr<FragmentedRangeTombstoneIterator>>
+        range_del_iters,
+    FileMetaData* meta, const InternalKeyComparator& internal_comparator,
+    const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
+        int_tbl_prop_collector_factories,
+    uint32_t column_family_id, const std::string& column_family_name,
+    std::vector<SequenceNumber> snapshots,
+    SequenceNumber earliest_write_conflict_snapshot,
+    SnapshotChecker* snapshot_checker, const CompressionType compression,
+    const uint64_t sample_for_compression,
+    const CompressionOptions& compression_opts, bool paranoid_file_checks,
+    InternalStats* internal_stats, TableFileCreationReason reason,
+    EventLogger* event_logger = nullptr, int job_id = 0,
+    const Env::IOPriority io_priority = Env::IO_HIGH,
+    TableProperties* table_properties = nullptr, int level = -1,
+    const uint64_t creation_time = 0, const uint64_t oldest_key_time = 0,
+    Env::WriteLifeTimeHint write_hint = Env::WLTH_NOT_SET,
+    const uint64_t file_creation_time = 0);
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/c.cc b/src/rocksdb/db/c.cc
new file mode 100644
index 000000000..db78030df
--- /dev/null
+++ b/src/rocksdb/db/c.cc
@@ -0,0 +1,4451 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef ROCKSDB_LITE
+
+#include "rocksdb/c.h"
+
+#include <stdlib.h>
+#include "port/port.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/memtablerep.h"
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/options.h"
+#include "rocksdb/rate_limiter.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/status.h"
+#include "rocksdb/table.h"
+#include "rocksdb/universal_compaction.h"
+#include "rocksdb/utilities/backupable_db.h"
+#include "rocksdb/utilities/checkpoint.h"
+#include "rocksdb/utilities/db_ttl.h"
+#include "rocksdb/utilities/memory_util.h"
+#include "rocksdb/utilities/optimistic_transaction_db.h"
+#include "rocksdb/utilities/transaction.h"
+#include "rocksdb/utilities/transaction_db.h"
+#include "rocksdb/utilities/write_batch_with_index.h"
+#include "rocksdb/write_batch.h"
+#include "rocksdb/perf_context.h"
+#include "utilities/merge_operators.h"
+
+#include <vector>
+#include <unordered_set>
+#include <map>
+
+using ROCKSDB_NAMESPACE::BackupableDBOptions;
+using ROCKSDB_NAMESPACE::BackupEngine;
+using ROCKSDB_NAMESPACE::BackupID;
+using ROCKSDB_NAMESPACE::BackupInfo;
+using ROCKSDB_NAMESPACE::BatchResult;
+using ROCKSDB_NAMESPACE::BlockBasedTableOptions;
+using ROCKSDB_NAMESPACE::BottommostLevelCompaction;
+using ROCKSDB_NAMESPACE::BytewiseComparator;
+using ROCKSDB_NAMESPACE::Cache;
+using ROCKSDB_NAMESPACE::Checkpoint;
+using ROCKSDB_NAMESPACE::ColumnFamilyDescriptor;
+using ROCKSDB_NAMESPACE::ColumnFamilyHandle;
+using ROCKSDB_NAMESPACE::ColumnFamilyOptions;
+using ROCKSDB_NAMESPACE::CompactionFilter;
+using ROCKSDB_NAMESPACE::CompactionFilterContext;
+using ROCKSDB_NAMESPACE::CompactionFilterFactory;
+using ROCKSDB_NAMESPACE::CompactionOptionsFIFO;
+using ROCKSDB_NAMESPACE::CompactRangeOptions;
+using ROCKSDB_NAMESPACE::Comparator;
+using ROCKSDB_NAMESPACE::CompressionType;
+using ROCKSDB_NAMESPACE::CuckooTableOptions;
+using ROCKSDB_NAMESPACE::DB;
+using ROCKSDB_NAMESPACE::DBOptions;
+using ROCKSDB_NAMESPACE::DbPath;
+using ROCKSDB_NAMESPACE::Env;
+using ROCKSDB_NAMESPACE::EnvOptions;
+using ROCKSDB_NAMESPACE::FileLock;
+using ROCKSDB_NAMESPACE::FilterPolicy;
+using ROCKSDB_NAMESPACE::FlushOptions;
+using ROCKSDB_NAMESPACE::InfoLogLevel;
+using ROCKSDB_NAMESPACE::IngestExternalFileOptions;
+using ROCKSDB_NAMESPACE::Iterator;
+using ROCKSDB_NAMESPACE::LiveFileMetaData;
+using ROCKSDB_NAMESPACE::Logger;
+using ROCKSDB_NAMESPACE::MemoryUtil;
+using ROCKSDB_NAMESPACE::MergeOperator;
+using ROCKSDB_NAMESPACE::MergeOperators;
+using ROCKSDB_NAMESPACE::NewBloomFilterPolicy;
+using ROCKSDB_NAMESPACE::NewGenericRateLimiter;
+using ROCKSDB_NAMESPACE::NewLRUCache;
+using ROCKSDB_NAMESPACE::OptimisticTransactionDB;
+using ROCKSDB_NAMESPACE::OptimisticTransactionOptions;
+using ROCKSDB_NAMESPACE::Options;
+using ROCKSDB_NAMESPACE::PerfContext;
+using ROCKSDB_NAMESPACE::PerfLevel;
+using ROCKSDB_NAMESPACE::PinnableSlice;
+using ROCKSDB_NAMESPACE::RandomAccessFile;
+using ROCKSDB_NAMESPACE::Range;
+using ROCKSDB_NAMESPACE::RateLimiter;
+using ROCKSDB_NAMESPACE::ReadOptions;
+using ROCKSDB_NAMESPACE::RestoreOptions;
+using ROCKSDB_NAMESPACE::SequentialFile;
+using ROCKSDB_NAMESPACE::Slice;
+using ROCKSDB_NAMESPACE::SliceParts;
+using ROCKSDB_NAMESPACE::SliceTransform;
+using ROCKSDB_NAMESPACE::Snapshot;
+using ROCKSDB_NAMESPACE::SstFileWriter;
+using ROCKSDB_NAMESPACE::Status;
+using ROCKSDB_NAMESPACE::Transaction;
+using ROCKSDB_NAMESPACE::TransactionDB;
+using ROCKSDB_NAMESPACE::TransactionDBOptions;
+using ROCKSDB_NAMESPACE::TransactionLogIterator;
+using ROCKSDB_NAMESPACE::TransactionOptions;
+using ROCKSDB_NAMESPACE::WALRecoveryMode;
+using ROCKSDB_NAMESPACE::WritableFile;
+using ROCKSDB_NAMESPACE::WriteBatch;
+using ROCKSDB_NAMESPACE::WriteBatchWithIndex;
+using ROCKSDB_NAMESPACE::WriteOptions;
+
+using std::shared_ptr;
+using std::vector;
+using std::unordered_set;
+using std::map;
+
+extern "C" {
+
+struct rocksdb_t                 { DB*               rep; };
+struct rocksdb_backup_engine_t   { BackupEngine*     rep; };
+struct rocksdb_backup_engine_info_t { std::vector<BackupInfo> rep; };
+struct rocksdb_restore_options_t { RestoreOptions rep; };
+struct rocksdb_iterator_t        { Iterator*         rep; };
+struct rocksdb_writebatch_t      { WriteBatch        rep; };
+struct rocksdb_writebatch_wi_t   { WriteBatchWithIndex* rep; };
+struct rocksdb_snapshot_t        { const Snapshot*   rep; };
+struct rocksdb_flushoptions_t    { FlushOptions      rep; };
+struct rocksdb_fifo_compaction_options_t { CompactionOptionsFIFO rep; };
+struct rocksdb_readoptions_t {
+   ReadOptions rep;
+   // stack variables to set pointers to in ReadOptions
+   Slice upper_bound;
+   Slice lower_bound;
+};
+struct rocksdb_writeoptions_t    { WriteOptions      rep; };
+struct rocksdb_options_t         { Options           rep; };
+struct rocksdb_compactoptions_t {
+  CompactRangeOptions rep;
+};
+struct rocksdb_block_based_table_options_t  { BlockBasedTableOptions rep; };
+struct rocksdb_cuckoo_table_options_t  { CuckooTableOptions rep; };
+struct rocksdb_seqfile_t         { SequentialFile*   rep; };
+struct rocksdb_randomfile_t      { RandomAccessFile* rep; };
+struct rocksdb_writablefile_t    { WritableFile*     rep; };
+struct rocksdb_wal_iterator_t { TransactionLogIterator* rep; };
+struct rocksdb_wal_readoptions_t { TransactionLogIterator::ReadOptions rep; };
+struct rocksdb_filelock_t        { FileLock*         rep; };
+struct rocksdb_logger_t {
+  std::shared_ptr<Logger> rep;
+};
+struct rocksdb_cache_t {
+  std::shared_ptr<Cache> rep;
+};
+struct rocksdb_livefiles_t       { std::vector<LiveFileMetaData> rep; };
+struct rocksdb_column_family_handle_t  { ColumnFamilyHandle* rep; };
+struct rocksdb_envoptions_t      { EnvOptions        rep; };
+struct rocksdb_ingestexternalfileoptions_t  { IngestExternalFileOptions rep; };
+struct rocksdb_sstfilewriter_t   { SstFileWriter*    rep; };
+struct rocksdb_ratelimiter_t {
+  std::shared_ptr<RateLimiter> rep;
+};
+struct rocksdb_perfcontext_t     { PerfContext*      rep; };
+struct rocksdb_pinnableslice_t {
+  PinnableSlice rep;
+};
+struct rocksdb_transactiondb_options_t {
+  TransactionDBOptions rep;
+};
+struct rocksdb_transactiondb_t {
+  TransactionDB* rep;
+};
+struct rocksdb_transaction_options_t {
+  TransactionOptions rep;
+};
+struct rocksdb_transaction_t {
+  Transaction* rep;
+};
+struct rocksdb_checkpoint_t {
+  Checkpoint* rep;
+};
+struct rocksdb_optimistictransactiondb_t {
+  OptimisticTransactionDB* rep;
+};
+struct rocksdb_optimistictransaction_options_t {
+  OptimisticTransactionOptions rep;
+};
+
+struct rocksdb_compactionfiltercontext_t {
+  CompactionFilter::Context rep;
+};
+
+struct rocksdb_compactionfilter_t : public CompactionFilter {
+  void* state_;
+  void (*destructor_)(void*);
+  unsigned char (*filter_)(
+      void*,
+      int level,
+      const char* key, size_t key_length,
+      const char* existing_value, size_t value_length,
+      char** new_value, size_t *new_value_length,
+      unsigned char* value_changed);
+  const char* (*name_)(void*);
+  unsigned char ignore_snapshots_;
+
+  ~rocksdb_compactionfilter_t() override { (*destructor_)(state_); }
+
+  bool Filter(int level, const Slice& key, const Slice& existing_value,
+              std::string* new_value, bool* value_changed) const override {
+    char* c_new_value = nullptr;
+    size_t new_value_length = 0;
+    unsigned char c_value_changed = 0;
+    unsigned char result = (*filter_)(
+        state_,
+        level,
+        key.data(), key.size(),
+        existing_value.data(), existing_value.size(),
+        &c_new_value, &new_value_length, &c_value_changed);
+    if (c_value_changed) {
+      new_value->assign(c_new_value, new_value_length);
+      *value_changed = true;
+    }
+    return result;
+  }
+
+  const char* Name() const override { return (*name_)(state_); }
+
+  bool IgnoreSnapshots() const override { return ignore_snapshots_; }
+};
+
+struct rocksdb_compactionfilterfactory_t : public CompactionFilterFactory {
+  void* state_;
+  void (*destructor_)(void*);
+  rocksdb_compactionfilter_t* (*create_compaction_filter_)(
+      void*, rocksdb_compactionfiltercontext_t* context);
+  const char* (*name_)(void*);
+
+  ~rocksdb_compactionfilterfactory_t() override { (*destructor_)(state_); }
+
+  std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+      const CompactionFilter::Context& context) override {
+    rocksdb_compactionfiltercontext_t ccontext;
+    ccontext.rep = context;
+    CompactionFilter* cf = (*create_compaction_filter_)(state_, &ccontext);
+    return std::unique_ptr<CompactionFilter>(cf);
+  }
+
+  const char* Name() const override { return (*name_)(state_); }
+};
+
+struct rocksdb_comparator_t : public Comparator {
+  void* state_;
+  void (*destructor_)(void*);
+  int (*compare_)(
+      void*,
+      const char* a, size_t alen,
+      const char* b, size_t blen);
+  const char* (*name_)(void*);
+
+  ~rocksdb_comparator_t() override { (*destructor_)(state_); }
+
+  int Compare(const Slice& a, const Slice& b) const override {
+    return (*compare_)(state_, a.data(), a.size(), b.data(), b.size());
+  }
+
+  const char* Name() const override { return (*name_)(state_); }
+
+  // No-ops since the C binding does not support key shortening methods.
+  void FindShortestSeparator(std::string*, const Slice&) const override {}
+  void FindShortSuccessor(std::string* /*key*/) const override {}
+};
+
+struct rocksdb_filterpolicy_t : public FilterPolicy {
+  void* state_;
+  void (*destructor_)(void*);
+  const char* (*name_)(void*);
+  char* (*create_)(
+      void*,
+      const char* const* key_array, const size_t* key_length_array,
+      int num_keys,
+      size_t* filter_length);
+  unsigned char (*key_match_)(
+      void*,
+      const char* key, size_t length,
+      const char* filter, size_t filter_length);
+  void (*delete_filter_)(
+      void*,
+      const char* filter, size_t filter_length);
+
+  ~rocksdb_filterpolicy_t() override { (*destructor_)(state_); }
+
+  const char* Name() const override { return (*name_)(state_); }
+
+  void CreateFilter(const Slice* keys, int n, std::string* dst) const override {
+    std::vector<const char*> key_pointers(n);
+    std::vector<size_t> key_sizes(n);
+    for (int i = 0; i < n; i++) {
+      key_pointers[i] = keys[i].data();
+      key_sizes[i] = keys[i].size();
+    }
+    size_t len;
+    char* filter = (*create_)(state_, &key_pointers[0], &key_sizes[0], n, &len);
+    dst->append(filter, len);
+
+    if (delete_filter_ != nullptr) {
+      (*delete_filter_)(state_, filter, len);
+    } else {
+      free(filter);
+    }
+  }
+
+  bool KeyMayMatch(const Slice& key, const Slice& filter) const override {
+    return (*key_match_)(state_, key.data(), key.size(),
+                         filter.data(), filter.size());
+  }
+};
+
+struct rocksdb_mergeoperator_t : public MergeOperator {
+  void* state_;
+  void (*destructor_)(void*);
+  const char* (*name_)(void*);
+  char* (*full_merge_)(
+      void*,
+      const char* key, size_t key_length,
+      const char* existing_value, size_t existing_value_length,
+      const char* const* operands_list, const size_t* operands_list_length,
+      int num_operands,
+      unsigned char* success, size_t* new_value_length);
+  char* (*partial_merge_)(void*, const char* key, size_t key_length,
+                          const char* const* operands_list,
+                          const size_t* operands_list_length, int num_operands,
+                          unsigned char* success, size_t* new_value_length);
+  void (*delete_value_)(
+      void*,
+      const char* value, size_t value_length);
+
+  ~rocksdb_mergeoperator_t() override { (*destructor_)(state_); }
+
+  const char* Name() const override { return (*name_)(state_); }
+
+  bool FullMergeV2(const MergeOperationInput& merge_in,
+                   MergeOperationOutput* merge_out) const override {
+    size_t n = merge_in.operand_list.size();
+    std::vector<const char*> operand_pointers(n);
+    std::vector<size_t> operand_sizes(n);
+    for (size_t i = 0; i < n; i++) {
+      Slice operand(merge_in.operand_list[i]);
+      operand_pointers[i] = operand.data();
+      operand_sizes[i] = operand.size();
+    }
+
+    const char* existing_value_data = nullptr;
+    size_t existing_value_len = 0;
+    if (merge_in.existing_value != nullptr) {
+      existing_value_data = merge_in.existing_value->data();
+      existing_value_len = merge_in.existing_value->size();
+    }
+
+    unsigned char success;
+    size_t new_value_len;
+    char* tmp_new_value = (*full_merge_)(
+        state_, merge_in.key.data(), merge_in.key.size(), existing_value_data,
+        existing_value_len, &operand_pointers[0], &operand_sizes[0],
+        static_cast<int>(n), &success, &new_value_len);
+    merge_out->new_value.assign(tmp_new_value, new_value_len);
+
+    if (delete_value_ != nullptr) {
+      (*delete_value_)(state_, tmp_new_value, new_value_len);
+    } else {
+      free(tmp_new_value);
+    }
+
+    return success;
+  }
+
+  bool PartialMergeMulti(const Slice& key,
+                         const std::deque<Slice>& operand_list,
+                         std::string* new_value,
+                         Logger* /*logger*/) const override {
+    size_t operand_count = operand_list.size();
+    std::vector<const char*> operand_pointers(operand_count);
+    std::vector<size_t> operand_sizes(operand_count);
+    for (size_t i = 0; i < operand_count; ++i) {
+      Slice operand(operand_list[i]);
+      operand_pointers[i] = operand.data();
+      operand_sizes[i] = operand.size();
+    }
+
+    unsigned char success;
+    size_t new_value_len;
+    char* tmp_new_value = (*partial_merge_)(
+        state_, key.data(), key.size(), &operand_pointers[0], &operand_sizes[0],
+        static_cast<int>(operand_count), &success, &new_value_len);
+    new_value->assign(tmp_new_value, new_value_len);
+
+    if (delete_value_ != nullptr) {
+      (*delete_value_)(state_, tmp_new_value, new_value_len);
+    } else {
+      free(tmp_new_value);
+    }
+
+    return success;
+  }
+};
+
+struct rocksdb_dbpath_t {
+  DbPath rep;
+};
+
+struct rocksdb_env_t {
+  Env* rep;
+  bool is_default;
+};
+
+struct rocksdb_slicetransform_t : public SliceTransform {
+  void* state_;
+  void (*destructor_)(void*);
+  const char* (*name_)(void*);
+  char* (*transform_)(
+      void*,
+      const char* key, size_t length,
+      size_t* dst_length);
+  unsigned char (*in_domain_)(
+      void*,
+      const char* key, size_t length);
+  unsigned char (*in_range_)(
+      void*,
+      const char* key, size_t length);
+
+  ~rocksdb_slicetransform_t() override { (*destructor_)(state_); }
+
+  const char* Name() const override { return (*name_)(state_); }
+
+  Slice Transform(const Slice& src) const override {
+    size_t len;
+    char* dst = (*transform_)(state_, src.data(), src.size(), &len);
+    return Slice(dst, len);
+  }
+
+  bool InDomain(const Slice& src) const override {
+    return (*in_domain_)(state_, src.data(), src.size());
+  }
+
+  bool InRange(const Slice& src) const override {
+    return (*in_range_)(state_, src.data(), src.size());
+  }
+};
+
+struct rocksdb_universal_compaction_options_t {
+  ROCKSDB_NAMESPACE::CompactionOptionsUniversal* rep;
+};
+
+static bool SaveError(char** errptr, const Status& s) {
+  assert(errptr != nullptr);
+  if (s.ok()) {
+    return false;
+  } else if (*errptr == nullptr) {
+    *errptr = strdup(s.ToString().c_str());
+  } else {
+    // TODO(sanjay): Merge with existing error?
+    // This is a bug if *errptr is not created by malloc()
+    free(*errptr);
+    *errptr = strdup(s.ToString().c_str());
+  }
+  return true;
+}
+
+static char* CopyString(const std::string& str) {
+  char* result = reinterpret_cast<char*>(malloc(sizeof(char) * str.size()));
+  memcpy(result, str.data(), sizeof(char) * str.size());
+  return result;
+}
+
+rocksdb_t* rocksdb_open(
+    const rocksdb_options_t* options,
+    const char* name,
+    char** errptr) {
+  DB* db;
+  if (SaveError(errptr, DB::Open(options->rep, std::string(name), &db))) {
+    return nullptr;
+  }
+  rocksdb_t* result = new rocksdb_t;
+  result->rep = db;
+  return result;
+}
+
+rocksdb_t* rocksdb_open_with_ttl(
+    const rocksdb_options_t* options,
+    const char* name,
+    int ttl,
+    char** errptr) {
+  ROCKSDB_NAMESPACE::DBWithTTL* db;
+  if (SaveError(errptr, ROCKSDB_NAMESPACE::DBWithTTL::Open(
+                            options->rep, std::string(name), &db, ttl))) {
+    return nullptr;
+  }
+  rocksdb_t* result = new rocksdb_t;
+  result->rep = db;
+  return result;
+}
+
+rocksdb_t* rocksdb_open_for_read_only(
+    const rocksdb_options_t* options,
+    const char* name,
+    unsigned char error_if_log_file_exist,
+    char** errptr) {
+  DB* db;
+  if (SaveError(errptr, DB::OpenForReadOnly(options->rep, std::string(name), &db, error_if_log_file_exist))) {
+    return nullptr;
+  }
+  rocksdb_t* result = new rocksdb_t;
+  result->rep = db;
+  return result;
+}
+
+rocksdb_t* rocksdb_open_as_secondary(const rocksdb_options_t* options,
+                                     const char* name,
+                                     const char* secondary_path,
+                                     char** errptr) {
+  DB* db;
+  if (SaveError(errptr,
+                DB::OpenAsSecondary(options->rep, std::string(name),
+                                    std::string(secondary_path), &db))) {
+    return nullptr;
+  }
+  rocksdb_t* result = new rocksdb_t;
+  result->rep = db;
+  return result;
+}
+
+rocksdb_backup_engine_t* rocksdb_backup_engine_open(
+    const rocksdb_options_t* options, const char* path, char** errptr) {
+  BackupEngine* be;
+  if (SaveError(errptr, BackupEngine::Open(options->rep.env,
+                                           BackupableDBOptions(path,
+                                                               nullptr,
+                                                               true,
+                                                               options->rep.info_log.get()),
+                                           &be))) {
+    return nullptr;
+  }
+  rocksdb_backup_engine_t* result = new rocksdb_backup_engine_t;
+  result->rep = be;
+  return result;
+}
+
+void rocksdb_backup_engine_create_new_backup(rocksdb_backup_engine_t* be,
+                                             rocksdb_t* db,
+                                             char** errptr) {
+  SaveError(errptr, be->rep->CreateNewBackup(db->rep));
+}
+
+void rocksdb_backup_engine_create_new_backup_flush(rocksdb_backup_engine_t* be,
+                                                   rocksdb_t* db,
+                                                   unsigned char flush_before_backup,
+                                                   char** errptr) {
+  SaveError(errptr, be->rep->CreateNewBackup(db->rep, flush_before_backup));
+}
+
+void rocksdb_backup_engine_purge_old_backups(rocksdb_backup_engine_t* be,
+                                             uint32_t num_backups_to_keep,
+                                             char** errptr) {
+  SaveError(errptr, be->rep->PurgeOldBackups(num_backups_to_keep));
+}
+
+rocksdb_restore_options_t* rocksdb_restore_options_create() {
+  return new rocksdb_restore_options_t;
+}
+
+void rocksdb_restore_options_destroy(rocksdb_restore_options_t* opt) {
+  delete opt;
+}
+
+void rocksdb_restore_options_set_keep_log_files(rocksdb_restore_options_t* opt,
+                                                int v) {
+  opt->rep.keep_log_files = v;
+}
+
+
+void rocksdb_backup_engine_verify_backup(rocksdb_backup_engine_t* be,
+    uint32_t backup_id, char** errptr) {
+  SaveError(errptr, be->rep->VerifyBackup(static_cast<BackupID>(backup_id)));
+}
+
+void rocksdb_backup_engine_restore_db_from_latest_backup(
+    rocksdb_backup_engine_t* be, const char* db_dir, const char* wal_dir,
+    const rocksdb_restore_options_t* restore_options, char** errptr) {
+  SaveError(errptr, be->rep->RestoreDBFromLatestBackup(std::string(db_dir),
+                                                       std::string(wal_dir),
+                                                       restore_options->rep));
+}
+
+const rocksdb_backup_engine_info_t* rocksdb_backup_engine_get_backup_info(
+    rocksdb_backup_engine_t* be) {
+  rocksdb_backup_engine_info_t* result = new rocksdb_backup_engine_info_t;
+  be->rep->GetBackupInfo(&result->rep);
+  return result;
+}
+
+int rocksdb_backup_engine_info_count(const rocksdb_backup_engine_info_t* info) {
+  return static_cast<int>(info->rep.size());
+}
+
+int64_t rocksdb_backup_engine_info_timestamp(
+    const rocksdb_backup_engine_info_t* info, int index) {
+  return info->rep[index].timestamp;
+}
+
+uint32_t rocksdb_backup_engine_info_backup_id(
+    const rocksdb_backup_engine_info_t* info, int index) {
+  return info->rep[index].backup_id;
+}
+
+uint64_t rocksdb_backup_engine_info_size(
+    const rocksdb_backup_engine_info_t* info, int index) {
+  return info->rep[index].size;
+}
+
+uint32_t rocksdb_backup_engine_info_number_files(
+    const rocksdb_backup_engine_info_t* info, int index) {
+  return info->rep[index].number_files;
+}
+
+void rocksdb_backup_engine_info_destroy(
+    const rocksdb_backup_engine_info_t* info) {
+  delete info;
+}
+
+void rocksdb_backup_engine_close(rocksdb_backup_engine_t* be) {
+  delete be->rep;
+  delete be;
+}
+
+rocksdb_checkpoint_t* rocksdb_checkpoint_object_create(rocksdb_t* db,
+                                                       char** errptr) {
+  Checkpoint* checkpoint;
+  if (SaveError(errptr, Checkpoint::Create(db->rep, &checkpoint))) {
+    return nullptr;
+  }
+  rocksdb_checkpoint_t* result = new rocksdb_checkpoint_t;
+  result->rep = checkpoint;
+  return result;
+}
+
+void rocksdb_checkpoint_create(rocksdb_checkpoint_t* checkpoint,
+                               const char* checkpoint_dir,
+                               uint64_t log_size_for_flush, char** errptr) {
+  SaveError(errptr, checkpoint->rep->CreateCheckpoint(
+                        std::string(checkpoint_dir), log_size_for_flush));
+}
+
+void rocksdb_checkpoint_object_destroy(rocksdb_checkpoint_t* checkpoint) {
+  delete checkpoint->rep;
+  delete checkpoint;
+}
+
+void rocksdb_close(rocksdb_t* db) {
+  delete db->rep;
+  delete db;
+}
+
+void rocksdb_options_set_uint64add_merge_operator(rocksdb_options_t* opt) {
+  opt->rep.merge_operator =
+      ROCKSDB_NAMESPACE::MergeOperators::CreateUInt64AddOperator();
+}
+
+rocksdb_t* rocksdb_open_column_families(
+    const rocksdb_options_t* db_options, const char* name,
+    int num_column_families, const char* const* column_family_names,
+    const rocksdb_options_t* const* column_family_options,
+    rocksdb_column_family_handle_t** column_family_handles, char** errptr) {
+  std::vector<ColumnFamilyDescriptor> column_families;
+  for (int i = 0; i < num_column_families; i++) {
+    column_families.push_back(ColumnFamilyDescriptor(
+        std::string(column_family_names[i]),
+        ColumnFamilyOptions(column_family_options[i]->rep)));
+  }
+
+  DB* db;
+  std::vector<ColumnFamilyHandle*> handles;
+  if (SaveError(errptr, DB::Open(DBOptions(db_options->rep),
+          std::string(name), column_families, &handles, &db))) {
+    return nullptr;
+  }
+
+  for (size_t i = 0; i < handles.size(); i++) {
+    rocksdb_column_family_handle_t* c_handle = new rocksdb_column_family_handle_t;
+    c_handle->rep = handles[i];
+    column_family_handles[i] = c_handle;
+  }
+  rocksdb_t* result = new rocksdb_t;
+  result->rep = db;
+  return result;
+}
+
+rocksdb_t* rocksdb_open_for_read_only_column_families(
+    const rocksdb_options_t* db_options, const char* name,
+    int num_column_families, const char* const* column_family_names,
+    const rocksdb_options_t* const* column_family_options,
+    rocksdb_column_family_handle_t** column_family_handles,
+    unsigned char error_if_log_file_exist, char** errptr) {
+  std::vector<ColumnFamilyDescriptor> column_families;
+  for (int i = 0; i < num_column_families; i++) {
+    column_families.push_back(ColumnFamilyDescriptor(
+        std::string(column_family_names[i]),
+        ColumnFamilyOptions(column_family_options[i]->rep)));
+  }
+
+  DB* db;
+  std::vector<ColumnFamilyHandle*> handles;
+  if (SaveError(errptr, DB::OpenForReadOnly(DBOptions(db_options->rep),
+          std::string(name), column_families, &handles, &db, error_if_log_file_exist))) {
+    return nullptr;
+  }
+
+  for (size_t i = 0; i < handles.size(); i++) {
+    rocksdb_column_family_handle_t* c_handle = new rocksdb_column_family_handle_t;
+    c_handle->rep = handles[i];
+    column_family_handles[i] = c_handle;
+  }
+  rocksdb_t* result = new rocksdb_t;
+  result->rep = db;
+  return result;
+}
+
+rocksdb_t* rocksdb_open_as_secondary_column_families(
+    const rocksdb_options_t* db_options, const char* name,
+    const char* secondary_path, int num_column_families,
+    const char* const* column_family_names,
+    const rocksdb_options_t* const* column_family_options,
+    rocksdb_column_family_handle_t** column_family_handles, char** errptr) {
+  std::vector<ColumnFamilyDescriptor> column_families;
+  for (int i = 0; i != num_column_families; ++i) {
+    column_families.emplace_back(
+        std::string(column_family_names[i]),
+        ColumnFamilyOptions(column_family_options[i]->rep));
+  }
+  DB* db;
+  std::vector<ColumnFamilyHandle*> handles;
+  if (SaveError(errptr, DB::OpenAsSecondary(DBOptions(db_options->rep),
+                                            std::string(name),
+                                            std::string(secondary_path),
+                                            column_families, &handles, &db))) {
+    return nullptr;
+  }
+  for (size_t i = 0; i != handles.size(); ++i) {
+    rocksdb_column_family_handle_t* c_handle =
+        new rocksdb_column_family_handle_t;
+    c_handle->rep = handles[i];
+    column_family_handles[i] = c_handle;
+  }
+  rocksdb_t* result = new rocksdb_t;
+  result->rep = db;
+  return result;
+}
+
+char** rocksdb_list_column_families(
+    const rocksdb_options_t* options,
+    const char* name,
+    size_t* lencfs,
+    char** errptr) {
+  std::vector<std::string> fams;
+  SaveError(errptr,
+      DB::ListColumnFamilies(DBOptions(options->rep),
+        std::string(name), &fams));
+
+  *lencfs = fams.size();
+  char** column_families = static_cast<char**>(malloc(sizeof(char*) * fams.size()));
+  for (size_t i = 0; i < fams.size(); i++) {
+    column_families[i] = strdup(fams[i].c_str());
+  }
+  return column_families;
+}
+
+void rocksdb_list_column_families_destroy(char** list, size_t len) {
+  for (size_t i = 0; i < len; ++i) {
+    free(list[i]);
+  }
+  free(list);
+}
+
+rocksdb_column_family_handle_t* rocksdb_create_column_family(
+    rocksdb_t* db,
+    const rocksdb_options_t* column_family_options,
+    const char* column_family_name,
+    char** errptr) {
+  rocksdb_column_family_handle_t* handle = new rocksdb_column_family_handle_t;
+  SaveError(errptr,
+      db->rep->CreateColumnFamily(ColumnFamilyOptions(column_family_options->rep),
+        std::string(column_family_name), &(handle->rep)));
+  return handle;
+}
+
+void rocksdb_drop_column_family(
+    rocksdb_t* db,
+    rocksdb_column_family_handle_t* handle,
+    char** errptr) {
+  SaveError(errptr, db->rep->DropColumnFamily(handle->rep));
+}
+
+void rocksdb_column_family_handle_destroy(rocksdb_column_family_handle_t* handle) {
+  delete handle->rep;
+  delete handle;
+}
+
+void rocksdb_put(
+    rocksdb_t* db,
+    const rocksdb_writeoptions_t* options,
+    const char* key, size_t keylen,
+    const char* val, size_t vallen,
+    char** errptr) {
+  SaveError(errptr,
+            db->rep->Put(options->rep, Slice(key, keylen), Slice(val, vallen)));
+}
+
+void rocksdb_put_cf(
+    rocksdb_t* db,
+    const rocksdb_writeoptions_t* options,
+    rocksdb_column_family_handle_t* column_family,
+    const char* key, size_t keylen,
+    const char* val, size_t vallen,
+    char** errptr) {
+  SaveError(errptr,
+            db->rep->Put(options->rep, column_family->rep,
+              Slice(key, keylen), Slice(val, vallen)));
+}
+
+void rocksdb_delete(
+    rocksdb_t* db,
+    const rocksdb_writeoptions_t* options,
+    const char* key, size_t keylen,
+    char** errptr) {
+  SaveError(errptr, db->rep->Delete(options->rep, Slice(key, keylen)));
+}
+
+void rocksdb_delete_cf(
+    rocksdb_t* db,
+    const rocksdb_writeoptions_t* options,
+    rocksdb_column_family_handle_t* column_family,
+    const char* key, size_t keylen,
+    char** errptr) {
+  SaveError(errptr, db->rep->Delete(options->rep, column_family->rep,
+        Slice(key, keylen)));
+}
+
+void rocksdb_delete_range_cf(rocksdb_t* db,
+                             const rocksdb_writeoptions_t* options,
+                             rocksdb_column_family_handle_t* column_family,
+                             const char* start_key, size_t start_key_len,
+                             const char* end_key, size_t end_key_len,
+                             char** errptr) {
+  SaveError(errptr, db->rep->DeleteRange(options->rep, column_family->rep,
+                                         Slice(start_key, start_key_len),
+                                         Slice(end_key, end_key_len)));
+}
+
+void rocksdb_merge(
+    rocksdb_t* db,
+    const rocksdb_writeoptions_t* options,
+    const char* key, size_t keylen,
+    const char* val, size_t vallen,
+    char** errptr) {
+  SaveError(errptr,
+            db->rep->Merge(options->rep, Slice(key, keylen), Slice(val, vallen)));
+}
+
+void rocksdb_merge_cf(
+    rocksdb_t* db,
+    const rocksdb_writeoptions_t* options,
+    rocksdb_column_family_handle_t* column_family,
+    const char* key, size_t keylen,
+    const char* val, size_t vallen,
+    char** errptr) {
+  SaveError(errptr,
+            db->rep->Merge(options->rep, column_family->rep,
+              Slice(key, keylen), Slice(val, vallen)));
+}
+
+void rocksdb_write(
+    rocksdb_t* db,
+    const rocksdb_writeoptions_t* options,
+    rocksdb_writebatch_t* batch,
+    char** errptr) {
+  SaveError(errptr, db->rep->Write(options->rep, &batch->rep));
+}
+
+char* rocksdb_get(
+    rocksdb_t* db,
+    const rocksdb_readoptions_t* options,
+    const char* key, size_t keylen,
+    size_t* vallen,
+    char** errptr) {
+  char* result = nullptr;
+  std::string tmp;
+  Status s = db->rep->Get(options->rep, Slice(key, keylen), &tmp);
+  if (s.ok()) {
+    *vallen = tmp.size();
+    result = CopyString(tmp);
+  } else {
+    *vallen = 0;
+    if (!s.IsNotFound()) {
+      SaveError(errptr, s);
+    }
+  }
+  return result;
+}
+
+char* rocksdb_get_cf(
+    rocksdb_t* db,
+    const rocksdb_readoptions_t* options,
+    rocksdb_column_family_handle_t* column_family,
+    const char* key, size_t keylen,
+    size_t* vallen,
+    char** errptr) {
+  char* result = nullptr;
+  std::string tmp;
+  Status s = db->rep->Get(options->rep, column_family->rep,
+      Slice(key, keylen), &tmp);
+  if (s.ok()) {
+    *vallen = tmp.size();
+    result = CopyString(tmp);
+  } else {
+    *vallen = 0;
+    if (!s.IsNotFound()) {
+      SaveError(errptr, s);
+    }
+  }
+  return result;
+}
+
+void rocksdb_multi_get(
+    rocksdb_t* db,
+    const rocksdb_readoptions_t* options,
+    size_t num_keys, const char* const* keys_list,
+    const size_t* keys_list_sizes,
+    char** values_list, size_t* values_list_sizes,
+    char** errs) {
+  std::vector<Slice> keys(num_keys);
+  for (size_t i = 0; i < num_keys; i++) {
+    keys[i] = Slice(keys_list[i], keys_list_sizes[i]);
+  }
+  std::vector<std::string> values(num_keys);
+  std::vector<Status> statuses = db->rep->MultiGet(options->rep, keys, &values);
+  for (size_t i = 0; i < num_keys; i++) {
+    if (statuses[i].ok()) {
+      values_list[i] = CopyString(values[i]);
+      values_list_sizes[i] = values[i].size();
+      errs[i] = nullptr;
+    } else {
+      values_list[i] = nullptr;
+      values_list_sizes[i] = 0;
+      if (!statuses[i].IsNotFound()) {
+        errs[i] = strdup(statuses[i].ToString().c_str());
+      } else {
+        errs[i] = nullptr;
+      }
+    }
+  }
+}
+
+void rocksdb_multi_get_cf(
+    rocksdb_t* db,
+    const rocksdb_readoptions_t* options,
+    const rocksdb_column_family_handle_t* const* column_families,
+    size_t num_keys, const char* const* keys_list,
+    const size_t* keys_list_sizes,
+    char** values_list, size_t* values_list_sizes,
+    char** errs) {
+  std::vector<Slice> keys(num_keys);
+  std::vector<ColumnFamilyHandle*> cfs(num_keys);
+  for (size_t i = 0; i < num_keys; i++) {
+    keys[i] = Slice(keys_list[i], keys_list_sizes[i]);
+    cfs[i] = column_families[i]->rep;
+  }
+  std::vector<std::string> values(num_keys);
+  std::vector<Status> statuses = db->rep->MultiGet(options->rep, cfs, keys, &values);
+  for (size_t i = 0; i < num_keys; i++) {
+    if (statuses[i].ok()) {
+      values_list[i] = CopyString(values[i]);
+      values_list_sizes[i] = values[i].size();
+      errs[i] = nullptr;
+    } else {
+      values_list[i] = nullptr;
+      values_list_sizes[i] = 0;
+      if (!statuses[i].IsNotFound()) {
+        errs[i] = strdup(statuses[i].ToString().c_str());
+      } else {
+        errs[i] = nullptr;
+      }
+    }
+  }
+}
+
+rocksdb_iterator_t* rocksdb_create_iterator(
+    rocksdb_t* db,
+    const rocksdb_readoptions_t* options) {
+  rocksdb_iterator_t* result = new rocksdb_iterator_t;
+  result->rep = db->rep->NewIterator(options->rep);
+  return result;
+}
+
+rocksdb_wal_iterator_t* rocksdb_get_updates_since(
+        rocksdb_t* db, uint64_t seq_number,
+        const rocksdb_wal_readoptions_t* options,
+        char** errptr) {
+  std::unique_ptr<TransactionLogIterator> iter;
+  TransactionLogIterator::ReadOptions ro;
+  if (options!=nullptr) {
+      ro = options->rep;
+  }
+  if (SaveError(errptr, db->rep->GetUpdatesSince(seq_number, &iter, ro))) {
+    return nullptr;
+  }
+  rocksdb_wal_iterator_t* result = new rocksdb_wal_iterator_t;
+  result->rep = iter.release();
+  return result;
+}
+
+void rocksdb_wal_iter_next(rocksdb_wal_iterator_t* iter) {
+    iter->rep->Next();
+}
+
+unsigned char rocksdb_wal_iter_valid(const rocksdb_wal_iterator_t* iter) {
+    return iter->rep->Valid();
+}
+
+void rocksdb_wal_iter_status (const rocksdb_wal_iterator_t* iter, char** errptr) {
+    SaveError(errptr, iter->rep->status());
+}
+
+void rocksdb_wal_iter_destroy (const rocksdb_wal_iterator_t* iter) {
+  delete iter->rep;
+  delete iter;
+}
+
+rocksdb_writebatch_t* rocksdb_wal_iter_get_batch (const rocksdb_wal_iterator_t* iter, uint64_t* seq) {
+  rocksdb_writebatch_t* result = rocksdb_writebatch_create();
+  BatchResult wal_batch = iter->rep->GetBatch();
+  result->rep = std::move(*wal_batch.writeBatchPtr);
+  if (seq != nullptr) {
+    *seq = wal_batch.sequence;
+  }
+  return result;
+}
+
+uint64_t rocksdb_get_latest_sequence_number (rocksdb_t *db) {
+    return db->rep->GetLatestSequenceNumber();
+}
+
+rocksdb_iterator_t* rocksdb_create_iterator_cf(
+    rocksdb_t* db,
+    const rocksdb_readoptions_t* options,
+    rocksdb_column_family_handle_t* column_family) {
+  rocksdb_iterator_t* result = new rocksdb_iterator_t;
+  result->rep = db->rep->NewIterator(options->rep, column_family->rep);
+  return result;
+}
+
+void rocksdb_create_iterators(
+    rocksdb_t *db,
+    rocksdb_readoptions_t* opts,
+    rocksdb_column_family_handle_t** column_families,
+    rocksdb_iterator_t** iterators,
+    size_t size,
+    char** errptr) {
+  std::vector<ColumnFamilyHandle*> column_families_vec;
+  for (size_t i = 0; i < size; i++) {
+    column_families_vec.push_back(column_families[i]->rep);
+  }
+
+  std::vector<Iterator*> res;
+  Status status = db->rep->NewIterators(opts->rep, column_families_vec, &res);
+  assert(res.size() == size);
+  if (SaveError(errptr, status)) {
+    return;
+  }
+
+  for (size_t i = 0; i < size; i++) {
+    iterators[i] = new rocksdb_iterator_t;
+    iterators[i]->rep = res[i];
+  }
+}
+
+const rocksdb_snapshot_t* rocksdb_create_snapshot(
+    rocksdb_t* db) {
+  rocksdb_snapshot_t* result = new rocksdb_snapshot_t;
+  result->rep = db->rep->GetSnapshot();
+  return result;
+}
+
+void rocksdb_release_snapshot(
+    rocksdb_t* db,
+    const rocksdb_snapshot_t* snapshot) {
+  db->rep->ReleaseSnapshot(snapshot->rep);
+  delete snapshot;
+}
+
+char* rocksdb_property_value(
+    rocksdb_t* db,
+    const char* propname) {
+  std::string tmp;
+  if (db->rep->GetProperty(Slice(propname), &tmp)) {
+    // We use strdup() since we expect human readable output.
+    return strdup(tmp.c_str());
+  } else {
+    return nullptr;
+  }
+}
+
+int rocksdb_property_int(
+    rocksdb_t* db,
+    const char* propname,
+    uint64_t *out_val) {
+  if (db->rep->GetIntProperty(Slice(propname), out_val)) {
+    return 0;
+  } else {
+    return -1;
+  }
+}
+
+int rocksdb_property_int_cf(
+    rocksdb_t* db,
+    rocksdb_column_family_handle_t* column_family,
+    const char* propname,
+    uint64_t *out_val) {
+  if (db->rep->GetIntProperty(column_family->rep, Slice(propname), out_val)) {
+    return 0;
+  } else {
+    return -1;
+  }
+}
+
+char* rocksdb_property_value_cf(
+    rocksdb_t* db,
+    rocksdb_column_family_handle_t* column_family,
+    const char* propname) {
+  std::string tmp;
+  if (db->rep->GetProperty(column_family->rep, Slice(propname), &tmp)) {
+    // We use strdup() since we expect human readable output.
+    return strdup(tmp.c_str());
+  } else {
+    return nullptr;
+  }
+}
+
+void rocksdb_approximate_sizes(
+    rocksdb_t* db,
+    int num_ranges,
+    const char* const* range_start_key, const size_t* range_start_key_len,
+    const char* const* range_limit_key, const size_t* range_limit_key_len,
+    uint64_t* sizes) {
+  Range* ranges = new Range[num_ranges];
+  for (int i = 0; i < num_ranges; i++) {
+    ranges[i].start = Slice(range_start_key[i], range_start_key_len[i]);
+    ranges[i].limit = Slice(range_limit_key[i], range_limit_key_len[i]);
+  }
+  db->rep->GetApproximateSizes(ranges, num_ranges, sizes);
+  delete[] ranges;
+}
+
+void rocksdb_approximate_sizes_cf(
+    rocksdb_t* db,
+    rocksdb_column_family_handle_t* column_family,
+    int num_ranges,
+    const char* const* range_start_key, const size_t* range_start_key_len,
+    const char* const* range_limit_key, const size_t* range_limit_key_len,
+    uint64_t* sizes) {
+  Range* ranges = new Range[num_ranges];
+  for (int i = 0; i < num_ranges; i++) {
+    ranges[i].start = Slice(range_start_key[i], range_start_key_len[i]);
+    ranges[i].limit = Slice(range_limit_key[i], range_limit_key_len[i]);
+  }
+  db->rep->GetApproximateSizes(column_family->rep, ranges, num_ranges, sizes);
+  delete[] ranges;
+}
+
+void rocksdb_delete_file(
+    rocksdb_t* db,
+    const char* name) {
+  db->rep->DeleteFile(name);
+}
+
+const rocksdb_livefiles_t* rocksdb_livefiles(
+    rocksdb_t* db) {
+  rocksdb_livefiles_t* result = new rocksdb_livefiles_t;
+  db->rep->GetLiveFilesMetaData(&result->rep);
+  return result;
+}
+
+void rocksdb_compact_range(
+    rocksdb_t* db,
+    const char* start_key, size_t start_key_len,
+    const char* limit_key, size_t limit_key_len) {
+  Slice a, b;
+  db->rep->CompactRange(
+      CompactRangeOptions(),
+      // Pass nullptr Slice if corresponding "const char*" is nullptr
+      (start_key ? (a = Slice(start_key, start_key_len), &a) : nullptr),
+      (limit_key ? (b = Slice(limit_key, limit_key_len), &b) : nullptr));
+}
+
+void rocksdb_compact_range_cf(
+    rocksdb_t* db,
+    rocksdb_column_family_handle_t* column_family,
+    const char* start_key, size_t start_key_len,
+    const char* limit_key, size_t limit_key_len) {
+  Slice a, b;
+  db->rep->CompactRange(
+      CompactRangeOptions(), column_family->rep,
+      // Pass nullptr Slice if corresponding "const char*" is nullptr
+      (start_key ? (a = Slice(start_key, start_key_len), &a) : nullptr),
+      (limit_key ? (b = Slice(limit_key, limit_key_len), &b) : nullptr));
+}
+
+void rocksdb_compact_range_opt(rocksdb_t* db, rocksdb_compactoptions_t* opt,
+                               const char* start_key, size_t start_key_len,
+                               const char* limit_key, size_t limit_key_len) {
+  Slice a, b;
+  db->rep->CompactRange(
+      opt->rep,
+      // Pass nullptr Slice if corresponding "const char*" is nullptr
+      (start_key ? (a = Slice(start_key, start_key_len), &a) : nullptr),
+      (limit_key ? (b = Slice(limit_key, limit_key_len), &b) : nullptr));
+}
+
+void rocksdb_compact_range_cf_opt(rocksdb_t* db,
+                                  rocksdb_column_family_handle_t* column_family,
+                                  rocksdb_compactoptions_t* opt,
+                                  const char* start_key, size_t start_key_len,
+                                  const char* limit_key, size_t limit_key_len) {
+  Slice a, b;
+  db->rep->CompactRange(
+      opt->rep, column_family->rep,
+      // Pass nullptr Slice if corresponding "const char*" is nullptr
+      (start_key ? (a = Slice(start_key, start_key_len), &a) : nullptr),
+      (limit_key ? (b = Slice(limit_key, limit_key_len), &b) : nullptr));
+}
+
+void rocksdb_flush(
+    rocksdb_t* db,
+    const rocksdb_flushoptions_t* options,
+    char** errptr) {
+  SaveError(errptr, db->rep->Flush(options->rep));
+}
+
+void rocksdb_flush_cf(
+    rocksdb_t* db,
+    const rocksdb_flushoptions_t* options,
+    rocksdb_column_family_handle_t* column_family,
+    char** errptr) {
+  SaveError(errptr, db->rep->Flush(options->rep, column_family->rep));
+}
+
+void rocksdb_disable_file_deletions(
+    rocksdb_t* db,
+    char** errptr) {
+  SaveError(errptr, db->rep->DisableFileDeletions());
+}
+
+void rocksdb_enable_file_deletions(
+    rocksdb_t* db,
+    unsigned char force,
+    char** errptr) {
+  SaveError(errptr, db->rep->EnableFileDeletions(force));
+}
+
+void rocksdb_destroy_db(
+    const rocksdb_options_t* options,
+    const char* name,
+    char** errptr) {
+  SaveError(errptr, DestroyDB(name, options->rep));
+}
+
+void rocksdb_repair_db(
+    const rocksdb_options_t* options,
+    const char* name,
+    char** errptr) {
+  SaveError(errptr, RepairDB(name, options->rep));
+}
+
+void rocksdb_iter_destroy(rocksdb_iterator_t* iter) {
+  delete iter->rep;
+  delete iter;
+}
+
+unsigned char rocksdb_iter_valid(const rocksdb_iterator_t* iter) {
+  return iter->rep->Valid();
+}
+
+void rocksdb_iter_seek_to_first(rocksdb_iterator_t* iter) {
+  iter->rep->SeekToFirst();
+}
+
+void rocksdb_iter_seek_to_last(rocksdb_iterator_t* iter) {
+  iter->rep->SeekToLast();
+}
+
+void rocksdb_iter_seek(rocksdb_iterator_t* iter, const char* k, size_t klen) {
+  iter->rep->Seek(Slice(k, klen));
+}
+
+void rocksdb_iter_seek_for_prev(rocksdb_iterator_t* iter, const char* k,
+                                size_t klen) {
+  iter->rep->SeekForPrev(Slice(k, klen));
+}
+
+void rocksdb_iter_next(rocksdb_iterator_t* iter) {
+  iter->rep->Next();
+}
+
+void rocksdb_iter_prev(rocksdb_iterator_t* iter) {
+  iter->rep->Prev();
+}
+
+const char* rocksdb_iter_key(const rocksdb_iterator_t* iter, size_t* klen) {
+  Slice s = iter->rep->key();
+  *klen = s.size();
+  return s.data();
+}
+
+const char* rocksdb_iter_value(const rocksdb_iterator_t* iter, size_t* vlen) {
+  Slice s = iter->rep->value();
+  *vlen = s.size();
+  return s.data();
+}
+
+void rocksdb_iter_get_error(const rocksdb_iterator_t* iter, char** errptr) {
+  SaveError(errptr, iter->rep->status());
+}
+
+rocksdb_writebatch_t* rocksdb_writebatch_create() {
+  return new rocksdb_writebatch_t;
+}
+
+rocksdb_writebatch_t* rocksdb_writebatch_create_from(const char* rep,
+                                                     size_t size) {
+  rocksdb_writebatch_t* b = new rocksdb_writebatch_t;
+  b->rep = WriteBatch(std::string(rep, size));
+  return b;
+}
+
+void rocksdb_writebatch_destroy(rocksdb_writebatch_t* b) {
+  delete b;
+}
+
+void rocksdb_writebatch_clear(rocksdb_writebatch_t* b) {
+  b->rep.Clear();
+}
+
+int rocksdb_writebatch_count(rocksdb_writebatch_t* b) {
+  return b->rep.Count();
+}
+
+void rocksdb_writebatch_put(
+    rocksdb_writebatch_t* b,
+    const char* key, size_t klen,
+    const char* val, size_t vlen) {
+  b->rep.Put(Slice(key, klen), Slice(val, vlen));
+}
+
+void rocksdb_writebatch_put_cf(
+    rocksdb_writebatch_t* b,
+    rocksdb_column_family_handle_t* column_family,
+    const char* key, size_t klen,
+    const char* val, size_t vlen) {
+  b->rep.Put(column_family->rep, Slice(key, klen), Slice(val, vlen));
+}
+
+void rocksdb_writebatch_putv(
+    rocksdb_writebatch_t* b,
+    int num_keys, const char* const* keys_list,
+    const size_t* keys_list_sizes,
+    int num_values, const char* const* values_list,
+    const size_t* values_list_sizes) {
+  std::vector<Slice> key_slices(num_keys);
+  for (int i = 0; i < num_keys; i++) {
+    key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]);
+  }
+  std::vector<Slice> value_slices(num_values);
+  for (int i = 0; i < num_values; i++) {
+    value_slices[i] = Slice(values_list[i], values_list_sizes[i]);
+  }
+  b->rep.Put(SliceParts(key_slices.data(), num_keys),
+             SliceParts(value_slices.data(), num_values));
+}
+
+void rocksdb_writebatch_putv_cf(
+    rocksdb_writebatch_t* b,
+    rocksdb_column_family_handle_t* column_family,
+    int num_keys, const char* const* keys_list,
+    const size_t* keys_list_sizes,
+    int num_values, const char* const* values_list,
+    const size_t* values_list_sizes) {
+  std::vector<Slice> key_slices(num_keys);
+  for (int i = 0; i < num_keys; i++) {
+    key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]);
+  }
+  std::vector<Slice> value_slices(num_values);
+  for (int i = 0; i < num_values; i++) {
+    value_slices[i] = Slice(values_list[i], values_list_sizes[i]);
+  }
+  b->rep.Put(column_family->rep, SliceParts(key_slices.data(), num_keys),
+             SliceParts(value_slices.data(), num_values));
+}
+
+void rocksdb_writebatch_merge(
+    rocksdb_writebatch_t* b,
+    const char* key, size_t klen,
+    const char* val, size_t vlen) {
+  b->rep.Merge(Slice(key, klen), Slice(val, vlen));
+}
+
+void rocksdb_writebatch_merge_cf(
+    rocksdb_writebatch_t* b,
+    rocksdb_column_family_handle_t* column_family,
+    const char* key, size_t klen,
+    const char* val, size_t vlen) {
+  b->rep.Merge(column_family->rep, Slice(key, klen), Slice(val, vlen));
+}
+
+void rocksdb_writebatch_mergev(
+    rocksdb_writebatch_t* b,
+    int num_keys, const char* const* keys_list,
+    const size_t* keys_list_sizes,
+    int num_values, const char* const* values_list,
+    const size_t* values_list_sizes) {
+  std::vector<Slice> key_slices(num_keys);
+  for (int i = 0; i < num_keys; i++) {
+    key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]);
+  }
+  std::vector<Slice> value_slices(num_values);
+  for (int i = 0; i < num_values; i++) {
+    value_slices[i] = Slice(values_list[i], values_list_sizes[i]);
+  }
+  b->rep.Merge(SliceParts(key_slices.data(), num_keys),
+               SliceParts(value_slices.data(), num_values));
+}
+
+void rocksdb_writebatch_mergev_cf(
+    rocksdb_writebatch_t* b,
+    rocksdb_column_family_handle_t* column_family,
+    int num_keys, const char* const* keys_list,
+    const size_t* keys_list_sizes,
+    int num_values, const char* const* values_list,
+    const size_t* values_list_sizes) {
+  std::vector<Slice> key_slices(num_keys);
+  for (int i = 0; i < num_keys; i++) {
+    key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]);
+  }
+  std::vector<Slice> value_slices(num_values);
+  for (int i = 0; i < num_values; i++) {
+    value_slices[i] = Slice(values_list[i], values_list_sizes[i]);
+  }
+  b->rep.Merge(column_family->rep, SliceParts(key_slices.data(), num_keys),
+               SliceParts(value_slices.data(), num_values));
+}
+
+void rocksdb_writebatch_delete(
+    rocksdb_writebatch_t* b,
+    const char* key, size_t klen) {
+  b->rep.Delete(Slice(key, klen));
+}
+
+void rocksdb_writebatch_delete_cf(
+    rocksdb_writebatch_t* b,
+    rocksdb_column_family_handle_t* column_family,
+    const char* key, size_t klen) {
+  b->rep.Delete(column_family->rep, Slice(key, klen));
+}
+
+void rocksdb_writebatch_deletev(
+    rocksdb_writebatch_t* b,
+    int num_keys, const char* const* keys_list,
+    const size_t* keys_list_sizes) {
+  std::vector<Slice> key_slices(num_keys);
+  for (int i = 0; i < num_keys; i++) {
+    key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]);
+  }
+  b->rep.Delete(SliceParts(key_slices.data(), num_keys));
+}
+
+void rocksdb_writebatch_deletev_cf(
+    rocksdb_writebatch_t* b,
+    rocksdb_column_family_handle_t* column_family,
+    int num_keys, const char* const* keys_list,
+    const size_t* keys_list_sizes) {
+  std::vector<Slice> key_slices(num_keys);
+  for (int i = 0; i < num_keys; i++) {
+    key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]);
+  }
+  b->rep.Delete(column_family->rep, SliceParts(key_slices.data(), num_keys));
+}
+
+void rocksdb_writebatch_delete_range(rocksdb_writebatch_t* b,
+                                     const char* start_key,
+                                     size_t start_key_len, const char* end_key,
+                                     size_t end_key_len) {
+  b->rep.DeleteRange(Slice(start_key, start_key_len),
+                     Slice(end_key, end_key_len));
+}
+
+void rocksdb_writebatch_delete_range_cf(
+    rocksdb_writebatch_t* b, rocksdb_column_family_handle_t* column_family,
+    const char* start_key, size_t start_key_len, const char* end_key,
+    size_t end_key_len) {
+  b->rep.DeleteRange(column_family->rep, Slice(start_key, start_key_len),
+                     Slice(end_key, end_key_len));
+}
+
+void rocksdb_writebatch_delete_rangev(rocksdb_writebatch_t* b, int num_keys,
+                                      const char* const* start_keys_list,
+                                      const size_t* start_keys_list_sizes,
+                                      const char* const* end_keys_list,
+                                      const size_t* end_keys_list_sizes) {
+  std::vector<Slice> start_key_slices(num_keys);
+  std::vector<Slice> end_key_slices(num_keys);
+  for (int i = 0; i < num_keys; i++) {
+    start_key_slices[i] = Slice(start_keys_list[i], start_keys_list_sizes[i]);
+    end_key_slices[i] = Slice(end_keys_list[i], end_keys_list_sizes[i]);
+  }
+  b->rep.DeleteRange(SliceParts(start_key_slices.data(), num_keys),
+                     SliceParts(end_key_slices.data(), num_keys));
+}
+
+void rocksdb_writebatch_delete_rangev_cf(
+    rocksdb_writebatch_t* b, rocksdb_column_family_handle_t* column_family,
+    int num_keys, const char* const* start_keys_list,
+    const size_t* start_keys_list_sizes, const char* const* end_keys_list,
+    const size_t* end_keys_list_sizes) {
+  std::vector<Slice> start_key_slices(num_keys);
+  std::vector<Slice> end_key_slices(num_keys);
+  for (int i = 0; i < num_keys; i++) {
+    start_key_slices[i] = Slice(start_keys_list[i], start_keys_list_sizes[i]);
+    end_key_slices[i] = Slice(end_keys_list[i], end_keys_list_sizes[i]);
+  }
+  b->rep.DeleteRange(column_family->rep,
+                     SliceParts(start_key_slices.data(), num_keys),
+                     SliceParts(end_key_slices.data(), num_keys));
+}
+
+void rocksdb_writebatch_put_log_data(
+    rocksdb_writebatch_t* b,
+    const char* blob, size_t len) {
+  b->rep.PutLogData(Slice(blob, len));
+}
+
+class H : public WriteBatch::Handler {
+ public:
+  void* state_;
+  void (*put_)(void*, const char* k, size_t klen, const char* v, size_t vlen);
+  void (*deleted_)(void*, const char* k, size_t klen);
+  void Put(const Slice& key, const Slice& value) override {
+    (*put_)(state_, key.data(), key.size(), value.data(), value.size());
+  }
+  void Delete(const Slice& key) override {
+    (*deleted_)(state_, key.data(), key.size());
+  }
+};
+
+void rocksdb_writebatch_iterate(
+    rocksdb_writebatch_t* b,
+    void* state,
+    void (*put)(void*, const char* k, size_t klen, const char* v, size_t vlen),
+    void (*deleted)(void*, const char* k, size_t klen)) {
+  H handler;
+  handler.state_ = state;
+  handler.put_ = put;
+  handler.deleted_ = deleted;
+  b->rep.Iterate(&handler);
+}
+
+const char* rocksdb_writebatch_data(rocksdb_writebatch_t* b, size_t* size) {
+  *size = b->rep.GetDataSize();
+  return b->rep.Data().c_str();
+}
+
+void rocksdb_writebatch_set_save_point(rocksdb_writebatch_t* b) {
+  b->rep.SetSavePoint();
+}
+
+void rocksdb_writebatch_rollback_to_save_point(rocksdb_writebatch_t* b,
+                                               char** errptr) {
+  SaveError(errptr, b->rep.RollbackToSavePoint());
+}
+
+void rocksdb_writebatch_pop_save_point(rocksdb_writebatch_t* b, char** errptr) {
+  SaveError(errptr, b->rep.PopSavePoint());
+}
+
+rocksdb_writebatch_wi_t* rocksdb_writebatch_wi_create(size_t reserved_bytes, unsigned char overwrite_key) {
+  rocksdb_writebatch_wi_t* b = new rocksdb_writebatch_wi_t;
+  b->rep = new WriteBatchWithIndex(BytewiseComparator(), reserved_bytes, overwrite_key);
+  return b;
+}
+
+void rocksdb_writebatch_wi_destroy(rocksdb_writebatch_wi_t* b) {
+  if (b->rep) {
+    delete b->rep;
+  }
+  delete b;
+}
+
+void rocksdb_writebatch_wi_clear(rocksdb_writebatch_wi_t* b) {
+  b->rep->Clear();
+}
+
+int rocksdb_writebatch_wi_count(rocksdb_writebatch_wi_t* b) {
+  return b->rep->GetWriteBatch()->Count();
+}
+
+void rocksdb_writebatch_wi_put(
+    rocksdb_writebatch_wi_t* b,
+    const char* key, size_t klen,
+    const char* val, size_t vlen) {
+  b->rep->Put(Slice(key, klen), Slice(val, vlen));
+}
+
+void rocksdb_writebatch_wi_put_cf(
+    rocksdb_writebatch_wi_t* b,
+    rocksdb_column_family_handle_t* column_family,
+    const char* key, size_t klen,
+    const char* val, size_t vlen) {
+  b->rep->Put(column_family->rep, Slice(key, klen), Slice(val, vlen));
+}
+
+void rocksdb_writebatch_wi_putv(
+    rocksdb_writebatch_wi_t* b,
+    int num_keys, const char* const* keys_list,
+    const size_t* keys_list_sizes,
+    int num_values, const char* const* values_list,
+    const size_t* values_list_sizes) {
+  std::vector<Slice> key_slices(num_keys);
+  for (int i = 0; i < num_keys; i++) {
+    key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]);
+  }
+  std::vector<Slice> value_slices(num_values);
+  for (int i = 0; i < num_values; i++) {
+    value_slices[i] = Slice(values_list[i], values_list_sizes[i]);
+  }
+  b->rep->Put(SliceParts(key_slices.data(), num_keys),
+             SliceParts(value_slices.data(), num_values));
+}
+
+void rocksdb_writebatch_wi_putv_cf(
+    rocksdb_writebatch_wi_t* b,
+    rocksdb_column_family_handle_t* column_family,
+    int num_keys, const char* const* keys_list,
+    const size_t* keys_list_sizes,
+    int num_values, const char* const* values_list,
+    const size_t* values_list_sizes) {
+  std::vector<Slice> key_slices(num_keys);
+  for (int i = 0; i < num_keys; i++) {
+    key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]);
+  }
+  std::vector<Slice> value_slices(num_values);
+  for (int i = 0; i < num_values; i++) {
+    value_slices[i] = Slice(values_list[i], values_list_sizes[i]);
+  }
+  b->rep->Put(column_family->rep, SliceParts(key_slices.data(), num_keys),
+             SliceParts(value_slices.data(), num_values));
+}
+
+void rocksdb_writebatch_wi_merge(
+    rocksdb_writebatch_wi_t* b,
+    const char* key, size_t klen,
+    const char* val, size_t vlen) {
+  b->rep->Merge(Slice(key, klen), Slice(val, vlen));
+}
+
+void rocksdb_writebatch_wi_merge_cf(
+    rocksdb_writebatch_wi_t* b,
+    rocksdb_column_family_handle_t* column_family,
+    const char* key, size_t klen,
+    const char* val, size_t vlen) {
+  b->rep->Merge(column_family->rep, Slice(key, klen), Slice(val, vlen));
+}
+
+void rocksdb_writebatch_wi_mergev(
+    rocksdb_writebatch_wi_t* b,
+    int num_keys, const char* const* keys_list,
+    const size_t* keys_list_sizes,
+    int num_values, const char* const* values_list,
+    const size_t* values_list_sizes) {
+  std::vector<Slice> key_slices(num_keys);
+  for (int i = 0; i < num_keys; i++) {
+    key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]);
+  }
+  std::vector<Slice> value_slices(num_values);
+  for (int i = 0; i < num_values; i++) {
+    value_slices[i] = Slice(values_list[i], values_list_sizes[i]);
+  }
+  b->rep->Merge(SliceParts(key_slices.data(), num_keys),
+               SliceParts(value_slices.data(), num_values));
+}
+
+void rocksdb_writebatch_wi_mergev_cf(
+    rocksdb_writebatch_wi_t* b,
+    rocksdb_column_family_handle_t* column_family,
+    int num_keys, const char* const* keys_list,
+    const size_t* keys_list_sizes,
+    int num_values, const char* const* values_list,
+    const size_t* values_list_sizes) {
+  std::vector<Slice> key_slices(num_keys);
+  for (int i = 0; i < num_keys; i++) {
+    key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]);
+  }
+  std::vector<Slice> value_slices(num_values);
+  for (int i = 0; i < num_values; i++) {
+    value_slices[i] = Slice(values_list[i], values_list_sizes[i]);
+  }
+  b->rep->Merge(column_family->rep, SliceParts(key_slices.data(), num_keys),
+               SliceParts(value_slices.data(), num_values));
+}
+
+void rocksdb_writebatch_wi_delete(
+    rocksdb_writebatch_wi_t* b,
+    const char* key, size_t klen) {
+  b->rep->Delete(Slice(key, klen));
+}
+
+void rocksdb_writebatch_wi_delete_cf(
+    rocksdb_writebatch_wi_t* b,
+    rocksdb_column_family_handle_t* column_family,
+    const char* key, size_t klen) {
+  b->rep->Delete(column_family->rep, Slice(key, klen));
+}
+
+void rocksdb_writebatch_wi_deletev(
+    rocksdb_writebatch_wi_t* b,
+    int num_keys, const char* const* keys_list,
+    const size_t* keys_list_sizes) {
+  std::vector<Slice> key_slices(num_keys);
+  for (int i = 0; i < num_keys; i++) {
+    key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]);
+  }
+  b->rep->Delete(SliceParts(key_slices.data(), num_keys));
+}
+
+void rocksdb_writebatch_wi_deletev_cf(
+    rocksdb_writebatch_wi_t* b,
+    rocksdb_column_family_handle_t* column_family,
+    int num_keys, const char* const* keys_list,
+    const size_t* keys_list_sizes) {
+  std::vector<Slice> key_slices(num_keys);
+  for (int i = 0; i < num_keys; i++) {
+    key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]);
+  }
+  b->rep->Delete(column_family->rep, SliceParts(key_slices.data(), num_keys));
+}
+
+void rocksdb_writebatch_wi_delete_range(rocksdb_writebatch_wi_t* b,
+                                     const char* start_key,
+                                     size_t start_key_len, const char* end_key,
+                                     size_t end_key_len) {
+  b->rep->DeleteRange(Slice(start_key, start_key_len),
+                     Slice(end_key, end_key_len));
+}
+
+void rocksdb_writebatch_wi_delete_range_cf(
+    rocksdb_writebatch_wi_t* b, rocksdb_column_family_handle_t* column_family,
+    const char* start_key, size_t start_key_len, const char* end_key,
+    size_t end_key_len) {
+  b->rep->DeleteRange(column_family->rep, Slice(start_key, start_key_len),
+                     Slice(end_key, end_key_len));
+}
+
+void rocksdb_writebatch_wi_delete_rangev(rocksdb_writebatch_wi_t* b, int num_keys,
+                                      const char* const* start_keys_list,
+                                      const size_t* start_keys_list_sizes,
+                                      const char* const* end_keys_list,
+                                      const size_t* end_keys_list_sizes) {
+  std::vector<Slice> start_key_slices(num_keys);
+  std::vector<Slice> end_key_slices(num_keys);
+  for (int i = 0; i < num_keys; i++) {
+    start_key_slices[i] = Slice(start_keys_list[i], start_keys_list_sizes[i]);
+    end_key_slices[i] = Slice(end_keys_list[i], end_keys_list_sizes[i]);
+  }
+  b->rep->DeleteRange(SliceParts(start_key_slices.data(), num_keys),
+                     SliceParts(end_key_slices.data(), num_keys));
+}
+
+void rocksdb_writebatch_wi_delete_rangev_cf(
+    rocksdb_writebatch_wi_t* b, rocksdb_column_family_handle_t* column_family,
+    int num_keys, const char* const* start_keys_list,
+    const size_t* start_keys_list_sizes, const char* const* end_keys_list,
+    const size_t* end_keys_list_sizes) {
+  std::vector<Slice> start_key_slices(num_keys);
+  std::vector<Slice> end_key_slices(num_keys);
+  for (int i = 0; i < num_keys; i++) {
+    start_key_slices[i] = Slice(start_keys_list[i], start_keys_list_sizes[i]);
+    end_key_slices[i] = Slice(end_keys_list[i], end_keys_list_sizes[i]);
+  }
+  b->rep->DeleteRange(column_family->rep,
+                     SliceParts(start_key_slices.data(), num_keys),
+                     SliceParts(end_key_slices.data(), num_keys));
+}
+
+void rocksdb_writebatch_wi_put_log_data(
+    rocksdb_writebatch_wi_t* b,
+    const char* blob, size_t len) {
+  b->rep->PutLogData(Slice(blob, len));
+}
+
+void rocksdb_writebatch_wi_iterate(
+    rocksdb_writebatch_wi_t* b,
+    void* state,
+    void (*put)(void*, const char* k, size_t klen, const char* v, size_t vlen),
+    void (*deleted)(void*, const char* k, size_t klen)) {
+  H handler;
+  handler.state_ = state;
+  handler.put_ = put;
+  handler.deleted_ = deleted;
+  b->rep->GetWriteBatch()->Iterate(&handler);
+}
+
+const char* rocksdb_writebatch_wi_data(rocksdb_writebatch_wi_t* b, size_t* size) {
+  WriteBatch* wb = b->rep->GetWriteBatch();
+  *size = wb->GetDataSize();
+  return wb->Data().c_str();
+}
+
+void rocksdb_writebatch_wi_set_save_point(rocksdb_writebatch_wi_t* b) {
+  b->rep->SetSavePoint();
+}
+
+void rocksdb_writebatch_wi_rollback_to_save_point(rocksdb_writebatch_wi_t* b,
+                                               char** errptr) {
+  SaveError(errptr, b->rep->RollbackToSavePoint());
+}
+
+rocksdb_iterator_t* rocksdb_writebatch_wi_create_iterator_with_base(
+    rocksdb_writebatch_wi_t* wbwi,
+    rocksdb_iterator_t* base_iterator) {
+  rocksdb_iterator_t* result = new rocksdb_iterator_t;
+  result->rep = wbwi->rep->NewIteratorWithBase(base_iterator->rep);
+  delete base_iterator;
+  return result;
+}
+
+rocksdb_iterator_t* rocksdb_writebatch_wi_create_iterator_with_base_cf(
+    rocksdb_writebatch_wi_t* wbwi, rocksdb_iterator_t* base_iterator,
+    rocksdb_column_family_handle_t* column_family) {
+  rocksdb_iterator_t* result = new rocksdb_iterator_t;
+  result->rep =
+      wbwi->rep->NewIteratorWithBase(column_family->rep, base_iterator->rep);
+  delete base_iterator;
+  return result;
+}
+
+char* rocksdb_writebatch_wi_get_from_batch(
+    rocksdb_writebatch_wi_t* wbwi,
+    const rocksdb_options_t* options,
+    const char* key, size_t keylen,
+    size_t* vallen,
+    char** errptr) {
+  char* result = nullptr;
+  std::string tmp;
+  Status s = wbwi->rep->GetFromBatch(options->rep, Slice(key, keylen), &tmp);
+  if (s.ok()) {
+    *vallen = tmp.size();
+    result = CopyString(tmp);
+  } else {
+    *vallen = 0;
+    if (!s.IsNotFound()) {
+      SaveError(errptr, s);
+    }
+  }
+  return result;
+}
+
+char* rocksdb_writebatch_wi_get_from_batch_cf(
+    rocksdb_writebatch_wi_t* wbwi,
+    const rocksdb_options_t* options,
+    rocksdb_column_family_handle_t* column_family,
+    const char* key, size_t keylen,
+    size_t* vallen,
+    char** errptr) {
+  char* result = nullptr;
+  std::string tmp;
+  Status s = wbwi->rep->GetFromBatch(column_family->rep, options->rep,
+      Slice(key, keylen), &tmp);
+  if (s.ok()) {
+    *vallen = tmp.size();
+    result = CopyString(tmp);
+  } else {
+    *vallen = 0;
+    if (!s.IsNotFound()) {
+      SaveError(errptr, s);
+    }
+  }
+  return result;
+}
+
+char* rocksdb_writebatch_wi_get_from_batch_and_db(
+    rocksdb_writebatch_wi_t* wbwi,
+    rocksdb_t* db,
+    const rocksdb_readoptions_t* options,
+    const char* key, size_t keylen,
+    size_t* vallen,
+    char** errptr) {
+  char* result = nullptr;
+  std::string tmp;
+  Status s = wbwi->rep->GetFromBatchAndDB(db->rep, options->rep, Slice(key, keylen), &tmp);
+  if (s.ok()) {
+    *vallen = tmp.size();
+    result = CopyString(tmp);
+  } else {
+    *vallen = 0;
+    if (!s.IsNotFound()) {
+      SaveError(errptr, s);
+    }
+  }
+  return result;
+}
+
+char* rocksdb_writebatch_wi_get_from_batch_and_db_cf(
+    rocksdb_writebatch_wi_t* wbwi,
+    rocksdb_t* db,
+    const rocksdb_readoptions_t* options,
+    rocksdb_column_family_handle_t* column_family,
+    const char* key, size_t keylen,
+    size_t* vallen,
+    char** errptr) {
+  char* result = nullptr;
+  std::string tmp;
+  Status s = wbwi->rep->GetFromBatchAndDB(db->rep, options->rep, column_family->rep,
+      Slice(key, keylen), &tmp);
+  if (s.ok()) {
+    *vallen = tmp.size();
+    result = CopyString(tmp);
+  } else {
+    *vallen = 0;
+    if (!s.IsNotFound()) {
+      SaveError(errptr, s);
+    }
+  }
+  return result;
+}
+
+void rocksdb_write_writebatch_wi(
+    rocksdb_t* db,
+    const rocksdb_writeoptions_t* options,
+    rocksdb_writebatch_wi_t* wbwi,
+    char** errptr) {
+  WriteBatch* wb = wbwi->rep->GetWriteBatch();
+  SaveError(errptr, db->rep->Write(options->rep, wb));
+}
+
+rocksdb_block_based_table_options_t*
+rocksdb_block_based_options_create() {
+  return new rocksdb_block_based_table_options_t;
+}
+
+void rocksdb_block_based_options_destroy(
+    rocksdb_block_based_table_options_t* options) {
+  delete options;
+}
+
+void rocksdb_block_based_options_set_block_size(
+    rocksdb_block_based_table_options_t* options, size_t block_size) {
+  options->rep.block_size = block_size;
+}
+
+void rocksdb_block_based_options_set_block_size_deviation(
+    rocksdb_block_based_table_options_t* options, int block_size_deviation) {
+  options->rep.block_size_deviation = block_size_deviation;
+}
+
+void rocksdb_block_based_options_set_block_restart_interval(
+    rocksdb_block_based_table_options_t* options, int block_restart_interval) {
+  options->rep.block_restart_interval = block_restart_interval;
+}
+
+void rocksdb_block_based_options_set_index_block_restart_interval(
+    rocksdb_block_based_table_options_t* options, int index_block_restart_interval) {
+  options->rep.index_block_restart_interval = index_block_restart_interval;
+}
+
+void rocksdb_block_based_options_set_metadata_block_size(
+    rocksdb_block_based_table_options_t* options, uint64_t metadata_block_size) {
+  options->rep.metadata_block_size = metadata_block_size;
+}
+
+void rocksdb_block_based_options_set_partition_filters(
+    rocksdb_block_based_table_options_t* options, unsigned char partition_filters) {
+  options->rep.partition_filters = partition_filters;
+}
+
+void rocksdb_block_based_options_set_use_delta_encoding(
+    rocksdb_block_based_table_options_t* options, unsigned char use_delta_encoding) {
+  options->rep.use_delta_encoding = use_delta_encoding;
+}
+
+void rocksdb_block_based_options_set_filter_policy(
+    rocksdb_block_based_table_options_t* options,
+    rocksdb_filterpolicy_t* filter_policy) {
+  options->rep.filter_policy.reset(filter_policy);
+}
+
+void rocksdb_block_based_options_set_no_block_cache(
+    rocksdb_block_based_table_options_t* options,
+    unsigned char no_block_cache) {
+  options->rep.no_block_cache = no_block_cache;
+}
+
+void rocksdb_block_based_options_set_block_cache(
+    rocksdb_block_based_table_options_t* options,
+    rocksdb_cache_t* block_cache) {
+  if (block_cache) {
+    options->rep.block_cache = block_cache->rep;
+  }
+}
+
+void rocksdb_block_based_options_set_block_cache_compressed(
+    rocksdb_block_based_table_options_t* options,
+    rocksdb_cache_t* block_cache_compressed) {
+  if (block_cache_compressed) {
+    options->rep.block_cache_compressed = block_cache_compressed->rep;
+  }
+}
+
+void rocksdb_block_based_options_set_whole_key_filtering(
+    rocksdb_block_based_table_options_t* options, unsigned char v) {
+  options->rep.whole_key_filtering = v;
+}
+
+void rocksdb_block_based_options_set_format_version(
+    rocksdb_block_based_table_options_t* options, int v) {
+  options->rep.format_version = v;
+}
+
+void rocksdb_block_based_options_set_index_type(
+    rocksdb_block_based_table_options_t* options, int v) {
+  options->rep.index_type = static_cast<BlockBasedTableOptions::IndexType>(v);
+}
+
+void rocksdb_block_based_options_set_data_block_index_type(
+    rocksdb_block_based_table_options_t* options, int v) {
+  options->rep.data_block_index_type =
+          static_cast<BlockBasedTableOptions::DataBlockIndexType>(v);
+}
+
+void rocksdb_block_based_options_set_data_block_hash_ratio(
+    rocksdb_block_based_table_options_t* options, double v) {
+  options->rep.data_block_hash_table_util_ratio = v;
+}
+
+void rocksdb_block_based_options_set_hash_index_allow_collision(
+    rocksdb_block_based_table_options_t* options, unsigned char v) {
+  options->rep.hash_index_allow_collision = v;
+}
+
+void rocksdb_block_based_options_set_cache_index_and_filter_blocks(
+    rocksdb_block_based_table_options_t* options, unsigned char v) {
+  options->rep.cache_index_and_filter_blocks = v;
+}
+
+void rocksdb_block_based_options_set_cache_index_and_filter_blocks_with_high_priority(
+    rocksdb_block_based_table_options_t* options, unsigned char v) {
+  options->rep.cache_index_and_filter_blocks_with_high_priority = v;
+}
+
+void rocksdb_block_based_options_set_pin_l0_filter_and_index_blocks_in_cache(
+    rocksdb_block_based_table_options_t* options, unsigned char v) {
+  options->rep.pin_l0_filter_and_index_blocks_in_cache = v;
+}
+
+void rocksdb_block_based_options_set_pin_top_level_index_and_filter(
+    rocksdb_block_based_table_options_t* options, unsigned char v) {
+  options->rep.pin_top_level_index_and_filter = v;
+}
+
+void rocksdb_options_set_block_based_table_factory(
+    rocksdb_options_t *opt,
+    rocksdb_block_based_table_options_t* table_options) {
+  if (table_options) {
+    opt->rep.table_factory.reset(
+        ROCKSDB_NAMESPACE::NewBlockBasedTableFactory(table_options->rep));
+  }
+}
+
+rocksdb_cuckoo_table_options_t*
+rocksdb_cuckoo_options_create() {
+  return new rocksdb_cuckoo_table_options_t;
+}
+
+void rocksdb_cuckoo_options_destroy(
+    rocksdb_cuckoo_table_options_t* options) {
+  delete options;
+}
+
+void rocksdb_cuckoo_options_set_hash_ratio(
+    rocksdb_cuckoo_table_options_t* options, double v) {
+  options->rep.hash_table_ratio = v;
+}
+
+void rocksdb_cuckoo_options_set_max_search_depth(
+    rocksdb_cuckoo_table_options_t* options, uint32_t v) {
+  options->rep.max_search_depth = v;
+}
+
+void rocksdb_cuckoo_options_set_cuckoo_block_size(
+    rocksdb_cuckoo_table_options_t* options, uint32_t v) {
+  options->rep.cuckoo_block_size = v;
+}
+
+void rocksdb_cuckoo_options_set_identity_as_first_hash(
+    rocksdb_cuckoo_table_options_t* options, unsigned char v) {
+  options->rep.identity_as_first_hash = v;
+}
+
+void rocksdb_cuckoo_options_set_use_module_hash(
+    rocksdb_cuckoo_table_options_t* options, unsigned char v) {
+  options->rep.use_module_hash = v;
+}
+
+void rocksdb_options_set_cuckoo_table_factory(
+    rocksdb_options_t *opt,
+    rocksdb_cuckoo_table_options_t* table_options) {
+  if (table_options) {
+    opt->rep.table_factory.reset(
+        ROCKSDB_NAMESPACE::NewCuckooTableFactory(table_options->rep));
+  }
+}
+
+void rocksdb_set_options(
+    rocksdb_t* db, int count, const char* const keys[], const char* const values[], char** errptr) {
+        std::unordered_map<std::string, std::string> options_map;
+        for (int i=0; i<count; i++)
+            options_map[keys[i]] = values[i];
+        SaveError(errptr,
+            db->rep->SetOptions(options_map));
+    }
+
+void rocksdb_set_options_cf(
+    rocksdb_t* db, rocksdb_column_family_handle_t* handle, int count, const char* const keys[], const char* const values[], char** errptr) {
+        std::unordered_map<std::string, std::string> options_map;
+        for (int i=0; i<count; i++)
+            options_map[keys[i]] = values[i];
+        SaveError(errptr,
+            db->rep->SetOptions(handle->rep, options_map));
+    }
+
+rocksdb_options_t* rocksdb_options_create() {
+  return new rocksdb_options_t;
+}
+
+void rocksdb_options_destroy(rocksdb_options_t* options) {
+  delete options;
+}
+
+void rocksdb_options_increase_parallelism(
+    rocksdb_options_t* opt, int total_threads) {
+  opt->rep.IncreaseParallelism(total_threads);
+}
+
+void rocksdb_options_optimize_for_point_lookup(
+    rocksdb_options_t* opt, uint64_t block_cache_size_mb) {
+  opt->rep.OptimizeForPointLookup(block_cache_size_mb);
+}
+
+void rocksdb_options_optimize_level_style_compaction(
+    rocksdb_options_t* opt, uint64_t memtable_memory_budget) {
+  opt->rep.OptimizeLevelStyleCompaction(memtable_memory_budget);
+}
+
+void rocksdb_options_optimize_universal_style_compaction(
+    rocksdb_options_t* opt, uint64_t memtable_memory_budget) {
+  opt->rep.OptimizeUniversalStyleCompaction(memtable_memory_budget);
+}
+
+void rocksdb_options_set_allow_ingest_behind(
+    rocksdb_options_t* opt, unsigned char v) {
+  opt->rep.allow_ingest_behind = v;
+}
+
+void rocksdb_options_set_compaction_filter(
+    rocksdb_options_t* opt,
+    rocksdb_compactionfilter_t* filter) {
+  opt->rep.compaction_filter = filter;
+}
+
+void rocksdb_options_set_compaction_filter_factory(
+    rocksdb_options_t* opt, rocksdb_compactionfilterfactory_t* factory) {
+  opt->rep.compaction_filter_factory =
+      std::shared_ptr<CompactionFilterFactory>(factory);
+}
+
+void rocksdb_options_compaction_readahead_size(
+    rocksdb_options_t* opt, size_t s) {
+  opt->rep.compaction_readahead_size = s;
+}
+
+void rocksdb_options_set_comparator(
+    rocksdb_options_t* opt,
+    rocksdb_comparator_t* cmp) {
+  opt->rep.comparator = cmp;
+}
+
+void rocksdb_options_set_merge_operator(
+    rocksdb_options_t* opt,
+    rocksdb_mergeoperator_t* merge_operator) {
+  opt->rep.merge_operator = std::shared_ptr<MergeOperator>(merge_operator);
+}
+
+
+void rocksdb_options_set_create_if_missing(
+    rocksdb_options_t* opt, unsigned char v) {
+  opt->rep.create_if_missing = v;
+}
+
+void rocksdb_options_set_create_missing_column_families(
+    rocksdb_options_t* opt, unsigned char v) {
+  opt->rep.create_missing_column_families = v;
+}
+
+void rocksdb_options_set_error_if_exists(
+    rocksdb_options_t* opt, unsigned char v) {
+  opt->rep.error_if_exists = v;
+}
+
+void rocksdb_options_set_paranoid_checks(
+    rocksdb_options_t* opt, unsigned char v) {
+  opt->rep.paranoid_checks = v;
+}
+
+void rocksdb_options_set_db_paths(rocksdb_options_t* opt,
+                                  const rocksdb_dbpath_t** dbpath_values,
+                                  size_t num_paths) {
+  std::vector<DbPath> db_paths(num_paths);
+  for (size_t i = 0; i < num_paths; ++i) {
+    db_paths[i] = dbpath_values[i]->rep;
+  }
+  opt->rep.db_paths = db_paths;
+}
+
+void rocksdb_options_set_env(rocksdb_options_t* opt, rocksdb_env_t* env) {
+  opt->rep.env = (env ? env->rep : nullptr);
+}
+
+void rocksdb_options_set_info_log(rocksdb_options_t* opt, rocksdb_logger_t* l) {
+  if (l) {
+    opt->rep.info_log = l->rep;
+  }
+}
+
+void rocksdb_options_set_info_log_level(
+    rocksdb_options_t* opt, int v) {
+  opt->rep.info_log_level = static_cast<InfoLogLevel>(v);
+}
+
+void rocksdb_options_set_db_write_buffer_size(rocksdb_options_t* opt,
+                                              size_t s) {
+  opt->rep.db_write_buffer_size = s;
+}
+
+void rocksdb_options_set_write_buffer_size(rocksdb_options_t* opt, size_t s) {
+  opt->rep.write_buffer_size = s;
+}
+
+void rocksdb_options_set_max_open_files(rocksdb_options_t* opt, int n) {
+  opt->rep.max_open_files = n;
+}
+
+void rocksdb_options_set_max_file_opening_threads(rocksdb_options_t* opt, int n) {
+  opt->rep.max_file_opening_threads = n;
+}
+
+void rocksdb_options_set_max_total_wal_size(rocksdb_options_t* opt, uint64_t n) {
+  opt->rep.max_total_wal_size = n;
+}
+
+void rocksdb_options_set_target_file_size_base(
+    rocksdb_options_t* opt, uint64_t n) {
+  opt->rep.target_file_size_base = n;
+}
+
+void rocksdb_options_set_target_file_size_multiplier(
+    rocksdb_options_t* opt, int n) {
+  opt->rep.target_file_size_multiplier = n;
+}
+
+void rocksdb_options_set_max_bytes_for_level_base(
+    rocksdb_options_t* opt, uint64_t n) {
+  opt->rep.max_bytes_for_level_base = n;
+}
+
+void rocksdb_options_set_level_compaction_dynamic_level_bytes(
+    rocksdb_options_t* opt, unsigned char v) {
+  opt->rep.level_compaction_dynamic_level_bytes = v;
+}
+
+void rocksdb_options_set_max_bytes_for_level_multiplier(rocksdb_options_t* opt,
+                                                        double n) {
+  opt->rep.max_bytes_for_level_multiplier = n;
+}
+
+void rocksdb_options_set_max_compaction_bytes(rocksdb_options_t* opt,
+                                              uint64_t n) {
+  opt->rep.max_compaction_bytes = n;
+}
+
+void rocksdb_options_set_max_bytes_for_level_multiplier_additional(
+    rocksdb_options_t* opt, int* level_values, size_t num_levels) {
+  opt->rep.max_bytes_for_level_multiplier_additional.resize(num_levels);
+  for (size_t i = 0; i < num_levels; ++i) {
+    opt->rep.max_bytes_for_level_multiplier_additional[i] = level_values[i];
+  }
+}
+
+void rocksdb_options_enable_statistics(rocksdb_options_t* opt) {
+  opt->rep.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+}
+
+void rocksdb_options_set_skip_stats_update_on_db_open(rocksdb_options_t* opt,
+                                                      unsigned char val) {
+  opt->rep.skip_stats_update_on_db_open = val;
+}
+
+void rocksdb_options_set_skip_checking_sst_file_sizes_on_db_open(
+    rocksdb_options_t* opt, unsigned char val) {
+  opt->rep.skip_checking_sst_file_sizes_on_db_open = val;
+}
+
+void rocksdb_options_set_num_levels(rocksdb_options_t* opt, int n) {
+  opt->rep.num_levels = n;
+}
+
+void rocksdb_options_set_level0_file_num_compaction_trigger(
+    rocksdb_options_t* opt, int n) {
+  opt->rep.level0_file_num_compaction_trigger = n;
+}
+
+void rocksdb_options_set_level0_slowdown_writes_trigger(
+    rocksdb_options_t* opt, int n) {
+  opt->rep.level0_slowdown_writes_trigger = n;
+}
+
+void rocksdb_options_set_level0_stop_writes_trigger(
+    rocksdb_options_t* opt, int n) {
+  opt->rep.level0_stop_writes_trigger = n;
+}
+
+void rocksdb_options_set_max_mem_compaction_level(rocksdb_options_t* /*opt*/,
+                                                  int /*n*/) {}
+
+void rocksdb_options_set_wal_recovery_mode(rocksdb_options_t* opt,int mode) {
+  opt->rep.wal_recovery_mode = static_cast<WALRecoveryMode>(mode);
+}
+
+void rocksdb_options_set_compression(rocksdb_options_t* opt, int t) {
+  opt->rep.compression = static_cast<CompressionType>(t);
+}
+
+void rocksdb_options_set_compression_per_level(rocksdb_options_t* opt,
+                                               int* level_values,
+                                               size_t num_levels) {
+  opt->rep.compression_per_level.resize(num_levels);
+  for (size_t i = 0; i < num_levels; ++i) {
+    opt->rep.compression_per_level[i] =
+      static_cast<CompressionType>(level_values[i]);
+  }
+}
+
+void rocksdb_options_set_bottommost_compression_options(rocksdb_options_t* opt,
+                                                        int w_bits, int level,
+                                                        int strategy,
+                                                        int max_dict_bytes,
+                                                        bool enabled) {
+  opt->rep.bottommost_compression_opts.window_bits = w_bits;
+  opt->rep.bottommost_compression_opts.level = level;
+  opt->rep.bottommost_compression_opts.strategy = strategy;
+  opt->rep.bottommost_compression_opts.max_dict_bytes = max_dict_bytes;
+  opt->rep.bottommost_compression_opts.enabled = enabled;
+}
+
+void rocksdb_options_set_compression_options(rocksdb_options_t* opt, int w_bits,
+                                             int level, int strategy,
+                                             int max_dict_bytes) {
+  opt->rep.compression_opts.window_bits = w_bits;
+  opt->rep.compression_opts.level = level;
+  opt->rep.compression_opts.strategy = strategy;
+  opt->rep.compression_opts.max_dict_bytes = max_dict_bytes;
+}
+
+void rocksdb_options_set_prefix_extractor(
+    rocksdb_options_t* opt, rocksdb_slicetransform_t* prefix_extractor) {
+  opt->rep.prefix_extractor.reset(prefix_extractor);
+}
+
+void rocksdb_options_set_use_fsync(
+    rocksdb_options_t* opt, int use_fsync) {
+  opt->rep.use_fsync = use_fsync;
+}
+
+void rocksdb_options_set_db_log_dir(
+    rocksdb_options_t* opt, const char* db_log_dir) {
+  opt->rep.db_log_dir = db_log_dir;
+}
+
+void rocksdb_options_set_wal_dir(
+    rocksdb_options_t* opt, const char* v) {
+  opt->rep.wal_dir = v;
+}
+
+void rocksdb_options_set_WAL_ttl_seconds(rocksdb_options_t* opt, uint64_t ttl) {
+  opt->rep.WAL_ttl_seconds = ttl;
+}
+
+void rocksdb_options_set_WAL_size_limit_MB(
+    rocksdb_options_t* opt, uint64_t limit) {
+  opt->rep.WAL_size_limit_MB = limit;
+}
+
+void rocksdb_options_set_manifest_preallocation_size(
+    rocksdb_options_t* opt, size_t v) {
+  opt->rep.manifest_preallocation_size = v;
+}
+
+// noop
+void rocksdb_options_set_purge_redundant_kvs_while_flush(
+    rocksdb_options_t* /*opt*/, unsigned char /*v*/) {}
+
+void rocksdb_options_set_use_direct_reads(rocksdb_options_t* opt,
+                                          unsigned char v) {
+  opt->rep.use_direct_reads = v;
+}
+
+void rocksdb_options_set_use_direct_io_for_flush_and_compaction(
+    rocksdb_options_t* opt, unsigned char v) {
+  opt->rep.use_direct_io_for_flush_and_compaction = v;
+}
+
+void rocksdb_options_set_allow_mmap_reads(
+    rocksdb_options_t* opt, unsigned char v) {
+  opt->rep.allow_mmap_reads = v;
+}
+
+void rocksdb_options_set_allow_mmap_writes(
+    rocksdb_options_t* opt, unsigned char v) {
+  opt->rep.allow_mmap_writes = v;
+}
+
+void rocksdb_options_set_is_fd_close_on_exec(
+    rocksdb_options_t* opt, unsigned char v) {
+  opt->rep.is_fd_close_on_exec = v;
+}
+
+void rocksdb_options_set_skip_log_error_on_recovery(
+    rocksdb_options_t* opt, unsigned char v) {
+  opt->rep.skip_log_error_on_recovery = v;
+}
+
+void rocksdb_options_set_stats_dump_period_sec(
+    rocksdb_options_t* opt, unsigned int v) {
+  opt->rep.stats_dump_period_sec = v;
+}
+
+void rocksdb_options_set_advise_random_on_open(
+    rocksdb_options_t* opt, unsigned char v) {
+  opt->rep.advise_random_on_open = v;
+}
+
+void rocksdb_options_set_access_hint_on_compaction_start(
+    rocksdb_options_t* opt, int v) {
+  switch(v) {
+    case 0:
+      opt->rep.access_hint_on_compaction_start =
+          ROCKSDB_NAMESPACE::Options::NONE;
+      break;
+    case 1:
+      opt->rep.access_hint_on_compaction_start =
+          ROCKSDB_NAMESPACE::Options::NORMAL;
+      break;
+    case 2:
+      opt->rep.access_hint_on_compaction_start =
+          ROCKSDB_NAMESPACE::Options::SEQUENTIAL;
+      break;
+    case 3:
+      opt->rep.access_hint_on_compaction_start =
+          ROCKSDB_NAMESPACE::Options::WILLNEED;
+      break;
+  }
+}
+
+void rocksdb_options_set_use_adaptive_mutex(
+    rocksdb_options_t* opt, unsigned char v) {
+  opt->rep.use_adaptive_mutex = v;
+}
+
+void rocksdb_options_set_wal_bytes_per_sync(
+    rocksdb_options_t* opt, uint64_t v) {
+  opt->rep.wal_bytes_per_sync = v;
+}
+
+void rocksdb_options_set_bytes_per_sync(
+    rocksdb_options_t* opt, uint64_t v) {
+  opt->rep.bytes_per_sync = v;
+}
+
+void rocksdb_options_set_writable_file_max_buffer_size(rocksdb_options_t* opt,
+                                                       uint64_t v) {
+  opt->rep.writable_file_max_buffer_size = static_cast<size_t>(v);
+}
+
+void rocksdb_options_set_allow_concurrent_memtable_write(rocksdb_options_t* opt,
+                                                         unsigned char v) {
+  opt->rep.allow_concurrent_memtable_write = v;
+}
+
+void rocksdb_options_set_enable_write_thread_adaptive_yield(
+    rocksdb_options_t* opt, unsigned char v) {
+  opt->rep.enable_write_thread_adaptive_yield = v;
+}
+
+void rocksdb_options_set_max_sequential_skip_in_iterations(
+    rocksdb_options_t* opt, uint64_t v) {
+  opt->rep.max_sequential_skip_in_iterations = v;
+}
+
+void rocksdb_options_set_max_write_buffer_number(rocksdb_options_t* opt, int n) {
+  opt->rep.max_write_buffer_number = n;
+}
+
+void rocksdb_options_set_min_write_buffer_number_to_merge(rocksdb_options_t* opt, int n) {
+  opt->rep.min_write_buffer_number_to_merge = n;
+}
+
+void rocksdb_options_set_max_write_buffer_number_to_maintain(
+    rocksdb_options_t* opt, int n) {
+  opt->rep.max_write_buffer_number_to_maintain = n;
+}
+
+void rocksdb_options_set_max_write_buffer_size_to_maintain(
+    rocksdb_options_t* opt, int64_t n) {
+  opt->rep.max_write_buffer_size_to_maintain = n;
+}
+
+void rocksdb_options_set_enable_pipelined_write(rocksdb_options_t* opt,
+                                                unsigned char v) {
+  opt->rep.enable_pipelined_write = v;
+}
+
+void rocksdb_options_set_unordered_write(rocksdb_options_t* opt,
+                                         unsigned char v) {
+  opt->rep.unordered_write = v;
+}
+
+void rocksdb_options_set_max_subcompactions(rocksdb_options_t* opt,
+                                            uint32_t n) {
+  opt->rep.max_subcompactions = n;
+}
+
+void rocksdb_options_set_max_background_jobs(rocksdb_options_t* opt, int n) {
+  opt->rep.max_background_jobs = n;
+}
+
+void rocksdb_options_set_max_background_compactions(rocksdb_options_t* opt, int n) {
+  opt->rep.max_background_compactions = n;
+}
+
+void rocksdb_options_set_base_background_compactions(rocksdb_options_t* opt,
+                                                     int n) {
+  opt->rep.base_background_compactions = n;
+}
+
+void rocksdb_options_set_max_background_flushes(rocksdb_options_t* opt, int n) {
+  opt->rep.max_background_flushes = n;
+}
+
+void rocksdb_options_set_max_log_file_size(rocksdb_options_t* opt, size_t v) {
+  opt->rep.max_log_file_size = v;
+}
+
+void rocksdb_options_set_log_file_time_to_roll(rocksdb_options_t* opt, size_t v) {
+  opt->rep.log_file_time_to_roll = v;
+}
+
+void rocksdb_options_set_keep_log_file_num(rocksdb_options_t* opt, size_t v) {
+  opt->rep.keep_log_file_num = v;
+}
+
+void rocksdb_options_set_recycle_log_file_num(rocksdb_options_t* opt,
+                                              size_t v) {
+  opt->rep.recycle_log_file_num = v;
+}
+
+void rocksdb_options_set_soft_rate_limit(rocksdb_options_t* opt, double v) {
+  opt->rep.soft_rate_limit = v;
+}
+
+void rocksdb_options_set_hard_rate_limit(rocksdb_options_t* opt, double v) {
+  opt->rep.hard_rate_limit = v;
+}
+
+void rocksdb_options_set_soft_pending_compaction_bytes_limit(rocksdb_options_t* opt, size_t v) {
+  opt->rep.soft_pending_compaction_bytes_limit = v;
+}
+
+void rocksdb_options_set_hard_pending_compaction_bytes_limit(rocksdb_options_t* opt, size_t v) {
+  opt->rep.hard_pending_compaction_bytes_limit = v;
+}
+
+void rocksdb_options_set_rate_limit_delay_max_milliseconds(
+    rocksdb_options_t* opt, unsigned int v) {
+  opt->rep.rate_limit_delay_max_milliseconds = v;
+}
+
+void rocksdb_options_set_max_manifest_file_size(
+    rocksdb_options_t* opt, size_t v) {
+  opt->rep.max_manifest_file_size = v;
+}
+
+void rocksdb_options_set_table_cache_numshardbits(
+    rocksdb_options_t* opt, int v) {
+  opt->rep.table_cache_numshardbits = v;
+}
+
+void rocksdb_options_set_table_cache_remove_scan_count_limit(
+    rocksdb_options_t* /*opt*/, int /*v*/) {
+  // this option is deprecated
+}
+
+void rocksdb_options_set_arena_block_size(
+    rocksdb_options_t* opt, size_t v) {
+  opt->rep.arena_block_size = v;
+}
+
+void rocksdb_options_set_disable_auto_compactions(rocksdb_options_t* opt, int disable) {
+  opt->rep.disable_auto_compactions = disable;
+}
+
+void rocksdb_options_set_optimize_filters_for_hits(rocksdb_options_t* opt, int v) {
+  opt->rep.optimize_filters_for_hits = v;
+}
+
+void rocksdb_options_set_delete_obsolete_files_period_micros(
+    rocksdb_options_t* opt, uint64_t v) {
+  opt->rep.delete_obsolete_files_period_micros = v;
+}
+
+void rocksdb_options_prepare_for_bulk_load(rocksdb_options_t* opt) {
+  opt->rep.PrepareForBulkLoad();
+}
+
+void rocksdb_options_set_memtable_vector_rep(rocksdb_options_t *opt) {
+  opt->rep.memtable_factory.reset(new ROCKSDB_NAMESPACE::VectorRepFactory);
+}
+
+void rocksdb_options_set_memtable_prefix_bloom_size_ratio(
+    rocksdb_options_t* opt, double v) {
+  opt->rep.memtable_prefix_bloom_size_ratio = v;
+}
+
+void rocksdb_options_set_memtable_huge_page_size(rocksdb_options_t* opt,
+                                                 size_t v) {
+  opt->rep.memtable_huge_page_size = v;
+}
+
+void rocksdb_options_set_hash_skip_list_rep(
+    rocksdb_options_t *opt, size_t bucket_count,
+    int32_t skiplist_height, int32_t skiplist_branching_factor) {
+  ROCKSDB_NAMESPACE::MemTableRepFactory* factory =
+      ROCKSDB_NAMESPACE::NewHashSkipListRepFactory(
+          bucket_count, skiplist_height, skiplist_branching_factor);
+  opt->rep.memtable_factory.reset(factory);
+}
+
+void rocksdb_options_set_hash_link_list_rep(
+    rocksdb_options_t *opt, size_t bucket_count) {
+  opt->rep.memtable_factory.reset(
+      ROCKSDB_NAMESPACE::NewHashLinkListRepFactory(bucket_count));
+}
+
+void rocksdb_options_set_plain_table_factory(
+    rocksdb_options_t *opt, uint32_t user_key_len, int bloom_bits_per_key,
+    double hash_table_ratio, size_t index_sparseness) {
+  ROCKSDB_NAMESPACE::PlainTableOptions options;
+  options.user_key_len = user_key_len;
+  options.bloom_bits_per_key = bloom_bits_per_key;
+  options.hash_table_ratio = hash_table_ratio;
+  options.index_sparseness = index_sparseness;
+
+  ROCKSDB_NAMESPACE::TableFactory* factory =
+      ROCKSDB_NAMESPACE::NewPlainTableFactory(options);
+  opt->rep.table_factory.reset(factory);
+}
+
+void rocksdb_options_set_max_successive_merges(
+    rocksdb_options_t* opt, size_t v) {
+  opt->rep.max_successive_merges = v;
+}
+
+void rocksdb_options_set_bloom_locality(
+    rocksdb_options_t* opt, uint32_t v) {
+  opt->rep.bloom_locality = v;
+}
+
+void rocksdb_options_set_inplace_update_support(
+    rocksdb_options_t* opt, unsigned char v) {
+  opt->rep.inplace_update_support = v;
+}
+
+void rocksdb_options_set_inplace_update_num_locks(
+    rocksdb_options_t* opt, size_t v) {
+  opt->rep.inplace_update_num_locks = v;
+}
+
+void rocksdb_options_set_report_bg_io_stats(
+    rocksdb_options_t* opt, int v) {
+  opt->rep.report_bg_io_stats = v;
+}
+
+void rocksdb_options_set_compaction_style(rocksdb_options_t *opt, int style) {
+  opt->rep.compaction_style =
+      static_cast<ROCKSDB_NAMESPACE::CompactionStyle>(style);
+}
+
+void rocksdb_options_set_universal_compaction_options(rocksdb_options_t *opt, rocksdb_universal_compaction_options_t *uco) {
+  opt->rep.compaction_options_universal = *(uco->rep);
+}
+
+void rocksdb_options_set_fifo_compaction_options(
+    rocksdb_options_t* opt,
+    rocksdb_fifo_compaction_options_t* fifo) {
+  opt->rep.compaction_options_fifo = fifo->rep;
+}
+
+char *rocksdb_options_statistics_get_string(rocksdb_options_t *opt) {
+  ROCKSDB_NAMESPACE::Statistics* statistics = opt->rep.statistics.get();
+  if (statistics) {
+    return strdup(statistics->ToString().c_str());
+  }
+  return nullptr;
+}
+
+void rocksdb_options_set_ratelimiter(rocksdb_options_t *opt, rocksdb_ratelimiter_t *limiter) {
+  if (limiter) {
+    opt->rep.rate_limiter = limiter->rep;
+  }
+}
+
+void rocksdb_options_set_atomic_flush(rocksdb_options_t* opt,
+                                      unsigned char atomic_flush) {
+  opt->rep.atomic_flush = atomic_flush;
+}
+
+rocksdb_ratelimiter_t* rocksdb_ratelimiter_create(
+    int64_t rate_bytes_per_sec,
+    int64_t refill_period_us,
+    int32_t fairness) {
+  rocksdb_ratelimiter_t* rate_limiter = new rocksdb_ratelimiter_t;
+  rate_limiter->rep.reset(
+               NewGenericRateLimiter(rate_bytes_per_sec,
+                                     refill_period_us, fairness));
+  return rate_limiter;
+}
+
+void rocksdb_ratelimiter_destroy(rocksdb_ratelimiter_t *limiter) {
+  delete limiter;
+}
+
+void rocksdb_options_set_row_cache(rocksdb_options_t* opt, rocksdb_cache_t* cache) {
+  if(cache) {
+    opt->rep.row_cache = cache->rep;
+  }
+}
+
+void rocksdb_set_perf_level(int v) {
+  PerfLevel level = static_cast<PerfLevel>(v);
+  SetPerfLevel(level);
+}
+
+rocksdb_perfcontext_t* rocksdb_perfcontext_create() {
+  rocksdb_perfcontext_t* context = new rocksdb_perfcontext_t;
+  context->rep = ROCKSDB_NAMESPACE::get_perf_context();
+  return context;
+}
+
+void rocksdb_perfcontext_reset(rocksdb_perfcontext_t* context) {
+  context->rep->Reset();
+}
+
+char* rocksdb_perfcontext_report(rocksdb_perfcontext_t* context,
+    unsigned char exclude_zero_counters) {
+  return strdup(context->rep->ToString(exclude_zero_counters).c_str());
+}
+
+uint64_t rocksdb_perfcontext_metric(rocksdb_perfcontext_t* context,
+    int metric) {
+  PerfContext* rep = context->rep;
+  switch (metric) {
+    case rocksdb_user_key_comparison_count:
+      return rep->user_key_comparison_count;
+    case rocksdb_block_cache_hit_count:
+      return rep->block_cache_hit_count;
+    case rocksdb_block_read_count:
+      return rep->block_read_count;
+    case rocksdb_block_read_byte:
+      return rep->block_read_byte;
+    case rocksdb_block_read_time:
+      return rep->block_read_time;
+    case rocksdb_block_checksum_time:
+      return rep->block_checksum_time;
+    case rocksdb_block_decompress_time:
+      return rep->block_decompress_time;
+    case rocksdb_get_read_bytes:
+      return rep->get_read_bytes;
+    case rocksdb_multiget_read_bytes:
+      return rep->multiget_read_bytes;
+    case rocksdb_iter_read_bytes:
+      return rep->iter_read_bytes;
+    case rocksdb_internal_key_skipped_count:
+      return rep->internal_key_skipped_count;
+    case rocksdb_internal_delete_skipped_count:
+      return rep->internal_delete_skipped_count;
+    case rocksdb_internal_recent_skipped_count:
+      return rep->internal_recent_skipped_count;
+    case rocksdb_internal_merge_count:
+      return rep->internal_merge_count;
+    case rocksdb_get_snapshot_time:
+      return rep->get_snapshot_time;
+    case rocksdb_get_from_memtable_time:
+      return rep->get_from_memtable_time;
+    case rocksdb_get_from_memtable_count:
+      return rep->get_from_memtable_count;
+    case rocksdb_get_post_process_time:
+      return rep->get_post_process_time;
+    case rocksdb_get_from_output_files_time:
+      return rep->get_from_output_files_time;
+    case rocksdb_seek_on_memtable_time:
+      return rep->seek_on_memtable_time;
+    case rocksdb_seek_on_memtable_count:
+      return rep->seek_on_memtable_count;
+    case rocksdb_next_on_memtable_count:
+      return rep->next_on_memtable_count;
+    case rocksdb_prev_on_memtable_count:
+      return rep->prev_on_memtable_count;
+    case rocksdb_seek_child_seek_time:
+      return rep->seek_child_seek_time;
+    case rocksdb_seek_child_seek_count:
+      return rep->seek_child_seek_count;
+    case rocksdb_seek_min_heap_time:
+      return rep->seek_min_heap_time;
+    case rocksdb_seek_max_heap_time:
+      return rep->seek_max_heap_time;
+    case rocksdb_seek_internal_seek_time:
+      return rep->seek_internal_seek_time;
+    case rocksdb_find_next_user_entry_time:
+      return rep->find_next_user_entry_time;
+    case rocksdb_write_wal_time:
+      return rep->write_wal_time;
+    case rocksdb_write_memtable_time:
+      return rep->write_memtable_time;
+    case rocksdb_write_delay_time:
+      return rep->write_delay_time;
+    case rocksdb_write_pre_and_post_process_time:
+      return rep->write_pre_and_post_process_time;
+    case rocksdb_db_mutex_lock_nanos:
+      return rep->db_mutex_lock_nanos;
+    case rocksdb_db_condition_wait_nanos:
+      return rep->db_condition_wait_nanos;
+    case rocksdb_merge_operator_time_nanos:
+      return rep->merge_operator_time_nanos;
+    case rocksdb_read_index_block_nanos:
+      return rep->read_index_block_nanos;
+    case rocksdb_read_filter_block_nanos:
+      return rep->read_filter_block_nanos;
+    case rocksdb_new_table_block_iter_nanos:
+      return rep->new_table_block_iter_nanos;
+    case rocksdb_new_table_iterator_nanos:
+      return rep->new_table_iterator_nanos;
+    case rocksdb_block_seek_nanos:
+      return rep->block_seek_nanos;
+    case rocksdb_find_table_nanos:
+      return rep->find_table_nanos;
+    case rocksdb_bloom_memtable_hit_count:
+      return rep->bloom_memtable_hit_count;
+    case rocksdb_bloom_memtable_miss_count:
+      return rep->bloom_memtable_miss_count;
+    case rocksdb_bloom_sst_hit_count:
+      return rep->bloom_sst_hit_count;
+    case rocksdb_bloom_sst_miss_count:
+      return rep->bloom_sst_miss_count;
+    case rocksdb_key_lock_wait_time:
+      return rep->key_lock_wait_time;
+    case rocksdb_key_lock_wait_count:
+      return rep->key_lock_wait_count;
+    case rocksdb_env_new_sequential_file_nanos:
+      return rep->env_new_sequential_file_nanos;
+    case rocksdb_env_new_random_access_file_nanos:
+      return rep->env_new_random_access_file_nanos;
+    case rocksdb_env_new_writable_file_nanos:
+      return rep->env_new_writable_file_nanos;
+    case rocksdb_env_reuse_writable_file_nanos:
+      return rep->env_reuse_writable_file_nanos;
+    case rocksdb_env_new_random_rw_file_nanos:
+      return rep->env_new_random_rw_file_nanos;
+    case rocksdb_env_new_directory_nanos:
+      return rep->env_new_directory_nanos;
+    case rocksdb_env_file_exists_nanos:
+      return rep->env_file_exists_nanos;
+    case rocksdb_env_get_children_nanos:
+      return rep->env_get_children_nanos;
+    case rocksdb_env_get_children_file_attributes_nanos:
+      return rep->env_get_children_file_attributes_nanos;
+    case rocksdb_env_delete_file_nanos:
+      return rep->env_delete_file_nanos;
+    case rocksdb_env_create_dir_nanos:
+      return rep->env_create_dir_nanos;
+    case rocksdb_env_create_dir_if_missing_nanos:
+      return rep->env_create_dir_if_missing_nanos;
+    case rocksdb_env_delete_dir_nanos:
+      return rep->env_delete_dir_nanos;
+    case rocksdb_env_get_file_size_nanos:
+      return rep->env_get_file_size_nanos;
+    case rocksdb_env_get_file_modification_time_nanos:
+      return rep->env_get_file_modification_time_nanos;
+    case rocksdb_env_rename_file_nanos:
+      return rep->env_rename_file_nanos;
+    case rocksdb_env_link_file_nanos:
+      return rep->env_link_file_nanos;
+    case rocksdb_env_lock_file_nanos:
+      return rep->env_lock_file_nanos;
+    case rocksdb_env_unlock_file_nanos:
+      return rep->env_unlock_file_nanos;
+    case rocksdb_env_new_logger_nanos:
+      return rep->env_new_logger_nanos;
+    default:
+      break;
+  }
+  return 0;
+}
+
+void rocksdb_perfcontext_destroy(rocksdb_perfcontext_t* context) {
+  delete context;
+}
+
+/*
+TODO:
+DB::OpenForReadOnly
+DB::KeyMayExist
+DB::GetOptions
+DB::GetSortedWalFiles
+DB::GetLatestSequenceNumber
+DB::GetUpdatesSince
+DB::GetDbIdentity
+DB::RunManualCompaction
+custom cache
+table_properties_collectors
+*/
+
+rocksdb_compactionfilter_t* rocksdb_compactionfilter_create(
+    void* state,
+    void (*destructor)(void*),
+    unsigned char (*filter)(
+        void*,
+        int level,
+        const char* key, size_t key_length,
+        const char* existing_value, size_t value_length,
+        char** new_value, size_t *new_value_length,
+        unsigned char* value_changed),
+    const char* (*name)(void*)) {
+  rocksdb_compactionfilter_t* result = new rocksdb_compactionfilter_t;
+  result->state_ = state;
+  result->destructor_ = destructor;
+  result->filter_ = filter;
+  result->ignore_snapshots_ = true;
+  result->name_ = name;
+  return result;
+}
+
+void rocksdb_compactionfilter_set_ignore_snapshots(
+  rocksdb_compactionfilter_t* filter,
+  unsigned char whether_ignore) {
+  filter->ignore_snapshots_ = whether_ignore;
+}
+
+void rocksdb_compactionfilter_destroy(rocksdb_compactionfilter_t* filter) {
+  delete filter;
+}
+
+unsigned char rocksdb_compactionfiltercontext_is_full_compaction(
+    rocksdb_compactionfiltercontext_t* context) {
+  return context->rep.is_full_compaction;
+}
+
+unsigned char rocksdb_compactionfiltercontext_is_manual_compaction(
+    rocksdb_compactionfiltercontext_t* context) {
+  return context->rep.is_manual_compaction;
+}
+
+rocksdb_compactionfilterfactory_t* rocksdb_compactionfilterfactory_create(
+    void* state, void (*destructor)(void*),
+    rocksdb_compactionfilter_t* (*create_compaction_filter)(
+        void*, rocksdb_compactionfiltercontext_t* context),
+    const char* (*name)(void*)) {
+  rocksdb_compactionfilterfactory_t* result =
+      new rocksdb_compactionfilterfactory_t;
+  result->state_ = state;
+  result->destructor_ = destructor;
+  result->create_compaction_filter_ = create_compaction_filter;
+  result->name_ = name;
+  return result;
+}
+
+void rocksdb_compactionfilterfactory_destroy(
+    rocksdb_compactionfilterfactory_t* factory) {
+  delete factory;
+}
+
+rocksdb_comparator_t* rocksdb_comparator_create(
+    void* state,
+    void (*destructor)(void*),
+    int (*compare)(
+        void*,
+        const char* a, size_t alen,
+        const char* b, size_t blen),
+    const char* (*name)(void*)) {
+  rocksdb_comparator_t* result = new rocksdb_comparator_t;
+  result->state_ = state;
+  result->destructor_ = destructor;
+  result->compare_ = compare;
+  result->name_ = name;
+  return result;
+}
+
+void rocksdb_comparator_destroy(rocksdb_comparator_t* cmp) {
+  delete cmp;
+}
+
+rocksdb_filterpolicy_t* rocksdb_filterpolicy_create(
+    void* state,
+    void (*destructor)(void*),
+    char* (*create_filter)(
+        void*,
+        const char* const* key_array, const size_t* key_length_array,
+        int num_keys,
+        size_t* filter_length),
+    unsigned char (*key_may_match)(
+        void*,
+        const char* key, size_t length,
+        const char* filter, size_t filter_length),
+    void (*delete_filter)(
+        void*,
+        const char* filter, size_t filter_length),
+    const char* (*name)(void*)) {
+  rocksdb_filterpolicy_t* result = new rocksdb_filterpolicy_t;
+  result->state_ = state;
+  result->destructor_ = destructor;
+  result->create_ = create_filter;
+  result->key_match_ = key_may_match;
+  result->delete_filter_ = delete_filter;
+  result->name_ = name;
+  return result;
+}
+
+void rocksdb_filterpolicy_destroy(rocksdb_filterpolicy_t* filter) {
+  delete filter;
+}
+
+rocksdb_filterpolicy_t* rocksdb_filterpolicy_create_bloom_format(int bits_per_key, bool original_format) {
+  // Make a rocksdb_filterpolicy_t, but override all of its methods so
+  // they delegate to a NewBloomFilterPolicy() instead of user
+  // supplied C functions.
+  struct Wrapper : public rocksdb_filterpolicy_t {
+    const FilterPolicy* rep_;
+    ~Wrapper() override { delete rep_; }
+    const char* Name() const override { return rep_->Name(); }
+    void CreateFilter(const Slice* keys, int n,
+                      std::string* dst) const override {
+      return rep_->CreateFilter(keys, n, dst);
+    }
+    bool KeyMayMatch(const Slice& key, const Slice& filter) const override {
+      return rep_->KeyMayMatch(key, filter);
+    }
+    // No need to override GetFilterBitsBuilder if this one is overridden
+    ROCKSDB_NAMESPACE::FilterBitsBuilder* GetBuilderWithContext(
+        const ROCKSDB_NAMESPACE::FilterBuildingContext& context)
+        const override {
+      return rep_->GetBuilderWithContext(context);
+    }
+    ROCKSDB_NAMESPACE::FilterBitsReader* GetFilterBitsReader(
+        const Slice& contents) const override {
+      return rep_->GetFilterBitsReader(contents);
+    }
+    static void DoNothing(void*) {}
+  };
+  Wrapper* wrapper = new Wrapper;
+  wrapper->rep_ = NewBloomFilterPolicy(bits_per_key, original_format);
+  wrapper->state_ = nullptr;
+  wrapper->delete_filter_ = nullptr;
+  wrapper->destructor_ = &Wrapper::DoNothing;
+  return wrapper;
+}
+
+rocksdb_filterpolicy_t* rocksdb_filterpolicy_create_bloom_full(int bits_per_key) {
+  return rocksdb_filterpolicy_create_bloom_format(bits_per_key, false);
+}
+
+rocksdb_filterpolicy_t* rocksdb_filterpolicy_create_bloom(int bits_per_key) {
+  return rocksdb_filterpolicy_create_bloom_format(bits_per_key, true);
+}
+
+rocksdb_mergeoperator_t* rocksdb_mergeoperator_create(
+    void* state, void (*destructor)(void*),
+    char* (*full_merge)(void*, const char* key, size_t key_length,
+                        const char* existing_value,
+                        size_t existing_value_length,
+                        const char* const* operands_list,
+                        const size_t* operands_list_length, int num_operands,
+                        unsigned char* success, size_t* new_value_length),
+    char* (*partial_merge)(void*, const char* key, size_t key_length,
+                           const char* const* operands_list,
+                           const size_t* operands_list_length, int num_operands,
+                           unsigned char* success, size_t* new_value_length),
+    void (*delete_value)(void*, const char* value, size_t value_length),
+    const char* (*name)(void*)) {
+  rocksdb_mergeoperator_t* result = new rocksdb_mergeoperator_t;
+  result->state_ = state;
+  result->destructor_ = destructor;
+  result->full_merge_ = full_merge;
+  result->partial_merge_ = partial_merge;
+  result->delete_value_ = delete_value;
+  result->name_ = name;
+  return result;
+}
+
+void rocksdb_mergeoperator_destroy(rocksdb_mergeoperator_t* merge_operator) {
+  delete merge_operator;
+}
+
+rocksdb_readoptions_t* rocksdb_readoptions_create() {
+  return new rocksdb_readoptions_t;
+}
+
+void rocksdb_readoptions_destroy(rocksdb_readoptions_t* opt) {
+  delete opt;
+}
+
+void rocksdb_readoptions_set_verify_checksums(
+    rocksdb_readoptions_t* opt,
+    unsigned char v) {
+  opt->rep.verify_checksums = v;
+}
+
+void rocksdb_readoptions_set_fill_cache(
+    rocksdb_readoptions_t* opt, unsigned char v) {
+  opt->rep.fill_cache = v;
+}
+
+void rocksdb_readoptions_set_snapshot(
+    rocksdb_readoptions_t* opt,
+    const rocksdb_snapshot_t* snap) {
+  opt->rep.snapshot = (snap ? snap->rep : nullptr);
+}
+
+void rocksdb_readoptions_set_iterate_upper_bound(
+    rocksdb_readoptions_t* opt,
+    const char* key, size_t keylen) {
+  if (key == nullptr) {
+    opt->upper_bound = Slice();
+    opt->rep.iterate_upper_bound = nullptr;
+
+  } else {
+    opt->upper_bound = Slice(key, keylen);
+    opt->rep.iterate_upper_bound = &opt->upper_bound;
+  }
+}
+
+void rocksdb_readoptions_set_iterate_lower_bound(
+    rocksdb_readoptions_t *opt,
+    const char* key, size_t keylen) {
+  if (key == nullptr) {
+    opt->lower_bound = Slice();
+    opt->rep.iterate_lower_bound = nullptr;
+  } else {
+    opt->lower_bound = Slice(key, keylen);
+    opt->rep.iterate_lower_bound = &opt->lower_bound;
+  }
+}
+
+void rocksdb_readoptions_set_read_tier(
+    rocksdb_readoptions_t* opt, int v) {
+  opt->rep.read_tier = static_cast<ROCKSDB_NAMESPACE::ReadTier>(v);
+}
+
+void rocksdb_readoptions_set_tailing(
+    rocksdb_readoptions_t* opt, unsigned char v) {
+  opt->rep.tailing = v;
+}
+
+void rocksdb_readoptions_set_managed(
+    rocksdb_readoptions_t* opt, unsigned char v) {
+  opt->rep.managed = v;
+}
+
+void rocksdb_readoptions_set_readahead_size(
+    rocksdb_readoptions_t* opt, size_t v) {
+  opt->rep.readahead_size = v;
+}
+
+void rocksdb_readoptions_set_prefix_same_as_start(
+    rocksdb_readoptions_t* opt, unsigned char v) {
+  opt->rep.prefix_same_as_start = v;
+}
+
+void rocksdb_readoptions_set_pin_data(rocksdb_readoptions_t* opt,
+                                      unsigned char v) {
+  opt->rep.pin_data = v;
+}
+
+void rocksdb_readoptions_set_total_order_seek(rocksdb_readoptions_t* opt,
+                                              unsigned char v) {
+  opt->rep.total_order_seek = v;
+}
+
+void rocksdb_readoptions_set_max_skippable_internal_keys(
+    rocksdb_readoptions_t* opt,
+    uint64_t v) {
+  opt->rep.max_skippable_internal_keys = v;
+}
+
+void rocksdb_readoptions_set_background_purge_on_iterator_cleanup(
+    rocksdb_readoptions_t* opt, unsigned char v) {
+  opt->rep.background_purge_on_iterator_cleanup = v;
+}
+
+void rocksdb_readoptions_set_ignore_range_deletions(
+    rocksdb_readoptions_t* opt, unsigned char v) {
+  opt->rep.ignore_range_deletions = v;
+}
+
+rocksdb_writeoptions_t* rocksdb_writeoptions_create() {
+  return new rocksdb_writeoptions_t;
+}
+
+void rocksdb_writeoptions_destroy(rocksdb_writeoptions_t* opt) {
+  delete opt;
+}
+
+void rocksdb_writeoptions_set_sync(
+    rocksdb_writeoptions_t* opt, unsigned char v) {
+  opt->rep.sync = v;
+}
+
+void rocksdb_writeoptions_disable_WAL(rocksdb_writeoptions_t* opt, int disable) {
+  opt->rep.disableWAL = disable;
+}
+
+void rocksdb_writeoptions_set_ignore_missing_column_families(
+    rocksdb_writeoptions_t* opt,
+    unsigned char v) {
+  opt->rep.ignore_missing_column_families = v;
+}
+
+void rocksdb_writeoptions_set_no_slowdown(
+    rocksdb_writeoptions_t* opt,
+    unsigned char v) {
+  opt->rep.no_slowdown = v;
+}
+
+void rocksdb_writeoptions_set_low_pri(
+    rocksdb_writeoptions_t* opt,
+    unsigned char v) {
+  opt->rep.low_pri = v;
+}
+
+void rocksdb_writeoptions_set_memtable_insert_hint_per_batch(
+    rocksdb_writeoptions_t* opt, unsigned char v) {
+  opt->rep.memtable_insert_hint_per_batch = v;
+}
+
+rocksdb_compactoptions_t* rocksdb_compactoptions_create() {
+  return new rocksdb_compactoptions_t;
+}
+
+void rocksdb_compactoptions_destroy(rocksdb_compactoptions_t* opt) {
+  delete opt;
+}
+
+void rocksdb_compactoptions_set_bottommost_level_compaction(
+    rocksdb_compactoptions_t* opt, unsigned char v) {
+  opt->rep.bottommost_level_compaction = static_cast<BottommostLevelCompaction>(v);
+}
+
+void rocksdb_compactoptions_set_exclusive_manual_compaction(
+    rocksdb_compactoptions_t* opt, unsigned char v) {
+  opt->rep.exclusive_manual_compaction = v;
+}
+
+void rocksdb_compactoptions_set_change_level(rocksdb_compactoptions_t* opt,
+                                             unsigned char v) {
+  opt->rep.change_level = v;
+}
+
+void rocksdb_compactoptions_set_target_level(rocksdb_compactoptions_t* opt,
+                                             int n) {
+  opt->rep.target_level = n;
+}
+
+rocksdb_flushoptions_t* rocksdb_flushoptions_create() {
+  return new rocksdb_flushoptions_t;
+}
+
+void rocksdb_flushoptions_destroy(rocksdb_flushoptions_t* opt) {
+  delete opt;
+}
+
+void rocksdb_flushoptions_set_wait(
+    rocksdb_flushoptions_t* opt, unsigned char v) {
+  opt->rep.wait = v;
+}
+
+rocksdb_cache_t* rocksdb_cache_create_lru(size_t capacity) {
+  rocksdb_cache_t* c = new rocksdb_cache_t;
+  c->rep = NewLRUCache(capacity);
+  return c;
+}
+
+void rocksdb_cache_destroy(rocksdb_cache_t* cache) {
+  delete cache;
+}
+
+void rocksdb_cache_set_capacity(rocksdb_cache_t* cache, size_t capacity) {
+  cache->rep->SetCapacity(capacity);
+}
+
+size_t rocksdb_cache_get_usage(rocksdb_cache_t* cache) {
+  return cache->rep->GetUsage();
+}
+
+size_t rocksdb_cache_get_pinned_usage(rocksdb_cache_t* cache) {
+  return cache->rep->GetPinnedUsage();
+}
+
+rocksdb_dbpath_t* rocksdb_dbpath_create(const char* path, uint64_t target_size) {
+  rocksdb_dbpath_t* result = new rocksdb_dbpath_t;
+  result->rep.path = std::string(path);
+  result->rep.target_size = target_size;
+  return result;
+}
+
+void rocksdb_dbpath_destroy(rocksdb_dbpath_t* dbpath) {
+  delete dbpath;
+}
+
+rocksdb_env_t* rocksdb_create_default_env() {
+  rocksdb_env_t* result = new rocksdb_env_t;
+  result->rep = Env::Default();
+  result->is_default = true;
+  return result;
+}
+
+rocksdb_env_t* rocksdb_create_mem_env() {
+  rocksdb_env_t* result = new rocksdb_env_t;
+  result->rep = ROCKSDB_NAMESPACE::NewMemEnv(Env::Default());
+  result->is_default = false;
+  return result;
+}
+
+void rocksdb_env_set_background_threads(rocksdb_env_t* env, int n) {
+  env->rep->SetBackgroundThreads(n);
+}
+
+void rocksdb_env_set_high_priority_background_threads(rocksdb_env_t* env, int n) {
+  env->rep->SetBackgroundThreads(n, Env::HIGH);
+}
+
+void rocksdb_env_join_all_threads(rocksdb_env_t* env) {
+  env->rep->WaitForJoin();
+}
+
+void rocksdb_env_lower_thread_pool_io_priority(rocksdb_env_t* env) {
+  env->rep->LowerThreadPoolIOPriority();
+}
+
+void rocksdb_env_lower_high_priority_thread_pool_io_priority(rocksdb_env_t* env) {
+  env->rep->LowerThreadPoolIOPriority(Env::HIGH);
+}
+
+void rocksdb_env_lower_thread_pool_cpu_priority(rocksdb_env_t* env) {
+  env->rep->LowerThreadPoolCPUPriority();
+}
+
+void rocksdb_env_lower_high_priority_thread_pool_cpu_priority(rocksdb_env_t* env) {
+  env->rep->LowerThreadPoolCPUPriority(Env::HIGH);
+}
+
+void rocksdb_env_destroy(rocksdb_env_t* env) {
+  if (!env->is_default) delete env->rep;
+  delete env;
+}
+
+rocksdb_envoptions_t* rocksdb_envoptions_create() {
+  rocksdb_envoptions_t* opt = new rocksdb_envoptions_t;
+  return opt;
+}
+
+void rocksdb_envoptions_destroy(rocksdb_envoptions_t* opt) { delete opt; }
+
+rocksdb_sstfilewriter_t* rocksdb_sstfilewriter_create(
+    const rocksdb_envoptions_t* env, const rocksdb_options_t* io_options) {
+  rocksdb_sstfilewriter_t* writer = new rocksdb_sstfilewriter_t;
+  writer->rep = new SstFileWriter(env->rep, io_options->rep);
+  return writer;
+}
+
+rocksdb_sstfilewriter_t* rocksdb_sstfilewriter_create_with_comparator(
+    const rocksdb_envoptions_t* env, const rocksdb_options_t* io_options,
+    const rocksdb_comparator_t* /*comparator*/) {
+  rocksdb_sstfilewriter_t* writer = new rocksdb_sstfilewriter_t;
+  writer->rep = new SstFileWriter(env->rep, io_options->rep);
+  return writer;
+}
+
+void rocksdb_sstfilewriter_open(rocksdb_sstfilewriter_t* writer,
+                                const char* name, char** errptr) {
+  SaveError(errptr, writer->rep->Open(std::string(name)));
+}
+
+void rocksdb_sstfilewriter_add(rocksdb_sstfilewriter_t* writer, const char* key,
+                               size_t keylen, const char* val, size_t vallen,
+                               char** errptr) {
+  SaveError(errptr, writer->rep->Put(Slice(key, keylen), Slice(val, vallen)));
+}
+
+void rocksdb_sstfilewriter_put(rocksdb_sstfilewriter_t* writer, const char* key,
+                               size_t keylen, const char* val, size_t vallen,
+                               char** errptr) {
+  SaveError(errptr, writer->rep->Put(Slice(key, keylen), Slice(val, vallen)));
+}
+
+void rocksdb_sstfilewriter_merge(rocksdb_sstfilewriter_t* writer,
+                                 const char* key, size_t keylen,
+                                 const char* val, size_t vallen,
+                                 char** errptr) {
+  SaveError(errptr, writer->rep->Merge(Slice(key, keylen), Slice(val, vallen)));
+}
+
+void rocksdb_sstfilewriter_delete(rocksdb_sstfilewriter_t* writer,
+                                  const char* key, size_t keylen,
+                                  char** errptr) {
+  SaveError(errptr, writer->rep->Delete(Slice(key, keylen)));
+}
+
+void rocksdb_sstfilewriter_finish(rocksdb_sstfilewriter_t* writer,
+                                  char** errptr) {
+  SaveError(errptr, writer->rep->Finish(nullptr));
+}
+
+void rocksdb_sstfilewriter_file_size(rocksdb_sstfilewriter_t* writer,
+                                  uint64_t* file_size) {
+  *file_size = writer->rep->FileSize();
+}
+
+void rocksdb_sstfilewriter_destroy(rocksdb_sstfilewriter_t* writer) {
+  delete writer->rep;
+  delete writer;
+}
+
+rocksdb_ingestexternalfileoptions_t*
+rocksdb_ingestexternalfileoptions_create() {
+  rocksdb_ingestexternalfileoptions_t* opt =
+      new rocksdb_ingestexternalfileoptions_t;
+  return opt;
+}
+
+void rocksdb_ingestexternalfileoptions_set_move_files(
+    rocksdb_ingestexternalfileoptions_t* opt, unsigned char move_files) {
+  opt->rep.move_files = move_files;
+}
+
+void rocksdb_ingestexternalfileoptions_set_snapshot_consistency(
+    rocksdb_ingestexternalfileoptions_t* opt,
+    unsigned char snapshot_consistency) {
+  opt->rep.snapshot_consistency = snapshot_consistency;
+}
+
+void rocksdb_ingestexternalfileoptions_set_allow_global_seqno(
+    rocksdb_ingestexternalfileoptions_t* opt,
+    unsigned char allow_global_seqno) {
+  opt->rep.allow_global_seqno = allow_global_seqno;
+}
+
+void rocksdb_ingestexternalfileoptions_set_allow_blocking_flush(
+    rocksdb_ingestexternalfileoptions_t* opt,
+    unsigned char allow_blocking_flush) {
+  opt->rep.allow_blocking_flush = allow_blocking_flush;
+}
+
+void rocksdb_ingestexternalfileoptions_set_ingest_behind(
+    rocksdb_ingestexternalfileoptions_t* opt,
+    unsigned char ingest_behind) {
+  opt->rep.ingest_behind = ingest_behind;
+}
+
+void rocksdb_ingestexternalfileoptions_destroy(
+    rocksdb_ingestexternalfileoptions_t* opt) {
+  delete opt;
+}
+
+void rocksdb_ingest_external_file(
+    rocksdb_t* db, const char* const* file_list, const size_t list_len,
+    const rocksdb_ingestexternalfileoptions_t* opt, char** errptr) {
+  std::vector<std::string> files(list_len);
+  for (size_t i = 0; i < list_len; ++i) {
+    files[i] = std::string(file_list[i]);
+  }
+  SaveError(errptr, db->rep->IngestExternalFile(files, opt->rep));
+}
+
+void rocksdb_ingest_external_file_cf(
+    rocksdb_t* db, rocksdb_column_family_handle_t* handle,
+    const char* const* file_list, const size_t list_len,
+    const rocksdb_ingestexternalfileoptions_t* opt, char** errptr) {
+  std::vector<std::string> files(list_len);
+  for (size_t i = 0; i < list_len; ++i) {
+    files[i] = std::string(file_list[i]);
+  }
+  SaveError(errptr, db->rep->IngestExternalFile(handle->rep, files, opt->rep));
+}
+
+void rocksdb_try_catch_up_with_primary(rocksdb_t* db, char** errptr) {
+  SaveError(errptr, db->rep->TryCatchUpWithPrimary());
+}
+
+rocksdb_slicetransform_t* rocksdb_slicetransform_create(
+    void* state,
+    void (*destructor)(void*),
+    char* (*transform)(
+        void*,
+        const char* key, size_t length,
+        size_t* dst_length),
+    unsigned char (*in_domain)(
+        void*,
+        const char* key, size_t length),
+    unsigned char (*in_range)(
+        void*,
+        const char* key, size_t length),
+    const char* (*name)(void*)) {
+  rocksdb_slicetransform_t* result = new rocksdb_slicetransform_t;
+  result->state_ = state;
+  result->destructor_ = destructor;
+  result->transform_ = transform;
+  result->in_domain_ = in_domain;
+  result->in_range_ = in_range;
+  result->name_ = name;
+  return result;
+}
+
+void rocksdb_slicetransform_destroy(rocksdb_slicetransform_t* st) {
+  delete st;
+}
+
+struct Wrapper : public rocksdb_slicetransform_t {
+  const SliceTransform* rep_;
+  ~Wrapper() override { delete rep_; }
+  const char* Name() const override { return rep_->Name(); }
+  Slice Transform(const Slice& src) const override {
+    return rep_->Transform(src);
+  }
+  bool InDomain(const Slice& src) const override {
+    return rep_->InDomain(src);
+  }
+  bool InRange(const Slice& src) const override { return rep_->InRange(src); }
+  static void DoNothing(void*) { }
+};
+
+rocksdb_slicetransform_t* rocksdb_slicetransform_create_fixed_prefix(size_t prefixLen) {
+  Wrapper* wrapper = new Wrapper;
+  wrapper->rep_ = ROCKSDB_NAMESPACE::NewFixedPrefixTransform(prefixLen);
+  wrapper->state_ = nullptr;
+  wrapper->destructor_ = &Wrapper::DoNothing;
+  return wrapper;
+}
+
+rocksdb_slicetransform_t* rocksdb_slicetransform_create_noop() {
+  Wrapper* wrapper = new Wrapper;
+  wrapper->rep_ = ROCKSDB_NAMESPACE::NewNoopTransform();
+  wrapper->state_ = nullptr;
+  wrapper->destructor_ = &Wrapper::DoNothing;
+  return wrapper;
+}
+
+rocksdb_universal_compaction_options_t* rocksdb_universal_compaction_options_create() {
+  rocksdb_universal_compaction_options_t* result = new rocksdb_universal_compaction_options_t;
+  result->rep = new ROCKSDB_NAMESPACE::CompactionOptionsUniversal;
+  return result;
+}
+
+void rocksdb_universal_compaction_options_set_size_ratio(
+  rocksdb_universal_compaction_options_t* uco, int ratio) {
+  uco->rep->size_ratio = ratio;
+}
+
+void rocksdb_universal_compaction_options_set_min_merge_width(
+  rocksdb_universal_compaction_options_t* uco, int w) {
+  uco->rep->min_merge_width = w;
+}
+
+void rocksdb_universal_compaction_options_set_max_merge_width(
+  rocksdb_universal_compaction_options_t* uco, int w) {
+  uco->rep->max_merge_width = w;
+}
+
+void rocksdb_universal_compaction_options_set_max_size_amplification_percent(
+  rocksdb_universal_compaction_options_t* uco, int p) {
+  uco->rep->max_size_amplification_percent = p;
+}
+
+void rocksdb_universal_compaction_options_set_compression_size_percent(
+  rocksdb_universal_compaction_options_t* uco, int p) {
+  uco->rep->compression_size_percent = p;
+}
+
+void rocksdb_universal_compaction_options_set_stop_style(
+  rocksdb_universal_compaction_options_t* uco, int style) {
+  uco->rep->stop_style =
+      static_cast<ROCKSDB_NAMESPACE::CompactionStopStyle>(style);
+}
+
+void rocksdb_universal_compaction_options_destroy(
+  rocksdb_universal_compaction_options_t* uco) {
+  delete uco->rep;
+  delete uco;
+}
+
+rocksdb_fifo_compaction_options_t* rocksdb_fifo_compaction_options_create() {
+  rocksdb_fifo_compaction_options_t* result = new rocksdb_fifo_compaction_options_t;
+  result->rep =  CompactionOptionsFIFO();
+  return result;
+}
+
+void rocksdb_fifo_compaction_options_set_max_table_files_size(
+    rocksdb_fifo_compaction_options_t* fifo_opts, uint64_t size) {
+  fifo_opts->rep.max_table_files_size = size;
+}
+
+void rocksdb_fifo_compaction_options_destroy(
+    rocksdb_fifo_compaction_options_t* fifo_opts) {
+  delete fifo_opts;
+}
+
+void rocksdb_options_set_min_level_to_compress(rocksdb_options_t* opt, int level) {
+  if (level >= 0) {
+    assert(level <= opt->rep.num_levels);
+    opt->rep.compression_per_level.resize(opt->rep.num_levels);
+    for (int i = 0; i < level; i++) {
+      opt->rep.compression_per_level[i] = ROCKSDB_NAMESPACE::kNoCompression;
+    }
+    for (int i = level; i < opt->rep.num_levels; i++) {
+      opt->rep.compression_per_level[i] = opt->rep.compression;
+    }
+  }
+}
+
+int rocksdb_livefiles_count(
+  const rocksdb_livefiles_t* lf) {
+  return static_cast<int>(lf->rep.size());
+}
+
+const char* rocksdb_livefiles_name(
+  const rocksdb_livefiles_t* lf,
+  int index) {
+  return lf->rep[index].name.c_str();
+}
+
+int rocksdb_livefiles_level(
+  const rocksdb_livefiles_t* lf,
+  int index) {
+  return lf->rep[index].level;
+}
+
+size_t rocksdb_livefiles_size(
+  const rocksdb_livefiles_t* lf,
+  int index) {
+  return lf->rep[index].size;
+}
+
+const char* rocksdb_livefiles_smallestkey(
+  const rocksdb_livefiles_t* lf,
+  int index,
+  size_t* size) {
+  *size = lf->rep[index].smallestkey.size();
+  return lf->rep[index].smallestkey.data();
+}
+
+const char* rocksdb_livefiles_largestkey(
+  const rocksdb_livefiles_t* lf,
+  int index,
+  size_t* size) {
+  *size = lf->rep[index].largestkey.size();
+  return lf->rep[index].largestkey.data();
+}
+
+uint64_t rocksdb_livefiles_entries(
+    const rocksdb_livefiles_t* lf,
+    int index) {
+  return lf->rep[index].num_entries;
+}
+
+uint64_t rocksdb_livefiles_deletions(
+    const rocksdb_livefiles_t* lf,
+    int index) {
+  return lf->rep[index].num_deletions;
+}
+
+extern void rocksdb_livefiles_destroy(
+  const rocksdb_livefiles_t* lf) {
+  delete lf;
+}
+
+void rocksdb_get_options_from_string(const rocksdb_options_t* base_options,
+                                     const char* opts_str,
+                                     rocksdb_options_t* new_options,
+                                     char** errptr) {
+  SaveError(errptr,
+            GetOptionsFromString(base_options->rep, std::string(opts_str),
+                                 &new_options->rep));
+}
+
+void rocksdb_delete_file_in_range(rocksdb_t* db, const char* start_key,
+                                  size_t start_key_len, const char* limit_key,
+                                  size_t limit_key_len, char** errptr) {
+  Slice a, b;
+  SaveError(
+      errptr,
+      DeleteFilesInRange(
+          db->rep, db->rep->DefaultColumnFamily(),
+          (start_key ? (a = Slice(start_key, start_key_len), &a) : nullptr),
+          (limit_key ? (b = Slice(limit_key, limit_key_len), &b) : nullptr)));
+}
+
+void rocksdb_delete_file_in_range_cf(
+    rocksdb_t* db, rocksdb_column_family_handle_t* column_family,
+    const char* start_key, size_t start_key_len, const char* limit_key,
+    size_t limit_key_len, char** errptr) {
+  Slice a, b;
+  SaveError(
+      errptr,
+      DeleteFilesInRange(
+          db->rep, column_family->rep,
+          (start_key ? (a = Slice(start_key, start_key_len), &a) : nullptr),
+          (limit_key ? (b = Slice(limit_key, limit_key_len), &b) : nullptr)));
+}
+
+rocksdb_transactiondb_options_t* rocksdb_transactiondb_options_create() {
+  return new rocksdb_transactiondb_options_t;
+}
+
+void rocksdb_transactiondb_options_destroy(rocksdb_transactiondb_options_t* opt){
+  delete opt;
+}
+
+void rocksdb_transactiondb_options_set_max_num_locks(
+    rocksdb_transactiondb_options_t* opt, int64_t max_num_locks) {
+  opt->rep.max_num_locks = max_num_locks;
+}
+
+void rocksdb_transactiondb_options_set_num_stripes(
+    rocksdb_transactiondb_options_t* opt, size_t num_stripes) {
+  opt->rep.num_stripes = num_stripes;
+}
+
+void rocksdb_transactiondb_options_set_transaction_lock_timeout(
+    rocksdb_transactiondb_options_t* opt, int64_t txn_lock_timeout) {
+  opt->rep.transaction_lock_timeout = txn_lock_timeout;
+}
+
+void rocksdb_transactiondb_options_set_default_lock_timeout(
+    rocksdb_transactiondb_options_t* opt, int64_t default_lock_timeout) {
+  opt->rep.default_lock_timeout = default_lock_timeout;
+}
+
+rocksdb_transaction_options_t* rocksdb_transaction_options_create() {
+  return new rocksdb_transaction_options_t;
+}
+
+void rocksdb_transaction_options_destroy(rocksdb_transaction_options_t* opt) {
+  delete opt;
+}
+
+void rocksdb_transaction_options_set_set_snapshot(
+    rocksdb_transaction_options_t* opt, unsigned char v) {
+  opt->rep.set_snapshot = v;
+}
+
+void rocksdb_transaction_options_set_deadlock_detect(
+    rocksdb_transaction_options_t* opt, unsigned char v) {
+  opt->rep.deadlock_detect = v;
+}
+
+void rocksdb_transaction_options_set_lock_timeout(
+    rocksdb_transaction_options_t* opt, int64_t lock_timeout) {
+  opt->rep.lock_timeout = lock_timeout;
+}
+
+void rocksdb_transaction_options_set_expiration(
+    rocksdb_transaction_options_t* opt, int64_t expiration) {
+  opt->rep.expiration = expiration;
+}
+
+void rocksdb_transaction_options_set_deadlock_detect_depth(
+    rocksdb_transaction_options_t* opt, int64_t depth) {
+  opt->rep.deadlock_detect_depth = depth;
+}
+
+void rocksdb_transaction_options_set_max_write_batch_size(
+    rocksdb_transaction_options_t* opt, size_t size) {
+  opt->rep.max_write_batch_size = size;
+}
+
+rocksdb_optimistictransaction_options_t*
+rocksdb_optimistictransaction_options_create() {
+  return new rocksdb_optimistictransaction_options_t;
+}
+
+void rocksdb_optimistictransaction_options_destroy(
+    rocksdb_optimistictransaction_options_t* opt) {
+  delete opt;
+}
+
+void rocksdb_optimistictransaction_options_set_set_snapshot(
+    rocksdb_optimistictransaction_options_t* opt, unsigned char v) {
+  opt->rep.set_snapshot = v;
+}
+
+rocksdb_column_family_handle_t* rocksdb_transactiondb_create_column_family(
+    rocksdb_transactiondb_t* txn_db,
+    const rocksdb_options_t* column_family_options,
+    const char* column_family_name, char** errptr) {
+  rocksdb_column_family_handle_t* handle = new rocksdb_column_family_handle_t;
+  SaveError(errptr, txn_db->rep->CreateColumnFamily(
+                        ColumnFamilyOptions(column_family_options->rep),
+                        std::string(column_family_name), &(handle->rep)));
+  return handle;
+}
+
+rocksdb_transactiondb_t* rocksdb_transactiondb_open(
+    const rocksdb_options_t* options,
+    const rocksdb_transactiondb_options_t* txn_db_options, const char* name,
+    char** errptr) {
+  TransactionDB* txn_db;
+  if (SaveError(errptr, TransactionDB::Open(options->rep, txn_db_options->rep,
+                                            std::string(name), &txn_db))) {
+    return nullptr;
+  }
+  rocksdb_transactiondb_t* result = new rocksdb_transactiondb_t;
+  result->rep = txn_db;
+  return result;
+}
+
+rocksdb_transactiondb_t* rocksdb_transactiondb_open_column_families(
+    const rocksdb_options_t* options,
+    const rocksdb_transactiondb_options_t* txn_db_options, const char* name,
+    int num_column_families, const char* const* column_family_names,
+    const rocksdb_options_t* const* column_family_options,
+    rocksdb_column_family_handle_t** column_family_handles, char** errptr) {
+  std::vector<ColumnFamilyDescriptor> column_families;
+  for (int i = 0; i < num_column_families; i++) {
+    column_families.push_back(ColumnFamilyDescriptor(
+        std::string(column_family_names[i]),
+        ColumnFamilyOptions(column_family_options[i]->rep)));
+  }
+
+  TransactionDB* txn_db;
+  std::vector<ColumnFamilyHandle*> handles;
+  if (SaveError(errptr, TransactionDB::Open(options->rep, txn_db_options->rep,
+                                            std::string(name), column_families,
+                                            &handles, &txn_db))) {
+    return nullptr;
+  }
+
+  for (size_t i = 0; i < handles.size(); i++) {
+    rocksdb_column_family_handle_t* c_handle =
+        new rocksdb_column_family_handle_t;
+    c_handle->rep = handles[i];
+    column_family_handles[i] = c_handle;
+  }
+  rocksdb_transactiondb_t* result = new rocksdb_transactiondb_t;
+  result->rep = txn_db;
+  return result;
+}
+
+const rocksdb_snapshot_t* rocksdb_transactiondb_create_snapshot(
+    rocksdb_transactiondb_t* txn_db) {
+  rocksdb_snapshot_t* result = new rocksdb_snapshot_t;
+  result->rep = txn_db->rep->GetSnapshot();
+  return result;
+}
+
+void rocksdb_transactiondb_release_snapshot(
+    rocksdb_transactiondb_t* txn_db, const rocksdb_snapshot_t* snapshot) {
+  txn_db->rep->ReleaseSnapshot(snapshot->rep);
+  delete snapshot;
+}
+
+rocksdb_transaction_t* rocksdb_transaction_begin(
+    rocksdb_transactiondb_t* txn_db,
+    const rocksdb_writeoptions_t* write_options,
+    const rocksdb_transaction_options_t* txn_options,
+    rocksdb_transaction_t* old_txn) {
+  if (old_txn == nullptr) {
+    rocksdb_transaction_t* result = new rocksdb_transaction_t;
+    result->rep = txn_db->rep->BeginTransaction(write_options->rep,
+                                                txn_options->rep, nullptr);
+    return result;
+  }
+  old_txn->rep = txn_db->rep->BeginTransaction(write_options->rep,
+                                                txn_options->rep, old_txn->rep);
+  return old_txn;
+}
+
+void rocksdb_transaction_commit(rocksdb_transaction_t* txn, char** errptr) {
+  SaveError(errptr, txn->rep->Commit());
+}
+
+void rocksdb_transaction_rollback(rocksdb_transaction_t* txn, char** errptr) {
+  SaveError(errptr, txn->rep->Rollback());
+}
+
+void rocksdb_transaction_set_savepoint(rocksdb_transaction_t* txn) {
+  txn->rep->SetSavePoint();
+}
+
+void rocksdb_transaction_rollback_to_savepoint(rocksdb_transaction_t* txn, char** errptr) {
+  SaveError(errptr, txn->rep->RollbackToSavePoint());
+}
+
+void rocksdb_transaction_destroy(rocksdb_transaction_t* txn) {
+  delete txn->rep;
+  delete txn;
+}
+
+const rocksdb_snapshot_t* rocksdb_transaction_get_snapshot(
+    rocksdb_transaction_t* txn) {
+  rocksdb_snapshot_t* result = new rocksdb_snapshot_t;
+  result->rep = txn->rep->GetSnapshot();
+  return result;
+}
+
+// Read a key inside a transaction
+char* rocksdb_transaction_get(rocksdb_transaction_t* txn,
+                              const rocksdb_readoptions_t* options,
+                              const char* key, size_t klen, size_t* vlen,
+                              char** errptr) {
+  char* result = nullptr;
+  std::string tmp;
+  Status s = txn->rep->Get(options->rep, Slice(key, klen), &tmp);
+  if (s.ok()) {
+    *vlen = tmp.size();
+    result = CopyString(tmp);
+  } else {
+    *vlen = 0;
+    if (!s.IsNotFound()) {
+      SaveError(errptr, s);
+    }
+  }
+  return result;
+}
+
+char* rocksdb_transaction_get_cf(rocksdb_transaction_t* txn,
+                                 const rocksdb_readoptions_t* options,
+                                 rocksdb_column_family_handle_t* column_family,
+                                 const char* key, size_t klen, size_t* vlen,
+                                 char** errptr) {
+  char* result = nullptr;
+  std::string tmp;
+  Status s =
+      txn->rep->Get(options->rep, column_family->rep, Slice(key, klen), &tmp);
+  if (s.ok()) {
+    *vlen = tmp.size();
+    result = CopyString(tmp);
+  } else {
+    *vlen = 0;
+    if (!s.IsNotFound()) {
+      SaveError(errptr, s);
+    }
+  }
+  return result;
+}
+
+// Read a key inside a transaction
+char* rocksdb_transaction_get_for_update(rocksdb_transaction_t* txn,
+                                         const rocksdb_readoptions_t* options,
+                                         const char* key, size_t klen,
+                                         size_t* vlen, unsigned char exclusive,
+                                         char** errptr) {
+  char* result = nullptr;
+  std::string tmp;
+  Status s =
+      txn->rep->GetForUpdate(options->rep, Slice(key, klen), &tmp, exclusive);
+  if (s.ok()) {
+    *vlen = tmp.size();
+    result = CopyString(tmp);
+  } else {
+    *vlen = 0;
+    if (!s.IsNotFound()) {
+      SaveError(errptr, s);
+    }
+  }
+  return result;
+}
+
+char* rocksdb_transaction_get_for_update_cf(
+    rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options,
+    rocksdb_column_family_handle_t* column_family, const char* key, size_t klen,
+    size_t* vlen, unsigned char exclusive, char** errptr) {
+  char* result = nullptr;
+  std::string tmp;
+  Status s = txn->rep->GetForUpdate(options->rep, column_family->rep,
+                                    Slice(key, klen), &tmp, exclusive);
+  if (s.ok()) {
+    *vlen = tmp.size();
+    result = CopyString(tmp);
+  } else {
+    *vlen = 0;
+    if (!s.IsNotFound()) {
+      SaveError(errptr, s);
+    }
+  }
+  return result;
+}
+
+// Read a key outside a transaction
+char* rocksdb_transactiondb_get(
+    rocksdb_transactiondb_t* txn_db,
+    const rocksdb_readoptions_t* options,
+    const char* key, size_t klen,
+    size_t* vlen,
+    char** errptr){
+  char* result = nullptr;
+  std::string tmp;
+  Status s = txn_db->rep->Get(options->rep, Slice(key, klen), &tmp);
+  if (s.ok()) {
+    *vlen = tmp.size();
+    result = CopyString(tmp);
+  } else {
+    *vlen = 0;
+    if (!s.IsNotFound()) {
+      SaveError(errptr, s);
+    }
+  }
+  return result;
+}
+
+char* rocksdb_transactiondb_get_cf(
+    rocksdb_transactiondb_t* txn_db, const rocksdb_readoptions_t* options,
+    rocksdb_column_family_handle_t* column_family, const char* key,
+    size_t keylen, size_t* vallen, char** errptr) {
+  char* result = nullptr;
+  std::string tmp;
+  Status s = txn_db->rep->Get(options->rep, column_family->rep,
+                              Slice(key, keylen), &tmp);
+  if (s.ok()) {
+    *vallen = tmp.size();
+    result = CopyString(tmp);
+  } else {
+    *vallen = 0;
+    if (!s.IsNotFound()) {
+      SaveError(errptr, s);
+    }
+  }
+  return result;
+}
+
+// Put a key inside a transaction
+void rocksdb_transaction_put(rocksdb_transaction_t* txn, const char* key,
+                             size_t klen, const char* val, size_t vlen,
+                             char** errptr) {
+  SaveError(errptr, txn->rep->Put(Slice(key, klen), Slice(val, vlen)));
+}
+
+void rocksdb_transaction_put_cf(rocksdb_transaction_t* txn,
+                                rocksdb_column_family_handle_t* column_family,
+                                const char* key, size_t klen, const char* val,
+                                size_t vlen, char** errptr) {
+  SaveError(errptr, txn->rep->Put(column_family->rep, Slice(key, klen),
+                                  Slice(val, vlen)));
+}
+
+// Put a key outside a transaction
+void rocksdb_transactiondb_put(rocksdb_transactiondb_t* txn_db,
+                               const rocksdb_writeoptions_t* options,
+                               const char* key, size_t klen, const char* val,
+                               size_t vlen, char** errptr) {
+  SaveError(errptr,
+            txn_db->rep->Put(options->rep, Slice(key, klen), Slice(val, vlen)));
+}
+
+void rocksdb_transactiondb_put_cf(rocksdb_transactiondb_t* txn_db,
+                                  const rocksdb_writeoptions_t* options,
+                                  rocksdb_column_family_handle_t* column_family,
+                                  const char* key, size_t keylen,
+                                  const char* val, size_t vallen,
+                                  char** errptr) {
+  SaveError(errptr, txn_db->rep->Put(options->rep, column_family->rep,
+                                     Slice(key, keylen), Slice(val, vallen)));
+}
+
+// Write batch into transaction db
+void rocksdb_transactiondb_write(
+        rocksdb_transactiondb_t* db,
+        const rocksdb_writeoptions_t* options,
+        rocksdb_writebatch_t* batch,
+        char** errptr) {
+  SaveError(errptr, db->rep->Write(options->rep, &batch->rep));
+}
+
+// Merge a key inside a transaction
+void rocksdb_transaction_merge(rocksdb_transaction_t* txn, const char* key,
+                               size_t klen, const char* val, size_t vlen,
+                               char** errptr) {
+  SaveError(errptr, txn->rep->Merge(Slice(key, klen), Slice(val, vlen)));
+}
+
+void rocksdb_transaction_merge_cf(rocksdb_transaction_t* txn,
+                                  rocksdb_column_family_handle_t* column_family,
+                                  const char* key, size_t klen, const char* val,
+                                  size_t vlen, char** errptr) {
+  SaveError(errptr, txn->rep->Merge(column_family->rep, Slice(key, klen),
+                                    Slice(val, vlen)));
+}
+
+// Merge a key outside a transaction
+void rocksdb_transactiondb_merge(rocksdb_transactiondb_t* txn_db,
+                                 const rocksdb_writeoptions_t* options,
+                                 const char* key, size_t klen, const char* val,
+                                 size_t vlen, char** errptr) {
+  SaveError(errptr, txn_db->rep->Merge(options->rep, Slice(key, klen),
+                                       Slice(val, vlen)));
+}
+
+void rocksdb_transactiondb_merge_cf(
+    rocksdb_transactiondb_t* txn_db, const rocksdb_writeoptions_t* options,
+    rocksdb_column_family_handle_t* column_family, const char* key, size_t klen,
+    const char* val, size_t vlen, char** errptr) {
+  SaveError(errptr, txn_db->rep->Merge(options->rep, column_family->rep,
+                                       Slice(key, klen), Slice(val, vlen)));
+}
+
+// Delete a key inside a transaction
+void rocksdb_transaction_delete(rocksdb_transaction_t* txn, const char* key,
+                                size_t klen, char** errptr) {
+  SaveError(errptr, txn->rep->Delete(Slice(key, klen)));
+}
+
+void rocksdb_transaction_delete_cf(
+    rocksdb_transaction_t* txn, rocksdb_column_family_handle_t* column_family,
+    const char* key, size_t klen, char** errptr) {
+  SaveError(errptr, txn->rep->Delete(column_family->rep, Slice(key, klen)));
+}
+
+// Delete a key outside a transaction
+void rocksdb_transactiondb_delete(rocksdb_transactiondb_t* txn_db,
+                                  const rocksdb_writeoptions_t* options,
+                                  const char* key, size_t klen, char** errptr) {
+  SaveError(errptr, txn_db->rep->Delete(options->rep, Slice(key, klen)));
+}
+
+void rocksdb_transactiondb_delete_cf(
+    rocksdb_transactiondb_t* txn_db, const rocksdb_writeoptions_t* options,
+    rocksdb_column_family_handle_t* column_family, const char* key,
+    size_t keylen, char** errptr) {
+  SaveError(errptr, txn_db->rep->Delete(options->rep, column_family->rep,
+                                        Slice(key, keylen)));
+}
+
+// Create an iterator inside a transaction
+rocksdb_iterator_t* rocksdb_transaction_create_iterator(
+    rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options) {
+  rocksdb_iterator_t* result = new rocksdb_iterator_t;
+  result->rep = txn->rep->GetIterator(options->rep);
+  return result;
+}
+
+// Create an iterator inside a transaction with column family
+rocksdb_iterator_t* rocksdb_transaction_create_iterator_cf(
+    rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options,
+    rocksdb_column_family_handle_t* column_family) {
+  rocksdb_iterator_t* result = new rocksdb_iterator_t;
+  result->rep = txn->rep->GetIterator(options->rep, column_family->rep);
+  return result;
+}
+
+// Create an iterator outside a transaction
+rocksdb_iterator_t* rocksdb_transactiondb_create_iterator(
+    rocksdb_transactiondb_t* txn_db, const rocksdb_readoptions_t* options) {
+  rocksdb_iterator_t* result = new rocksdb_iterator_t;
+  result->rep = txn_db->rep->NewIterator(options->rep);
+  return result;
+}
+
+rocksdb_iterator_t* rocksdb_transactiondb_create_iterator_cf(
+    rocksdb_transactiondb_t* txn_db, const rocksdb_readoptions_t* options,
+    rocksdb_column_family_handle_t* column_family) {
+  rocksdb_iterator_t* result = new rocksdb_iterator_t;
+  result->rep = txn_db->rep->NewIterator(options->rep, column_family->rep);
+  return result;
+}
+
+void rocksdb_transactiondb_close(rocksdb_transactiondb_t* txn_db) {
+  delete txn_db->rep;
+  delete txn_db;
+}
+
+rocksdb_checkpoint_t* rocksdb_transactiondb_checkpoint_object_create(
+    rocksdb_transactiondb_t* txn_db, char** errptr) {
+  Checkpoint* checkpoint;
+  if (SaveError(errptr, Checkpoint::Create(txn_db->rep, &checkpoint))) {
+    return nullptr;
+  }
+  rocksdb_checkpoint_t* result = new rocksdb_checkpoint_t;
+  result->rep = checkpoint;
+  return result;
+}
+
+rocksdb_optimistictransactiondb_t* rocksdb_optimistictransactiondb_open(
+    const rocksdb_options_t* options, const char* name, char** errptr) {
+  OptimisticTransactionDB* otxn_db;
+  if (SaveError(errptr, OptimisticTransactionDB::Open(
+                            options->rep, std::string(name), &otxn_db))) {
+    return nullptr;
+  }
+  rocksdb_optimistictransactiondb_t* result =
+      new rocksdb_optimistictransactiondb_t;
+  result->rep = otxn_db;
+  return result;
+}
+
+rocksdb_optimistictransactiondb_t*
+rocksdb_optimistictransactiondb_open_column_families(
+    const rocksdb_options_t* db_options, const char* name,
+    int num_column_families, const char* const* column_family_names,
+    const rocksdb_options_t* const* column_family_options,
+    rocksdb_column_family_handle_t** column_family_handles, char** errptr) {
+  std::vector<ColumnFamilyDescriptor> column_families;
+  for (int i = 0; i < num_column_families; i++) {
+    column_families.push_back(ColumnFamilyDescriptor(
+        std::string(column_family_names[i]),
+        ColumnFamilyOptions(column_family_options[i]->rep)));
+  }
+
+  OptimisticTransactionDB* otxn_db;
+  std::vector<ColumnFamilyHandle*> handles;
+  if (SaveError(errptr, OptimisticTransactionDB::Open(
+                            DBOptions(db_options->rep), std::string(name),
+                            column_families, &handles, &otxn_db))) {
+    return nullptr;
+  }
+
+  for (size_t i = 0; i < handles.size(); i++) {
+    rocksdb_column_family_handle_t* c_handle =
+        new rocksdb_column_family_handle_t;
+    c_handle->rep = handles[i];
+    column_family_handles[i] = c_handle;
+  }
+  rocksdb_optimistictransactiondb_t* result =
+      new rocksdb_optimistictransactiondb_t;
+  result->rep = otxn_db;
+  return result;
+}
+
+rocksdb_t* rocksdb_optimistictransactiondb_get_base_db(
+    rocksdb_optimistictransactiondb_t* otxn_db) {
+  DB* base_db = otxn_db->rep->GetBaseDB();
+
+  if (base_db != nullptr) {
+    rocksdb_t* result = new rocksdb_t;
+    result->rep = base_db;
+    return result;
+  }
+
+  return nullptr;
+}
+
+void rocksdb_optimistictransactiondb_close_base_db(rocksdb_t* base_db) {
+  delete base_db;
+}
+
+rocksdb_transaction_t* rocksdb_optimistictransaction_begin(
+    rocksdb_optimistictransactiondb_t* otxn_db,
+    const rocksdb_writeoptions_t* write_options,
+    const rocksdb_optimistictransaction_options_t* otxn_options,
+    rocksdb_transaction_t* old_txn) {
+  if (old_txn == nullptr) {
+    rocksdb_transaction_t* result = new rocksdb_transaction_t;
+    result->rep = otxn_db->rep->BeginTransaction(write_options->rep,
+                                                 otxn_options->rep, nullptr);
+    return result;
+  }
+  old_txn->rep = otxn_db->rep->BeginTransaction(
+      write_options->rep, otxn_options->rep, old_txn->rep);
+  return old_txn;
+}
+
+void rocksdb_optimistictransactiondb_close(
+    rocksdb_optimistictransactiondb_t* otxn_db) {
+  delete otxn_db->rep;
+  delete otxn_db;
+}
+
+void rocksdb_free(void* ptr) { free(ptr); }
+
+rocksdb_pinnableslice_t* rocksdb_get_pinned(
+    rocksdb_t* db, const rocksdb_readoptions_t* options, const char* key,
+    size_t keylen, char** errptr) {
+  rocksdb_pinnableslice_t* v = new (rocksdb_pinnableslice_t);
+  Status s = db->rep->Get(options->rep, db->rep->DefaultColumnFamily(),
+                          Slice(key, keylen), &v->rep);
+  if (!s.ok()) {
+    delete (v);
+    if (!s.IsNotFound()) {
+      SaveError(errptr, s);
+    }
+    return nullptr;
+  }
+  return v;
+}
+
+rocksdb_pinnableslice_t* rocksdb_get_pinned_cf(
+    rocksdb_t* db, const rocksdb_readoptions_t* options,
+    rocksdb_column_family_handle_t* column_family, const char* key,
+    size_t keylen, char** errptr) {
+  rocksdb_pinnableslice_t* v = new (rocksdb_pinnableslice_t);
+  Status s = db->rep->Get(options->rep, column_family->rep, Slice(key, keylen),
+                          &v->rep);
+  if (!s.ok()) {
+    delete v;
+    if (!s.IsNotFound()) {
+      SaveError(errptr, s);
+    }
+    return nullptr;
+  }
+  return v;
+}
+
+void rocksdb_pinnableslice_destroy(rocksdb_pinnableslice_t* v) { delete v; }
+
+const char* rocksdb_pinnableslice_value(const rocksdb_pinnableslice_t* v,
+                                        size_t* vlen) {
+  if (!v) {
+    *vlen = 0;
+    return nullptr;
+  }
+
+  *vlen = v->rep.size();
+  return v->rep.data();
+}
+
+// container to keep databases and caches in order to use
+// ROCKSDB_NAMESPACE::MemoryUtil
+struct rocksdb_memory_consumers_t {
+  std::vector<rocksdb_t*> dbs;
+  std::unordered_set<rocksdb_cache_t*> caches;
+};
+
+// initializes new container of memory consumers
+rocksdb_memory_consumers_t* rocksdb_memory_consumers_create() {
+  return new rocksdb_memory_consumers_t;
+}
+
+// adds datatabase to the container of memory consumers
+void rocksdb_memory_consumers_add_db(rocksdb_memory_consumers_t* consumers,
+                                     rocksdb_t* db) {
+  consumers->dbs.push_back(db);
+}
+
+// adds cache to the container of memory consumers
+void rocksdb_memory_consumers_add_cache(rocksdb_memory_consumers_t* consumers,
+                                        rocksdb_cache_t* cache) {
+  consumers->caches.insert(cache);
+}
+
+// deletes container with memory consumers
+void rocksdb_memory_consumers_destroy(rocksdb_memory_consumers_t* consumers) {
+  delete consumers;
+}
+
+// contains memory usage statistics provided by ROCKSDB_NAMESPACE::MemoryUtil
+struct rocksdb_memory_usage_t {
+  uint64_t mem_table_total;
+  uint64_t mem_table_unflushed;
+  uint64_t mem_table_readers_total;
+  uint64_t cache_total;
+};
+
+// estimates amount of memory occupied by consumers (dbs and caches)
+rocksdb_memory_usage_t* rocksdb_approximate_memory_usage_create(
+    rocksdb_memory_consumers_t* consumers, char** errptr) {
+
+  vector<DB*> dbs;
+  for (auto db : consumers->dbs) {
+    dbs.push_back(db->rep);
+  }
+
+  unordered_set<const Cache*> cache_set;
+  for (auto cache : consumers->caches) {
+    cache_set.insert(const_cast<const Cache*>(cache->rep.get()));
+  }
+
+  std::map<ROCKSDB_NAMESPACE::MemoryUtil::UsageType, uint64_t> usage_by_type;
+
+  auto status = MemoryUtil::GetApproximateMemoryUsageByType(dbs, cache_set,
+                                                            &usage_by_type);
+  if (SaveError(errptr, status)) {
+    return nullptr;
+  }
+
+  auto result = new rocksdb_memory_usage_t;
+  result->mem_table_total = usage_by_type[MemoryUtil::kMemTableTotal];
+  result->mem_table_unflushed = usage_by_type[MemoryUtil::kMemTableUnFlushed];
+  result->mem_table_readers_total = usage_by_type[MemoryUtil::kTableReadersTotal];
+  result->cache_total = usage_by_type[MemoryUtil::kCacheTotal];
+  return result;
+}
+
+uint64_t rocksdb_approximate_memory_usage_get_mem_table_total(
+    rocksdb_memory_usage_t* memory_usage) {
+  return memory_usage->mem_table_total;
+}
+
+uint64_t rocksdb_approximate_memory_usage_get_mem_table_unflushed(
+    rocksdb_memory_usage_t* memory_usage) {
+  return memory_usage->mem_table_unflushed;
+}
+
+uint64_t rocksdb_approximate_memory_usage_get_mem_table_readers_total(
+    rocksdb_memory_usage_t* memory_usage) {
+  return memory_usage->mem_table_readers_total;
+}
+
+uint64_t rocksdb_approximate_memory_usage_get_cache_total(
+    rocksdb_memory_usage_t* memory_usage) {
+  return memory_usage->cache_total;
+}
+
+// deletes container with memory usage estimates
+void rocksdb_approximate_memory_usage_destroy(rocksdb_memory_usage_t* usage) {
+  delete usage;
+}
+
+}  // end extern "C"
+
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/c_test.c b/src/rocksdb/db/c_test.c
new file mode 100644
index 000000000..cf2e266f9
--- /dev/null
+++ b/src/rocksdb/db/c_test.c
@@ -0,0 +1,1866 @@
+/* Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+   Use of this source code is governed by a BSD-style license that can be
+   found in the LICENSE file. See the AUTHORS file for names of contributors. */
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+#include <stdio.h>
+
+#ifndef ROCKSDB_LITE  // Lite does not support C API
+
+#include "rocksdb/c.h"
+
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#ifndef OS_WIN
+#include <unistd.h>
+#endif
+#include <inttypes.h>
+
+// Can not use port/port.h macros as this is a c file
+#ifdef OS_WIN
+#include <windows.h>
+
+// Ok for uniqueness
+int geteuid() {
+  int result = 0;
+
+  result = ((int)GetCurrentProcessId() << 16);
+  result |= (int)GetCurrentThreadId();
+
+  return result;
+}
+
+// VS < 2015
+#if defined(_MSC_VER) && (_MSC_VER < 1900)
+#define snprintf _snprintf
+#endif
+
+#endif
+
+const char* phase = "";
+static char dbname[200];
+static char sstfilename[200];
+static char dbbackupname[200];
+static char dbcheckpointname[200];
+static char dbpathname[200];
+static char secondary_path[200];
+
+static void StartPhase(const char* name) {
+  fprintf(stderr, "=== Test %s\n", name);
+  phase = name;
+}
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning (disable: 4996) // getenv security warning
+#endif
+static const char* GetTempDir(void) {
+    const char* ret = getenv("TEST_TMPDIR");
+    if (ret == NULL || ret[0] == '\0')
+        ret = "/tmp";
+    return ret;
+}
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
+#define CheckNoError(err)                                               \
+  if ((err) != NULL) {                                                  \
+    fprintf(stderr, "%s:%d: %s: %s\n", __FILE__, __LINE__, phase, (err)); \
+    abort();                                                            \
+  }
+
+#define CheckCondition(cond)                                            \
+  if (!(cond)) {                                                        \
+    fprintf(stderr, "%s:%d: %s: %s\n", __FILE__, __LINE__, phase, #cond); \
+    abort();                                                            \
+  }
+
+static void CheckEqual(const char* expected, const char* v, size_t n) {
+  if (expected == NULL && v == NULL) {
+    // ok
+  } else if (expected != NULL && v != NULL && n == strlen(expected) &&
+             memcmp(expected, v, n) == 0) {
+    // ok
+    return;
+  } else {
+    fprintf(stderr, "%s: expected '%s', got '%s'\n",
+            phase,
+            (expected ? expected : "(null)"),
+            (v ? v : "(null"));
+    abort();
+  }
+}
+
+static void Free(char** ptr) {
+  if (*ptr) {
+    free(*ptr);
+    *ptr = NULL;
+  }
+}
+
+static void CheckValue(
+    char* err,
+    const char* expected,
+    char** actual,
+    size_t actual_length) {
+  CheckNoError(err);
+  CheckEqual(expected, *actual, actual_length);
+  Free(actual);
+}
+
+static void CheckGet(
+    rocksdb_t* db,
+    const rocksdb_readoptions_t* options,
+    const char* key,
+    const char* expected) {
+  char* err = NULL;
+  size_t val_len;
+  char* val;
+  val = rocksdb_get(db, options, key, strlen(key), &val_len, &err);
+  CheckNoError(err);
+  CheckEqual(expected, val, val_len);
+  Free(&val);
+}
+
+static void CheckGetCF(
+    rocksdb_t* db,
+    const rocksdb_readoptions_t* options,
+    rocksdb_column_family_handle_t* handle,
+    const char* key,
+    const char* expected) {
+  char* err = NULL;
+  size_t val_len;
+  char* val;
+  val = rocksdb_get_cf(db, options, handle, key, strlen(key), &val_len, &err);
+  CheckNoError(err);
+  CheckEqual(expected, val, val_len);
+  Free(&val);
+}
+
+static void CheckPinGet(rocksdb_t* db, const rocksdb_readoptions_t* options,
+                        const char* key, const char* expected) {
+  char* err = NULL;
+  size_t val_len;
+  const char* val;
+  rocksdb_pinnableslice_t* p;
+  p = rocksdb_get_pinned(db, options, key, strlen(key), &err);
+  CheckNoError(err);
+  val = rocksdb_pinnableslice_value(p, &val_len);
+  CheckEqual(expected, val, val_len);
+  rocksdb_pinnableslice_destroy(p);
+}
+
+static void CheckPinGetCF(rocksdb_t* db, const rocksdb_readoptions_t* options,
+                          rocksdb_column_family_handle_t* handle,
+                          const char* key, const char* expected) {
+  char* err = NULL;
+  size_t val_len;
+  const char* val;
+  rocksdb_pinnableslice_t* p;
+  p = rocksdb_get_pinned_cf(db, options, handle, key, strlen(key), &err);
+  CheckNoError(err);
+  val = rocksdb_pinnableslice_value(p, &val_len);
+  CheckEqual(expected, val, val_len);
+  rocksdb_pinnableslice_destroy(p);
+}
+
+static void CheckIter(rocksdb_iterator_t* iter,
+                      const char* key, const char* val) {
+  size_t len;
+  const char* str;
+  str = rocksdb_iter_key(iter, &len);
+  CheckEqual(key, str, len);
+  str = rocksdb_iter_value(iter, &len);
+  CheckEqual(val, str, len);
+}
+
+// Callback from rocksdb_writebatch_iterate()
+static void CheckPut(void* ptr,
+                     const char* k, size_t klen,
+                     const char* v, size_t vlen) {
+  int* state = (int*) ptr;
+  CheckCondition(*state < 2);
+  switch (*state) {
+    case 0:
+      CheckEqual("bar", k, klen);
+      CheckEqual("b", v, vlen);
+      break;
+    case 1:
+      CheckEqual("box", k, klen);
+      CheckEqual("c", v, vlen);
+      break;
+  }
+  (*state)++;
+}
+
+// Callback from rocksdb_writebatch_iterate()
+static void CheckDel(void* ptr, const char* k, size_t klen) {
+  int* state = (int*) ptr;
+  CheckCondition(*state == 2);
+  CheckEqual("bar", k, klen);
+  (*state)++;
+}
+
+static void CmpDestroy(void* arg) { (void)arg; }
+
+static int CmpCompare(void* arg, const char* a, size_t alen,
+                      const char* b, size_t blen) {
+  (void)arg;
+  size_t n = (alen < blen) ? alen : blen;
+  int r = memcmp(a, b, n);
+  if (r == 0) {
+    if (alen < blen) r = -1;
+    else if (alen > blen) r = +1;
+  }
+  return r;
+}
+
+static const char* CmpName(void* arg) {
+  (void)arg;
+  return "foo";
+}
+
+// Custom filter policy
+static unsigned char fake_filter_result = 1;
+static void FilterDestroy(void* arg) { (void)arg; }
+static const char* FilterName(void* arg) {
+  (void)arg;
+  return "TestFilter";
+}
+static char* FilterCreate(
+    void* arg,
+    const char* const* key_array, const size_t* key_length_array,
+    int num_keys,
+    size_t* filter_length) {
+  (void)arg;
+  (void)key_array;
+  (void)key_length_array;
+  (void)num_keys;
+  *filter_length = 4;
+  char* result = malloc(4);
+  memcpy(result, "fake", 4);
+  return result;
+}
+static unsigned char FilterKeyMatch(
+    void* arg,
+    const char* key, size_t length,
+    const char* filter, size_t filter_length) {
+  (void)arg;
+  (void)key;
+  (void)length;
+  CheckCondition(filter_length == 4);
+  CheckCondition(memcmp(filter, "fake", 4) == 0);
+  return fake_filter_result;
+}
+
+// Custom compaction filter
+static void CFilterDestroy(void* arg) { (void)arg; }
+static const char* CFilterName(void* arg) {
+  (void)arg;
+  return "foo";
+}
+static unsigned char CFilterFilter(void* arg, int level, const char* key,
+                                   size_t key_length,
+                                   const char* existing_value,
+                                   size_t value_length, char** new_value,
+                                   size_t* new_value_length,
+                                   unsigned char* value_changed) {
+  (void)arg;
+  (void)level;
+  (void)existing_value;
+  (void)value_length;
+  if (key_length == 3) {
+    if (memcmp(key, "bar", key_length) == 0) {
+      return 1;
+    } else if (memcmp(key, "baz", key_length) == 0) {
+      *value_changed = 1;
+      *new_value = "newbazvalue";
+      *new_value_length = 11;
+      return 0;
+    }
+  }
+  return 0;
+}
+
+static void CFilterFactoryDestroy(void* arg) { (void)arg; }
+static const char* CFilterFactoryName(void* arg) {
+  (void)arg;
+  return "foo";
+}
+static rocksdb_compactionfilter_t* CFilterCreate(
+    void* arg, rocksdb_compactionfiltercontext_t* context) {
+  (void)arg;
+  (void)context;
+  return rocksdb_compactionfilter_create(NULL, CFilterDestroy, CFilterFilter,
+                                         CFilterName);
+}
+
+static rocksdb_t* CheckCompaction(rocksdb_t* db, rocksdb_options_t* options,
+                                  rocksdb_readoptions_t* roptions,
+                                  rocksdb_writeoptions_t* woptions) {
+  char* err = NULL;
+  db = rocksdb_open(options, dbname, &err);
+  CheckNoError(err);
+  rocksdb_put(db, woptions, "foo", 3, "foovalue", 8, &err);
+  CheckNoError(err);
+  CheckGet(db, roptions, "foo", "foovalue");
+  rocksdb_put(db, woptions, "bar", 3, "barvalue", 8, &err);
+  CheckNoError(err);
+  CheckGet(db, roptions, "bar", "barvalue");
+  rocksdb_put(db, woptions, "baz", 3, "bazvalue", 8, &err);
+  CheckNoError(err);
+  CheckGet(db, roptions, "baz", "bazvalue");
+
+  // Force compaction
+  rocksdb_compact_range(db, NULL, 0, NULL, 0);
+  // should have filtered bar, but not foo
+  CheckGet(db, roptions, "foo", "foovalue");
+  CheckGet(db, roptions, "bar", NULL);
+  CheckGet(db, roptions, "baz", "newbazvalue");
+  return db;
+}
+
+// Custom merge operator
+static void MergeOperatorDestroy(void* arg) { (void)arg; }
+static const char* MergeOperatorName(void* arg) {
+  (void)arg;
+  return "TestMergeOperator";
+}
+static char* MergeOperatorFullMerge(
+    void* arg,
+    const char* key, size_t key_length,
+    const char* existing_value, size_t existing_value_length,
+    const char* const* operands_list, const size_t* operands_list_length,
+    int num_operands,
+    unsigned char* success, size_t* new_value_length) {
+  (void)arg;
+  (void)key;
+  (void)key_length;
+  (void)existing_value;
+  (void)existing_value_length;
+  (void)operands_list;
+  (void)operands_list_length;
+  (void)num_operands;
+  *new_value_length = 4;
+  *success = 1;
+  char* result = malloc(4);
+  memcpy(result, "fake", 4);
+  return result;
+}
+static char* MergeOperatorPartialMerge(
+    void* arg,
+    const char* key, size_t key_length,
+    const char* const* operands_list, const size_t* operands_list_length,
+    int num_operands,
+    unsigned char* success, size_t* new_value_length) {
+  (void)arg;
+  (void)key;
+  (void)key_length;
+  (void)operands_list;
+  (void)operands_list_length;
+  (void)num_operands;
+  *new_value_length = 4;
+  *success = 1;
+  char* result = malloc(4);
+  memcpy(result, "fake", 4);
+  return result;
+}
+
+static void CheckTxnGet(
+        rocksdb_transaction_t* txn,
+        const rocksdb_readoptions_t* options,
+        const char* key,
+        const char* expected) {
+        char* err = NULL;
+        size_t val_len;
+        char* val;
+        val = rocksdb_transaction_get(txn, options, key, strlen(key), &val_len, &err);
+        CheckNoError(err);
+        CheckEqual(expected, val, val_len);
+        Free(&val);
+}
+
+static void CheckTxnGetCF(rocksdb_transaction_t* txn,
+                          const rocksdb_readoptions_t* options,
+                          rocksdb_column_family_handle_t* column_family,
+                          const char* key, const char* expected) {
+  char* err = NULL;
+  size_t val_len;
+  char* val;
+  val = rocksdb_transaction_get_cf(txn, options, column_family, key,
+                                   strlen(key), &val_len, &err);
+  CheckNoError(err);
+  CheckEqual(expected, val, val_len);
+  Free(&val);
+}
+
+static void CheckTxnDBGet(
+        rocksdb_transactiondb_t* txn_db,
+        const rocksdb_readoptions_t* options,
+        const char* key,
+        const char* expected) {
+        char* err = NULL;
+        size_t val_len;
+        char* val;
+        val = rocksdb_transactiondb_get(txn_db, options, key, strlen(key), &val_len, &err);
+        CheckNoError(err);
+        CheckEqual(expected, val, val_len);
+        Free(&val);
+}
+
+static void CheckTxnDBGetCF(rocksdb_transactiondb_t* txn_db,
+                            const rocksdb_readoptions_t* options,
+                            rocksdb_column_family_handle_t* column_family,
+                            const char* key, const char* expected) {
+  char* err = NULL;
+  size_t val_len;
+  char* val;
+  val = rocksdb_transactiondb_get_cf(txn_db, options, column_family, key,
+                                     strlen(key), &val_len, &err);
+  CheckNoError(err);
+  CheckEqual(expected, val, val_len);
+  Free(&val);
+}
+
+int main(int argc, char** argv) {
+  (void)argc;
+  (void)argv;
+  rocksdb_t* db;
+  rocksdb_comparator_t* cmp;
+  rocksdb_cache_t* cache;
+  rocksdb_dbpath_t *dbpath;
+  rocksdb_env_t* env;
+  rocksdb_options_t* options;
+  rocksdb_compactoptions_t* coptions;
+  rocksdb_block_based_table_options_t* table_options;
+  rocksdb_readoptions_t* roptions;
+  rocksdb_writeoptions_t* woptions;
+  rocksdb_ratelimiter_t* rate_limiter;
+  rocksdb_transactiondb_t* txn_db;
+  rocksdb_transactiondb_options_t* txn_db_options;
+  rocksdb_transaction_t* txn;
+  rocksdb_transaction_options_t* txn_options;
+  rocksdb_optimistictransactiondb_t* otxn_db;
+  rocksdb_optimistictransaction_options_t* otxn_options;
+  char* err = NULL;
+  int run = -1;
+
+  snprintf(dbname, sizeof(dbname),
+           "%s/rocksdb_c_test-%d",
+           GetTempDir(),
+           ((int) geteuid()));
+
+  snprintf(dbbackupname, sizeof(dbbackupname),
+           "%s/rocksdb_c_test-%d-backup",
+           GetTempDir(),
+           ((int) geteuid()));
+
+  snprintf(dbcheckpointname, sizeof(dbcheckpointname),
+           "%s/rocksdb_c_test-%d-checkpoint",
+           GetTempDir(),
+           ((int) geteuid()));
+
+  snprintf(sstfilename, sizeof(sstfilename),
+           "%s/rocksdb_c_test-%d-sst",
+           GetTempDir(),
+           ((int)geteuid()));
+
+  snprintf(dbpathname, sizeof(dbpathname),
+           "%s/rocksdb_c_test-%d-dbpath",
+           GetTempDir(),
+           ((int) geteuid()));
+
+  StartPhase("create_objects");
+  cmp = rocksdb_comparator_create(NULL, CmpDestroy, CmpCompare, CmpName);
+  dbpath = rocksdb_dbpath_create(dbpathname, 1024 * 1024);
+  env = rocksdb_create_default_env();
+  cache = rocksdb_cache_create_lru(100000);
+
+  options = rocksdb_options_create();
+  rocksdb_options_set_comparator(options, cmp);
+  rocksdb_options_set_error_if_exists(options, 1);
+  rocksdb_options_set_env(options, env);
+  rocksdb_options_set_info_log(options, NULL);
+  rocksdb_options_set_write_buffer_size(options, 100000);
+  rocksdb_options_set_paranoid_checks(options, 1);
+  rocksdb_options_set_max_open_files(options, 10);
+  rocksdb_options_set_base_background_compactions(options, 1);
+
+  table_options = rocksdb_block_based_options_create();
+  rocksdb_block_based_options_set_block_cache(table_options, cache);
+  rocksdb_block_based_options_set_data_block_index_type(table_options, 1);
+  rocksdb_block_based_options_set_data_block_hash_ratio(table_options, 0.75);
+  rocksdb_options_set_block_based_table_factory(options, table_options);
+
+  rocksdb_options_set_compression(options, rocksdb_no_compression);
+  rocksdb_options_set_compression_options(options, -14, -1, 0, 0);
+  int compression_levels[] = {rocksdb_no_compression, rocksdb_no_compression,
+                              rocksdb_no_compression, rocksdb_no_compression};
+  rocksdb_options_set_compression_per_level(options, compression_levels, 4);
+  rate_limiter = rocksdb_ratelimiter_create(1000 * 1024 * 1024, 100 * 1000, 10);
+  rocksdb_options_set_ratelimiter(options, rate_limiter);
+  rocksdb_ratelimiter_destroy(rate_limiter);
+
+  roptions = rocksdb_readoptions_create();
+  rocksdb_readoptions_set_verify_checksums(roptions, 1);
+  rocksdb_readoptions_set_fill_cache(roptions, 1);
+
+  woptions = rocksdb_writeoptions_create();
+  rocksdb_writeoptions_set_sync(woptions, 1);
+
+  coptions = rocksdb_compactoptions_create();
+  rocksdb_compactoptions_set_exclusive_manual_compaction(coptions, 1);
+
+  StartPhase("destroy");
+  rocksdb_destroy_db(options, dbname, &err);
+  Free(&err);
+
+  StartPhase("open_error");
+  rocksdb_open(options, dbname, &err);
+  CheckCondition(err != NULL);
+  Free(&err);
+
+  StartPhase("open");
+  rocksdb_options_set_create_if_missing(options, 1);
+  db = rocksdb_open(options, dbname, &err);
+  CheckNoError(err);
+  CheckGet(db, roptions, "foo", NULL);
+
+  StartPhase("put");
+  rocksdb_put(db, woptions, "foo", 3, "hello", 5, &err);
+  CheckNoError(err);
+  CheckGet(db, roptions, "foo", "hello");
+
+  StartPhase("backup_and_restore");
+  {
+    rocksdb_destroy_db(options, dbbackupname, &err);
+    CheckNoError(err);
+
+    rocksdb_backup_engine_t *be = rocksdb_backup_engine_open(options, dbbackupname, &err);
+    CheckNoError(err);
+
+    rocksdb_backup_engine_create_new_backup(be, db, &err);
+    CheckNoError(err);
+
+    // need a change to trigger a new backup
+    rocksdb_delete(db, woptions, "does-not-exist", 14, &err);
+    CheckNoError(err);
+
+    rocksdb_backup_engine_create_new_backup(be, db, &err);
+    CheckNoError(err);
+
+    const rocksdb_backup_engine_info_t* bei = rocksdb_backup_engine_get_backup_info(be);
+    CheckCondition(rocksdb_backup_engine_info_count(bei) > 1);
+    rocksdb_backup_engine_info_destroy(bei);
+
+    rocksdb_backup_engine_purge_old_backups(be, 1, &err);
+    CheckNoError(err);
+
+    bei = rocksdb_backup_engine_get_backup_info(be);
+    CheckCondition(rocksdb_backup_engine_info_count(bei) == 1);
+    rocksdb_backup_engine_info_destroy(bei);
+
+    rocksdb_delete(db, woptions, "foo", 3, &err);
+    CheckNoError(err);
+
+    rocksdb_close(db);
+
+    rocksdb_destroy_db(options, dbname, &err);
+    CheckNoError(err);
+
+    rocksdb_restore_options_t *restore_options = rocksdb_restore_options_create();
+    rocksdb_restore_options_set_keep_log_files(restore_options, 0);
+    rocksdb_backup_engine_restore_db_from_latest_backup(be, dbname, dbname, restore_options, &err);
+    CheckNoError(err);
+    rocksdb_restore_options_destroy(restore_options);
+
+    rocksdb_options_set_error_if_exists(options, 0);
+    db = rocksdb_open(options, dbname, &err);
+    CheckNoError(err);
+    rocksdb_options_set_error_if_exists(options, 1);
+
+    CheckGet(db, roptions, "foo", "hello");
+
+    rocksdb_backup_engine_close(be);
+  }
+
+  StartPhase("checkpoint");
+  {
+    rocksdb_destroy_db(options, dbcheckpointname, &err);
+    CheckNoError(err);
+
+    rocksdb_checkpoint_t* checkpoint = rocksdb_checkpoint_object_create(db, &err);
+    CheckNoError(err);
+
+    rocksdb_checkpoint_create(checkpoint, dbcheckpointname, 0, &err);
+    CheckNoError(err);
+
+    // start a new database from the checkpoint
+    rocksdb_close(db);
+    rocksdb_options_set_error_if_exists(options, 0);
+    db = rocksdb_open(options, dbcheckpointname, &err);
+    CheckNoError(err);
+
+    CheckGet(db, roptions, "foo", "hello");
+
+    rocksdb_checkpoint_object_destroy(checkpoint);
+
+    rocksdb_close(db);
+    rocksdb_destroy_db(options, dbcheckpointname, &err);
+    CheckNoError(err);
+
+    db = rocksdb_open(options, dbname, &err);
+    CheckNoError(err);
+    rocksdb_options_set_error_if_exists(options, 1);
+  }
+
+  StartPhase("compactall");
+  rocksdb_compact_range(db, NULL, 0, NULL, 0);
+  CheckGet(db, roptions, "foo", "hello");
+
+  StartPhase("compactrange");
+  rocksdb_compact_range(db, "a", 1, "z", 1);
+  CheckGet(db, roptions, "foo", "hello");
+
+  StartPhase("compactallopt");
+  rocksdb_compact_range_opt(db, coptions, NULL, 0, NULL, 0);
+  CheckGet(db, roptions, "foo", "hello");
+
+  StartPhase("compactrangeopt");
+  rocksdb_compact_range_opt(db, coptions, "a", 1, "z", 1);
+  CheckGet(db, roptions, "foo", "hello");
+
+  // Simple check cache usage
+  StartPhase("cache_usage");
+  {
+    rocksdb_readoptions_set_pin_data(roptions, 1);
+    rocksdb_iterator_t* iter = rocksdb_create_iterator(db, roptions);
+    rocksdb_iter_seek(iter, "foo", 3);
+
+    size_t usage = rocksdb_cache_get_usage(cache);
+    CheckCondition(usage > 0);
+
+    size_t pin_usage = rocksdb_cache_get_pinned_usage(cache);
+    CheckCondition(pin_usage > 0);
+
+    rocksdb_iter_next(iter);
+    rocksdb_iter_destroy(iter);
+    rocksdb_readoptions_set_pin_data(roptions, 0);
+  }
+
+  StartPhase("addfile");
+  {
+    rocksdb_envoptions_t* env_opt = rocksdb_envoptions_create();
+    rocksdb_options_t* io_options = rocksdb_options_create();
+    rocksdb_sstfilewriter_t* writer =
+        rocksdb_sstfilewriter_create(env_opt, io_options);
+
+    remove(sstfilename);
+    rocksdb_sstfilewriter_open(writer, sstfilename, &err);
+    CheckNoError(err);
+    rocksdb_sstfilewriter_put(writer, "sstk1", 5, "v1", 2, &err);
+    CheckNoError(err);
+    rocksdb_sstfilewriter_put(writer, "sstk2", 5, "v2", 2, &err);
+    CheckNoError(err);
+    rocksdb_sstfilewriter_put(writer, "sstk3", 5, "v3", 2, &err);
+    CheckNoError(err);
+    rocksdb_sstfilewriter_finish(writer, &err);
+    CheckNoError(err);
+
+    rocksdb_ingestexternalfileoptions_t* ing_opt =
+        rocksdb_ingestexternalfileoptions_create();
+    const char* file_list[1] = {sstfilename};
+    rocksdb_ingest_external_file(db, file_list, 1, ing_opt, &err);
+    CheckNoError(err);
+    CheckGet(db, roptions, "sstk1", "v1");
+    CheckGet(db, roptions, "sstk2", "v2");
+    CheckGet(db, roptions, "sstk3", "v3");
+
+    remove(sstfilename);
+    rocksdb_sstfilewriter_open(writer, sstfilename, &err);
+    CheckNoError(err);
+    rocksdb_sstfilewriter_put(writer, "sstk2", 5, "v4", 2, &err);
+    CheckNoError(err);
+    rocksdb_sstfilewriter_put(writer, "sstk22", 6, "v5", 2, &err);
+    CheckNoError(err);
+    rocksdb_sstfilewriter_put(writer, "sstk3", 5, "v6", 2, &err);
+    CheckNoError(err);
+    rocksdb_sstfilewriter_finish(writer, &err);
+    CheckNoError(err);
+
+    rocksdb_ingest_external_file(db, file_list, 1, ing_opt, &err);
+    CheckNoError(err);
+    CheckGet(db, roptions, "sstk1", "v1");
+    CheckGet(db, roptions, "sstk2", "v4");
+    CheckGet(db, roptions, "sstk22", "v5");
+    CheckGet(db, roptions, "sstk3", "v6");
+
+    rocksdb_ingestexternalfileoptions_destroy(ing_opt);
+    rocksdb_sstfilewriter_destroy(writer);
+    rocksdb_options_destroy(io_options);
+    rocksdb_envoptions_destroy(env_opt);
+
+    // Delete all keys we just ingested
+    rocksdb_delete(db, woptions, "sstk1", 5, &err);
+    CheckNoError(err);
+    rocksdb_delete(db, woptions, "sstk2", 5, &err);
+    CheckNoError(err);
+    rocksdb_delete(db, woptions, "sstk22", 6, &err);
+    CheckNoError(err);
+    rocksdb_delete(db, woptions, "sstk3", 5, &err);
+    CheckNoError(err);
+  }
+
+  StartPhase("writebatch");
+  {
+    rocksdb_writebatch_t* wb = rocksdb_writebatch_create();
+    rocksdb_writebatch_put(wb, "foo", 3, "a", 1);
+    rocksdb_writebatch_clear(wb);
+    rocksdb_writebatch_put(wb, "bar", 3, "b", 1);
+    rocksdb_writebatch_put(wb, "box", 3, "c", 1);
+    rocksdb_writebatch_delete(wb, "bar", 3);
+    rocksdb_write(db, woptions, wb, &err);
+    CheckNoError(err);
+    CheckGet(db, roptions, "foo", "hello");
+    CheckGet(db, roptions, "bar", NULL);
+    CheckGet(db, roptions, "box", "c");
+    int pos = 0;
+    rocksdb_writebatch_iterate(wb, &pos, CheckPut, CheckDel);
+    CheckCondition(pos == 3);
+    rocksdb_writebatch_clear(wb);
+    rocksdb_writebatch_put(wb, "bar", 3, "b", 1);
+    rocksdb_writebatch_put(wb, "bay", 3, "d", 1);
+    rocksdb_writebatch_delete_range(wb, "bar", 3, "bay", 3);
+    rocksdb_write(db, woptions, wb, &err);
+    CheckNoError(err);
+    CheckGet(db, roptions, "bar", NULL);
+    CheckGet(db, roptions, "bay", "d");
+    rocksdb_writebatch_clear(wb);
+    const char* start_list[1] = {"bay"};
+    const size_t start_sizes[1] = {3};
+    const char* end_list[1] = {"baz"};
+    const size_t end_sizes[1] = {3};
+    rocksdb_writebatch_delete_rangev(wb, 1, start_list, start_sizes, end_list,
+                                     end_sizes);
+    rocksdb_write(db, woptions, wb, &err);
+    CheckNoError(err);
+    CheckGet(db, roptions, "bay", NULL);
+    rocksdb_writebatch_destroy(wb);
+  }
+
+  StartPhase("writebatch_vectors");
+  {
+    rocksdb_writebatch_t* wb = rocksdb_writebatch_create();
+    const char* k_list[2] = { "z", "ap" };
+    const size_t k_sizes[2] = { 1, 2 };
+    const char* v_list[3] = { "x", "y", "z" };
+    const size_t v_sizes[3] = { 1, 1, 1 };
+    rocksdb_writebatch_putv(wb, 2, k_list, k_sizes, 3, v_list, v_sizes);
+    rocksdb_write(db, woptions, wb, &err);
+    CheckNoError(err);
+    CheckGet(db, roptions, "zap", "xyz");
+    rocksdb_writebatch_delete(wb, "zap", 3);
+    rocksdb_write(db, woptions, wb, &err);
+    CheckNoError(err);
+    CheckGet(db, roptions, "zap", NULL);
+    rocksdb_writebatch_destroy(wb);
+  }
+
+  StartPhase("writebatch_savepoint");
+  {
+    rocksdb_writebatch_t* wb = rocksdb_writebatch_create();
+    rocksdb_writebatch_set_save_point(wb);
+    rocksdb_writebatch_set_save_point(wb);
+    const char* k_list[2] = {"z", "ap"};
+    const size_t k_sizes[2] = {1, 2};
+    const char* v_list[3] = {"x", "y", "z"};
+    const size_t v_sizes[3] = {1, 1, 1};
+    rocksdb_writebatch_pop_save_point(wb, &err);
+    CheckNoError(err);
+    rocksdb_writebatch_putv(wb, 2, k_list, k_sizes, 3, v_list, v_sizes);
+    rocksdb_writebatch_rollback_to_save_point(wb, &err);
+    CheckNoError(err);
+    rocksdb_write(db, woptions, wb, &err);
+    CheckNoError(err);
+    CheckGet(db, roptions, "zap", NULL);
+    rocksdb_writebatch_destroy(wb);
+  }
+
+  StartPhase("writebatch_rep");
+  {
+    rocksdb_writebatch_t* wb1 = rocksdb_writebatch_create();
+    rocksdb_writebatch_put(wb1, "baz", 3, "d", 1);
+    rocksdb_writebatch_put(wb1, "quux", 4, "e", 1);
+    rocksdb_writebatch_delete(wb1, "quux", 4);
+    size_t repsize1 = 0;
+    const char* rep = rocksdb_writebatch_data(wb1, &repsize1);
+    rocksdb_writebatch_t* wb2 = rocksdb_writebatch_create_from(rep, repsize1);
+    CheckCondition(rocksdb_writebatch_count(wb1) ==
+                   rocksdb_writebatch_count(wb2));
+    size_t repsize2 = 0;
+    CheckCondition(
+        memcmp(rep, rocksdb_writebatch_data(wb2, &repsize2), repsize1) == 0);
+    rocksdb_writebatch_destroy(wb1);
+    rocksdb_writebatch_destroy(wb2);
+  }
+
+  StartPhase("writebatch_wi");
+  {
+    rocksdb_writebatch_wi_t* wbi = rocksdb_writebatch_wi_create(0, 1);
+    rocksdb_writebatch_wi_put(wbi, "foo", 3, "a", 1);
+    rocksdb_writebatch_wi_clear(wbi);
+    rocksdb_writebatch_wi_put(wbi, "bar", 3, "b", 1);
+    rocksdb_writebatch_wi_put(wbi, "box", 3, "c", 1);
+    rocksdb_writebatch_wi_delete(wbi, "bar", 3);
+    int count = rocksdb_writebatch_wi_count(wbi);
+    CheckCondition(count == 3);
+    size_t size;
+    char* value;
+    value = rocksdb_writebatch_wi_get_from_batch(wbi, options, "box", 3, &size, &err);
+    CheckValue(err, "c", &value, size);
+    value = rocksdb_writebatch_wi_get_from_batch(wbi, options, "bar", 3, &size, &err);
+    CheckValue(err, NULL, &value, size);
+    value = rocksdb_writebatch_wi_get_from_batch_and_db(wbi, db, roptions, "foo", 3, &size, &err);
+    CheckValue(err, "hello", &value, size);
+    value = rocksdb_writebatch_wi_get_from_batch_and_db(wbi, db, roptions, "box", 3, &size, &err);
+    CheckValue(err, "c", &value, size);
+    rocksdb_write_writebatch_wi(db, woptions, wbi, &err);
+    CheckNoError(err);
+    CheckGet(db, roptions, "foo", "hello");
+    CheckGet(db, roptions, "bar", NULL);
+    CheckGet(db, roptions, "box", "c");
+    int pos = 0;
+    rocksdb_writebatch_wi_iterate(wbi, &pos, CheckPut, CheckDel);
+    CheckCondition(pos == 3);
+    rocksdb_writebatch_wi_clear(wbi);
+    rocksdb_writebatch_wi_destroy(wbi);
+  }
+
+  StartPhase("writebatch_wi_vectors");
+  {
+    rocksdb_writebatch_wi_t* wb = rocksdb_writebatch_wi_create(0, 1);
+    const char* k_list[2] = { "z", "ap" };
+    const size_t k_sizes[2] = { 1, 2 };
+    const char* v_list[3] = { "x", "y", "z" };
+    const size_t v_sizes[3] = { 1, 1, 1 };
+    rocksdb_writebatch_wi_putv(wb, 2, k_list, k_sizes, 3, v_list, v_sizes);
+    rocksdb_write_writebatch_wi(db, woptions, wb, &err);
+    CheckNoError(err);
+    CheckGet(db, roptions, "zap", "xyz");
+    rocksdb_writebatch_wi_delete(wb, "zap", 3);
+    rocksdb_write_writebatch_wi(db, woptions, wb, &err);
+    CheckNoError(err);
+    CheckGet(db, roptions, "zap", NULL);
+    rocksdb_writebatch_wi_destroy(wb);
+  }
+
+  StartPhase("writebatch_wi_savepoint");
+  {
+    rocksdb_writebatch_wi_t* wb = rocksdb_writebatch_wi_create(0, 1);
+    rocksdb_writebatch_wi_set_save_point(wb);
+    const char* k_list[2] = {"z", "ap"};
+    const size_t k_sizes[2] = {1, 2};
+    const char* v_list[3] = {"x", "y", "z"};
+    const size_t v_sizes[3] = {1, 1, 1};
+    rocksdb_writebatch_wi_putv(wb, 2, k_list, k_sizes, 3, v_list, v_sizes);
+    rocksdb_writebatch_wi_rollback_to_save_point(wb, &err);
+    CheckNoError(err);
+    rocksdb_write_writebatch_wi(db, woptions, wb, &err);
+    CheckNoError(err);
+    CheckGet(db, roptions, "zap", NULL);
+    rocksdb_writebatch_wi_destroy(wb);
+  }
+
+  StartPhase("iter");
+  {
+    rocksdb_iterator_t* iter = rocksdb_create_iterator(db, roptions);
+    CheckCondition(!rocksdb_iter_valid(iter));
+    rocksdb_iter_seek_to_first(iter);
+    CheckCondition(rocksdb_iter_valid(iter));
+    CheckIter(iter, "box", "c");
+    rocksdb_iter_next(iter);
+    CheckIter(iter, "foo", "hello");
+    rocksdb_iter_prev(iter);
+    CheckIter(iter, "box", "c");
+    rocksdb_iter_prev(iter);
+    CheckCondition(!rocksdb_iter_valid(iter));
+    rocksdb_iter_seek_to_last(iter);
+    CheckIter(iter, "foo", "hello");
+    rocksdb_iter_seek(iter, "b", 1);
+    CheckIter(iter, "box", "c");
+    rocksdb_iter_seek_for_prev(iter, "g", 1);
+    CheckIter(iter, "foo", "hello");
+    rocksdb_iter_seek_for_prev(iter, "box", 3);
+    CheckIter(iter, "box", "c");
+    rocksdb_iter_get_error(iter, &err);
+    CheckNoError(err);
+    rocksdb_iter_destroy(iter);
+  }
+
+  StartPhase("wbwi_iter");
+  {
+    rocksdb_iterator_t* base_iter = rocksdb_create_iterator(db, roptions);
+    rocksdb_writebatch_wi_t* wbi = rocksdb_writebatch_wi_create(0, 1);
+    rocksdb_writebatch_wi_put(wbi, "bar", 3, "b", 1);
+    rocksdb_writebatch_wi_delete(wbi, "foo", 3);
+    rocksdb_iterator_t* iter =
+        rocksdb_writebatch_wi_create_iterator_with_base(wbi, base_iter);
+    CheckCondition(!rocksdb_iter_valid(iter));
+    rocksdb_iter_seek_to_first(iter);
+    CheckCondition(rocksdb_iter_valid(iter));
+    CheckIter(iter, "bar", "b");
+    rocksdb_iter_next(iter);
+    CheckIter(iter, "box", "c");
+    rocksdb_iter_prev(iter);
+    CheckIter(iter, "bar", "b");
+    rocksdb_iter_prev(iter);
+    CheckCondition(!rocksdb_iter_valid(iter));
+    rocksdb_iter_seek_to_last(iter);
+    CheckIter(iter, "box", "c");
+    rocksdb_iter_seek(iter, "b", 1);
+    CheckIter(iter, "bar", "b");
+    rocksdb_iter_seek_for_prev(iter, "c", 1);
+    CheckIter(iter, "box", "c");
+    rocksdb_iter_seek_for_prev(iter, "box", 3);
+    CheckIter(iter, "box", "c");
+    rocksdb_iter_get_error(iter, &err);
+    CheckNoError(err);
+    rocksdb_iter_destroy(iter);
+    rocksdb_writebatch_wi_destroy(wbi);
+  }
+
+  StartPhase("multiget");
+  {
+    const char* keys[3] = { "box", "foo", "notfound" };
+    const size_t keys_sizes[3] = { 3, 3, 8 };
+    char* vals[3];
+    size_t vals_sizes[3];
+    char* errs[3];
+    rocksdb_multi_get(db, roptions, 3, keys, keys_sizes, vals, vals_sizes, errs);
+
+    int i;
+    for (i = 0; i < 3; i++) {
+      CheckEqual(NULL, errs[i], 0);
+      switch (i) {
+      case 0:
+        CheckEqual("c", vals[i], vals_sizes[i]);
+        break;
+      case 1:
+        CheckEqual("hello", vals[i], vals_sizes[i]);
+        break;
+      case 2:
+        CheckEqual(NULL, vals[i], vals_sizes[i]);
+        break;
+      }
+      Free(&vals[i]);
+    }
+  }
+
+  StartPhase("pin_get");
+  {
+    CheckPinGet(db, roptions, "box", "c");
+    CheckPinGet(db, roptions, "foo", "hello");
+    CheckPinGet(db, roptions, "notfound", NULL);
+  }
+
+  StartPhase("approximate_sizes");
+  {
+    int i;
+    int n = 20000;
+    char keybuf[100];
+    char valbuf[100];
+    uint64_t sizes[2];
+    const char* start[2] = { "a", "k00000000000000010000" };
+    size_t start_len[2] = { 1, 21 };
+    const char* limit[2] = { "k00000000000000010000", "z" };
+    size_t limit_len[2] = { 21, 1 };
+    rocksdb_writeoptions_set_sync(woptions, 0);
+    for (i = 0; i < n; i++) {
+      snprintf(keybuf, sizeof(keybuf), "k%020d", i);
+      snprintf(valbuf, sizeof(valbuf), "v%020d", i);
+      rocksdb_put(db, woptions, keybuf, strlen(keybuf), valbuf, strlen(valbuf),
+                  &err);
+      CheckNoError(err);
+    }
+    rocksdb_approximate_sizes(db, 2, start, start_len, limit, limit_len, sizes);
+    CheckCondition(sizes[0] > 0);
+    CheckCondition(sizes[1] > 0);
+  }
+
+  StartPhase("property");
+  {
+    char* prop = rocksdb_property_value(db, "nosuchprop");
+    CheckCondition(prop == NULL);
+    prop = rocksdb_property_value(db, "rocksdb.stats");
+    CheckCondition(prop != NULL);
+    Free(&prop);
+  }
+
+  StartPhase("snapshot");
+  {
+    const rocksdb_snapshot_t* snap;
+    snap = rocksdb_create_snapshot(db);
+    rocksdb_delete(db, woptions, "foo", 3, &err);
+    CheckNoError(err);
+    rocksdb_readoptions_set_snapshot(roptions, snap);
+    CheckGet(db, roptions, "foo", "hello");
+    rocksdb_readoptions_set_snapshot(roptions, NULL);
+    CheckGet(db, roptions, "foo", NULL);
+    rocksdb_release_snapshot(db, snap);
+  }
+
+  StartPhase("repair");
+  {
+    // If we do not compact here, then the lazy deletion of
+    // files (https://reviews.facebook.net/D6123) would leave
+    // around deleted files and the repair process will find
+    // those files and put them back into the database.
+    rocksdb_compact_range(db, NULL, 0, NULL, 0);
+    rocksdb_close(db);
+    rocksdb_options_set_create_if_missing(options, 0);
+    rocksdb_options_set_error_if_exists(options, 0);
+    rocksdb_options_set_wal_recovery_mode(options, 2);
+    rocksdb_repair_db(options, dbname, &err);
+    CheckNoError(err);
+    db = rocksdb_open(options, dbname, &err);
+    CheckNoError(err);
+    CheckGet(db, roptions, "foo", NULL);
+    CheckGet(db, roptions, "bar", NULL);
+    CheckGet(db, roptions, "box", "c");
+    rocksdb_options_set_create_if_missing(options, 1);
+    rocksdb_options_set_error_if_exists(options, 1);
+  }
+
+  StartPhase("filter");
+  for (run = 0; run <= 2; run++) {
+    // First run uses custom filter
+    // Second run uses old block-based bloom filter
+    // Third run uses full bloom filter
+    CheckNoError(err);
+    rocksdb_filterpolicy_t* policy;
+    if (run == 0) {
+      policy = rocksdb_filterpolicy_create(NULL, FilterDestroy, FilterCreate,
+                                           FilterKeyMatch, NULL, FilterName);
+    } else if (run == 1) {
+      policy = rocksdb_filterpolicy_create_bloom(8);
+    } else {
+      policy = rocksdb_filterpolicy_create_bloom_full(8);
+    }
+    rocksdb_block_based_options_set_filter_policy(table_options, policy);
+
+    // Create new database
+    rocksdb_close(db);
+    rocksdb_destroy_db(options, dbname, &err);
+    rocksdb_options_set_block_based_table_factory(options, table_options);
+    db = rocksdb_open(options, dbname, &err);
+    CheckNoError(err);
+    rocksdb_put(db, woptions, "foo", 3, "foovalue", 8, &err);
+    CheckNoError(err);
+    rocksdb_put(db, woptions, "bar", 3, "barvalue", 8, &err);
+    CheckNoError(err);
+
+    {
+      // Add enough keys to get just one reasonably populated Bloom filter
+      const int keys_to_add = 1500;
+      int i;
+      char keybuf[100];
+      for (i = 0; i < keys_to_add; i++) {
+        snprintf(keybuf, sizeof(keybuf), "yes%020d", i);
+        rocksdb_put(db, woptions, keybuf, strlen(keybuf), "val", 3, &err);
+        CheckNoError(err);
+      }
+    }
+    rocksdb_compact_range(db, NULL, 0, NULL, 0);
+
+    fake_filter_result = 1;
+    CheckGet(db, roptions, "foo", "foovalue");
+    CheckGet(db, roptions, "bar", "barvalue");
+    if (run == 0) {
+      // Must not find value when custom filter returns false
+      fake_filter_result = 0;
+      CheckGet(db, roptions, "foo", NULL);
+      CheckGet(db, roptions, "bar", NULL);
+      fake_filter_result = 1;
+
+      CheckGet(db, roptions, "foo", "foovalue");
+      CheckGet(db, roptions, "bar", "barvalue");
+    }
+
+    {
+      // Query some keys not added to identify Bloom filter implementation
+      // from false positive queries, using perfcontext to detect Bloom
+      // filter behavior
+      rocksdb_perfcontext_t* perf = rocksdb_perfcontext_create();
+      rocksdb_perfcontext_reset(perf);
+
+      const int keys_to_query = 10000;
+      int i;
+      char keybuf[100];
+      for (i = 0; i < keys_to_query; i++) {
+        fake_filter_result = i % 2;
+        snprintf(keybuf, sizeof(keybuf), "no%020d", i);
+        CheckGet(db, roptions, keybuf, NULL);
+      }
+
+      const int hits =
+          (int)rocksdb_perfcontext_metric(perf, rocksdb_bloom_sst_hit_count);
+      if (run == 0) {
+        // Due to half true, half false with fake filter result
+        CheckCondition(hits == keys_to_query / 2);
+      } else if (run == 1) {
+        // Essentially a fingerprint of the block-based Bloom schema
+        CheckCondition(hits == 241);
+      } else {
+        // Essentially a fingerprint of the full Bloom schema(s),
+        // format_version < 5, which vary for three different CACHE_LINE_SIZEs
+        CheckCondition(hits == 224 || hits == 180 || hits == 125);
+      }
+      CheckCondition(
+          (keys_to_query - hits) ==
+          (int)rocksdb_perfcontext_metric(perf, rocksdb_bloom_sst_miss_count));
+
+      rocksdb_perfcontext_destroy(perf);
+    }
+
+    // Reset the policy
+    rocksdb_block_based_options_set_filter_policy(table_options, NULL);
+    rocksdb_options_set_block_based_table_factory(options, table_options);
+  }
+
+  StartPhase("compaction_filter");
+  {
+    rocksdb_options_t* options_with_filter = rocksdb_options_create();
+    rocksdb_options_set_create_if_missing(options_with_filter, 1);
+    rocksdb_compactionfilter_t* cfilter;
+    cfilter = rocksdb_compactionfilter_create(NULL, CFilterDestroy,
+                                              CFilterFilter, CFilterName);
+    // Create new database
+    rocksdb_close(db);
+    rocksdb_destroy_db(options_with_filter, dbname, &err);
+    rocksdb_options_set_compaction_filter(options_with_filter, cfilter);
+    db = CheckCompaction(db, options_with_filter, roptions, woptions);
+
+    rocksdb_options_set_compaction_filter(options_with_filter, NULL);
+    rocksdb_compactionfilter_destroy(cfilter);
+    rocksdb_options_destroy(options_with_filter);
+  }
+
+  StartPhase("compaction_filter_factory");
+  {
+    rocksdb_options_t* options_with_filter_factory = rocksdb_options_create();
+    rocksdb_options_set_create_if_missing(options_with_filter_factory, 1);
+    rocksdb_compactionfilterfactory_t* factory;
+    factory = rocksdb_compactionfilterfactory_create(
+        NULL, CFilterFactoryDestroy, CFilterCreate, CFilterFactoryName);
+    // Create new database
+    rocksdb_close(db);
+    rocksdb_destroy_db(options_with_filter_factory, dbname, &err);
+    rocksdb_options_set_compaction_filter_factory(options_with_filter_factory,
+                                                  factory);
+    db = CheckCompaction(db, options_with_filter_factory, roptions, woptions);
+
+    rocksdb_options_set_compaction_filter_factory(
+        options_with_filter_factory, NULL);
+    rocksdb_options_destroy(options_with_filter_factory);
+  }
+
+  StartPhase("merge_operator");
+  {
+    rocksdb_mergeoperator_t* merge_operator;
+    merge_operator = rocksdb_mergeoperator_create(
+        NULL, MergeOperatorDestroy, MergeOperatorFullMerge,
+        MergeOperatorPartialMerge, NULL, MergeOperatorName);
+    // Create new database
+    rocksdb_close(db);
+    rocksdb_destroy_db(options, dbname, &err);
+    rocksdb_options_set_merge_operator(options, merge_operator);
+    db = rocksdb_open(options, dbname, &err);
+    CheckNoError(err);
+    rocksdb_put(db, woptions, "foo", 3, "foovalue", 8, &err);
+    CheckNoError(err);
+    CheckGet(db, roptions, "foo", "foovalue");
+    rocksdb_merge(db, woptions, "foo", 3, "barvalue", 8, &err);
+    CheckNoError(err);
+    CheckGet(db, roptions, "foo", "fake");
+
+    // Merge of a non-existing value
+    rocksdb_merge(db, woptions, "bar", 3, "barvalue", 8, &err);
+    CheckNoError(err);
+    CheckGet(db, roptions, "bar", "fake");
+
+  }
+
+  StartPhase("columnfamilies");
+  {
+    rocksdb_close(db);
+    rocksdb_destroy_db(options, dbname, &err);
+    CheckNoError(err);
+
+    rocksdb_options_t* db_options = rocksdb_options_create();
+    rocksdb_options_set_create_if_missing(db_options, 1);
+    db = rocksdb_open(db_options, dbname, &err);
+    CheckNoError(err)
+    rocksdb_column_family_handle_t* cfh;
+    cfh = rocksdb_create_column_family(db, db_options, "cf1", &err);
+    rocksdb_column_family_handle_destroy(cfh);
+    CheckNoError(err);
+    rocksdb_close(db);
+
+    size_t cflen;
+    char** column_fams = rocksdb_list_column_families(db_options, dbname, &cflen, &err);
+    CheckNoError(err);
+    CheckEqual("default", column_fams[0], 7);
+    CheckEqual("cf1", column_fams[1], 3);
+    CheckCondition(cflen == 2);
+    rocksdb_list_column_families_destroy(column_fams, cflen);
+
+    rocksdb_options_t* cf_options = rocksdb_options_create();
+
+    const char* cf_names[2] = {"default", "cf1"};
+    const rocksdb_options_t* cf_opts[2] = {cf_options, cf_options};
+    rocksdb_column_family_handle_t* handles[2];
+    db = rocksdb_open_column_families(db_options, dbname, 2, cf_names, cf_opts, handles, &err);
+    CheckNoError(err);
+
+    rocksdb_put_cf(db, woptions, handles[1], "foo", 3, "hello", 5, &err);
+    CheckNoError(err);
+
+    rocksdb_put_cf(db, woptions, handles[1], "foobar1", 7, "hello1", 6, &err);
+    CheckNoError(err);
+    rocksdb_put_cf(db, woptions, handles[1], "foobar2", 7, "hello2", 6, &err);
+    CheckNoError(err);
+    rocksdb_put_cf(db, woptions, handles[1], "foobar3", 7, "hello3", 6, &err);
+    CheckNoError(err);
+    rocksdb_put_cf(db, woptions, handles[1], "foobar4", 7, "hello4", 6, &err);
+    CheckNoError(err);
+
+    rocksdb_flushoptions_t *flush_options = rocksdb_flushoptions_create();
+    rocksdb_flushoptions_set_wait(flush_options, 1);
+    rocksdb_flush_cf(db, flush_options, handles[1], &err);
+    CheckNoError(err)
+    rocksdb_flushoptions_destroy(flush_options);
+
+    CheckGetCF(db, roptions, handles[1], "foo", "hello");
+    CheckPinGetCF(db, roptions, handles[1], "foo", "hello");
+
+    rocksdb_delete_cf(db, woptions, handles[1], "foo", 3, &err);
+    CheckNoError(err);
+
+    rocksdb_delete_range_cf(db, woptions, handles[1], "foobar2", 7, "foobar4",
+                            7, &err);
+    CheckNoError(err);
+
+    CheckGetCF(db, roptions, handles[1], "foo", NULL);
+    CheckPinGetCF(db, roptions, handles[1], "foo", NULL);
+
+    rocksdb_writebatch_t* wb = rocksdb_writebatch_create();
+    rocksdb_writebatch_put_cf(wb, handles[1], "baz", 3, "a", 1);
+    rocksdb_writebatch_clear(wb);
+    rocksdb_writebatch_put_cf(wb, handles[1], "bar", 3, "b", 1);
+    rocksdb_writebatch_put_cf(wb, handles[1], "box", 3, "c", 1);
+    rocksdb_writebatch_delete_cf(wb, handles[1], "bar", 3);
+    rocksdb_write(db, woptions, wb, &err);
+    CheckNoError(err);
+    CheckGetCF(db, roptions, handles[1], "baz", NULL);
+    CheckGetCF(db, roptions, handles[1], "bar", NULL);
+    CheckGetCF(db, roptions, handles[1], "box", "c");
+    CheckPinGetCF(db, roptions, handles[1], "baz", NULL);
+    CheckPinGetCF(db, roptions, handles[1], "bar", NULL);
+    CheckPinGetCF(db, roptions, handles[1], "box", "c");
+    rocksdb_writebatch_destroy(wb);
+
+    const char* keys[3] = { "box", "box", "barfooxx" };
+    const rocksdb_column_family_handle_t* get_handles[3] = { handles[0], handles[1], handles[1] };
+    const size_t keys_sizes[3] = { 3, 3, 8 };
+    char* vals[3];
+    size_t vals_sizes[3];
+    char* errs[3];
+    rocksdb_multi_get_cf(db, roptions, get_handles, 3, keys, keys_sizes, vals, vals_sizes, errs);
+
+    int i;
+    for (i = 0; i < 3; i++) {
+      CheckEqual(NULL, errs[i], 0);
+      switch (i) {
+      case 0:
+        CheckEqual(NULL, vals[i], vals_sizes[i]); // wrong cf
+        break;
+      case 1:
+        CheckEqual("c", vals[i], vals_sizes[i]); // bingo
+        break;
+      case 2:
+        CheckEqual(NULL, vals[i], vals_sizes[i]); // normal not found
+        break;
+      }
+      Free(&vals[i]);
+    }
+
+    rocksdb_iterator_t* iter = rocksdb_create_iterator_cf(db, roptions, handles[1]);
+    CheckCondition(!rocksdb_iter_valid(iter));
+    rocksdb_iter_seek_to_first(iter);
+    CheckCondition(rocksdb_iter_valid(iter));
+
+    for (i = 0; rocksdb_iter_valid(iter) != 0; rocksdb_iter_next(iter)) {
+      i++;
+    }
+    CheckCondition(i == 3);
+    rocksdb_iter_get_error(iter, &err);
+    CheckNoError(err);
+    rocksdb_iter_destroy(iter);
+
+    rocksdb_column_family_handle_t* iters_cf_handles[2] = { handles[0], handles[1] };
+    rocksdb_iterator_t* iters_handles[2];
+    rocksdb_create_iterators(db, roptions, iters_cf_handles, iters_handles, 2, &err);
+    CheckNoError(err);
+
+    iter = iters_handles[0];
+    CheckCondition(!rocksdb_iter_valid(iter));
+    rocksdb_iter_seek_to_first(iter);
+    CheckCondition(!rocksdb_iter_valid(iter));
+    rocksdb_iter_destroy(iter);
+
+    iter = iters_handles[1];
+    CheckCondition(!rocksdb_iter_valid(iter));
+    rocksdb_iter_seek_to_first(iter);
+    CheckCondition(rocksdb_iter_valid(iter));
+
+    for (i = 0; rocksdb_iter_valid(iter) != 0; rocksdb_iter_next(iter)) {
+      i++;
+    }
+    CheckCondition(i == 3);
+    rocksdb_iter_get_error(iter, &err);
+    CheckNoError(err);
+    rocksdb_iter_destroy(iter);
+
+    rocksdb_drop_column_family(db, handles[1], &err);
+    CheckNoError(err);
+    for (i = 0; i < 2; i++) {
+      rocksdb_column_family_handle_destroy(handles[i]);
+    }
+    rocksdb_close(db);
+    rocksdb_destroy_db(options, dbname, &err);
+    rocksdb_options_destroy(db_options);
+    rocksdb_options_destroy(cf_options);
+  }
+
+  StartPhase("prefix");
+  {
+    // Create new database
+    rocksdb_options_set_allow_mmap_reads(options, 1);
+    rocksdb_options_set_prefix_extractor(options, rocksdb_slicetransform_create_fixed_prefix(3));
+    rocksdb_options_set_hash_skip_list_rep(options, 5000, 4, 4);
+    rocksdb_options_set_plain_table_factory(options, 4, 10, 0.75, 16);
+    rocksdb_options_set_allow_concurrent_memtable_write(options, 0);
+
+    db = rocksdb_open(options, dbname, &err);
+    CheckNoError(err);
+
+    rocksdb_put(db, woptions, "foo1", 4, "foo", 3, &err);
+    CheckNoError(err);
+    rocksdb_put(db, woptions, "foo2", 4, "foo", 3, &err);
+    CheckNoError(err);
+    rocksdb_put(db, woptions, "foo3", 4, "foo", 3, &err);
+    CheckNoError(err);
+    rocksdb_put(db, woptions, "bar1", 4, "bar", 3, &err);
+    CheckNoError(err);
+    rocksdb_put(db, woptions, "bar2", 4, "bar", 3, &err);
+    CheckNoError(err);
+    rocksdb_put(db, woptions, "bar3", 4, "bar", 3, &err);
+    CheckNoError(err);
+
+    rocksdb_iterator_t* iter = rocksdb_create_iterator(db, roptions);
+    CheckCondition(!rocksdb_iter_valid(iter));
+
+    rocksdb_iter_seek(iter, "bar", 3);
+    rocksdb_iter_get_error(iter, &err);
+    CheckNoError(err);
+    CheckCondition(rocksdb_iter_valid(iter));
+
+    CheckIter(iter, "bar1", "bar");
+    rocksdb_iter_next(iter);
+    CheckIter(iter, "bar2", "bar");
+    rocksdb_iter_next(iter);
+    CheckIter(iter, "bar3", "bar");
+    rocksdb_iter_get_error(iter, &err);
+    CheckNoError(err);
+    rocksdb_iter_destroy(iter);
+
+    rocksdb_readoptions_set_total_order_seek(roptions, 1);
+    iter = rocksdb_create_iterator(db, roptions);
+    CheckCondition(!rocksdb_iter_valid(iter));
+
+    rocksdb_iter_seek(iter, "ba", 2);
+    rocksdb_iter_get_error(iter, &err);
+    CheckNoError(err);
+    CheckCondition(rocksdb_iter_valid(iter));
+    CheckIter(iter, "bar1", "bar");
+
+    rocksdb_iter_destroy(iter);
+    rocksdb_readoptions_set_total_order_seek(roptions, 0);
+
+    rocksdb_close(db);
+    rocksdb_destroy_db(options, dbname, &err);
+  }
+
+  // Check memory usage stats
+  StartPhase("approximate_memory_usage");
+  {
+    // Create database
+    db = rocksdb_open(options, dbname, &err);
+    CheckNoError(err);
+
+    rocksdb_memory_consumers_t* consumers;
+    consumers = rocksdb_memory_consumers_create();
+    rocksdb_memory_consumers_add_db(consumers, db);
+    rocksdb_memory_consumers_add_cache(consumers, cache);
+
+    // take memory usage report before write-read operation
+    rocksdb_memory_usage_t* mu1;
+    mu1 = rocksdb_approximate_memory_usage_create(consumers, &err);
+    CheckNoError(err);
+
+    // Put data (this should affect memtables)
+    rocksdb_put(db, woptions, "memory", 6, "test", 4, &err);
+    CheckNoError(err);
+    CheckGet(db, roptions, "memory", "test");
+
+    // take memory usage report after write-read operation
+    rocksdb_memory_usage_t* mu2;
+    mu2 = rocksdb_approximate_memory_usage_create(consumers, &err);
+    CheckNoError(err);
+
+    // amount of memory used within memtables should grow
+    CheckCondition(rocksdb_approximate_memory_usage_get_mem_table_total(mu2) >=
+                   rocksdb_approximate_memory_usage_get_mem_table_total(mu1));
+    CheckCondition(rocksdb_approximate_memory_usage_get_mem_table_unflushed(mu2) >=
+                   rocksdb_approximate_memory_usage_get_mem_table_unflushed(mu1));
+
+    rocksdb_memory_consumers_destroy(consumers);
+    rocksdb_approximate_memory_usage_destroy(mu1);
+    rocksdb_approximate_memory_usage_destroy(mu2);
+    rocksdb_close(db);
+    rocksdb_destroy_db(options, dbname, &err);
+    CheckNoError(err);
+  }
+
+  StartPhase("cuckoo_options");
+  {
+    rocksdb_cuckoo_table_options_t* cuckoo_options;
+    cuckoo_options = rocksdb_cuckoo_options_create();
+    rocksdb_cuckoo_options_set_hash_ratio(cuckoo_options, 0.5);
+    rocksdb_cuckoo_options_set_max_search_depth(cuckoo_options, 200);
+    rocksdb_cuckoo_options_set_cuckoo_block_size(cuckoo_options, 10);
+    rocksdb_cuckoo_options_set_identity_as_first_hash(cuckoo_options, 1);
+    rocksdb_cuckoo_options_set_use_module_hash(cuckoo_options, 0);
+    rocksdb_options_set_cuckoo_table_factory(options, cuckoo_options);
+
+    db = rocksdb_open(options, dbname, &err);
+    CheckNoError(err);
+
+    rocksdb_cuckoo_options_destroy(cuckoo_options);
+  }
+
+  StartPhase("iterate_upper_bound");
+  {
+    // Create new empty database
+    rocksdb_close(db);
+    rocksdb_destroy_db(options, dbname, &err);
+    CheckNoError(err);
+
+    rocksdb_options_set_prefix_extractor(options, NULL);
+    db = rocksdb_open(options, dbname, &err);
+    CheckNoError(err);
+
+    rocksdb_put(db, woptions, "a",    1, "0",    1, &err); CheckNoError(err);
+    rocksdb_put(db, woptions, "foo",  3, "bar",  3, &err); CheckNoError(err);
+    rocksdb_put(db, woptions, "foo1", 4, "bar1", 4, &err); CheckNoError(err);
+    rocksdb_put(db, woptions, "g1",   2, "0",    1, &err); CheckNoError(err);
+
+    // testing basic case with no iterate_upper_bound and no prefix_extractor
+    {
+       rocksdb_readoptions_set_iterate_upper_bound(roptions, NULL, 0);
+       rocksdb_iterator_t* iter = rocksdb_create_iterator(db, roptions);
+
+       rocksdb_iter_seek(iter, "foo", 3);
+       CheckCondition(rocksdb_iter_valid(iter));
+       CheckIter(iter, "foo", "bar");
+
+       rocksdb_iter_next(iter);
+       CheckCondition(rocksdb_iter_valid(iter));
+       CheckIter(iter, "foo1", "bar1");
+
+       rocksdb_iter_next(iter);
+       CheckCondition(rocksdb_iter_valid(iter));
+       CheckIter(iter, "g1", "0");
+
+       rocksdb_iter_destroy(iter);
+    }
+
+    // testing iterate_upper_bound and forward iterator
+    // to make sure it stops at bound
+    {
+       // iterate_upper_bound points beyond the last expected entry
+       rocksdb_readoptions_set_iterate_upper_bound(roptions, "foo2", 4);
+
+       rocksdb_iterator_t* iter = rocksdb_create_iterator(db, roptions);
+
+       rocksdb_iter_seek(iter, "foo", 3);
+       CheckCondition(rocksdb_iter_valid(iter));
+       CheckIter(iter, "foo", "bar");
+
+       rocksdb_iter_next(iter);
+       CheckCondition(rocksdb_iter_valid(iter));
+       CheckIter(iter, "foo1", "bar1");
+
+       rocksdb_iter_next(iter);
+       // should stop here...
+       CheckCondition(!rocksdb_iter_valid(iter));
+
+       rocksdb_iter_destroy(iter);
+       rocksdb_readoptions_set_iterate_upper_bound(roptions, NULL, 0);
+    }
+  }
+
+  StartPhase("transactions");
+  {
+    rocksdb_close(db);
+    rocksdb_destroy_db(options, dbname, &err);
+    CheckNoError(err);
+
+    // open a TransactionDB
+    txn_db_options = rocksdb_transactiondb_options_create();
+    txn_options = rocksdb_transaction_options_create();
+    rocksdb_options_set_create_if_missing(options, 1);
+    txn_db = rocksdb_transactiondb_open(options, txn_db_options, dbname, &err);
+    CheckNoError(err);
+
+    // put outside a transaction
+    rocksdb_transactiondb_put(txn_db, woptions, "foo", 3, "hello", 5, &err);
+    CheckNoError(err);
+    CheckTxnDBGet(txn_db, roptions, "foo", "hello");
+
+    // delete from outside transaction
+    rocksdb_transactiondb_delete(txn_db, woptions, "foo", 3, &err);
+    CheckNoError(err);
+    CheckTxnDBGet(txn_db, roptions, "foo", NULL);
+
+    // write batch into TransactionDB
+    rocksdb_writebatch_t* wb = rocksdb_writebatch_create();
+    rocksdb_writebatch_put(wb, "foo", 3, "a", 1);
+    rocksdb_writebatch_clear(wb);
+    rocksdb_writebatch_put(wb, "bar", 3, "b", 1);
+    rocksdb_writebatch_put(wb, "box", 3, "c", 1);
+    rocksdb_writebatch_delete(wb, "bar", 3);
+    rocksdb_transactiondb_write(txn_db, woptions, wb, &err);
+    rocksdb_writebatch_destroy(wb);
+    CheckTxnDBGet(txn_db, roptions, "box", "c");
+    CheckNoError(err);
+
+    // begin a transaction
+    txn = rocksdb_transaction_begin(txn_db, woptions, txn_options, NULL);
+    // put
+    rocksdb_transaction_put(txn, "foo", 3, "hello", 5, &err);
+    CheckNoError(err);
+    CheckTxnGet(txn, roptions, "foo", "hello");
+    // delete
+    rocksdb_transaction_delete(txn, "foo", 3, &err);
+    CheckNoError(err);
+    CheckTxnGet(txn, roptions, "foo", NULL);
+
+    rocksdb_transaction_put(txn, "foo", 3, "hello", 5, &err);
+    CheckNoError(err);
+
+    // read from outside transaction, before commit
+    CheckTxnDBGet(txn_db, roptions, "foo", NULL);
+
+    // commit
+    rocksdb_transaction_commit(txn, &err);
+    CheckNoError(err);
+
+    // read from outside transaction, after commit
+    CheckTxnDBGet(txn_db, roptions, "foo", "hello");
+
+    // reuse old transaction
+    txn = rocksdb_transaction_begin(txn_db, woptions, txn_options, txn);
+
+    // snapshot
+    const rocksdb_snapshot_t* snapshot;
+    snapshot = rocksdb_transactiondb_create_snapshot(txn_db);
+    rocksdb_readoptions_set_snapshot(roptions, snapshot);
+
+    rocksdb_transactiondb_put(txn_db, woptions, "foo", 3, "hey", 3,  &err);
+    CheckNoError(err);
+
+    CheckTxnDBGet(txn_db, roptions, "foo", "hello");
+    rocksdb_readoptions_set_snapshot(roptions, NULL);
+    rocksdb_transactiondb_release_snapshot(txn_db, snapshot);
+    CheckTxnDBGet(txn_db, roptions, "foo", "hey");
+
+    // iterate
+    rocksdb_transaction_put(txn, "bar", 3, "hi", 2, &err);
+    rocksdb_iterator_t* iter = rocksdb_transaction_create_iterator(txn, roptions);
+    CheckCondition(!rocksdb_iter_valid(iter));
+    rocksdb_iter_seek_to_first(iter);
+    CheckCondition(rocksdb_iter_valid(iter));
+    CheckIter(iter, "bar", "hi");
+    rocksdb_iter_get_error(iter, &err);
+    CheckNoError(err);
+    rocksdb_iter_destroy(iter);
+
+    // rollback
+    rocksdb_transaction_rollback(txn, &err);
+    CheckNoError(err);
+    CheckTxnDBGet(txn_db, roptions, "bar", NULL);
+
+    // save point
+    rocksdb_transaction_put(txn, "foo1", 4, "hi1", 3, &err);
+    rocksdb_transaction_set_savepoint(txn);
+    CheckTxnGet(txn, roptions, "foo1", "hi1");
+    rocksdb_transaction_put(txn, "foo2", 4, "hi2", 3, &err);
+    CheckTxnGet(txn, roptions, "foo2", "hi2");
+
+    // rollback to savepoint
+    rocksdb_transaction_rollback_to_savepoint(txn, &err);
+    CheckNoError(err);
+    CheckTxnGet(txn, roptions, "foo2", NULL);
+    CheckTxnGet(txn, roptions, "foo1", "hi1");
+    CheckTxnDBGet(txn_db, roptions, "foo1", NULL);
+    CheckTxnDBGet(txn_db, roptions, "foo2", NULL);
+    rocksdb_transaction_commit(txn, &err);
+    CheckNoError(err);
+    CheckTxnDBGet(txn_db, roptions, "foo1", "hi1");
+    CheckTxnDBGet(txn_db, roptions, "foo2", NULL);
+
+    // Column families.
+    rocksdb_column_family_handle_t* cfh;
+    cfh = rocksdb_transactiondb_create_column_family(txn_db, options,
+                                                     "txn_db_cf", &err);
+    CheckNoError(err);
+
+    rocksdb_transactiondb_put_cf(txn_db, woptions, cfh, "cf_foo", 6, "cf_hello",
+                                 8, &err);
+    CheckNoError(err);
+    CheckTxnDBGetCF(txn_db, roptions, cfh, "cf_foo", "cf_hello");
+
+    rocksdb_transactiondb_delete_cf(txn_db, woptions, cfh, "cf_foo", 6, &err);
+    CheckNoError(err);
+    CheckTxnDBGetCF(txn_db, roptions, cfh, "cf_foo", NULL);
+
+    rocksdb_column_family_handle_destroy(cfh);
+
+    // close and destroy
+    rocksdb_transaction_destroy(txn);
+    rocksdb_transactiondb_close(txn_db);
+    rocksdb_destroy_db(options, dbname, &err);
+    CheckNoError(err);
+    rocksdb_transaction_options_destroy(txn_options);
+    rocksdb_transactiondb_options_destroy(txn_db_options);
+  }
+
+  StartPhase("optimistic_transactions");
+  {
+    rocksdb_options_t* db_options = rocksdb_options_create();
+    rocksdb_options_set_create_if_missing(db_options, 1);
+    rocksdb_options_set_allow_concurrent_memtable_write(db_options, 1);
+    otxn_db = rocksdb_optimistictransactiondb_open(db_options, dbname, &err);
+    otxn_options = rocksdb_optimistictransaction_options_create();
+    rocksdb_transaction_t* txn1 = rocksdb_optimistictransaction_begin(
+        otxn_db, woptions, otxn_options, NULL);
+    rocksdb_transaction_t* txn2 = rocksdb_optimistictransaction_begin(
+        otxn_db, woptions, otxn_options, NULL);
+    rocksdb_transaction_put(txn1, "key", 3, "value", 5, &err);
+    CheckNoError(err);
+    rocksdb_transaction_put(txn2, "key1", 4, "value1", 6, &err);
+    CheckNoError(err);
+    CheckTxnGet(txn1, roptions, "key", "value");
+    rocksdb_transaction_commit(txn1, &err);
+    CheckNoError(err);
+    rocksdb_transaction_commit(txn2, &err);
+    CheckNoError(err);
+    rocksdb_transaction_destroy(txn1);
+    rocksdb_transaction_destroy(txn2);
+
+    // Check column family
+    db = rocksdb_optimistictransactiondb_get_base_db(otxn_db);
+    rocksdb_put(db, woptions, "key", 3, "value", 5, &err);
+    CheckNoError(err);
+    rocksdb_column_family_handle_t *cfh1, *cfh2;
+    cfh1 = rocksdb_create_column_family(db, db_options, "txn_db_cf1", &err);
+    cfh2 = rocksdb_create_column_family(db, db_options, "txn_db_cf2", &err);
+    txn = rocksdb_optimistictransaction_begin(otxn_db, woptions, otxn_options,
+                                              NULL);
+    rocksdb_transaction_put_cf(txn, cfh1, "key_cf1", 7, "val_cf1", 7, &err);
+    CheckNoError(err);
+    rocksdb_transaction_put_cf(txn, cfh2, "key_cf2", 7, "val_cf2", 7, &err);
+    CheckNoError(err);
+    rocksdb_transaction_commit(txn, &err);
+    CheckNoError(err);
+    txn = rocksdb_optimistictransaction_begin(otxn_db, woptions, otxn_options,
+                                              txn);
+    CheckGetCF(db, roptions, cfh1, "key_cf1", "val_cf1");
+    CheckTxnGetCF(txn, roptions, cfh1, "key_cf1", "val_cf1");
+
+    // Check iterator with column family
+    rocksdb_transaction_put_cf(txn, cfh1, "key1_cf", 7, "val1_cf", 7, &err);
+    CheckNoError(err);
+    rocksdb_iterator_t* iter =
+        rocksdb_transaction_create_iterator_cf(txn, roptions, cfh1);
+    CheckCondition(!rocksdb_iter_valid(iter));
+    rocksdb_iter_seek_to_first(iter);
+    CheckCondition(rocksdb_iter_valid(iter));
+    CheckIter(iter, "key1_cf", "val1_cf");
+    rocksdb_iter_get_error(iter, &err);
+    CheckNoError(err);
+    rocksdb_iter_destroy(iter);
+
+    rocksdb_transaction_destroy(txn);
+    rocksdb_column_family_handle_destroy(cfh1);
+    rocksdb_column_family_handle_destroy(cfh2);
+    rocksdb_optimistictransactiondb_close_base_db(db);
+    rocksdb_optimistictransactiondb_close(otxn_db);
+
+    // Check open optimistic transaction db with column families
+    size_t cf_len;
+    char** column_fams =
+        rocksdb_list_column_families(db_options, dbname, &cf_len, &err);
+    CheckNoError(err);
+    CheckEqual("default", column_fams[0], 7);
+    CheckEqual("txn_db_cf1", column_fams[1], 10);
+    CheckEqual("txn_db_cf2", column_fams[2], 10);
+    CheckCondition(cf_len == 3);
+    rocksdb_list_column_families_destroy(column_fams, cf_len);
+
+    const char* cf_names[3] = {"default", "txn_db_cf1", "txn_db_cf2"};
+    rocksdb_options_t* cf_options = rocksdb_options_create();
+    const rocksdb_options_t* cf_opts[3] = {cf_options, cf_options, cf_options};
+
+    rocksdb_options_set_error_if_exists(cf_options, 0);
+    rocksdb_column_family_handle_t* cf_handles[3];
+    otxn_db = rocksdb_optimistictransactiondb_open_column_families(
+        db_options, dbname, 3, cf_names, cf_opts, cf_handles, &err);
+    CheckNoError(err);
+    rocksdb_transaction_t* txn_cf = rocksdb_optimistictransaction_begin(
+        otxn_db, woptions, otxn_options, NULL);
+    CheckTxnGetCF(txn_cf, roptions, cf_handles[0], "key", "value");
+    CheckTxnGetCF(txn_cf, roptions, cf_handles[1], "key_cf1", "val_cf1");
+    CheckTxnGetCF(txn_cf, roptions, cf_handles[2], "key_cf2", "val_cf2");
+    rocksdb_transaction_destroy(txn_cf);
+    rocksdb_options_destroy(cf_options);
+    rocksdb_column_family_handle_destroy(cf_handles[0]);
+    rocksdb_column_family_handle_destroy(cf_handles[1]);
+    rocksdb_column_family_handle_destroy(cf_handles[2]);
+    rocksdb_optimistictransactiondb_close(otxn_db);
+    rocksdb_destroy_db(db_options, dbname, &err);
+    rocksdb_options_destroy(db_options);
+    rocksdb_optimistictransaction_options_destroy(otxn_options);
+    CheckNoError(err);
+  }
+
+  // Simple sanity check that setting memtable rep works.
+  StartPhase("memtable_reps");
+  {
+    // Create database with vector memtable.
+    rocksdb_options_set_memtable_vector_rep(options);
+    db = rocksdb_open(options, dbname, &err);
+    CheckNoError(err);
+
+    // Create database with hash skiplist memtable.
+    rocksdb_close(db);
+    rocksdb_destroy_db(options, dbname, &err);
+    CheckNoError(err);
+
+    rocksdb_options_set_hash_skip_list_rep(options, 5000, 4, 4);
+    db = rocksdb_open(options, dbname, &err);
+    CheckNoError(err);
+  }
+
+  // Check that secondary instance works.
+  StartPhase("open_as_secondary");
+  {
+    rocksdb_close(db);
+    rocksdb_destroy_db(options, dbname, &err);
+
+    rocksdb_options_t* db_options = rocksdb_options_create();
+    rocksdb_options_set_create_if_missing(db_options, 1);
+    db = rocksdb_open(db_options, dbname, &err);
+    CheckNoError(err);
+    rocksdb_t* db1;
+    rocksdb_options_t* opts = rocksdb_options_create();
+    rocksdb_options_set_max_open_files(opts, -1);
+    rocksdb_options_set_create_if_missing(opts, 1);
+    snprintf(secondary_path, sizeof(secondary_path),
+             "%s/rocksdb_c_test_secondary-%d", GetTempDir(), ((int)geteuid()));
+    db1 = rocksdb_open_as_secondary(opts, dbname, secondary_path, &err);
+    CheckNoError(err);
+
+    rocksdb_writeoptions_set_sync(woptions, 0);
+    rocksdb_writeoptions_disable_WAL(woptions, 1);
+    rocksdb_put(db, woptions, "key0", 4, "value0", 6, &err);
+    CheckNoError(err);
+    rocksdb_flushoptions_t* flush_opts = rocksdb_flushoptions_create();
+    rocksdb_flushoptions_set_wait(flush_opts, 1);
+    rocksdb_flush(db, flush_opts, &err);
+    CheckNoError(err);
+    rocksdb_try_catch_up_with_primary(db1, &err);
+    CheckNoError(err);
+    rocksdb_readoptions_t* ropts = rocksdb_readoptions_create();
+    rocksdb_readoptions_set_verify_checksums(ropts, 1);
+    rocksdb_readoptions_set_snapshot(ropts, NULL);
+    CheckGet(db, ropts, "key0", "value0");
+    CheckGet(db1, ropts, "key0", "value0");
+
+    rocksdb_writeoptions_disable_WAL(woptions, 0);
+    rocksdb_put(db, woptions, "key1", 4, "value1", 6, &err);
+    CheckNoError(err);
+    rocksdb_try_catch_up_with_primary(db1, &err);
+    CheckNoError(err);
+    CheckGet(db1, ropts, "key0", "value0");
+    CheckGet(db1, ropts, "key1", "value1");
+
+    rocksdb_close(db1);
+    rocksdb_destroy_db(opts, secondary_path, &err);
+    CheckNoError(err);
+
+    rocksdb_options_destroy(db_options);
+    rocksdb_options_destroy(opts);
+    rocksdb_readoptions_destroy(ropts);
+    rocksdb_flushoptions_destroy(flush_opts);
+  }
+
+  // Simple sanity check that options setting db_paths work.
+  StartPhase("open_db_paths");
+  {
+    rocksdb_close(db);
+    rocksdb_destroy_db(options, dbname, &err);
+
+    const rocksdb_dbpath_t* paths[1] = {dbpath};
+    rocksdb_options_set_db_paths(options, paths, 1);
+    db = rocksdb_open(options, dbname, &err);
+    CheckNoError(err);
+  }
+
+  StartPhase("cleanup");
+  rocksdb_close(db);
+  rocksdb_options_destroy(options);
+  rocksdb_block_based_options_destroy(table_options);
+  rocksdb_readoptions_destroy(roptions);
+  rocksdb_writeoptions_destroy(woptions);
+  rocksdb_compactoptions_destroy(coptions);
+  rocksdb_cache_destroy(cache);
+  rocksdb_comparator_destroy(cmp);
+  rocksdb_dbpath_destroy(dbpath);
+  rocksdb_env_destroy(env);
+
+  fprintf(stderr, "PASS\n");
+  return 0;
+}
+
+#else
+
+int main() {
+  fprintf(stderr, "SKIPPED\n");
+  return 0;
+}
+
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/column_family.cc b/src/rocksdb/db/column_family.cc
new file mode 100644
index 000000000..928a02a1f
--- /dev/null
+++ b/src/rocksdb/db/column_family.cc
@@ -0,0 +1,1523 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/column_family.h"
+
+#include <algorithm>
+#include <cinttypes>
+#include <limits>
+#include <string>
+#include <vector>
+
+#include "db/compaction/compaction_picker.h"
+#include "db/compaction/compaction_picker_fifo.h"
+#include "db/compaction/compaction_picker_level.h"
+#include "db/compaction/compaction_picker_universal.h"
+#include "db/db_impl/db_impl.h"
+#include "db/internal_stats.h"
+#include "db/job_context.h"
+#include "db/range_del_aggregator.h"
+#include "db/table_properties_collector.h"
+#include "db/version_set.h"
+#include "db/write_controller.h"
+#include "file/sst_file_manager_impl.h"
+#include "memtable/hash_skiplist_rep.h"
+#include "monitoring/thread_status_util.h"
+#include "options/options_helper.h"
+#include "port/port.h"
+#include "table/block_based/block_based_table_factory.h"
+#include "table/merging_iterator.h"
+#include "util/autovector.h"
+#include "util/compression.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+ColumnFamilyHandleImpl::ColumnFamilyHandleImpl(
+    ColumnFamilyData* column_family_data, DBImpl* db, InstrumentedMutex* mutex)
+    : cfd_(column_family_data), db_(db), mutex_(mutex) {
+  if (cfd_ != nullptr) {
+    cfd_->Ref();
+  }
+}
+
+ColumnFamilyHandleImpl::~ColumnFamilyHandleImpl() {
+  if (cfd_ != nullptr) {
+#ifndef ROCKSDB_LITE
+    for (auto& listener : cfd_->ioptions()->listeners) {
+      listener->OnColumnFamilyHandleDeletionStarted(this);
+    }
+#endif  // ROCKSDB_LITE
+    // Job id == 0 means that this is not our background process, but rather
+    // user thread
+    // Need to hold some shared pointers owned by the initial_cf_options
+    // before final cleaning up finishes.
+    ColumnFamilyOptions initial_cf_options_copy = cfd_->initial_cf_options();
+    JobContext job_context(0);
+    mutex_->Lock();
+    bool dropped = cfd_->IsDropped();
+    if (cfd_->UnrefAndTryDelete()) {
+      if (dropped) {
+        db_->FindObsoleteFiles(&job_context, false, true);
+      }
+    }
+    mutex_->Unlock();
+    if (job_context.HaveSomethingToDelete()) {
+      bool defer_purge =
+          db_->immutable_db_options().avoid_unnecessary_blocking_io;
+      db_->PurgeObsoleteFiles(job_context, defer_purge);
+      if (defer_purge) {
+        mutex_->Lock();
+        db_->SchedulePurge();
+        mutex_->Unlock();
+      }
+    }
+    job_context.Clean();
+  }
+}
+
+uint32_t ColumnFamilyHandleImpl::GetID() const { return cfd()->GetID(); }
+
+const std::string& ColumnFamilyHandleImpl::GetName() const {
+  return cfd()->GetName();
+}
+
+Status ColumnFamilyHandleImpl::GetDescriptor(ColumnFamilyDescriptor* desc) {
+#ifndef ROCKSDB_LITE
+  // accessing mutable cf-options requires db mutex.
+  InstrumentedMutexLock l(mutex_);
+  *desc = ColumnFamilyDescriptor(cfd()->GetName(), cfd()->GetLatestCFOptions());
+  return Status::OK();
+#else
+  (void)desc;
+  return Status::NotSupported();
+#endif  // !ROCKSDB_LITE
+}
+
+const Comparator* ColumnFamilyHandleImpl::GetComparator() const {
+  return cfd()->user_comparator();
+}
+
+void GetIntTblPropCollectorFactory(
+    const ImmutableCFOptions& ioptions,
+    std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
+        int_tbl_prop_collector_factories) {
+  auto& collector_factories = ioptions.table_properties_collector_factories;
+  for (size_t i = 0; i < ioptions.table_properties_collector_factories.size();
+       ++i) {
+    assert(collector_factories[i]);
+    int_tbl_prop_collector_factories->emplace_back(
+        new UserKeyTablePropertiesCollectorFactory(collector_factories[i]));
+  }
+}
+
+Status CheckCompressionSupported(const ColumnFamilyOptions& cf_options) {
+  if (!cf_options.compression_per_level.empty()) {
+    for (size_t level = 0; level < cf_options.compression_per_level.size();
+         ++level) {
+      if (!CompressionTypeSupported(cf_options.compression_per_level[level])) {
+        return Status::InvalidArgument(
+            "Compression type " +
+            CompressionTypeToString(cf_options.compression_per_level[level]) +
+            " is not linked with the binary.");
+      }
+    }
+  } else {
+    if (!CompressionTypeSupported(cf_options.compression)) {
+      return Status::InvalidArgument(
+          "Compression type " +
+          CompressionTypeToString(cf_options.compression) +
+          " is not linked with the binary.");
+    }
+  }
+  if (cf_options.compression_opts.zstd_max_train_bytes > 0) {
+    if (!ZSTD_TrainDictionarySupported()) {
+      return Status::InvalidArgument(
+          "zstd dictionary trainer cannot be used because ZSTD 1.1.3+ "
+          "is not linked with the binary.");
+    }
+    if (cf_options.compression_opts.max_dict_bytes == 0) {
+      return Status::InvalidArgument(
+          "The dictionary size limit (`CompressionOptions::max_dict_bytes`) "
+          "should be nonzero if we're using zstd's dictionary generator.");
+    }
+  }
+  return Status::OK();
+}
+
+Status CheckConcurrentWritesSupported(const ColumnFamilyOptions& cf_options) {
+  if (cf_options.inplace_update_support) {
+    return Status::InvalidArgument(
+        "In-place memtable updates (inplace_update_support) is not compatible "
+        "with concurrent writes (allow_concurrent_memtable_write)");
+  }
+  if (!cf_options.memtable_factory->IsInsertConcurrentlySupported()) {
+    return Status::InvalidArgument(
+        "Memtable doesn't concurrent writes (allow_concurrent_memtable_write)");
+  }
+  return Status::OK();
+}
+
+Status CheckCFPathsSupported(const DBOptions& db_options,
+                             const ColumnFamilyOptions& cf_options) {
+  // More than one cf_paths are supported only in universal
+  // and level compaction styles. This function also checks the case
+  // in which cf_paths is not specified, which results in db_paths
+  // being used.
+  if ((cf_options.compaction_style != kCompactionStyleUniversal) &&
+      (cf_options.compaction_style != kCompactionStyleLevel)) {
+    if (cf_options.cf_paths.size() > 1) {
+      return Status::NotSupported(
+          "More than one CF paths are only supported in "
+          "universal and level compaction styles. ");
+    } else if (cf_options.cf_paths.empty() &&
+               db_options.db_paths.size() > 1) {
+      return Status::NotSupported(
+          "More than one DB paths are only supported in "
+          "universal and level compaction styles. ");
+    }
+  }
+  return Status::OK();
+}
+
+namespace {
+const uint64_t kDefaultTtl = 0xfffffffffffffffe;
+const uint64_t kDefaultPeriodicCompSecs = 0xfffffffffffffffe;
+};  // namespace
+
+ColumnFamilyOptions SanitizeOptions(const ImmutableDBOptions& db_options,
+                                    const ColumnFamilyOptions& src) {
+  ColumnFamilyOptions result = src;
+  size_t clamp_max = std::conditional<
+      sizeof(size_t) == 4, std::integral_constant<size_t, 0xffffffff>,
+      std::integral_constant<uint64_t, 64ull << 30>>::type::value;
+  ClipToRange(&result.write_buffer_size, ((size_t)64) << 10, clamp_max);
+  // if user sets arena_block_size, we trust user to use this value. Otherwise,
+  // calculate a proper value from writer_buffer_size;
+  if (result.arena_block_size <= 0) {
+    result.arena_block_size = result.write_buffer_size / 8;
+
+    // Align up to 4k
+    const size_t align = 4 * 1024;
+    result.arena_block_size =
+        ((result.arena_block_size + align - 1) / align) * align;
+  }
+  result.min_write_buffer_number_to_merge =
+      std::min(result.min_write_buffer_number_to_merge,
+               result.max_write_buffer_number - 1);
+  if (result.min_write_buffer_number_to_merge < 1) {
+    result.min_write_buffer_number_to_merge = 1;
+  }
+
+  if (result.num_levels < 1) {
+    result.num_levels = 1;
+  }
+  if (result.compaction_style == kCompactionStyleLevel &&
+      result.num_levels < 2) {
+    result.num_levels = 2;
+  }
+
+  if (result.compaction_style == kCompactionStyleUniversal &&
+      db_options.allow_ingest_behind && result.num_levels < 3) {
+    result.num_levels = 3;
+  }
+
+  if (result.max_write_buffer_number < 2) {
+    result.max_write_buffer_number = 2;
+  }
+  // fall back max_write_buffer_number_to_maintain if
+  // max_write_buffer_size_to_maintain is not set
+  if (result.max_write_buffer_size_to_maintain < 0) {
+    result.max_write_buffer_size_to_maintain =
+        result.max_write_buffer_number *
+        static_cast<int64_t>(result.write_buffer_size);
+  } else if (result.max_write_buffer_size_to_maintain == 0 &&
+             result.max_write_buffer_number_to_maintain < 0) {
+    result.max_write_buffer_number_to_maintain = result.max_write_buffer_number;
+  }
+  // bloom filter size shouldn't exceed 1/4 of memtable size.
+  if (result.memtable_prefix_bloom_size_ratio > 0.25) {
+    result.memtable_prefix_bloom_size_ratio = 0.25;
+  } else if (result.memtable_prefix_bloom_size_ratio < 0) {
+    result.memtable_prefix_bloom_size_ratio = 0;
+  }
+
+  if (!result.prefix_extractor) {
+    assert(result.memtable_factory);
+    Slice name = result.memtable_factory->Name();
+    if (name.compare("HashSkipListRepFactory") == 0 ||
+        name.compare("HashLinkListRepFactory") == 0) {
+      result.memtable_factory = std::make_shared<SkipListFactory>();
+    }
+  }
+
+  if (result.compaction_style == kCompactionStyleFIFO) {
+    result.num_levels = 1;
+    // since we delete level0 files in FIFO compaction when there are too many
+    // of them, these options don't really mean anything
+    result.level0_slowdown_writes_trigger = std::numeric_limits<int>::max();
+    result.level0_stop_writes_trigger = std::numeric_limits<int>::max();
+  }
+
+  if (result.max_bytes_for_level_multiplier <= 0) {
+    result.max_bytes_for_level_multiplier = 1;
+  }
+
+  if (result.level0_file_num_compaction_trigger == 0) {
+    ROCKS_LOG_WARN(db_options.info_log.get(),
+                   "level0_file_num_compaction_trigger cannot be 0");
+    result.level0_file_num_compaction_trigger = 1;
+  }
+
+  if (result.level0_stop_writes_trigger <
+          result.level0_slowdown_writes_trigger ||
+      result.level0_slowdown_writes_trigger <
+          result.level0_file_num_compaction_trigger) {
+    ROCKS_LOG_WARN(db_options.info_log.get(),
+                   "This condition must be satisfied: "
+                   "level0_stop_writes_trigger(%d) >= "
+                   "level0_slowdown_writes_trigger(%d) >= "
+                   "level0_file_num_compaction_trigger(%d)",
+                   result.level0_stop_writes_trigger,
+                   result.level0_slowdown_writes_trigger,
+                   result.level0_file_num_compaction_trigger);
+    if (result.level0_slowdown_writes_trigger <
+        result.level0_file_num_compaction_trigger) {
+      result.level0_slowdown_writes_trigger =
+          result.level0_file_num_compaction_trigger;
+    }
+    if (result.level0_stop_writes_trigger <
+        result.level0_slowdown_writes_trigger) {
+      result.level0_stop_writes_trigger = result.level0_slowdown_writes_trigger;
+    }
+    ROCKS_LOG_WARN(db_options.info_log.get(),
+                   "Adjust the value to "
+                   "level0_stop_writes_trigger(%d)"
+                   "level0_slowdown_writes_trigger(%d)"
+                   "level0_file_num_compaction_trigger(%d)",
+                   result.level0_stop_writes_trigger,
+                   result.level0_slowdown_writes_trigger,
+                   result.level0_file_num_compaction_trigger);
+  }
+
+  if (result.soft_pending_compaction_bytes_limit == 0) {
+    result.soft_pending_compaction_bytes_limit =
+        result.hard_pending_compaction_bytes_limit;
+  } else if (result.hard_pending_compaction_bytes_limit > 0 &&
+             result.soft_pending_compaction_bytes_limit >
+                 result.hard_pending_compaction_bytes_limit) {
+    result.soft_pending_compaction_bytes_limit =
+        result.hard_pending_compaction_bytes_limit;
+  }
+
+#ifndef ROCKSDB_LITE
+  // When the DB is stopped, it's possible that there are some .trash files that
+  // were not deleted yet, when we open the DB we will find these .trash files
+  // and schedule them to be deleted (or delete immediately if SstFileManager
+  // was not used)
+  auto sfm = static_cast<SstFileManagerImpl*>(db_options.sst_file_manager.get());
+  for (size_t i = 0; i < result.cf_paths.size(); i++) {
+    DeleteScheduler::CleanupDirectory(db_options.env, sfm, result.cf_paths[i].path);
+  }
+#endif
+
+  if (result.cf_paths.empty()) {
+    result.cf_paths = db_options.db_paths;
+  }
+
+  if (result.level_compaction_dynamic_level_bytes) {
+    if (result.compaction_style != kCompactionStyleLevel ||
+        result.cf_paths.size() > 1U) {
+      // 1. level_compaction_dynamic_level_bytes only makes sense for
+      //    level-based compaction.
+      // 2. we don't yet know how to make both of this feature and multiple
+      //    DB path work.
+      result.level_compaction_dynamic_level_bytes = false;
+    }
+  }
+
+  if (result.max_compaction_bytes == 0) {
+    result.max_compaction_bytes = result.target_file_size_base * 25;
+  }
+
+  bool is_block_based_table =
+      (result.table_factory->Name() == BlockBasedTableFactory().Name());
+
+  const uint64_t kAdjustedTtl = 30 * 24 * 60 * 60;
+  if (result.ttl == kDefaultTtl) {
+    if (is_block_based_table &&
+        result.compaction_style != kCompactionStyleFIFO) {
+      result.ttl = kAdjustedTtl;
+    } else {
+      result.ttl = 0;
+    }
+  }
+
+  const uint64_t kAdjustedPeriodicCompSecs = 30 * 24 * 60 * 60;
+
+  // Turn on periodic compactions and set them to occur once every 30 days if
+  // compaction filters are used and periodic_compaction_seconds is set to the
+  // default value.
+  if (result.compaction_style != kCompactionStyleFIFO) {
+    if ((result.compaction_filter != nullptr ||
+         result.compaction_filter_factory != nullptr) &&
+        result.periodic_compaction_seconds == kDefaultPeriodicCompSecs &&
+        is_block_based_table) {
+      result.periodic_compaction_seconds = kAdjustedPeriodicCompSecs;
+    }
+  } else {
+    // result.compaction_style == kCompactionStyleFIFO
+    if (result.ttl == 0) {
+      if (is_block_based_table) {
+        if (result.periodic_compaction_seconds == kDefaultPeriodicCompSecs) {
+          result.periodic_compaction_seconds = kAdjustedPeriodicCompSecs;
+        }
+        result.ttl = result.periodic_compaction_seconds;
+      }
+    } else if (result.periodic_compaction_seconds != 0) {
+      result.ttl = std::min(result.ttl, result.periodic_compaction_seconds);
+    }
+  }
+
+  // TTL compactions would work similar to Periodic Compactions in Universal in
+  // most of the cases. So, if ttl is set, execute the periodic compaction
+  // codepath.
+  if (result.compaction_style == kCompactionStyleUniversal && result.ttl != 0) {
+    if (result.periodic_compaction_seconds != 0) {
+      result.periodic_compaction_seconds =
+          std::min(result.ttl, result.periodic_compaction_seconds);
+    } else {
+      result.periodic_compaction_seconds = result.ttl;
+    }
+  }
+
+  if (result.periodic_compaction_seconds == kDefaultPeriodicCompSecs) {
+    result.periodic_compaction_seconds = 0;
+  }
+
+  return result;
+}
+
+int SuperVersion::dummy = 0;
+void* const SuperVersion::kSVInUse = &SuperVersion::dummy;
+void* const SuperVersion::kSVObsolete = nullptr;
+
+SuperVersion::~SuperVersion() {
+  for (auto td : to_delete) {
+    delete td;
+  }
+}
+
+SuperVersion* SuperVersion::Ref() {
+  refs.fetch_add(1, std::memory_order_relaxed);
+  return this;
+}
+
+bool SuperVersion::Unref() {
+  // fetch_sub returns the previous value of ref
+  uint32_t previous_refs = refs.fetch_sub(1);
+  assert(previous_refs > 0);
+  return previous_refs == 1;
+}
+
+void SuperVersion::Cleanup() {
+  assert(refs.load(std::memory_order_relaxed) == 0);
+  imm->Unref(&to_delete);
+  MemTable* m = mem->Unref();
+  if (m != nullptr) {
+    auto* memory_usage = current->cfd()->imm()->current_memory_usage();
+    assert(*memory_usage >= m->ApproximateMemoryUsage());
+    *memory_usage -= m->ApproximateMemoryUsage();
+    to_delete.push_back(m);
+  }
+  current->Unref();
+  if (cfd->Unref()) {
+    delete cfd;
+  }
+}
+
+void SuperVersion::Init(ColumnFamilyData* new_cfd, MemTable* new_mem,
+                        MemTableListVersion* new_imm, Version* new_current) {
+  cfd = new_cfd;
+  mem = new_mem;
+  imm = new_imm;
+  current = new_current;
+  cfd->Ref();
+  mem->Ref();
+  imm->Ref();
+  current->Ref();
+  refs.store(1, std::memory_order_relaxed);
+}
+
+namespace {
+void SuperVersionUnrefHandle(void* ptr) {
+  // UnrefHandle is called when a thread exists or a ThreadLocalPtr gets
+  // destroyed. When former happens, the thread shouldn't see kSVInUse.
+  // When latter happens, we are in ~ColumnFamilyData(), no get should happen as
+  // well.
+  SuperVersion* sv = static_cast<SuperVersion*>(ptr);
+  bool was_last_ref __attribute__((__unused__));
+  was_last_ref = sv->Unref();
+  // Thread-local SuperVersions can't outlive ColumnFamilyData::super_version_.
+  // This is important because we can't do SuperVersion cleanup here.
+  // That would require locking DB mutex, which would deadlock because
+  // SuperVersionUnrefHandle is called with locked ThreadLocalPtr mutex.
+  assert(!was_last_ref);
+}
+}  // anonymous namespace
+
+ColumnFamilyData::ColumnFamilyData(
+    uint32_t id, const std::string& name, Version* _dummy_versions,
+    Cache* _table_cache, WriteBufferManager* write_buffer_manager,
+    const ColumnFamilyOptions& cf_options, const ImmutableDBOptions& db_options,
+    const FileOptions& file_options, ColumnFamilySet* column_family_set,
+    BlockCacheTracer* const block_cache_tracer)
+    : id_(id),
+      name_(name),
+      dummy_versions_(_dummy_versions),
+      current_(nullptr),
+      refs_(0),
+      initialized_(false),
+      dropped_(false),
+      internal_comparator_(cf_options.comparator),
+      initial_cf_options_(SanitizeOptions(db_options, cf_options)),
+      ioptions_(db_options, initial_cf_options_),
+      mutable_cf_options_(initial_cf_options_),
+      is_delete_range_supported_(
+          cf_options.table_factory->IsDeleteRangeSupported()),
+      write_buffer_manager_(write_buffer_manager),
+      mem_(nullptr),
+      imm_(ioptions_.min_write_buffer_number_to_merge,
+           ioptions_.max_write_buffer_number_to_maintain,
+           ioptions_.max_write_buffer_size_to_maintain),
+      super_version_(nullptr),
+      super_version_number_(0),
+      local_sv_(new ThreadLocalPtr(&SuperVersionUnrefHandle)),
+      next_(nullptr),
+      prev_(nullptr),
+      log_number_(0),
+      flush_reason_(FlushReason::kOthers),
+      column_family_set_(column_family_set),
+      queued_for_flush_(false),
+      queued_for_compaction_(false),
+      prev_compaction_needed_bytes_(0),
+      allow_2pc_(db_options.allow_2pc),
+      last_memtable_id_(0) {
+  Ref();
+
+  // Convert user defined table properties collector factories to internal ones.
+  GetIntTblPropCollectorFactory(ioptions_, &int_tbl_prop_collector_factories_);
+
+  // if _dummy_versions is nullptr, then this is a dummy column family.
+  if (_dummy_versions != nullptr) {
+    internal_stats_.reset(
+        new InternalStats(ioptions_.num_levels, db_options.env, this));
+    table_cache_.reset(new TableCache(ioptions_, file_options, _table_cache,
+                                      block_cache_tracer));
+    if (ioptions_.compaction_style == kCompactionStyleLevel) {
+      compaction_picker_.reset(
+          new LevelCompactionPicker(ioptions_, &internal_comparator_));
+#ifndef ROCKSDB_LITE
+    } else if (ioptions_.compaction_style == kCompactionStyleUniversal) {
+      compaction_picker_.reset(
+          new UniversalCompactionPicker(ioptions_, &internal_comparator_));
+    } else if (ioptions_.compaction_style == kCompactionStyleFIFO) {
+      compaction_picker_.reset(
+          new FIFOCompactionPicker(ioptions_, &internal_comparator_));
+    } else if (ioptions_.compaction_style == kCompactionStyleNone) {
+      compaction_picker_.reset(new NullCompactionPicker(
+          ioptions_, &internal_comparator_));
+      ROCKS_LOG_WARN(ioptions_.info_log,
+                     "Column family %s does not use any background compaction. "
+                     "Compactions can only be done via CompactFiles\n",
+                     GetName().c_str());
+#endif  // !ROCKSDB_LITE
+    } else {
+      ROCKS_LOG_ERROR(ioptions_.info_log,
+                      "Unable to recognize the specified compaction style %d. "
+                      "Column family %s will use kCompactionStyleLevel.\n",
+                      ioptions_.compaction_style, GetName().c_str());
+      compaction_picker_.reset(
+          new LevelCompactionPicker(ioptions_, &internal_comparator_));
+    }
+
+    if (column_family_set_->NumberOfColumnFamilies() < 10) {
+      ROCKS_LOG_INFO(ioptions_.info_log,
+                     "--------------- Options for column family [%s]:\n",
+                     name.c_str());
+      initial_cf_options_.Dump(ioptions_.info_log);
+    } else {
+      ROCKS_LOG_INFO(ioptions_.info_log, "\t(skipping printing options)\n");
+    }
+  }
+
+  RecalculateWriteStallConditions(mutable_cf_options_);
+}
+
+// DB mutex held
+ColumnFamilyData::~ColumnFamilyData() {
+  assert(refs_.load(std::memory_order_relaxed) == 0);
+  // remove from linked list
+  auto prev = prev_;
+  auto next = next_;
+  prev->next_ = next;
+  next->prev_ = prev;
+
+  if (!dropped_ && column_family_set_ != nullptr) {
+    // If it's dropped, it's already removed from column family set
+    // If column_family_set_ == nullptr, this is dummy CFD and not in
+    // ColumnFamilySet
+    column_family_set_->RemoveColumnFamily(this);
+  }
+
+  if (current_ != nullptr) {
+    current_->Unref();
+  }
+
+  // It would be wrong if this ColumnFamilyData is in flush_queue_ or
+  // compaction_queue_ and we destroyed it
+  assert(!queued_for_flush_);
+  assert(!queued_for_compaction_);
+  assert(super_version_ == nullptr);
+
+  if (dummy_versions_ != nullptr) {
+    // List must be empty
+    assert(dummy_versions_->TEST_Next() == dummy_versions_);
+    bool deleted __attribute__((__unused__));
+    deleted = dummy_versions_->Unref();
+    assert(deleted);
+  }
+
+  if (mem_ != nullptr) {
+    delete mem_->Unref();
+  }
+  autovector<MemTable*> to_delete;
+  imm_.current()->Unref(&to_delete);
+  for (MemTable* m : to_delete) {
+    delete m;
+  }
+}
+
+bool ColumnFamilyData::UnrefAndTryDelete() {
+  int old_refs = refs_.fetch_sub(1);
+  assert(old_refs > 0);
+
+  if (old_refs == 1) {
+    assert(super_version_ == nullptr);
+    delete this;
+    return true;
+  }
+
+  if (old_refs == 2 && super_version_ != nullptr) {
+    // Only the super_version_ holds me
+    SuperVersion* sv = super_version_;
+    super_version_ = nullptr;
+    // Release SuperVersion reference kept in ThreadLocalPtr.
+    // This must be done outside of mutex_ since unref handler can lock mutex.
+    sv->db_mutex->Unlock();
+    local_sv_.reset();
+    sv->db_mutex->Lock();
+
+    if (sv->Unref()) {
+      // May delete this ColumnFamilyData after calling Cleanup()
+      sv->Cleanup();
+      delete sv;
+      return true;
+    }
+  }
+  return false;
+}
+
+void ColumnFamilyData::SetDropped() {
+  // can't drop default CF
+  assert(id_ != 0);
+  dropped_ = true;
+  write_controller_token_.reset();
+
+  // remove from column_family_set
+  column_family_set_->RemoveColumnFamily(this);
+}
+
+ColumnFamilyOptions ColumnFamilyData::GetLatestCFOptions() const {
+  return BuildColumnFamilyOptions(initial_cf_options_, mutable_cf_options_);
+}
+
+uint64_t ColumnFamilyData::OldestLogToKeep() {
+  auto current_log = GetLogNumber();
+
+  if (allow_2pc_) {
+    autovector<MemTable*> empty_list;
+    auto imm_prep_log =
+        imm()->PrecomputeMinLogContainingPrepSection(empty_list);
+    auto mem_prep_log = mem()->GetMinLogContainingPrepSection();
+
+    if (imm_prep_log > 0 && imm_prep_log < current_log) {
+      current_log = imm_prep_log;
+    }
+
+    if (mem_prep_log > 0 && mem_prep_log < current_log) {
+      current_log = mem_prep_log;
+    }
+  }
+
+  return current_log;
+}
+
+const double kIncSlowdownRatio = 0.8;
+const double kDecSlowdownRatio = 1 / kIncSlowdownRatio;
+const double kNearStopSlowdownRatio = 0.6;
+const double kDelayRecoverSlowdownRatio = 1.4;
+
+namespace {
+// If penalize_stop is true, we further reduce slowdown rate.
+std::unique_ptr<WriteControllerToken> SetupDelay(
+    WriteController* write_controller, uint64_t compaction_needed_bytes,
+    uint64_t prev_compaction_need_bytes, bool penalize_stop,
+    bool auto_comapctions_disabled) {
+  const uint64_t kMinWriteRate = 16 * 1024u;  // Minimum write rate 16KB/s.
+
+  uint64_t max_write_rate = write_controller->max_delayed_write_rate();
+  uint64_t write_rate = write_controller->delayed_write_rate();
+
+  if (auto_comapctions_disabled) {
+    // When auto compaction is disabled, always use the value user gave.
+    write_rate = max_write_rate;
+  } else if (write_controller->NeedsDelay() && max_write_rate > kMinWriteRate) {
+    // If user gives rate less than kMinWriteRate, don't adjust it.
+    //
+    // If already delayed, need to adjust based on previous compaction debt.
+    // When there are two or more column families require delay, we always
+    // increase or reduce write rate based on information for one single
+    // column family. It is likely to be OK but we can improve if there is a
+    // problem.
+    // Ignore compaction_needed_bytes = 0 case because compaction_needed_bytes
+    // is only available in level-based compaction
+    //
+    // If the compaction debt stays the same as previously, we also further slow
+    // down. It usually means a mem table is full. It's mainly for the case
+    // where both of flush and compaction are much slower than the speed we
+    // insert to mem tables, so we need to actively slow down before we get
+    // feedback signal from compaction and flushes to avoid the full stop
+    // because of hitting the max write buffer number.
+    //
+    // If DB just falled into the stop condition, we need to further reduce
+    // the write rate to avoid the stop condition.
+    if (penalize_stop) {
+      // Penalize the near stop or stop condition by more aggressive slowdown.
+      // This is to provide the long term slowdown increase signal.
+      // The penalty is more than the reward of recovering to the normal
+      // condition.
+      write_rate = static_cast<uint64_t>(static_cast<double>(write_rate) *
+                                         kNearStopSlowdownRatio);
+      if (write_rate < kMinWriteRate) {
+        write_rate = kMinWriteRate;
+      }
+    } else if (prev_compaction_need_bytes > 0 &&
+               prev_compaction_need_bytes <= compaction_needed_bytes) {
+      write_rate = static_cast<uint64_t>(static_cast<double>(write_rate) *
+                                         kIncSlowdownRatio);
+      if (write_rate < kMinWriteRate) {
+        write_rate = kMinWriteRate;
+      }
+    } else if (prev_compaction_need_bytes > compaction_needed_bytes) {
+      // We are speeding up by ratio of kSlowdownRatio when we have paid
+      // compaction debt. But we'll never speed up to faster than the write rate
+      // given by users.
+      write_rate = static_cast<uint64_t>(static_cast<double>(write_rate) *
+                                         kDecSlowdownRatio);
+      if (write_rate > max_write_rate) {
+        write_rate = max_write_rate;
+      }
+    }
+  }
+  return write_controller->GetDelayToken(write_rate);
+}
+
+int GetL0ThresholdSpeedupCompaction(int level0_file_num_compaction_trigger,
+                                    int level0_slowdown_writes_trigger) {
+  // SanitizeOptions() ensures it.
+  assert(level0_file_num_compaction_trigger <= level0_slowdown_writes_trigger);
+
+  if (level0_file_num_compaction_trigger < 0) {
+    return std::numeric_limits<int>::max();
+  }
+
+  const int64_t twice_level0_trigger =
+      static_cast<int64_t>(level0_file_num_compaction_trigger) * 2;
+
+  const int64_t one_fourth_trigger_slowdown =
+      static_cast<int64_t>(level0_file_num_compaction_trigger) +
+      ((level0_slowdown_writes_trigger - level0_file_num_compaction_trigger) /
+       4);
+
+  assert(twice_level0_trigger >= 0);
+  assert(one_fourth_trigger_slowdown >= 0);
+
+  // 1/4 of the way between L0 compaction trigger threshold and slowdown
+  // condition.
+  // Or twice as compaction trigger, if it is smaller.
+  int64_t res = std::min(twice_level0_trigger, one_fourth_trigger_slowdown);
+  if (res >= port::kMaxInt32) {
+    return port::kMaxInt32;
+  } else {
+    // res fits in int
+    return static_cast<int>(res);
+  }
+}
+}  // namespace
+
+std::pair<WriteStallCondition, ColumnFamilyData::WriteStallCause>
+ColumnFamilyData::GetWriteStallConditionAndCause(
+    int num_unflushed_memtables, int num_l0_files,
+    uint64_t num_compaction_needed_bytes,
+    const MutableCFOptions& mutable_cf_options) {
+  if (num_unflushed_memtables >= mutable_cf_options.max_write_buffer_number) {
+    return {WriteStallCondition::kStopped, WriteStallCause::kMemtableLimit};
+  } else if (!mutable_cf_options.disable_auto_compactions &&
+             num_l0_files >= mutable_cf_options.level0_stop_writes_trigger) {
+    return {WriteStallCondition::kStopped, WriteStallCause::kL0FileCountLimit};
+  } else if (!mutable_cf_options.disable_auto_compactions &&
+             mutable_cf_options.hard_pending_compaction_bytes_limit > 0 &&
+             num_compaction_needed_bytes >=
+                 mutable_cf_options.hard_pending_compaction_bytes_limit) {
+    return {WriteStallCondition::kStopped,
+            WriteStallCause::kPendingCompactionBytes};
+  } else if (mutable_cf_options.max_write_buffer_number > 3 &&
+             num_unflushed_memtables >=
+                 mutable_cf_options.max_write_buffer_number - 1) {
+    return {WriteStallCondition::kDelayed, WriteStallCause::kMemtableLimit};
+  } else if (!mutable_cf_options.disable_auto_compactions &&
+             mutable_cf_options.level0_slowdown_writes_trigger >= 0 &&
+             num_l0_files >=
+                 mutable_cf_options.level0_slowdown_writes_trigger) {
+    return {WriteStallCondition::kDelayed, WriteStallCause::kL0FileCountLimit};
+  } else if (!mutable_cf_options.disable_auto_compactions &&
+             mutable_cf_options.soft_pending_compaction_bytes_limit > 0 &&
+             num_compaction_needed_bytes >=
+                 mutable_cf_options.soft_pending_compaction_bytes_limit) {
+    return {WriteStallCondition::kDelayed,
+            WriteStallCause::kPendingCompactionBytes};
+  }
+  return {WriteStallCondition::kNormal, WriteStallCause::kNone};
+}
+
+WriteStallCondition ColumnFamilyData::RecalculateWriteStallConditions(
+      const MutableCFOptions& mutable_cf_options) {
+  auto write_stall_condition = WriteStallCondition::kNormal;
+  if (current_ != nullptr) {
+    auto* vstorage = current_->storage_info();
+    auto write_controller = column_family_set_->write_controller_;
+    uint64_t compaction_needed_bytes =
+        vstorage->estimated_compaction_needed_bytes();
+
+    auto write_stall_condition_and_cause = GetWriteStallConditionAndCause(
+        imm()->NumNotFlushed(), vstorage->l0_delay_trigger_count(),
+        vstorage->estimated_compaction_needed_bytes(), mutable_cf_options);
+    write_stall_condition = write_stall_condition_and_cause.first;
+    auto write_stall_cause = write_stall_condition_and_cause.second;
+
+    bool was_stopped = write_controller->IsStopped();
+    bool needed_delay = write_controller->NeedsDelay();
+
+    if (write_stall_condition == WriteStallCondition::kStopped &&
+        write_stall_cause == WriteStallCause::kMemtableLimit) {
+      write_controller_token_ = write_controller->GetStopToken();
+      internal_stats_->AddCFStats(InternalStats::MEMTABLE_LIMIT_STOPS, 1);
+      ROCKS_LOG_WARN(
+          ioptions_.info_log,
+          "[%s] Stopping writes because we have %d immutable memtables "
+          "(waiting for flush), max_write_buffer_number is set to %d",
+          name_.c_str(), imm()->NumNotFlushed(),
+          mutable_cf_options.max_write_buffer_number);
+    } else if (write_stall_condition == WriteStallCondition::kStopped &&
+               write_stall_cause == WriteStallCause::kL0FileCountLimit) {
+      write_controller_token_ = write_controller->GetStopToken();
+      internal_stats_->AddCFStats(InternalStats::L0_FILE_COUNT_LIMIT_STOPS, 1);
+      if (compaction_picker_->IsLevel0CompactionInProgress()) {
+        internal_stats_->AddCFStats(
+            InternalStats::LOCKED_L0_FILE_COUNT_LIMIT_STOPS, 1);
+      }
+      ROCKS_LOG_WARN(ioptions_.info_log,
+                     "[%s] Stopping writes because we have %d level-0 files",
+                     name_.c_str(), vstorage->l0_delay_trigger_count());
+    } else if (write_stall_condition == WriteStallCondition::kStopped &&
+               write_stall_cause == WriteStallCause::kPendingCompactionBytes) {
+      write_controller_token_ = write_controller->GetStopToken();
+      internal_stats_->AddCFStats(
+          InternalStats::PENDING_COMPACTION_BYTES_LIMIT_STOPS, 1);
+      ROCKS_LOG_WARN(
+          ioptions_.info_log,
+          "[%s] Stopping writes because of estimated pending compaction "
+          "bytes %" PRIu64,
+          name_.c_str(), compaction_needed_bytes);
+    } else if (write_stall_condition == WriteStallCondition::kDelayed &&
+               write_stall_cause == WriteStallCause::kMemtableLimit) {
+      write_controller_token_ =
+          SetupDelay(write_controller, compaction_needed_bytes,
+                     prev_compaction_needed_bytes_, was_stopped,
+                     mutable_cf_options.disable_auto_compactions);
+      internal_stats_->AddCFStats(InternalStats::MEMTABLE_LIMIT_SLOWDOWNS, 1);
+      ROCKS_LOG_WARN(
+          ioptions_.info_log,
+          "[%s] Stalling writes because we have %d immutable memtables "
+          "(waiting for flush), max_write_buffer_number is set to %d "
+          "rate %" PRIu64,
+          name_.c_str(), imm()->NumNotFlushed(),
+          mutable_cf_options.max_write_buffer_number,
+          write_controller->delayed_write_rate());
+    } else if (write_stall_condition == WriteStallCondition::kDelayed &&
+               write_stall_cause == WriteStallCause::kL0FileCountLimit) {
+      // L0 is the last two files from stopping.
+      bool near_stop = vstorage->l0_delay_trigger_count() >=
+                       mutable_cf_options.level0_stop_writes_trigger - 2;
+      write_controller_token_ =
+          SetupDelay(write_controller, compaction_needed_bytes,
+                     prev_compaction_needed_bytes_, was_stopped || near_stop,
+                     mutable_cf_options.disable_auto_compactions);
+      internal_stats_->AddCFStats(InternalStats::L0_FILE_COUNT_LIMIT_SLOWDOWNS,
+                                  1);
+      if (compaction_picker_->IsLevel0CompactionInProgress()) {
+        internal_stats_->AddCFStats(
+            InternalStats::LOCKED_L0_FILE_COUNT_LIMIT_SLOWDOWNS, 1);
+      }
+      ROCKS_LOG_WARN(ioptions_.info_log,
+                     "[%s] Stalling writes because we have %d level-0 files "
+                     "rate %" PRIu64,
+                     name_.c_str(), vstorage->l0_delay_trigger_count(),
+                     write_controller->delayed_write_rate());
+    } else if (write_stall_condition == WriteStallCondition::kDelayed &&
+               write_stall_cause == WriteStallCause::kPendingCompactionBytes) {
+      // If the distance to hard limit is less than 1/4 of the gap between soft
+      // and
+      // hard bytes limit, we think it is near stop and speed up the slowdown.
+      bool near_stop =
+          mutable_cf_options.hard_pending_compaction_bytes_limit > 0 &&
+          (compaction_needed_bytes -
+           mutable_cf_options.soft_pending_compaction_bytes_limit) >
+              3 * (mutable_cf_options.hard_pending_compaction_bytes_limit -
+                   mutable_cf_options.soft_pending_compaction_bytes_limit) /
+                  4;
+
+      write_controller_token_ =
+          SetupDelay(write_controller, compaction_needed_bytes,
+                     prev_compaction_needed_bytes_, was_stopped || near_stop,
+                     mutable_cf_options.disable_auto_compactions);
+      internal_stats_->AddCFStats(
+          InternalStats::PENDING_COMPACTION_BYTES_LIMIT_SLOWDOWNS, 1);
+      ROCKS_LOG_WARN(
+          ioptions_.info_log,
+          "[%s] Stalling writes because of estimated pending compaction "
+          "bytes %" PRIu64 " rate %" PRIu64,
+          name_.c_str(), vstorage->estimated_compaction_needed_bytes(),
+          write_controller->delayed_write_rate());
+    } else {
+      assert(write_stall_condition == WriteStallCondition::kNormal);
+      if (vstorage->l0_delay_trigger_count() >=
+          GetL0ThresholdSpeedupCompaction(
+              mutable_cf_options.level0_file_num_compaction_trigger,
+              mutable_cf_options.level0_slowdown_writes_trigger)) {
+        write_controller_token_ =
+            write_controller->GetCompactionPressureToken();
+        ROCKS_LOG_INFO(
+            ioptions_.info_log,
+            "[%s] Increasing compaction threads because we have %d level-0 "
+            "files ",
+            name_.c_str(), vstorage->l0_delay_trigger_count());
+      } else if (vstorage->estimated_compaction_needed_bytes() >=
+                 mutable_cf_options.soft_pending_compaction_bytes_limit / 4) {
+        // Increase compaction threads if bytes needed for compaction exceeds
+        // 1/4 of threshold for slowing down.
+        // If soft pending compaction byte limit is not set, always speed up
+        // compaction.
+        write_controller_token_ =
+            write_controller->GetCompactionPressureToken();
+        if (mutable_cf_options.soft_pending_compaction_bytes_limit > 0) {
+          ROCKS_LOG_INFO(
+              ioptions_.info_log,
+              "[%s] Increasing compaction threads because of estimated pending "
+              "compaction "
+              "bytes %" PRIu64,
+              name_.c_str(), vstorage->estimated_compaction_needed_bytes());
+        }
+      } else {
+        write_controller_token_.reset();
+      }
+      // If the DB recovers from delay conditions, we reward with reducing
+      // double the slowdown ratio. This is to balance the long term slowdown
+      // increase signal.
+      if (needed_delay) {
+        uint64_t write_rate = write_controller->delayed_write_rate();
+        write_controller->set_delayed_write_rate(static_cast<uint64_t>(
+            static_cast<double>(write_rate) * kDelayRecoverSlowdownRatio));
+        // Set the low pri limit to be 1/4 the delayed write rate.
+        // Note we don't reset this value even after delay condition is relased.
+        // Low-pri rate will continue to apply if there is a compaction
+        // pressure.
+        write_controller->low_pri_rate_limiter()->SetBytesPerSecond(write_rate /
+                                                                    4);
+      }
+    }
+    prev_compaction_needed_bytes_ = compaction_needed_bytes;
+  }
+  return write_stall_condition;
+}
+
+const FileOptions* ColumnFamilyData::soptions() const {
+  return &(column_family_set_->file_options_);
+}
+
+void ColumnFamilyData::SetCurrent(Version* current_version) {
+  current_ = current_version;
+}
+
+uint64_t ColumnFamilyData::GetNumLiveVersions() const {
+  return VersionSet::GetNumLiveVersions(dummy_versions_);
+}
+
+uint64_t ColumnFamilyData::GetTotalSstFilesSize() const {
+  return VersionSet::GetTotalSstFilesSize(dummy_versions_);
+}
+
+uint64_t ColumnFamilyData::GetLiveSstFilesSize() const {
+  return current_->GetSstFilesSize();
+}
+
+MemTable* ColumnFamilyData::ConstructNewMemtable(
+    const MutableCFOptions& mutable_cf_options, SequenceNumber earliest_seq) {
+  return new MemTable(internal_comparator_, ioptions_, mutable_cf_options,
+                      write_buffer_manager_, earliest_seq, id_);
+}
+
+void ColumnFamilyData::CreateNewMemtable(
+    const MutableCFOptions& mutable_cf_options, SequenceNumber earliest_seq) {
+  if (mem_ != nullptr) {
+    delete mem_->Unref();
+  }
+  SetMemtable(ConstructNewMemtable(mutable_cf_options, earliest_seq));
+  mem_->Ref();
+}
+
+bool ColumnFamilyData::NeedsCompaction() const {
+  return compaction_picker_->NeedsCompaction(current_->storage_info());
+}
+
+Compaction* ColumnFamilyData::PickCompaction(
+    const MutableCFOptions& mutable_options, LogBuffer* log_buffer) {
+  SequenceNumber earliest_mem_seqno =
+      std::min(mem_->GetEarliestSequenceNumber(),
+               imm_.current()->GetEarliestSequenceNumber(false));
+  auto* result = compaction_picker_->PickCompaction(
+      GetName(), mutable_options, current_->storage_info(), log_buffer,
+      earliest_mem_seqno);
+  if (result != nullptr) {
+    result->SetInputVersion(current_);
+  }
+  return result;
+}
+
+bool ColumnFamilyData::RangeOverlapWithCompaction(
+    const Slice& smallest_user_key, const Slice& largest_user_key,
+    int level) const {
+  return compaction_picker_->RangeOverlapWithCompaction(
+      smallest_user_key, largest_user_key, level);
+}
+
+Status ColumnFamilyData::RangesOverlapWithMemtables(
+    const autovector<Range>& ranges, SuperVersion* super_version,
+    bool* overlap) {
+  assert(overlap != nullptr);
+  *overlap = false;
+  // Create an InternalIterator over all unflushed memtables
+  Arena arena;
+  ReadOptions read_opts;
+  read_opts.total_order_seek = true;
+  MergeIteratorBuilder merge_iter_builder(&internal_comparator_, &arena);
+  merge_iter_builder.AddIterator(
+      super_version->mem->NewIterator(read_opts, &arena));
+  super_version->imm->AddIterators(read_opts, &merge_iter_builder);
+  ScopedArenaIterator memtable_iter(merge_iter_builder.Finish());
+
+  auto read_seq = super_version->current->version_set()->LastSequence();
+  ReadRangeDelAggregator range_del_agg(&internal_comparator_, read_seq);
+  auto* active_range_del_iter =
+      super_version->mem->NewRangeTombstoneIterator(read_opts, read_seq);
+  range_del_agg.AddTombstones(
+      std::unique_ptr<FragmentedRangeTombstoneIterator>(active_range_del_iter));
+  super_version->imm->AddRangeTombstoneIterators(read_opts, nullptr /* arena */,
+                                                 &range_del_agg);
+
+  Status status;
+  for (size_t i = 0; i < ranges.size() && status.ok() && !*overlap; ++i) {
+    auto* vstorage = super_version->current->storage_info();
+    auto* ucmp = vstorage->InternalComparator()->user_comparator();
+    InternalKey range_start(ranges[i].start, kMaxSequenceNumber,
+                            kValueTypeForSeek);
+    memtable_iter->Seek(range_start.Encode());
+    status = memtable_iter->status();
+    ParsedInternalKey seek_result;
+    if (status.ok()) {
+      if (memtable_iter->Valid() &&
+          !ParseInternalKey(memtable_iter->key(), &seek_result)) {
+        status = Status::Corruption("DB have corrupted keys");
+      }
+    }
+    if (status.ok()) {
+      if (memtable_iter->Valid() &&
+          ucmp->Compare(seek_result.user_key, ranges[i].limit) <= 0) {
+        *overlap = true;
+      } else if (range_del_agg.IsRangeOverlapped(ranges[i].start,
+                                                 ranges[i].limit)) {
+        *overlap = true;
+      }
+    }
+  }
+  return status;
+}
+
+const int ColumnFamilyData::kCompactAllLevels = -1;
+const int ColumnFamilyData::kCompactToBaseLevel = -2;
+
+Compaction* ColumnFamilyData::CompactRange(
+    const MutableCFOptions& mutable_cf_options, int input_level,
+    int output_level, const CompactRangeOptions& compact_range_options,
+    const InternalKey* begin, const InternalKey* end,
+    InternalKey** compaction_end, bool* conflict,
+    uint64_t max_file_num_to_ignore) {
+  auto* result = compaction_picker_->CompactRange(
+      GetName(), mutable_cf_options, current_->storage_info(), input_level,
+      output_level, compact_range_options, begin, end, compaction_end, conflict,
+      max_file_num_to_ignore);
+  if (result != nullptr) {
+    result->SetInputVersion(current_);
+  }
+  return result;
+}
+
+SuperVersion* ColumnFamilyData::GetReferencedSuperVersion(DBImpl* db) {
+  SuperVersion* sv = GetThreadLocalSuperVersion(db);
+  sv->Ref();
+  if (!ReturnThreadLocalSuperVersion(sv)) {
+    // This Unref() corresponds to the Ref() in GetThreadLocalSuperVersion()
+    // when the thread-local pointer was populated. So, the Ref() earlier in
+    // this function still prevents the returned SuperVersion* from being
+    // deleted out from under the caller.
+    sv->Unref();
+  }
+  return sv;
+}
+
+SuperVersion* ColumnFamilyData::GetThreadLocalSuperVersion(DBImpl* db) {
+  // The SuperVersion is cached in thread local storage to avoid acquiring
+  // mutex when SuperVersion does not change since the last use. When a new
+  // SuperVersion is installed, the compaction or flush thread cleans up
+  // cached SuperVersion in all existing thread local storage. To avoid
+  // acquiring mutex for this operation, we use atomic Swap() on the thread
+  // local pointer to guarantee exclusive access. If the thread local pointer
+  // is being used while a new SuperVersion is installed, the cached
+  // SuperVersion can become stale. In that case, the background thread would
+  // have swapped in kSVObsolete. We re-check the value at when returning
+  // SuperVersion back to thread local, with an atomic compare and swap.
+  // The superversion will need to be released if detected to be stale.
+  void* ptr = local_sv_->Swap(SuperVersion::kSVInUse);
+  // Invariant:
+  // (1) Scrape (always) installs kSVObsolete in ThreadLocal storage
+  // (2) the Swap above (always) installs kSVInUse, ThreadLocal storage
+  // should only keep kSVInUse before ReturnThreadLocalSuperVersion call
+  // (if no Scrape happens).
+  assert(ptr != SuperVersion::kSVInUse);
+  SuperVersion* sv = static_cast<SuperVersion*>(ptr);
+  if (sv == SuperVersion::kSVObsolete ||
+      sv->version_number != super_version_number_.load()) {
+    RecordTick(ioptions_.statistics, NUMBER_SUPERVERSION_ACQUIRES);
+    SuperVersion* sv_to_delete = nullptr;
+
+    if (sv && sv->Unref()) {
+      RecordTick(ioptions_.statistics, NUMBER_SUPERVERSION_CLEANUPS);
+      db->mutex()->Lock();
+      // NOTE: underlying resources held by superversion (sst files) might
+      // not be released until the next background job.
+      sv->Cleanup();
+      if (db->immutable_db_options().avoid_unnecessary_blocking_io) {
+        db->AddSuperVersionsToFreeQueue(sv);
+        db->SchedulePurge();
+      } else {
+        sv_to_delete = sv;
+      }
+    } else {
+      db->mutex()->Lock();
+    }
+    sv = super_version_->Ref();
+    db->mutex()->Unlock();
+
+    delete sv_to_delete;
+  }
+  assert(sv != nullptr);
+  return sv;
+}
+
+bool ColumnFamilyData::ReturnThreadLocalSuperVersion(SuperVersion* sv) {
+  assert(sv != nullptr);
+  // Put the SuperVersion back
+  void* expected = SuperVersion::kSVInUse;
+  if (local_sv_->CompareAndSwap(static_cast<void*>(sv), expected)) {
+    // When we see kSVInUse in the ThreadLocal, we are sure ThreadLocal
+    // storage has not been altered and no Scrape has happened. The
+    // SuperVersion is still current.
+    return true;
+  } else {
+    // ThreadLocal scrape happened in the process of this GetImpl call (after
+    // thread local Swap() at the beginning and before CompareAndSwap()).
+    // This means the SuperVersion it holds is obsolete.
+    assert(expected == SuperVersion::kSVObsolete);
+  }
+  return false;
+}
+
+void ColumnFamilyData::InstallSuperVersion(
+    SuperVersionContext* sv_context, InstrumentedMutex* db_mutex) {
+  db_mutex->AssertHeld();
+  return InstallSuperVersion(sv_context, db_mutex, mutable_cf_options_);
+}
+
+void ColumnFamilyData::InstallSuperVersion(
+    SuperVersionContext* sv_context, InstrumentedMutex* db_mutex,
+    const MutableCFOptions& mutable_cf_options) {
+  SuperVersion* new_superversion = sv_context->new_superversion.release();
+  new_superversion->db_mutex = db_mutex;
+  new_superversion->mutable_cf_options = mutable_cf_options;
+  new_superversion->Init(this, mem_, imm_.current(), current_);
+  SuperVersion* old_superversion = super_version_;
+  super_version_ = new_superversion;
+  ++super_version_number_;
+  super_version_->version_number = super_version_number_;
+  super_version_->write_stall_condition =
+      RecalculateWriteStallConditions(mutable_cf_options);
+
+  if (old_superversion != nullptr) {
+    // Reset SuperVersions cached in thread local storage.
+    // This should be done before old_superversion->Unref(). That's to ensure
+    // that local_sv_ never holds the last reference to SuperVersion, since
+    // it has no means to safely do SuperVersion cleanup.
+    ResetThreadLocalSuperVersions();
+
+    if (old_superversion->mutable_cf_options.write_buffer_size !=
+        mutable_cf_options.write_buffer_size) {
+      mem_->UpdateWriteBufferSize(mutable_cf_options.write_buffer_size);
+    }
+    if (old_superversion->write_stall_condition !=
+        new_superversion->write_stall_condition) {
+      sv_context->PushWriteStallNotification(
+          old_superversion->write_stall_condition,
+          new_superversion->write_stall_condition, GetName(), ioptions());
+    }
+    if (old_superversion->Unref()) {
+      old_superversion->Cleanup();
+      sv_context->superversions_to_free.push_back(old_superversion);
+    }
+  }
+}
+
+void ColumnFamilyData::ResetThreadLocalSuperVersions() {
+  autovector<void*> sv_ptrs;
+  local_sv_->Scrape(&sv_ptrs, SuperVersion::kSVObsolete);
+  for (auto ptr : sv_ptrs) {
+    assert(ptr);
+    if (ptr == SuperVersion::kSVInUse) {
+      continue;
+    }
+    auto sv = static_cast<SuperVersion*>(ptr);
+    bool was_last_ref __attribute__((__unused__));
+    was_last_ref = sv->Unref();
+    // sv couldn't have been the last reference because
+    // ResetThreadLocalSuperVersions() is called before
+    // unref'ing super_version_.
+    assert(!was_last_ref);
+  }
+}
+
+Status ColumnFamilyData::ValidateOptions(
+    const DBOptions& db_options, const ColumnFamilyOptions& cf_options) {
+  Status s;
+  s = CheckCompressionSupported(cf_options);
+  if (s.ok() && db_options.allow_concurrent_memtable_write) {
+    s = CheckConcurrentWritesSupported(cf_options);
+  }
+  if (s.ok() && db_options.unordered_write &&
+      cf_options.max_successive_merges != 0) {
+    s = Status::InvalidArgument(
+        "max_successive_merges > 0 is incompatible with unordered_write");
+  }
+  if (s.ok()) {
+    s = CheckCFPathsSupported(db_options, cf_options);
+  }
+  if (!s.ok()) {
+    return s;
+  }
+
+  if (cf_options.ttl > 0 && cf_options.ttl != kDefaultTtl) {
+    if (cf_options.table_factory->Name() != BlockBasedTableFactory().Name()) {
+      return Status::NotSupported(
+          "TTL is only supported in Block-Based Table format. ");
+    }
+  }
+
+  if (cf_options.periodic_compaction_seconds > 0 &&
+      cf_options.periodic_compaction_seconds != kDefaultPeriodicCompSecs) {
+    if (cf_options.table_factory->Name() != BlockBasedTableFactory().Name()) {
+      return Status::NotSupported(
+          "Periodic Compaction is only supported in "
+          "Block-Based Table format. ");
+    }
+  }
+  return s;
+}
+
+#ifndef ROCKSDB_LITE
+Status ColumnFamilyData::SetOptions(
+    const DBOptions& db_options,
+    const std::unordered_map<std::string, std::string>& options_map) {
+  MutableCFOptions new_mutable_cf_options;
+  Status s =
+      GetMutableOptionsFromStrings(mutable_cf_options_, options_map,
+                                   ioptions_.info_log, &new_mutable_cf_options);
+  if (s.ok()) {
+    ColumnFamilyOptions cf_options =
+        BuildColumnFamilyOptions(initial_cf_options_, new_mutable_cf_options);
+    s = ValidateOptions(db_options, cf_options);
+  }
+  if (s.ok()) {
+    mutable_cf_options_ = new_mutable_cf_options;
+    mutable_cf_options_.RefreshDerivedOptions(ioptions_);
+  }
+  return s;
+}
+#endif  // ROCKSDB_LITE
+
+// REQUIRES: DB mutex held
+Env::WriteLifeTimeHint ColumnFamilyData::CalculateSSTWriteHint(int level) {
+  if (initial_cf_options_.compaction_style != kCompactionStyleLevel) {
+    return Env::WLTH_NOT_SET;
+  }
+  if (level == 0) {
+    return Env::WLTH_MEDIUM;
+  }
+  int base_level = current_->storage_info()->base_level();
+
+  // L1: medium, L2: long, ...
+  if (level - base_level >= 2) {
+    return Env::WLTH_EXTREME;
+  } else if (level < base_level) {
+    // There is no restriction which prevents level passed in to be smaller
+    // than base_level.
+    return Env::WLTH_MEDIUM;
+  }
+  return static_cast<Env::WriteLifeTimeHint>(level - base_level +
+                            static_cast<int>(Env::WLTH_MEDIUM));
+}
+
+Status ColumnFamilyData::AddDirectories(
+    std::map<std::string, std::shared_ptr<Directory>>* created_dirs) {
+  Status s;
+  assert(created_dirs != nullptr);
+  assert(data_dirs_.empty());
+  for (auto& p : ioptions_.cf_paths) {
+    auto existing_dir = created_dirs->find(p.path);
+
+    if (existing_dir == created_dirs->end()) {
+      std::unique_ptr<Directory> path_directory;
+      s = DBImpl::CreateAndNewDirectory(ioptions_.env, p.path, &path_directory);
+      if (!s.ok()) {
+        return s;
+      }
+      assert(path_directory != nullptr);
+      data_dirs_.emplace_back(path_directory.release());
+      (*created_dirs)[p.path] = data_dirs_.back();
+    } else {
+      data_dirs_.emplace_back(existing_dir->second);
+    }
+  }
+  assert(data_dirs_.size() == ioptions_.cf_paths.size());
+  return s;
+}
+
+Directory* ColumnFamilyData::GetDataDir(size_t path_id) const {
+  if (data_dirs_.empty()) {
+    return nullptr;
+  }
+
+  assert(path_id < data_dirs_.size());
+  return data_dirs_[path_id].get();
+}
+
+ColumnFamilySet::ColumnFamilySet(const std::string& dbname,
+                                 const ImmutableDBOptions* db_options,
+                                 const FileOptions& file_options,
+                                 Cache* table_cache,
+                                 WriteBufferManager* write_buffer_manager,
+                                 WriteController* write_controller,
+                                 BlockCacheTracer* const block_cache_tracer)
+    : max_column_family_(0),
+      dummy_cfd_(new ColumnFamilyData(
+          0, "", nullptr, nullptr, nullptr, ColumnFamilyOptions(), *db_options,
+          file_options, nullptr, block_cache_tracer)),
+      default_cfd_cache_(nullptr),
+      db_name_(dbname),
+      db_options_(db_options),
+      file_options_(file_options),
+      table_cache_(table_cache),
+      write_buffer_manager_(write_buffer_manager),
+      write_controller_(write_controller),
+      block_cache_tracer_(block_cache_tracer) {
+  // initialize linked list
+  dummy_cfd_->prev_ = dummy_cfd_;
+  dummy_cfd_->next_ = dummy_cfd_;
+}
+
+ColumnFamilySet::~ColumnFamilySet() {
+  while (column_family_data_.size() > 0) {
+    // cfd destructor will delete itself from column_family_data_
+    auto cfd = column_family_data_.begin()->second;
+    bool last_ref __attribute__((__unused__));
+    last_ref = cfd->UnrefAndTryDelete();
+    assert(last_ref);
+  }
+  bool dummy_last_ref __attribute__((__unused__));
+  dummy_last_ref = dummy_cfd_->UnrefAndTryDelete();
+  assert(dummy_last_ref);
+}
+
+ColumnFamilyData* ColumnFamilySet::GetDefault() const {
+  assert(default_cfd_cache_ != nullptr);
+  return default_cfd_cache_;
+}
+
+ColumnFamilyData* ColumnFamilySet::GetColumnFamily(uint32_t id) const {
+  auto cfd_iter = column_family_data_.find(id);
+  if (cfd_iter != column_family_data_.end()) {
+    return cfd_iter->second;
+  } else {
+    return nullptr;
+  }
+}
+
+ColumnFamilyData* ColumnFamilySet::GetColumnFamily(const std::string& name)
+    const {
+  auto cfd_iter = column_families_.find(name);
+  if (cfd_iter != column_families_.end()) {
+    auto cfd = GetColumnFamily(cfd_iter->second);
+    assert(cfd != nullptr);
+    return cfd;
+  } else {
+    return nullptr;
+  }
+}
+
+uint32_t ColumnFamilySet::GetNextColumnFamilyID() {
+  return ++max_column_family_;
+}
+
+uint32_t ColumnFamilySet::GetMaxColumnFamily() { return max_column_family_; }
+
+void ColumnFamilySet::UpdateMaxColumnFamily(uint32_t new_max_column_family) {
+  max_column_family_ = std::max(new_max_column_family, max_column_family_);
+}
+
+size_t ColumnFamilySet::NumberOfColumnFamilies() const {
+  return column_families_.size();
+}
+
+// under a DB mutex AND write thread
+ColumnFamilyData* ColumnFamilySet::CreateColumnFamily(
+    const std::string& name, uint32_t id, Version* dummy_versions,
+    const ColumnFamilyOptions& options) {
+  assert(column_families_.find(name) == column_families_.end());
+  ColumnFamilyData* new_cfd = new ColumnFamilyData(
+      id, name, dummy_versions, table_cache_, write_buffer_manager_, options,
+      *db_options_, file_options_, this, block_cache_tracer_);
+  column_families_.insert({name, id});
+  column_family_data_.insert({id, new_cfd});
+  max_column_family_ = std::max(max_column_family_, id);
+  // add to linked list
+  new_cfd->next_ = dummy_cfd_;
+  auto prev = dummy_cfd_->prev_;
+  new_cfd->prev_ = prev;
+  prev->next_ = new_cfd;
+  dummy_cfd_->prev_ = new_cfd;
+  if (id == 0) {
+    default_cfd_cache_ = new_cfd;
+  }
+  return new_cfd;
+}
+
+// REQUIRES: DB mutex held
+void ColumnFamilySet::FreeDeadColumnFamilies() {
+  autovector<ColumnFamilyData*> to_delete;
+  for (auto cfd = dummy_cfd_->next_; cfd != dummy_cfd_; cfd = cfd->next_) {
+    if (cfd->refs_.load(std::memory_order_relaxed) == 0) {
+      to_delete.push_back(cfd);
+    }
+  }
+  for (auto cfd : to_delete) {
+    // this is very rare, so it's not a problem that we do it under a mutex
+    delete cfd;
+  }
+}
+
+// under a DB mutex AND from a write thread
+void ColumnFamilySet::RemoveColumnFamily(ColumnFamilyData* cfd) {
+  auto cfd_iter = column_family_data_.find(cfd->GetID());
+  assert(cfd_iter != column_family_data_.end());
+  column_family_data_.erase(cfd_iter);
+  column_families_.erase(cfd->GetName());
+}
+
+// under a DB mutex OR from a write thread
+bool ColumnFamilyMemTablesImpl::Seek(uint32_t column_family_id) {
+  if (column_family_id == 0) {
+    // optimization for common case
+    current_ = column_family_set_->GetDefault();
+  } else {
+    current_ = column_family_set_->GetColumnFamily(column_family_id);
+  }
+  handle_.SetCFD(current_);
+  return current_ != nullptr;
+}
+
+uint64_t ColumnFamilyMemTablesImpl::GetLogNumber() const {
+  assert(current_ != nullptr);
+  return current_->GetLogNumber();
+}
+
+MemTable* ColumnFamilyMemTablesImpl::GetMemTable() const {
+  assert(current_ != nullptr);
+  return current_->mem();
+}
+
+ColumnFamilyHandle* ColumnFamilyMemTablesImpl::GetColumnFamilyHandle() {
+  assert(current_ != nullptr);
+  return &handle_;
+}
+
+uint32_t GetColumnFamilyID(ColumnFamilyHandle* column_family) {
+  uint32_t column_family_id = 0;
+  if (column_family != nullptr) {
+    auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+    column_family_id = cfh->GetID();
+  }
+  return column_family_id;
+}
+
+const Comparator* GetColumnFamilyUserComparator(
+    ColumnFamilyHandle* column_family) {
+  if (column_family != nullptr) {
+    return column_family->GetComparator();
+  }
+  return nullptr;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/column_family.h b/src/rocksdb/db/column_family.h
new file mode 100644
index 000000000..fcc8ea2cf
--- /dev/null
+++ b/src/rocksdb/db/column_family.h
@@ -0,0 +1,757 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <unordered_map>
+#include <string>
+#include <vector>
+#include <atomic>
+
+#include "db/memtable_list.h"
+#include "db/table_cache.h"
+#include "db/table_properties_collector.h"
+#include "db/write_batch_internal.h"
+#include "db/write_controller.h"
+#include "options/cf_options.h"
+#include "rocksdb/compaction_job_stats.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/options.h"
+#include "trace_replay/block_cache_tracer.h"
+#include "util/thread_local.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Version;
+class VersionSet;
+class VersionStorageInfo;
+class MemTable;
+class MemTableListVersion;
+class CompactionPicker;
+class Compaction;
+class InternalKey;
+class InternalStats;
+class ColumnFamilyData;
+class DBImpl;
+class LogBuffer;
+class InstrumentedMutex;
+class InstrumentedMutexLock;
+struct SuperVersionContext;
+
+extern const double kIncSlowdownRatio;
+// This file contains a list of data structures for managing column family
+// level metadata.
+//
+// The basic relationships among classes declared here are illustrated as
+// following:
+//
+//       +----------------------+    +----------------------+   +--------+
+//   +---+ ColumnFamilyHandle 1 | +--+ ColumnFamilyHandle 2 |   | DBImpl |
+//   |   +----------------------+ |  +----------------------+   +----+---+
+//   | +--------------------------+                                  |
+//   | |                               +-----------------------------+
+//   | |                               |
+//   | | +-----------------------------v-------------------------------+
+//   | | |                                                             |
+//   | | |                      ColumnFamilySet                        |
+//   | | |                                                             |
+//   | | +-------------+--------------------------+----------------+---+
+//   | |               |                          |                |
+//   | +-------------------------------------+    |                |
+//   |                 |                     |    |                v
+//   |   +-------------v-------------+ +-----v----v---------+
+//   |   |                           | |                    |
+//   |   |     ColumnFamilyData 1    | | ColumnFamilyData 2 |    ......
+//   |   |                           | |                    |
+//   +--->                           | |                    |
+//       |                 +---------+ |                    |
+//       |                 | MemTable| |                    |
+//       |                 |  List   | |                    |
+//       +--------+---+--+-+----+----+ +--------------------++
+//                |   |  |      |
+//                |   |  |      |
+//                |   |  |      +-----------------------+
+//                |   |  +-----------+                  |
+//                v   +--------+     |                  |
+//       +--------+--------+   |     |                  |
+//       |                 |   |     |       +----------v----------+
+// +---> |SuperVersion 1.a +----------------->                     |
+//       |                 +------+  |       | MemTableListVersion |
+//       +---+-------------+   |  |  |       |                     |
+//           |                 |  |  |       +----+------------+---+
+//           |      current    |  |  |            |            |
+//           |   +-------------+  |  |mem         |            |
+//           |   |                |  |            |            |
+//         +-v---v-------+    +---v--v---+  +-----v----+  +----v-----+
+//         |             |    |          |  |          |  |          |
+//         | Version 1.a |    | memtable |  | memtable |  | memtable |
+//         |             |    |   1.a    |  |   1.b    |  |   1.c    |
+//         +-------------+    |          |  |          |  |          |
+//                            +----------+  +----------+  +----------+
+//
+// DBImpl keeps a ColumnFamilySet, which references to all column families by
+// pointing to respective ColumnFamilyData object of each column family.
+// This is how DBImpl can list and operate on all the column families.
+// ColumnFamilyHandle also points to ColumnFamilyData directly, so that
+// when a user executes a query, it can directly find memtables and Version
+// as well as SuperVersion to the column family, without going through
+// ColumnFamilySet.
+//
+// ColumnFamilySet points to the latest view of the LSM-tree (list of memtables
+// and SST files) indirectly, while ongoing operations may hold references
+// to a current or an out-of-date SuperVersion, which in turn points to a
+// point-in-time view of the LSM-tree. This guarantees the memtables and SST
+// files being operated on will not go away, until the SuperVersion is
+// unreferenced to 0 and destoryed.
+//
+// The following graph illustrates a possible referencing relationships:
+//
+// Column       +--------------+      current       +-----------+
+// Family +---->+              +------------------->+           |
+//  Data        | SuperVersion +----------+         | Version A |
+//              |      3       |   imm    |         |           |
+// Iter2 +----->+              |  +-------v------+  +-----------+
+//              +-----+--------+  | MemtableList +----------------> Empty
+//                    |           |   Version r  |  +-----------+
+//                    |           +--------------+  |           |
+//                    +------------------+   current| Version B |
+//              +--------------+         |   +----->+           |
+//              |              |         |   |      +-----+-----+
+// Compaction +>+ SuperVersion +-------------+            ^
+//    Job       |      2       +------+  |                |current
+//              |              +----+ |  |     mem        |    +------------+
+//              +--------------+    | |  +--------------------->            |
+//                                  | +------------------------> MemTable a |
+//                                  |          mem        |    |            |
+//              +--------------+    |                     |    +------------+
+//              |              +--------------------------+
+//  Iter1 +-----> SuperVersion |    |                          +------------+
+//              |      1       +------------------------------>+            |
+//              |              +-+  |        mem               | MemTable b |
+//              +--------------+ |  |                          |            |
+//                               |  |    +--------------+      +-----^------+
+//                               |  |imm | MemtableList |            |
+//                               |  +--->+   Version s  +------------+
+//                               |       +--------------+
+//                               |       +--------------+
+//                               |       | MemtableList |
+//                               +------>+   Version t  +-------->  Empty
+//                                 imm   +--------------+
+//
+// In this example, even if the current LSM-tree consists of Version A and
+// memtable a, which is also referenced by SuperVersion, two older SuperVersion
+// SuperVersion2 and Superversion1 still exist, and are referenced by a
+// compaction job and an old iterator Iter1, respectively. SuperVersion2
+// contains Version B, memtable a and memtable b; SuperVersion1 contains
+// Version B and memtable b (mutable). As a result, Version B and memtable b
+// are prevented from being destroyed or deleted.
+
+// ColumnFamilyHandleImpl is the class that clients use to access different
+// column families. It has non-trivial destructor, which gets called when client
+// is done using the column family
+class ColumnFamilyHandleImpl : public ColumnFamilyHandle {
+ public:
+  // create while holding the mutex
+  ColumnFamilyHandleImpl(
+      ColumnFamilyData* cfd, DBImpl* db, InstrumentedMutex* mutex);
+  // destroy without mutex
+  virtual ~ColumnFamilyHandleImpl();
+  virtual ColumnFamilyData* cfd() const { return cfd_; }
+
+  virtual uint32_t GetID() const override;
+  virtual const std::string& GetName() const override;
+  virtual Status GetDescriptor(ColumnFamilyDescriptor* desc) override;
+  virtual const Comparator* GetComparator() const override;
+
+ private:
+  ColumnFamilyData* cfd_;
+  DBImpl* db_;
+  InstrumentedMutex* mutex_;
+};
+
+// Does not ref-count ColumnFamilyData
+// We use this dummy ColumnFamilyHandleImpl because sometimes MemTableInserter
+// calls DBImpl methods. When this happens, MemTableInserter need access to
+// ColumnFamilyHandle (same as the client would need). In that case, we feed
+// MemTableInserter dummy ColumnFamilyHandle and enable it to call DBImpl
+// methods
+class ColumnFamilyHandleInternal : public ColumnFamilyHandleImpl {
+ public:
+  ColumnFamilyHandleInternal()
+      : ColumnFamilyHandleImpl(nullptr, nullptr, nullptr), internal_cfd_(nullptr) {}
+
+  void SetCFD(ColumnFamilyData* _cfd) { internal_cfd_ = _cfd; }
+  virtual ColumnFamilyData* cfd() const override { return internal_cfd_; }
+
+ private:
+  ColumnFamilyData* internal_cfd_;
+};
+
+// holds references to memtable, all immutable memtables and version
+struct SuperVersion {
+  // Accessing members of this class is not thread-safe and requires external
+  // synchronization (ie db mutex held or on write thread).
+  ColumnFamilyData* cfd;
+  MemTable* mem;
+  MemTableListVersion* imm;
+  Version* current;
+  MutableCFOptions mutable_cf_options;
+  // Version number of the current SuperVersion
+  uint64_t version_number;
+  WriteStallCondition write_stall_condition;
+
+  InstrumentedMutex* db_mutex;
+
+  // should be called outside the mutex
+  SuperVersion() = default;
+  ~SuperVersion();
+  SuperVersion* Ref();
+  // If Unref() returns true, Cleanup() should be called with mutex held
+  // before deleting this SuperVersion.
+  bool Unref();
+
+  // call these two methods with db mutex held
+  // Cleanup unrefs mem, imm and current. Also, it stores all memtables
+  // that needs to be deleted in to_delete vector. Unrefing those
+  // objects needs to be done in the mutex
+  void Cleanup();
+  void Init(ColumnFamilyData* new_cfd, MemTable* new_mem,
+            MemTableListVersion* new_imm, Version* new_current);
+
+  // The value of dummy is not actually used. kSVInUse takes its address as a
+  // mark in the thread local storage to indicate the SuperVersion is in use
+  // by thread. This way, the value of kSVInUse is guaranteed to have no
+  // conflict with SuperVersion object address and portable on different
+  // platform.
+  static int dummy;
+  static void* const kSVInUse;
+  static void* const kSVObsolete;
+
+ private:
+  std::atomic<uint32_t> refs;
+  // We need to_delete because during Cleanup(), imm->Unref() returns
+  // all memtables that we need to free through this vector. We then
+  // delete all those memtables outside of mutex, during destruction
+  autovector<MemTable*> to_delete;
+};
+
+extern Status CheckCompressionSupported(const ColumnFamilyOptions& cf_options);
+
+extern Status CheckConcurrentWritesSupported(
+    const ColumnFamilyOptions& cf_options);
+
+extern Status CheckCFPathsSupported(const DBOptions& db_options,
+                                    const ColumnFamilyOptions& cf_options);
+
+extern ColumnFamilyOptions SanitizeOptions(const ImmutableDBOptions& db_options,
+                                           const ColumnFamilyOptions& src);
+// Wrap user defined table proproties collector factories `from cf_options`
+// into internal ones in int_tbl_prop_collector_factories. Add a system internal
+// one too.
+extern void GetIntTblPropCollectorFactory(
+    const ImmutableCFOptions& ioptions,
+    std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
+        int_tbl_prop_collector_factories);
+
+class ColumnFamilySet;
+
+// This class keeps all the data that a column family needs.
+// Most methods require DB mutex held, unless otherwise noted
+class ColumnFamilyData {
+ public:
+  ~ColumnFamilyData();
+
+  // thread-safe
+  uint32_t GetID() const { return id_; }
+  // thread-safe
+  const std::string& GetName() const { return name_; }
+
+  // Ref() can only be called from a context where the caller can guarantee
+  // that ColumnFamilyData is alive (while holding a non-zero ref already,
+  // holding a DB mutex, or as the leader in a write batch group).
+  void Ref() { refs_.fetch_add(1); }
+
+  // Unref decreases the reference count, but does not handle deletion
+  // when the count goes to 0.  If this method returns true then the
+  // caller should delete the instance immediately, or later, by calling
+  // FreeDeadColumnFamilies().  Unref() can only be called while holding
+  // a DB mutex, or during single-threaded recovery.
+  bool Unref() {
+    int old_refs = refs_.fetch_sub(1);
+    assert(old_refs > 0);
+    return old_refs == 1;
+  }
+
+  // UnrefAndTryDelete() decreases the reference count and do free if needed,
+  // return true if this is freed else false, UnrefAndTryDelete() can only
+  // be called while holding a DB mutex, or during single-threaded recovery.
+  bool UnrefAndTryDelete();
+
+  // SetDropped() can only be called under following conditions:
+  // 1) Holding a DB mutex,
+  // 2) from single-threaded write thread, AND
+  // 3) from single-threaded VersionSet::LogAndApply()
+  // After dropping column family no other operation on that column family
+  // will be executed. All the files and memory will be, however, kept around
+  // until client drops the column family handle. That way, client can still
+  // access data from dropped column family.
+  // Column family can be dropped and still alive. In that state:
+  // *) Compaction and flush is not executed on the dropped column family.
+  // *) Client can continue reading from column family. Writes will fail unless
+  // WriteOptions::ignore_missing_column_families is true
+  // When the dropped column family is unreferenced, then we:
+  // *) Remove column family from the linked list maintained by ColumnFamilySet
+  // *) delete all memory associated with that column family
+  // *) delete all the files associated with that column family
+  void SetDropped();
+  bool IsDropped() const { return dropped_.load(std::memory_order_relaxed); }
+
+  // thread-safe
+  int NumberLevels() const { return ioptions_.num_levels; }
+
+  void SetLogNumber(uint64_t log_number) { log_number_ = log_number; }
+  uint64_t GetLogNumber() const { return log_number_; }
+
+  void SetFlushReason(FlushReason flush_reason) {
+    flush_reason_ = flush_reason;
+  }
+  FlushReason GetFlushReason() const { return flush_reason_; }
+  // thread-safe
+  const FileOptions* soptions() const;
+  const ImmutableCFOptions* ioptions() const { return &ioptions_; }
+  // REQUIRES: DB mutex held
+  // This returns the MutableCFOptions used by current SuperVersion
+  // You should use this API to reference MutableCFOptions most of the time.
+  const MutableCFOptions* GetCurrentMutableCFOptions() const {
+    return &(super_version_->mutable_cf_options);
+  }
+  // REQUIRES: DB mutex held
+  // This returns the latest MutableCFOptions, which may be not in effect yet.
+  const MutableCFOptions* GetLatestMutableCFOptions() const {
+    return &mutable_cf_options_;
+  }
+
+  // REQUIRES: DB mutex held
+  // Build ColumnFamiliesOptions with immutable options and latest mutable
+  // options.
+  ColumnFamilyOptions GetLatestCFOptions() const;
+
+  bool is_delete_range_supported() { return is_delete_range_supported_; }
+
+  // Validate CF options against DB options
+  static Status ValidateOptions(const DBOptions& db_options,
+                                const ColumnFamilyOptions& cf_options);
+#ifndef ROCKSDB_LITE
+  // REQUIRES: DB mutex held
+  Status SetOptions(
+      const DBOptions& db_options,
+      const std::unordered_map<std::string, std::string>& options_map);
+#endif  // ROCKSDB_LITE
+
+  InternalStats* internal_stats() { return internal_stats_.get(); }
+
+  MemTableList* imm() { return &imm_; }
+  MemTable* mem() { return mem_; }
+  Version* current() { return current_; }
+  Version* dummy_versions() { return dummy_versions_; }
+  void SetCurrent(Version* _current);
+  uint64_t GetNumLiveVersions() const;  // REQUIRE: DB mutex held
+  uint64_t GetTotalSstFilesSize() const;  // REQUIRE: DB mutex held
+  uint64_t GetLiveSstFilesSize() const;   // REQUIRE: DB mutex held
+  void SetMemtable(MemTable* new_mem) {
+    uint64_t memtable_id = last_memtable_id_.fetch_add(1) + 1;
+    new_mem->SetID(memtable_id);
+    mem_ = new_mem;
+  }
+
+  // calculate the oldest log needed for the durability of this column family
+  uint64_t OldestLogToKeep();
+
+  // See Memtable constructor for explanation of earliest_seq param.
+  MemTable* ConstructNewMemtable(const MutableCFOptions& mutable_cf_options,
+                                 SequenceNumber earliest_seq);
+  void CreateNewMemtable(const MutableCFOptions& mutable_cf_options,
+                         SequenceNumber earliest_seq);
+
+  TableCache* table_cache() const { return table_cache_.get(); }
+
+  // See documentation in compaction_picker.h
+  // REQUIRES: DB mutex held
+  bool NeedsCompaction() const;
+  // REQUIRES: DB mutex held
+  Compaction* PickCompaction(const MutableCFOptions& mutable_options,
+                             LogBuffer* log_buffer);
+
+  // Check if the passed range overlap with any running compactions.
+  // REQUIRES: DB mutex held
+  bool RangeOverlapWithCompaction(const Slice& smallest_user_key,
+                                  const Slice& largest_user_key,
+                                  int level) const;
+
+  // Check if the passed ranges overlap with any unflushed memtables
+  // (immutable or mutable).
+  //
+  // @param super_version A referenced SuperVersion that will be held for the
+  //    duration of this function.
+  //
+  // Thread-safe
+  Status RangesOverlapWithMemtables(const autovector<Range>& ranges,
+                                    SuperVersion* super_version, bool* overlap);
+
+  // A flag to tell a manual compaction is to compact all levels together
+  // instead of a specific level.
+  static const int kCompactAllLevels;
+  // A flag to tell a manual compaction's output is base level.
+  static const int kCompactToBaseLevel;
+  // REQUIRES: DB mutex held
+  Compaction* CompactRange(const MutableCFOptions& mutable_cf_options,
+                           int input_level, int output_level,
+                           const CompactRangeOptions& compact_range_options,
+                           const InternalKey* begin, const InternalKey* end,
+                           InternalKey** compaction_end, bool* manual_conflict,
+                           uint64_t max_file_num_to_ignore);
+
+  CompactionPicker* compaction_picker() { return compaction_picker_.get(); }
+  // thread-safe
+  const Comparator* user_comparator() const {
+    return internal_comparator_.user_comparator();
+  }
+  // thread-safe
+  const InternalKeyComparator& internal_comparator() const {
+    return internal_comparator_;
+  }
+
+  const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
+  int_tbl_prop_collector_factories() const {
+    return &int_tbl_prop_collector_factories_;
+  }
+
+  SuperVersion* GetSuperVersion() { return super_version_; }
+  // thread-safe
+  // Return a already referenced SuperVersion to be used safely.
+  SuperVersion* GetReferencedSuperVersion(DBImpl* db);
+  // thread-safe
+  // Get SuperVersion stored in thread local storage. If it does not exist,
+  // get a reference from a current SuperVersion.
+  SuperVersion* GetThreadLocalSuperVersion(DBImpl* db);
+  // Try to return SuperVersion back to thread local storage. Retrun true on
+  // success and false on failure. It fails when the thread local storage
+  // contains anything other than SuperVersion::kSVInUse flag.
+  bool ReturnThreadLocalSuperVersion(SuperVersion* sv);
+  // thread-safe
+  uint64_t GetSuperVersionNumber() const {
+    return super_version_number_.load();
+  }
+  // will return a pointer to SuperVersion* if previous SuperVersion
+  // if its reference count is zero and needs deletion or nullptr if not
+  // As argument takes a pointer to allocated SuperVersion to enable
+  // the clients to allocate SuperVersion outside of mutex.
+  // IMPORTANT: Only call this from DBImpl::InstallSuperVersion()
+  void InstallSuperVersion(SuperVersionContext* sv_context,
+                           InstrumentedMutex* db_mutex,
+                           const MutableCFOptions& mutable_cf_options);
+  void InstallSuperVersion(SuperVersionContext* sv_context,
+                           InstrumentedMutex* db_mutex);
+
+  void ResetThreadLocalSuperVersions();
+
+  // Protected by DB mutex
+  void set_queued_for_flush(bool value) { queued_for_flush_ = value; }
+  void set_queued_for_compaction(bool value) { queued_for_compaction_ = value; }
+  bool queued_for_flush() { return queued_for_flush_; }
+  bool queued_for_compaction() { return queued_for_compaction_; }
+
+  enum class WriteStallCause {
+    kNone,
+    kMemtableLimit,
+    kL0FileCountLimit,
+    kPendingCompactionBytes,
+  };
+  static std::pair<WriteStallCondition, WriteStallCause>
+  GetWriteStallConditionAndCause(int num_unflushed_memtables, int num_l0_files,
+                                 uint64_t num_compaction_needed_bytes,
+                                 const MutableCFOptions& mutable_cf_options);
+
+  // Recalculate some small conditions, which are changed only during
+  // compaction, adding new memtable and/or
+  // recalculation of compaction score. These values are used in
+  // DBImpl::MakeRoomForWrite function to decide, if it need to make
+  // a write stall
+  WriteStallCondition RecalculateWriteStallConditions(
+      const MutableCFOptions& mutable_cf_options);
+
+  void set_initialized() { initialized_.store(true); }
+
+  bool initialized() const { return initialized_.load(); }
+
+  const ColumnFamilyOptions& initial_cf_options() {
+    return initial_cf_options_;
+  }
+
+  Env::WriteLifeTimeHint CalculateSSTWriteHint(int level);
+
+  // created_dirs remembers directory created, so that we don't need to call
+  // the same data creation operation again.
+  Status AddDirectories(
+      std::map<std::string, std::shared_ptr<Directory>>* created_dirs);
+
+  Directory* GetDataDir(size_t path_id) const;
+
+  ThreadLocalPtr* TEST_GetLocalSV() { return local_sv_.get(); }
+
+ private:
+  friend class ColumnFamilySet;
+  ColumnFamilyData(uint32_t id, const std::string& name,
+                   Version* dummy_versions, Cache* table_cache,
+                   WriteBufferManager* write_buffer_manager,
+                   const ColumnFamilyOptions& options,
+                   const ImmutableDBOptions& db_options,
+                   const FileOptions& file_options,
+                   ColumnFamilySet* column_family_set,
+                   BlockCacheTracer* const block_cache_tracer);
+
+  uint32_t id_;
+  const std::string name_;
+  Version* dummy_versions_;  // Head of circular doubly-linked list of versions.
+  Version* current_;         // == dummy_versions->prev_
+
+  std::atomic<int> refs_;      // outstanding references to ColumnFamilyData
+  std::atomic<bool> initialized_;
+  std::atomic<bool> dropped_;  // true if client dropped it
+
+  const InternalKeyComparator internal_comparator_;
+  std::vector<std::unique_ptr<IntTblPropCollectorFactory>>
+      int_tbl_prop_collector_factories_;
+
+  const ColumnFamilyOptions initial_cf_options_;
+  const ImmutableCFOptions ioptions_;
+  MutableCFOptions mutable_cf_options_;
+
+  const bool is_delete_range_supported_;
+
+  std::unique_ptr<TableCache> table_cache_;
+
+  std::unique_ptr<InternalStats> internal_stats_;
+
+  WriteBufferManager* write_buffer_manager_;
+
+  MemTable* mem_;
+  MemTableList imm_;
+  SuperVersion* super_version_;
+
+  // An ordinal representing the current SuperVersion. Updated by
+  // InstallSuperVersion(), i.e. incremented every time super_version_
+  // changes.
+  std::atomic<uint64_t> super_version_number_;
+
+  // Thread's local copy of SuperVersion pointer
+  // This needs to be destructed before mutex_
+  std::unique_ptr<ThreadLocalPtr> local_sv_;
+
+  // pointers for a circular linked list. we use it to support iterations over
+  // all column families that are alive (note: dropped column families can also
+  // be alive as long as client holds a reference)
+  ColumnFamilyData* next_;
+  ColumnFamilyData* prev_;
+
+  // This is the earliest log file number that contains data from this
+  // Column Family. All earlier log files must be ignored and not
+  // recovered from
+  uint64_t log_number_;
+
+  std::atomic<FlushReason> flush_reason_;
+
+  // An object that keeps all the compaction stats
+  // and picks the next compaction
+  std::unique_ptr<CompactionPicker> compaction_picker_;
+
+  ColumnFamilySet* column_family_set_;
+
+  std::unique_ptr<WriteControllerToken> write_controller_token_;
+
+  // If true --> this ColumnFamily is currently present in DBImpl::flush_queue_
+  bool queued_for_flush_;
+
+  // If true --> this ColumnFamily is currently present in
+  // DBImpl::compaction_queue_
+  bool queued_for_compaction_;
+
+  uint64_t prev_compaction_needed_bytes_;
+
+  // if the database was opened with 2pc enabled
+  bool allow_2pc_;
+
+  // Memtable id to track flush.
+  std::atomic<uint64_t> last_memtable_id_;
+
+  // Directories corresponding to cf_paths.
+  std::vector<std::shared_ptr<Directory>> data_dirs_;
+};
+
+// ColumnFamilySet has interesting thread-safety requirements
+// * CreateColumnFamily() or RemoveColumnFamily() -- need to be protected by DB
+// mutex AND executed in the write thread.
+// CreateColumnFamily() should ONLY be called from VersionSet::LogAndApply() AND
+// single-threaded write thread. It is also called during Recovery and in
+// DumpManifest().
+// RemoveColumnFamily() is only called from SetDropped(). DB mutex needs to be
+// held and it needs to be executed from the write thread. SetDropped() also
+// guarantees that it will be called only from single-threaded LogAndApply(),
+// but this condition is not that important.
+// * Iteration -- hold DB mutex, but you can release it in the body of
+// iteration. If you release DB mutex in body, reference the column
+// family before the mutex and unreference after you unlock, since the column
+// family might get dropped when the DB mutex is released
+// * GetDefault() -- thread safe
+// * GetColumnFamily() -- either inside of DB mutex or from a write thread
+// * GetNextColumnFamilyID(), GetMaxColumnFamily(), UpdateMaxColumnFamily(),
+// NumberOfColumnFamilies -- inside of DB mutex
+class ColumnFamilySet {
+ public:
+  // ColumnFamilySet supports iteration
+  class iterator {
+   public:
+    explicit iterator(ColumnFamilyData* cfd)
+        : current_(cfd) {}
+    iterator& operator++() {
+      // dropped column families might still be included in this iteration
+      // (we're only removing them when client drops the last reference to the
+      // column family).
+      // dummy is never dead, so this will never be infinite
+      do {
+        current_ = current_->next_;
+      } while (current_->refs_.load(std::memory_order_relaxed) == 0);
+      return *this;
+    }
+    bool operator!=(const iterator& other) {
+      return this->current_ != other.current_;
+    }
+    ColumnFamilyData* operator*() { return current_; }
+
+   private:
+    ColumnFamilyData* current_;
+  };
+
+  ColumnFamilySet(const std::string& dbname,
+                  const ImmutableDBOptions* db_options,
+                  const FileOptions& file_options, Cache* table_cache,
+                  WriteBufferManager* write_buffer_manager,
+                  WriteController* write_controller,
+                  BlockCacheTracer* const block_cache_tracer);
+  ~ColumnFamilySet();
+
+  ColumnFamilyData* GetDefault() const;
+  // GetColumnFamily() calls return nullptr if column family is not found
+  ColumnFamilyData* GetColumnFamily(uint32_t id) const;
+  ColumnFamilyData* GetColumnFamily(const std::string& name) const;
+  // this call will return the next available column family ID. it guarantees
+  // that there is no column family with id greater than or equal to the
+  // returned value in the current running instance or anytime in RocksDB
+  // instance history.
+  uint32_t GetNextColumnFamilyID();
+  uint32_t GetMaxColumnFamily();
+  void UpdateMaxColumnFamily(uint32_t new_max_column_family);
+  size_t NumberOfColumnFamilies() const;
+
+  ColumnFamilyData* CreateColumnFamily(const std::string& name, uint32_t id,
+                                       Version* dummy_version,
+                                       const ColumnFamilyOptions& options);
+
+  iterator begin() { return iterator(dummy_cfd_->next_); }
+  iterator end() { return iterator(dummy_cfd_); }
+
+  // REQUIRES: DB mutex held
+  // Don't call while iterating over ColumnFamilySet
+  void FreeDeadColumnFamilies();
+
+  Cache* get_table_cache() { return table_cache_; }
+
+ private:
+  friend class ColumnFamilyData;
+  // helper function that gets called from cfd destructor
+  // REQUIRES: DB mutex held
+  void RemoveColumnFamily(ColumnFamilyData* cfd);
+
+  // column_families_ and column_family_data_ need to be protected:
+  // * when mutating both conditions have to be satisfied:
+  // 1. DB mutex locked
+  // 2. thread currently in single-threaded write thread
+  // * when reading, at least one condition needs to be satisfied:
+  // 1. DB mutex locked
+  // 2. accessed from a single-threaded write thread
+  std::unordered_map<std::string, uint32_t> column_families_;
+  std::unordered_map<uint32_t, ColumnFamilyData*> column_family_data_;
+
+  uint32_t max_column_family_;
+  ColumnFamilyData* dummy_cfd_;
+  // We don't hold the refcount here, since default column family always exists
+  // We are also not responsible for cleaning up default_cfd_cache_. This is
+  // just a cache that makes common case (accessing default column family)
+  // faster
+  ColumnFamilyData* default_cfd_cache_;
+
+  const std::string db_name_;
+  const ImmutableDBOptions* const db_options_;
+  const FileOptions file_options_;
+  Cache* table_cache_;
+  WriteBufferManager* write_buffer_manager_;
+  WriteController* write_controller_;
+  BlockCacheTracer* const block_cache_tracer_;
+};
+
+// We use ColumnFamilyMemTablesImpl to provide WriteBatch a way to access
+// memtables of different column families (specified by ID in the write batch)
+class ColumnFamilyMemTablesImpl : public ColumnFamilyMemTables {
+ public:
+  explicit ColumnFamilyMemTablesImpl(ColumnFamilySet* column_family_set)
+      : column_family_set_(column_family_set), current_(nullptr) {}
+
+  // Constructs a ColumnFamilyMemTablesImpl equivalent to one constructed
+  // with the arguments used to construct *orig.
+  explicit ColumnFamilyMemTablesImpl(ColumnFamilyMemTablesImpl* orig)
+      : column_family_set_(orig->column_family_set_), current_(nullptr) {}
+
+  // sets current_ to ColumnFamilyData with column_family_id
+  // returns false if column family doesn't exist
+  // REQUIRES: use this function of DBImpl::column_family_memtables_ should be
+  //           under a DB mutex OR from a write thread
+  bool Seek(uint32_t column_family_id) override;
+
+  // Returns log number of the selected column family
+  // REQUIRES: under a DB mutex OR from a write thread
+  uint64_t GetLogNumber() const override;
+
+  // REQUIRES: Seek() called first
+  // REQUIRES: use this function of DBImpl::column_family_memtables_ should be
+  //           under a DB mutex OR from a write thread
+  virtual MemTable* GetMemTable() const override;
+
+  // Returns column family handle for the selected column family
+  // REQUIRES: use this function of DBImpl::column_family_memtables_ should be
+  //           under a DB mutex OR from a write thread
+  virtual ColumnFamilyHandle* GetColumnFamilyHandle() override;
+
+  // Cannot be called while another thread is calling Seek().
+  // REQUIRES: use this function of DBImpl::column_family_memtables_ should be
+  //           under a DB mutex OR from a write thread
+  virtual ColumnFamilyData* current() override { return current_; }
+
+ private:
+  ColumnFamilySet* column_family_set_;
+  ColumnFamilyData* current_;
+  ColumnFamilyHandleInternal handle_;
+};
+
+extern uint32_t GetColumnFamilyID(ColumnFamilyHandle* column_family);
+
+extern const Comparator* GetColumnFamilyUserComparator(
+    ColumnFamilyHandle* column_family);
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/column_family_test.cc b/src/rocksdb/db/column_family_test.cc
new file mode 100644
index 000000000..24ff4e08b
--- /dev/null
+++ b/src/rocksdb/db/column_family_test.cc
@@ -0,0 +1,3387 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <algorithm>
+#include <vector>
+#include <string>
+#include <thread>
+
+#include "db/db_impl/db_impl.h"
+#include "db/db_test_util.h"
+#include "memtable/hash_skiplist_rep.h"
+#include "options/options_parser.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/utilities/object_registry.h"
+#include "test_util/fault_injection_test_env.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/coding.h"
+#include "util/string_util.h"
+#include "utilities/merge_operators.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+static const int kValueSize = 1000;
+
+namespace {
+std::string RandomString(Random* rnd, int len) {
+  std::string r;
+  test::RandomString(rnd, len, &r);
+  return r;
+}
+}  // anonymous namespace
+
+// counts how many operations were performed
+class EnvCounter : public EnvWrapper {
+ public:
+  explicit EnvCounter(Env* base)
+      : EnvWrapper(base), num_new_writable_file_(0) {}
+  int GetNumberOfNewWritableFileCalls() {
+    return num_new_writable_file_;
+  }
+  Status NewWritableFile(const std::string& f, std::unique_ptr<WritableFile>* r,
+                         const EnvOptions& soptions) override {
+    ++num_new_writable_file_;
+    return EnvWrapper::NewWritableFile(f, r, soptions);
+  }
+
+ private:
+  std::atomic<int> num_new_writable_file_;
+};
+
+class ColumnFamilyTestBase : public testing::Test {
+ public:
+  explicit ColumnFamilyTestBase(uint32_t format) : rnd_(139), format_(format) {
+    Env* base_env = Env::Default();
+#ifndef ROCKSDB_LITE
+    const char* test_env_uri = getenv("TEST_ENV_URI");
+    if (test_env_uri) {
+      Env* test_env = nullptr;
+      Status s = Env::LoadEnv(test_env_uri, &test_env, &env_guard_);
+      base_env = test_env;
+      EXPECT_OK(s);
+      EXPECT_NE(Env::Default(), base_env);
+    }
+#endif  // !ROCKSDB_LITE
+    EXPECT_NE(nullptr, base_env);
+    env_ = new EnvCounter(base_env);
+    dbname_ = test::PerThreadDBPath("column_family_test");
+    db_options_.create_if_missing = true;
+    db_options_.fail_if_options_file_error = true;
+    db_options_.env = env_;
+    DestroyDB(dbname_, Options(db_options_, column_family_options_));
+  }
+
+  ~ColumnFamilyTestBase() override {
+    std::vector<ColumnFamilyDescriptor> column_families;
+    for (auto h : handles_) {
+      ColumnFamilyDescriptor cfdescriptor;
+      h->GetDescriptor(&cfdescriptor);
+      column_families.push_back(cfdescriptor);
+    }
+    Close();
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+    Destroy(column_families);
+    delete env_;
+  }
+
+  BlockBasedTableOptions GetBlockBasedTableOptions() {
+    BlockBasedTableOptions options;
+    options.format_version = format_;
+    return options;
+  }
+
+  // Return the value to associate with the specified key
+  Slice Value(int k, std::string* storage) {
+    if (k == 0) {
+      // Ugh.  Random seed of 0 used to produce no entropy.  This code
+      // preserves the implementation that was in place when all of the
+      // magic values in this file were picked.
+      *storage = std::string(kValueSize, ' ');
+      return Slice(*storage);
+    } else {
+      Random r(k);
+      return test::RandomString(&r, kValueSize, storage);
+    }
+  }
+
+  void Build(int base, int n, int flush_every = 0) {
+    std::string key_space, value_space;
+    WriteBatch batch;
+
+    for (int i = 0; i < n; i++) {
+      if (flush_every != 0 && i != 0 && i % flush_every == 0) {
+        DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
+        dbi->TEST_FlushMemTable();
+      }
+
+      int keyi = base + i;
+      Slice key(DBTestBase::Key(keyi));
+
+      batch.Clear();
+      batch.Put(handles_[0], key, Value(keyi, &value_space));
+      batch.Put(handles_[1], key, Value(keyi, &value_space));
+      batch.Put(handles_[2], key, Value(keyi, &value_space));
+      ASSERT_OK(db_->Write(WriteOptions(), &batch));
+    }
+  }
+
+  void CheckMissed() {
+    uint64_t next_expected = 0;
+    uint64_t missed = 0;
+    int bad_keys = 0;
+    int bad_values = 0;
+    int correct = 0;
+    std::string value_space;
+    for (int cf = 0; cf < 3; cf++) {
+      next_expected = 0;
+      Iterator* iter = db_->NewIterator(ReadOptions(false, true), handles_[cf]);
+      for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+        uint64_t key;
+        Slice in(iter->key());
+        in.remove_prefix(3);
+        if (!ConsumeDecimalNumber(&in, &key) || !in.empty() ||
+            key < next_expected) {
+          bad_keys++;
+          continue;
+        }
+        missed += (key - next_expected);
+        next_expected = key + 1;
+        if (iter->value() != Value(static_cast<int>(key), &value_space)) {
+          bad_values++;
+        } else {
+          correct++;
+        }
+      }
+      delete iter;
+    }
+
+    ASSERT_EQ(0, bad_keys);
+    ASSERT_EQ(0, bad_values);
+    ASSERT_EQ(0, missed);
+    (void)correct;
+  }
+
+  void Close() {
+    for (auto h : handles_) {
+      if (h) {
+        db_->DestroyColumnFamilyHandle(h);
+      }
+    }
+    handles_.clear();
+    names_.clear();
+    delete db_;
+    db_ = nullptr;
+  }
+
+  Status TryOpen(std::vector<std::string> cf,
+                 std::vector<ColumnFamilyOptions> options = {}) {
+    std::vector<ColumnFamilyDescriptor> column_families;
+    names_.clear();
+    for (size_t i = 0; i < cf.size(); ++i) {
+      column_families.push_back(ColumnFamilyDescriptor(
+          cf[i], options.size() == 0 ? column_family_options_ : options[i]));
+      names_.push_back(cf[i]);
+    }
+    return DB::Open(db_options_, dbname_, column_families, &handles_, &db_);
+  }
+
+  Status OpenReadOnly(std::vector<std::string> cf,
+                         std::vector<ColumnFamilyOptions> options = {}) {
+    std::vector<ColumnFamilyDescriptor> column_families;
+    names_.clear();
+    for (size_t i = 0; i < cf.size(); ++i) {
+      column_families.push_back(ColumnFamilyDescriptor(
+          cf[i], options.size() == 0 ? column_family_options_ : options[i]));
+      names_.push_back(cf[i]);
+    }
+    return DB::OpenForReadOnly(db_options_, dbname_, column_families, &handles_,
+                               &db_);
+  }
+
+#ifndef ROCKSDB_LITE  // ReadOnlyDB is not supported
+  void AssertOpenReadOnly(std::vector<std::string> cf,
+                    std::vector<ColumnFamilyOptions> options = {}) {
+    ASSERT_OK(OpenReadOnly(cf, options));
+  }
+#endif  // !ROCKSDB_LITE
+
+
+  void Open(std::vector<std::string> cf,
+            std::vector<ColumnFamilyOptions> options = {}) {
+    ASSERT_OK(TryOpen(cf, options));
+  }
+
+  void Open() {
+    Open({"default"});
+  }
+
+  DBImpl* dbfull() { return reinterpret_cast<DBImpl*>(db_); }
+
+  int GetProperty(int cf, std::string property) {
+    std::string value;
+    EXPECT_TRUE(dbfull()->GetProperty(handles_[cf], property, &value));
+#ifndef CYGWIN
+    return std::stoi(value);
+#else
+    return std::strtol(value.c_str(), 0 /* off */, 10 /* base */);
+#endif
+  }
+
+  bool IsDbWriteStopped() {
+#ifndef ROCKSDB_LITE
+    uint64_t v;
+    EXPECT_TRUE(dbfull()->GetIntProperty("rocksdb.is-write-stopped", &v));
+    return (v == 1);
+#else
+    return dbfull()->TEST_write_controler().IsStopped();
+#endif  // !ROCKSDB_LITE
+  }
+
+  uint64_t GetDbDelayedWriteRate() {
+#ifndef ROCKSDB_LITE
+    uint64_t v;
+    EXPECT_TRUE(
+        dbfull()->GetIntProperty("rocksdb.actual-delayed-write-rate", &v));
+    return v;
+#else
+    if (!dbfull()->TEST_write_controler().NeedsDelay()) {
+      return 0;
+    }
+    return dbfull()->TEST_write_controler().delayed_write_rate();
+#endif  // !ROCKSDB_LITE
+  }
+
+  void Destroy(const std::vector<ColumnFamilyDescriptor>& column_families =
+                  std::vector<ColumnFamilyDescriptor>()) {
+    Close();
+    ASSERT_OK(DestroyDB(dbname_, Options(db_options_, column_family_options_),
+                        column_families));
+  }
+
+  void CreateColumnFamilies(
+      const std::vector<std::string>& cfs,
+      const std::vector<ColumnFamilyOptions> options = {}) {
+    int cfi = static_cast<int>(handles_.size());
+    handles_.resize(cfi + cfs.size());
+    names_.resize(cfi + cfs.size());
+    for (size_t i = 0; i < cfs.size(); ++i) {
+      const auto& current_cf_opt =
+          options.size() == 0 ? column_family_options_ : options[i];
+      ASSERT_OK(
+          db_->CreateColumnFamily(current_cf_opt, cfs[i], &handles_[cfi]));
+      names_[cfi] = cfs[i];
+
+#ifndef ROCKSDB_LITE  // RocksDBLite does not support GetDescriptor
+      // Verify the CF options of the returned CF handle.
+      ColumnFamilyDescriptor desc;
+      ASSERT_OK(handles_[cfi]->GetDescriptor(&desc));
+      RocksDBOptionsParser::VerifyCFOptions(desc.options, current_cf_opt);
+#endif  // !ROCKSDB_LITE
+      cfi++;
+    }
+  }
+
+  void Reopen(const std::vector<ColumnFamilyOptions> options = {}) {
+    std::vector<std::string> names;
+    for (auto name : names_) {
+      if (name != "") {
+        names.push_back(name);
+      }
+    }
+    Close();
+    assert(options.size() == 0 || names.size() == options.size());
+    Open(names, options);
+  }
+
+  void CreateColumnFamiliesAndReopen(const std::vector<std::string>& cfs) {
+    CreateColumnFamilies(cfs);
+    Reopen();
+  }
+
+  void DropColumnFamilies(const std::vector<int>& cfs) {
+    for (auto cf : cfs) {
+      ASSERT_OK(db_->DropColumnFamily(handles_[cf]));
+      db_->DestroyColumnFamilyHandle(handles_[cf]);
+      handles_[cf] = nullptr;
+      names_[cf] = "";
+    }
+  }
+
+  void PutRandomData(int cf, int num, int key_value_size, bool save = false) {
+    if (cf >= static_cast<int>(keys_.size())) {
+      keys_.resize(cf + 1);
+    }
+    for (int i = 0; i < num; ++i) {
+      // 10 bytes for key, rest is value
+      if (!save) {
+        ASSERT_OK(Put(cf, test::RandomKey(&rnd_, 11),
+                      RandomString(&rnd_, key_value_size - 10)));
+      } else {
+        std::string key = test::RandomKey(&rnd_, 11);
+        keys_[cf].insert(key);
+        ASSERT_OK(Put(cf, key, RandomString(&rnd_, key_value_size - 10)));
+      }
+    }
+    db_->FlushWAL(false);
+  }
+
+#ifndef ROCKSDB_LITE  // TEST functions in DB are not supported in lite
+  void WaitForFlush(int cf) {
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[cf]));
+  }
+
+  void WaitForCompaction() {
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  }
+
+  uint64_t MaxTotalInMemoryState() {
+    return dbfull()->TEST_MaxTotalInMemoryState();
+  }
+
+  void AssertMaxTotalInMemoryState(uint64_t value) {
+    ASSERT_EQ(value, MaxTotalInMemoryState());
+  }
+#endif  // !ROCKSDB_LITE
+
+  Status Put(int cf, const std::string& key, const std::string& value) {
+    return db_->Put(WriteOptions(), handles_[cf], Slice(key), Slice(value));
+  }
+  Status Merge(int cf, const std::string& key, const std::string& value) {
+    return db_->Merge(WriteOptions(), handles_[cf], Slice(key), Slice(value));
+  }
+  Status Flush(int cf) {
+    return db_->Flush(FlushOptions(), handles_[cf]);
+  }
+
+  std::string Get(int cf, const std::string& key) {
+    ReadOptions options;
+    options.verify_checksums = true;
+    std::string result;
+    Status s = db_->Get(options, handles_[cf], Slice(key), &result);
+    if (s.IsNotFound()) {
+      result = "NOT_FOUND";
+    } else if (!s.ok()) {
+      result = s.ToString();
+    }
+    return result;
+  }
+
+  void CompactAll(int cf) {
+    ASSERT_OK(db_->CompactRange(CompactRangeOptions(), handles_[cf], nullptr,
+                                nullptr));
+  }
+
+  void Compact(int cf, const Slice& start, const Slice& limit) {
+    ASSERT_OK(
+        db_->CompactRange(CompactRangeOptions(), handles_[cf], &start, &limit));
+  }
+
+  int NumTableFilesAtLevel(int level, int cf) {
+    return GetProperty(cf,
+                       "rocksdb.num-files-at-level" + ToString(level));
+  }
+
+#ifndef ROCKSDB_LITE
+  // Return spread of files per level
+  std::string FilesPerLevel(int cf) {
+    std::string result;
+    int last_non_zero_offset = 0;
+    for (int level = 0; level < dbfull()->NumberLevels(handles_[cf]); level++) {
+      int f = NumTableFilesAtLevel(level, cf);
+      char buf[100];
+      snprintf(buf, sizeof(buf), "%s%d", (level ? "," : ""), f);
+      result += buf;
+      if (f > 0) {
+        last_non_zero_offset = static_cast<int>(result.size());
+      }
+    }
+    result.resize(last_non_zero_offset);
+    return result;
+  }
+#endif
+
+  void AssertFilesPerLevel(const std::string& value, int cf) {
+#ifndef ROCKSDB_LITE
+    ASSERT_EQ(value, FilesPerLevel(cf));
+#else
+    (void) value;
+    (void) cf;
+#endif
+  }
+
+#ifndef ROCKSDB_LITE  // GetLiveFilesMetaData is not supported
+  int CountLiveFiles() {
+    std::vector<LiveFileMetaData> metadata;
+    db_->GetLiveFilesMetaData(&metadata);
+    return static_cast<int>(metadata.size());
+  }
+#endif  // !ROCKSDB_LITE
+
+  void AssertCountLiveFiles(int expected_value) {
+#ifndef ROCKSDB_LITE
+    ASSERT_EQ(expected_value, CountLiveFiles());
+#else
+    (void) expected_value;
+#endif
+  }
+
+  // Do n memtable flushes, each of which produces an sstable
+  // covering the range [small,large].
+  void MakeTables(int cf, int n, const std::string& small,
+                  const std::string& large) {
+    for (int i = 0; i < n; i++) {
+      ASSERT_OK(Put(cf, small, "begin"));
+      ASSERT_OK(Put(cf, large, "end"));
+      ASSERT_OK(db_->Flush(FlushOptions(), handles_[cf]));
+    }
+  }
+
+#ifndef ROCKSDB_LITE  // GetSortedWalFiles is not supported
+  int CountLiveLogFiles() {
+    int micros_wait_for_log_deletion = 20000;
+    env_->SleepForMicroseconds(micros_wait_for_log_deletion);
+    int ret = 0;
+    VectorLogPtr wal_files;
+    Status s;
+    // GetSortedWalFiles is a flakey function -- it gets all the wal_dir
+    // children files and then later checks for their existence. if some of the
+    // log files doesn't exist anymore, it reports an error. it does all of this
+    // without DB mutex held, so if a background process deletes the log file
+    // while the function is being executed, it returns an error. We retry the
+    // function 10 times to avoid the error failing the test
+    for (int retries = 0; retries < 10; ++retries) {
+      wal_files.clear();
+      s = db_->GetSortedWalFiles(wal_files);
+      if (s.ok()) {
+        break;
+      }
+    }
+    EXPECT_OK(s);
+    for (const auto& wal : wal_files) {
+      if (wal->Type() == kAliveLogFile) {
+        ++ret;
+      }
+    }
+    return ret;
+    return 0;
+  }
+#endif  // !ROCKSDB_LITE
+
+  void AssertCountLiveLogFiles(int value) {
+#ifndef ROCKSDB_LITE  // GetSortedWalFiles is not supported
+    ASSERT_EQ(value, CountLiveLogFiles());
+#else
+    (void) value;
+#endif  // !ROCKSDB_LITE
+  }
+
+  void AssertNumberOfImmutableMemtables(std::vector<int> num_per_cf) {
+    assert(num_per_cf.size() == handles_.size());
+
+#ifndef ROCKSDB_LITE  // GetProperty is not supported in lite
+    for (size_t i = 0; i < num_per_cf.size(); ++i) {
+      ASSERT_EQ(num_per_cf[i], GetProperty(static_cast<int>(i),
+                                           "rocksdb.num-immutable-mem-table"));
+    }
+#endif  // !ROCKSDB_LITE
+  }
+
+  void CopyFile(const std::string& source, const std::string& destination,
+                uint64_t size = 0) {
+    const EnvOptions soptions;
+    std::unique_ptr<SequentialFile> srcfile;
+    ASSERT_OK(env_->NewSequentialFile(source, &srcfile, soptions));
+    std::unique_ptr<WritableFile> destfile;
+    ASSERT_OK(env_->NewWritableFile(destination, &destfile, soptions));
+
+    if (size == 0) {
+      // default argument means copy everything
+      ASSERT_OK(env_->GetFileSize(source, &size));
+    }
+
+    char buffer[4096];
+    Slice slice;
+    while (size > 0) {
+      uint64_t one = std::min(uint64_t(sizeof(buffer)), size);
+      ASSERT_OK(srcfile->Read(one, &slice, buffer));
+      ASSERT_OK(destfile->Append(slice));
+      size -= slice.size();
+    }
+    ASSERT_OK(destfile->Close());
+  }
+
+  int GetSstFileCount(std::string path) {
+    std::vector<std::string> files;
+    DBTestBase::GetSstFiles(env_, path, &files);
+    return static_cast<int>(files.size());
+  }
+
+  void RecalculateWriteStallConditions(ColumnFamilyData* cfd,
+      const MutableCFOptions& mutable_cf_options)  {
+    // add lock to avoid race condition between
+    // `RecalculateWriteStallConditions` which writes to CFStats and
+    // background `DBImpl::DumpStats()` threads which read CFStats
+    dbfull()->TEST_LockMutex();
+    cfd->RecalculateWriteStallConditions(mutable_cf_options);
+    dbfull()-> TEST_UnlockMutex();
+  }
+
+  std::vector<ColumnFamilyHandle*> handles_;
+  std::vector<std::string> names_;
+  std::vector<std::set<std::string>> keys_;
+  ColumnFamilyOptions column_family_options_;
+  DBOptions db_options_;
+  std::string dbname_;
+  DB* db_ = nullptr;
+  EnvCounter* env_;
+  std::shared_ptr<Env> env_guard_;
+  Random rnd_;
+  uint32_t format_;
+};
+
+class ColumnFamilyTest
+    : public ColumnFamilyTestBase,
+      virtual public ::testing::WithParamInterface<uint32_t> {
+ public:
+  ColumnFamilyTest() : ColumnFamilyTestBase(GetParam()) {}
+};
+
+INSTANTIATE_TEST_CASE_P(FormatDef, ColumnFamilyTest,
+                        testing::Values(test::kDefaultFormatVersion));
+INSTANTIATE_TEST_CASE_P(FormatLatest, ColumnFamilyTest,
+                        testing::Values(test::kLatestFormatVersion));
+
+TEST_P(ColumnFamilyTest, DontReuseColumnFamilyID) {
+  for (int iter = 0; iter < 3; ++iter) {
+    Open();
+    CreateColumnFamilies({"one", "two", "three"});
+    for (size_t i = 0; i < handles_.size(); ++i) {
+      auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(handles_[i]);
+      ASSERT_EQ(i, cfh->GetID());
+    }
+    if (iter == 1) {
+      Reopen();
+    }
+    DropColumnFamilies({3});
+    Reopen();
+    if (iter == 2) {
+      // this tests if max_column_family is correctly persisted with
+      // WriteSnapshot()
+      Reopen();
+    }
+    CreateColumnFamilies({"three2"});
+    // ID 3 that was used for dropped column family "three" should not be
+    // reused
+    auto cfh3 = reinterpret_cast<ColumnFamilyHandleImpl*>(handles_[3]);
+    ASSERT_EQ(4U, cfh3->GetID());
+    Close();
+    Destroy();
+  }
+}
+
+#ifndef ROCKSDB_LITE
+TEST_P(ColumnFamilyTest, CreateCFRaceWithGetAggProperty) {
+  Open();
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::WriteOptionsFile:1",
+        "ColumnFamilyTest.CreateCFRaceWithGetAggProperty:1"},
+       {"ColumnFamilyTest.CreateCFRaceWithGetAggProperty:2",
+        "DBImpl::WriteOptionsFile:2"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ROCKSDB_NAMESPACE::port::Thread thread(
+      [&] { CreateColumnFamilies({"one"}); });
+
+  TEST_SYNC_POINT("ColumnFamilyTest.CreateCFRaceWithGetAggProperty:1");
+  uint64_t pv;
+  db_->GetAggregatedIntProperty(DB::Properties::kEstimateTableReadersMem, &pv);
+  TEST_SYNC_POINT("ColumnFamilyTest.CreateCFRaceWithGetAggProperty:2");
+
+  thread.join();
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+#endif  // !ROCKSDB_LITE
+
+class FlushEmptyCFTestWithParam
+    : public ColumnFamilyTestBase,
+      virtual public testing::WithParamInterface<std::tuple<uint32_t, bool>> {
+ public:
+  FlushEmptyCFTestWithParam()
+      : ColumnFamilyTestBase(std::get<0>(GetParam())),
+        allow_2pc_(std::get<1>(GetParam())) {}
+
+  // Required if inheriting from testing::WithParamInterface<>
+  static void SetUpTestCase() {}
+  static void TearDownTestCase() {}
+
+  bool allow_2pc_;
+};
+
+TEST_P(FlushEmptyCFTestWithParam, FlushEmptyCFTest) {
+  std::unique_ptr<FaultInjectionTestEnv> fault_env(
+      new FaultInjectionTestEnv(env_));
+  db_options_.env = fault_env.get();
+  db_options_.allow_2pc = allow_2pc_;
+  Open();
+  CreateColumnFamilies({"one", "two"});
+  // Generate log file A.
+  ASSERT_OK(Put(1, "foo", "v1"));  // seqID 1
+
+  Reopen();
+  // Log file A is not dropped after reopening because default column family's
+  // min log number is 0.
+  // It flushes to SST file X
+  ASSERT_OK(Put(1, "foo", "v1"));  // seqID 2
+  ASSERT_OK(Put(1, "bar", "v2"));  // seqID 3
+  // Current log file is file B now. While flushing, a new log file C is created
+  // and is set to current. Boths' min log number is set to file C in memory, so
+  // after flushing file B is deleted. At the same time, the min log number of
+  // default CF is not written to manifest. Log file A still remains.
+  // Flushed to SST file Y.
+  Flush(1);
+  Flush(0);
+  ASSERT_OK(Put(1, "bar", "v3"));  // seqID 4
+  ASSERT_OK(Put(1, "foo", "v4"));  // seqID 5
+  db_->FlushWAL(false);
+
+  // Preserve file system state up to here to simulate a crash condition.
+  fault_env->SetFilesystemActive(false);
+  std::vector<std::string> names;
+  for (auto name : names_) {
+    if (name != "") {
+      names.push_back(name);
+    }
+  }
+
+  Close();
+  fault_env->ResetState();
+
+  // Before opening, there are four files:
+  //   Log file A contains seqID 1
+  //   Log file C contains seqID 4, 5
+  //   SST file X contains seqID 1
+  //   SST file Y contains seqID 2, 3
+  // Min log number:
+  //   default CF: 0
+  //   CF one, two: C
+  // When opening the DB, all the seqID should be preserved.
+  Open(names, {});
+  ASSERT_EQ("v4", Get(1, "foo"));
+  ASSERT_EQ("v3", Get(1, "bar"));
+  Close();
+
+  db_options_.env = env_;
+}
+
+TEST_P(FlushEmptyCFTestWithParam, FlushEmptyCFTest2) {
+  std::unique_ptr<FaultInjectionTestEnv> fault_env(
+      new FaultInjectionTestEnv(env_));
+  db_options_.env = fault_env.get();
+  db_options_.allow_2pc = allow_2pc_;
+  Open();
+  CreateColumnFamilies({"one", "two"});
+  // Generate log file A.
+  ASSERT_OK(Put(1, "foo", "v1"));  // seqID 1
+
+  Reopen();
+  // Log file A is not dropped after reopening because default column family's
+  // min log number is 0.
+  // It flushes to SST file X
+  ASSERT_OK(Put(1, "foo", "v1"));  // seqID 2
+  ASSERT_OK(Put(1, "bar", "v2"));  // seqID 3
+  // Current log file is file B now. While flushing, a new log file C is created
+  // and is set to current. Both CFs' min log number is set to file C so after
+  // flushing file B is deleted. Log file A still remains.
+  // Flushed to SST file Y.
+  Flush(1);
+  ASSERT_OK(Put(0, "bar", "v2"));  // seqID 4
+  ASSERT_OK(Put(2, "bar", "v2"));  // seqID 5
+  ASSERT_OK(Put(1, "bar", "v3"));  // seqID 6
+  // Flushing all column families. This forces all CFs' min log to current. This
+  // is written to the manifest file. Log file C is cleared.
+  Flush(0);
+  Flush(1);
+  Flush(2);
+  // Write to log file D
+  ASSERT_OK(Put(1, "bar", "v4"));  // seqID 7
+  ASSERT_OK(Put(1, "bar", "v5"));  // seqID 8
+  db_->FlushWAL(false);
+  // Preserve file system state up to here to simulate a crash condition.
+  fault_env->SetFilesystemActive(false);
+  std::vector<std::string> names;
+  for (auto name : names_) {
+    if (name != "") {
+      names.push_back(name);
+    }
+  }
+
+  Close();
+  fault_env->ResetState();
+  // Before opening, there are two logfiles:
+  //   Log file A contains seqID 1
+  //   Log file D contains seqID 7, 8
+  // Min log number:
+  //   default CF: D
+  //   CF one, two: D
+  // When opening the DB, log file D should be replayed using the seqID
+  // specified in the file.
+  Open(names, {});
+  ASSERT_EQ("v1", Get(1, "foo"));
+  ASSERT_EQ("v5", Get(1, "bar"));
+  Close();
+
+  db_options_.env = env_;
+}
+
+INSTANTIATE_TEST_CASE_P(
+    FormatDef, FlushEmptyCFTestWithParam,
+    testing::Values(std::make_tuple(test::kDefaultFormatVersion, true),
+                    std::make_tuple(test::kDefaultFormatVersion, false)));
+INSTANTIATE_TEST_CASE_P(
+    FormatLatest, FlushEmptyCFTestWithParam,
+    testing::Values(std::make_tuple(test::kLatestFormatVersion, true),
+                    std::make_tuple(test::kLatestFormatVersion, false)));
+
+TEST_P(ColumnFamilyTest, AddDrop) {
+  Open();
+  CreateColumnFamilies({"one", "two", "three"});
+  ASSERT_EQ("NOT_FOUND", Get(1, "fodor"));
+  ASSERT_EQ("NOT_FOUND", Get(2, "fodor"));
+  DropColumnFamilies({2});
+  ASSERT_EQ("NOT_FOUND", Get(1, "fodor"));
+  CreateColumnFamilies({"four"});
+  ASSERT_EQ("NOT_FOUND", Get(3, "fodor"));
+  ASSERT_OK(Put(1, "fodor", "mirko"));
+  ASSERT_EQ("mirko", Get(1, "fodor"));
+  ASSERT_EQ("NOT_FOUND", Get(3, "fodor"));
+  Close();
+  ASSERT_TRUE(TryOpen({"default"}).IsInvalidArgument());
+  Open({"default", "one", "three", "four"});
+  DropColumnFamilies({1});
+  Reopen();
+  Close();
+
+  std::vector<std::string> families;
+  ASSERT_OK(DB::ListColumnFamilies(db_options_, dbname_, &families));
+  std::sort(families.begin(), families.end());
+  ASSERT_TRUE(families ==
+              std::vector<std::string>({"default", "four", "three"}));
+}
+
+TEST_P(ColumnFamilyTest, BulkAddDrop) {
+  constexpr int kNumCF = 1000;
+  ColumnFamilyOptions cf_options;
+  WriteOptions write_options;
+  Open();
+  std::vector<std::string> cf_names;
+  std::vector<ColumnFamilyHandle*> cf_handles;
+  for (int i = 1; i <= kNumCF; i++) {
+    cf_names.push_back("cf1-" + ToString(i));
+  }
+  ASSERT_OK(db_->CreateColumnFamilies(cf_options, cf_names, &cf_handles));
+  for (int i = 1; i <= kNumCF; i++) {
+    ASSERT_OK(db_->Put(write_options, cf_handles[i - 1], "foo", "bar"));
+  }
+  ASSERT_OK(db_->DropColumnFamilies(cf_handles));
+  std::vector<ColumnFamilyDescriptor> cf_descriptors;
+  for (auto* handle : cf_handles) {
+    delete handle;
+  }
+  cf_handles.clear();
+  for (int i = 1; i <= kNumCF; i++) {
+    cf_descriptors.emplace_back("cf2-" + ToString(i), ColumnFamilyOptions());
+  }
+  ASSERT_OK(db_->CreateColumnFamilies(cf_descriptors, &cf_handles));
+  for (int i = 1; i <= kNumCF; i++) {
+    ASSERT_OK(db_->Put(write_options, cf_handles[i - 1], "foo", "bar"));
+  }
+  ASSERT_OK(db_->DropColumnFamilies(cf_handles));
+  for (auto* handle : cf_handles) {
+    delete handle;
+  }
+  Close();
+  std::vector<std::string> families;
+  ASSERT_OK(DB::ListColumnFamilies(db_options_, dbname_, &families));
+  std::sort(families.begin(), families.end());
+  ASSERT_TRUE(families == std::vector<std::string>({"default"}));
+}
+
+TEST_P(ColumnFamilyTest, DropTest) {
+  // first iteration - dont reopen DB before dropping
+  // second iteration - reopen DB before dropping
+  for (int iter = 0; iter < 2; ++iter) {
+    Open({"default"});
+    CreateColumnFamiliesAndReopen({"pikachu"});
+    for (int i = 0; i < 100; ++i) {
+      ASSERT_OK(Put(1, ToString(i), "bar" + ToString(i)));
+    }
+    ASSERT_OK(Flush(1));
+
+    if (iter == 1) {
+      Reopen();
+    }
+    ASSERT_EQ("bar1", Get(1, "1"));
+
+    AssertCountLiveFiles(1);
+    DropColumnFamilies({1});
+    // make sure that all files are deleted when we drop the column family
+    AssertCountLiveFiles(0);
+    Destroy();
+  }
+}
+
+TEST_P(ColumnFamilyTest, WriteBatchFailure) {
+  Open();
+  CreateColumnFamiliesAndReopen({"one", "two"});
+  WriteBatch batch;
+  batch.Put(handles_[0], Slice("existing"), Slice("column-family"));
+  batch.Put(handles_[1], Slice("non-existing"), Slice("column-family"));
+  ASSERT_OK(db_->Write(WriteOptions(), &batch));
+  DropColumnFamilies({1});
+  WriteOptions woptions_ignore_missing_cf;
+  woptions_ignore_missing_cf.ignore_missing_column_families = true;
+  batch.Put(handles_[0], Slice("still here"), Slice("column-family"));
+  ASSERT_OK(db_->Write(woptions_ignore_missing_cf, &batch));
+  ASSERT_EQ("column-family", Get(0, "still here"));
+  Status s = db_->Write(WriteOptions(), &batch);
+  ASSERT_TRUE(s.IsInvalidArgument());
+  Close();
+}
+
+TEST_P(ColumnFamilyTest, ReadWrite) {
+  Open();
+  CreateColumnFamiliesAndReopen({"one", "two"});
+  ASSERT_OK(Put(0, "foo", "v1"));
+  ASSERT_OK(Put(0, "bar", "v2"));
+  ASSERT_OK(Put(1, "mirko", "v3"));
+  ASSERT_OK(Put(0, "foo", "v2"));
+  ASSERT_OK(Put(2, "fodor", "v5"));
+
+  for (int iter = 0; iter <= 3; ++iter) {
+    ASSERT_EQ("v2", Get(0, "foo"));
+    ASSERT_EQ("v2", Get(0, "bar"));
+    ASSERT_EQ("v3", Get(1, "mirko"));
+    ASSERT_EQ("v5", Get(2, "fodor"));
+    ASSERT_EQ("NOT_FOUND", Get(0, "fodor"));
+    ASSERT_EQ("NOT_FOUND", Get(1, "fodor"));
+    ASSERT_EQ("NOT_FOUND", Get(2, "foo"));
+    if (iter <= 1) {
+      Reopen();
+    }
+  }
+  Close();
+}
+
+TEST_P(ColumnFamilyTest, IgnoreRecoveredLog) {
+  std::string backup_logs = dbname_ + "/backup_logs";
+
+  // delete old files in backup_logs directory
+  ASSERT_OK(env_->CreateDirIfMissing(dbname_));
+  ASSERT_OK(env_->CreateDirIfMissing(backup_logs));
+  std::vector<std::string> old_files;
+  env_->GetChildren(backup_logs, &old_files);
+  for (auto& file : old_files) {
+    if (file != "." && file != "..") {
+      env_->DeleteFile(backup_logs + "/" + file);
+    }
+  }
+
+  column_family_options_.merge_operator =
+      MergeOperators::CreateUInt64AddOperator();
+  db_options_.wal_dir = dbname_ + "/logs";
+  Destroy();
+  Open();
+  CreateColumnFamilies({"cf1", "cf2"});
+
+  // fill up the DB
+  std::string one, two, three;
+  PutFixed64(&one, 1);
+  PutFixed64(&two, 2);
+  PutFixed64(&three, 3);
+  ASSERT_OK(Merge(0, "foo", one));
+  ASSERT_OK(Merge(1, "mirko", one));
+  ASSERT_OK(Merge(0, "foo", one));
+  ASSERT_OK(Merge(2, "bla", one));
+  ASSERT_OK(Merge(2, "fodor", one));
+  ASSERT_OK(Merge(0, "bar", one));
+  ASSERT_OK(Merge(2, "bla", one));
+  ASSERT_OK(Merge(1, "mirko", two));
+  ASSERT_OK(Merge(1, "franjo", one));
+
+  // copy the logs to backup
+  std::vector<std::string> logs;
+  env_->GetChildren(db_options_.wal_dir, &logs);
+  for (auto& log : logs) {
+    if (log != ".." && log != ".") {
+      CopyFile(db_options_.wal_dir + "/" + log, backup_logs + "/" + log);
+    }
+  }
+
+  // recover the DB
+  Close();
+
+  // 1. check consistency
+  // 2. copy the logs from backup back to WAL dir. if the recovery happens
+  // again on the same log files, this should lead to incorrect results
+  // due to applying merge operator twice
+  // 3. check consistency
+  for (int iter = 0; iter < 2; ++iter) {
+    // assert consistency
+    Open({"default", "cf1", "cf2"});
+    ASSERT_EQ(two, Get(0, "foo"));
+    ASSERT_EQ(one, Get(0, "bar"));
+    ASSERT_EQ(three, Get(1, "mirko"));
+    ASSERT_EQ(one, Get(1, "franjo"));
+    ASSERT_EQ(one, Get(2, "fodor"));
+    ASSERT_EQ(two, Get(2, "bla"));
+    Close();
+
+    if (iter == 0) {
+      // copy the logs from backup back to wal dir
+      for (auto& log : logs) {
+        if (log != ".." && log != ".") {
+          CopyFile(backup_logs + "/" + log, db_options_.wal_dir + "/" + log);
+        }
+      }
+    }
+  }
+}
+
+#ifndef ROCKSDB_LITE  // TEST functions used are not supported
+TEST_P(ColumnFamilyTest, FlushTest) {
+  Open();
+  CreateColumnFamiliesAndReopen({"one", "two"});
+  ASSERT_OK(Put(0, "foo", "v1"));
+  ASSERT_OK(Put(0, "bar", "v2"));
+  ASSERT_OK(Put(1, "mirko", "v3"));
+  ASSERT_OK(Put(0, "foo", "v2"));
+  ASSERT_OK(Put(2, "fodor", "v5"));
+
+  for (int j = 0; j < 2; j++) {
+    ReadOptions ro;
+    std::vector<Iterator*> iterators;
+    // Hold super version.
+    if (j == 0) {
+      ASSERT_OK(db_->NewIterators(ro, handles_, &iterators));
+    }
+
+    for (int i = 0; i < 3; ++i) {
+      uint64_t max_total_in_memory_state =
+          MaxTotalInMemoryState();
+      Flush(i);
+      AssertMaxTotalInMemoryState(max_total_in_memory_state);
+    }
+    ASSERT_OK(Put(1, "foofoo", "bar"));
+    ASSERT_OK(Put(0, "foofoo", "bar"));
+
+    for (auto* it : iterators) {
+      delete it;
+    }
+  }
+  Reopen();
+
+  for (int iter = 0; iter <= 2; ++iter) {
+    ASSERT_EQ("v2", Get(0, "foo"));
+    ASSERT_EQ("v2", Get(0, "bar"));
+    ASSERT_EQ("v3", Get(1, "mirko"));
+    ASSERT_EQ("v5", Get(2, "fodor"));
+    ASSERT_EQ("NOT_FOUND", Get(0, "fodor"));
+    ASSERT_EQ("NOT_FOUND", Get(1, "fodor"));
+    ASSERT_EQ("NOT_FOUND", Get(2, "foo"));
+    if (iter <= 1) {
+      Reopen();
+    }
+  }
+  Close();
+}
+
+// Makes sure that obsolete log files get deleted
+TEST_P(ColumnFamilyTest, LogDeletionTest) {
+  db_options_.max_total_wal_size = std::numeric_limits<uint64_t>::max();
+  column_family_options_.arena_block_size = 4 * 1024;
+  column_family_options_.write_buffer_size = 128000;  // 128KB
+  Open();
+  CreateColumnFamilies({"one", "two", "three", "four"});
+  // Each bracket is one log file. if number is in (), it means
+  // we don't need it anymore (it's been flushed)
+  // []
+  AssertCountLiveLogFiles(0);
+  PutRandomData(0, 1, 128);
+  // [0]
+  PutRandomData(1, 1, 128);
+  // [0, 1]
+  PutRandomData(1, 1000, 128);
+  WaitForFlush(1);
+  // [0, (1)] [1]
+  AssertCountLiveLogFiles(2);
+  PutRandomData(0, 1, 128);
+  // [0, (1)] [0, 1]
+  AssertCountLiveLogFiles(2);
+  PutRandomData(2, 1, 128);
+  // [0, (1)] [0, 1, 2]
+  PutRandomData(2, 1000, 128);
+  WaitForFlush(2);
+  // [0, (1)] [0, 1, (2)] [2]
+  AssertCountLiveLogFiles(3);
+  PutRandomData(2, 1000, 128);
+  WaitForFlush(2);
+  // [0, (1)] [0, 1, (2)] [(2)] [2]
+  AssertCountLiveLogFiles(4);
+  PutRandomData(3, 1, 128);
+  // [0, (1)] [0, 1, (2)] [(2)] [2, 3]
+  PutRandomData(1, 1, 128);
+  // [0, (1)] [0, 1, (2)] [(2)] [1, 2, 3]
+  AssertCountLiveLogFiles(4);
+  PutRandomData(1, 1000, 128);
+  WaitForFlush(1);
+  // [0, (1)] [0, (1), (2)] [(2)] [(1), 2, 3] [1]
+  AssertCountLiveLogFiles(5);
+  PutRandomData(0, 1000, 128);
+  WaitForFlush(0);
+  // [(0), (1)] [(0), (1), (2)] [(2)] [(1), 2, 3] [1, (0)] [0]
+  // delete obsolete logs -->
+  // [(1), 2, 3] [1, (0)] [0]
+  AssertCountLiveLogFiles(3);
+  PutRandomData(0, 1000, 128);
+  WaitForFlush(0);
+  // [(1), 2, 3] [1, (0)], [(0)] [0]
+  AssertCountLiveLogFiles(4);
+  PutRandomData(1, 1000, 128);
+  WaitForFlush(1);
+  // [(1), 2, 3] [(1), (0)] [(0)] [0, (1)] [1]
+  AssertCountLiveLogFiles(5);
+  PutRandomData(2, 1000, 128);
+  WaitForFlush(2);
+  // [(1), (2), 3] [(1), (0)] [(0)] [0, (1)] [1, (2)], [2]
+  AssertCountLiveLogFiles(6);
+  PutRandomData(3, 1000, 128);
+  WaitForFlush(3);
+  // [(1), (2), (3)] [(1), (0)] [(0)] [0, (1)] [1, (2)], [2, (3)] [3]
+  // delete obsolete logs -->
+  // [0, (1)] [1, (2)], [2, (3)] [3]
+  AssertCountLiveLogFiles(4);
+  Close();
+}
+#endif  // !ROCKSDB_LITE
+
+TEST_P(ColumnFamilyTest, CrashAfterFlush) {
+  std::unique_ptr<FaultInjectionTestEnv> fault_env(
+      new FaultInjectionTestEnv(env_));
+  db_options_.env = fault_env.get();
+  Open();
+  CreateColumnFamilies({"one"});
+
+  WriteBatch batch;
+  batch.Put(handles_[0], Slice("foo"), Slice("bar"));
+  batch.Put(handles_[1], Slice("foo"), Slice("bar"));
+  ASSERT_OK(db_->Write(WriteOptions(), &batch));
+  Flush(0);
+  fault_env->SetFilesystemActive(false);
+
+  std::vector<std::string> names;
+  for (auto name : names_) {
+    if (name != "") {
+      names.push_back(name);
+    }
+  }
+  Close();
+  fault_env->DropUnsyncedFileData();
+  fault_env->ResetState();
+  Open(names, {});
+
+  // Write batch should be atomic.
+  ASSERT_EQ(Get(0, "foo"), Get(1, "foo"));
+
+  Close();
+  db_options_.env = env_;
+}
+
+TEST_P(ColumnFamilyTest, OpenNonexistentColumnFamily) {
+  ASSERT_OK(TryOpen({"default"}));
+  Close();
+  ASSERT_TRUE(TryOpen({"default", "dne"}).IsInvalidArgument());
+}
+
+#ifndef ROCKSDB_LITE  // WaitForFlush() is not supported
+// Makes sure that obsolete log files get deleted
+TEST_P(ColumnFamilyTest, DifferentWriteBufferSizes) {
+  // disable flushing stale column families
+  db_options_.max_total_wal_size = std::numeric_limits<uint64_t>::max();
+  Open();
+  CreateColumnFamilies({"one", "two", "three"});
+  ColumnFamilyOptions default_cf, one, two, three;
+  // setup options. all column families have max_write_buffer_number setup to 10
+  // "default" -> 100KB memtable, start flushing immediatelly
+  // "one" -> 200KB memtable, start flushing with two immutable memtables
+  // "two" -> 1MB memtable, start flushing with three immutable memtables
+  // "three" -> 90KB memtable, start flushing with four immutable memtables
+  default_cf.write_buffer_size = 100000;
+  default_cf.arena_block_size = 4 * 4096;
+  default_cf.max_write_buffer_number = 10;
+  default_cf.min_write_buffer_number_to_merge = 1;
+  default_cf.max_write_buffer_size_to_maintain = 0;
+  one.write_buffer_size = 200000;
+  one.arena_block_size = 4 * 4096;
+  one.max_write_buffer_number = 10;
+  one.min_write_buffer_number_to_merge = 2;
+  one.max_write_buffer_size_to_maintain =
+      static_cast<int>(one.write_buffer_size);
+  two.write_buffer_size = 1000000;
+  two.arena_block_size = 4 * 4096;
+  two.max_write_buffer_number = 10;
+  two.min_write_buffer_number_to_merge = 3;
+  two.max_write_buffer_size_to_maintain =
+      static_cast<int>(two.write_buffer_size);
+  three.write_buffer_size = 4096 * 22;
+  three.arena_block_size = 4096;
+  three.max_write_buffer_number = 10;
+  three.min_write_buffer_number_to_merge = 4;
+  three.max_write_buffer_size_to_maintain =
+      static_cast<int>(three.write_buffer_size);
+
+  Reopen({default_cf, one, two, three});
+
+  int micros_wait_for_flush = 10000;
+  PutRandomData(0, 100, 1000);
+  WaitForFlush(0);
+  AssertNumberOfImmutableMemtables({0, 0, 0, 0});
+  AssertCountLiveLogFiles(1);
+  PutRandomData(1, 200, 1000);
+  env_->SleepForMicroseconds(micros_wait_for_flush);
+  AssertNumberOfImmutableMemtables({0, 1, 0, 0});
+  AssertCountLiveLogFiles(2);
+  PutRandomData(2, 1000, 1000);
+  env_->SleepForMicroseconds(micros_wait_for_flush);
+  AssertNumberOfImmutableMemtables({0, 1, 1, 0});
+  AssertCountLiveLogFiles(3);
+  PutRandomData(2, 1000, 1000);
+  env_->SleepForMicroseconds(micros_wait_for_flush);
+  AssertNumberOfImmutableMemtables({0, 1, 2, 0});
+  AssertCountLiveLogFiles(4);
+  PutRandomData(3, 93, 990);
+  env_->SleepForMicroseconds(micros_wait_for_flush);
+  AssertNumberOfImmutableMemtables({0, 1, 2, 1});
+  AssertCountLiveLogFiles(5);
+  PutRandomData(3, 88, 990);
+  env_->SleepForMicroseconds(micros_wait_for_flush);
+  AssertNumberOfImmutableMemtables({0, 1, 2, 2});
+  AssertCountLiveLogFiles(6);
+  PutRandomData(3, 88, 990);
+  env_->SleepForMicroseconds(micros_wait_for_flush);
+  AssertNumberOfImmutableMemtables({0, 1, 2, 3});
+  AssertCountLiveLogFiles(7);
+  PutRandomData(0, 100, 1000);
+  WaitForFlush(0);
+  AssertNumberOfImmutableMemtables({0, 1, 2, 3});
+  AssertCountLiveLogFiles(8);
+  PutRandomData(2, 100, 10000);
+  WaitForFlush(2);
+  AssertNumberOfImmutableMemtables({0, 1, 0, 3});
+  AssertCountLiveLogFiles(9);
+  PutRandomData(3, 88, 990);
+  WaitForFlush(3);
+  AssertNumberOfImmutableMemtables({0, 1, 0, 0});
+  AssertCountLiveLogFiles(10);
+  PutRandomData(3, 88, 990);
+  env_->SleepForMicroseconds(micros_wait_for_flush);
+  AssertNumberOfImmutableMemtables({0, 1, 0, 1});
+  AssertCountLiveLogFiles(11);
+  PutRandomData(1, 200, 1000);
+  WaitForFlush(1);
+  AssertNumberOfImmutableMemtables({0, 0, 0, 1});
+  AssertCountLiveLogFiles(5);
+  PutRandomData(3, 88 * 3, 990);
+  WaitForFlush(3);
+  PutRandomData(3, 88 * 4, 990);
+  WaitForFlush(3);
+  AssertNumberOfImmutableMemtables({0, 0, 0, 0});
+  AssertCountLiveLogFiles(12);
+  PutRandomData(0, 100, 1000);
+  WaitForFlush(0);
+  AssertNumberOfImmutableMemtables({0, 0, 0, 0});
+  AssertCountLiveLogFiles(12);
+  PutRandomData(2, 3 * 1000, 1000);
+  WaitForFlush(2);
+  AssertNumberOfImmutableMemtables({0, 0, 0, 0});
+  AssertCountLiveLogFiles(12);
+  PutRandomData(1, 2*200, 1000);
+  WaitForFlush(1);
+  AssertNumberOfImmutableMemtables({0, 0, 0, 0});
+  AssertCountLiveLogFiles(7);
+  Close();
+}
+#endif  // !ROCKSDB_LITE
+
+// The test is commented out because we want to test that snapshot is
+// not created for memtables not supported it, but There isn't a memtable
+// that doesn't support snapshot right now. If we have one later, we can
+// re-enable the test.
+//
+// #ifndef ROCKSDB_LITE  // Cuckoo is not supported in lite
+//   TEST_P(ColumnFamilyTest, MemtableNotSupportSnapshot) {
+//   db_options_.allow_concurrent_memtable_write = false;
+//   Open();
+//   auto* s1 = dbfull()->GetSnapshot();
+//   ASSERT_TRUE(s1 != nullptr);
+//   dbfull()->ReleaseSnapshot(s1);
+
+//   // Add a column family that doesn't support snapshot
+//   ColumnFamilyOptions first;
+//   first.memtable_factory.reset(new DummyMemtableNotSupportingSnapshot());
+//   CreateColumnFamilies({"first"}, {first});
+//   auto* s2 = dbfull()->GetSnapshot();
+//   ASSERT_TRUE(s2 == nullptr);
+
+//   // Add a column family that supports snapshot. Snapshot stays not
+//   supported. ColumnFamilyOptions second; CreateColumnFamilies({"second"},
+//   {second}); auto* s3 = dbfull()->GetSnapshot(); ASSERT_TRUE(s3 == nullptr);
+//   Close();
+// }
+// #endif  // !ROCKSDB_LITE
+
+class TestComparator : public Comparator {
+  int Compare(const ROCKSDB_NAMESPACE::Slice& /*a*/,
+              const ROCKSDB_NAMESPACE::Slice& /*b*/) const override {
+    return 0;
+  }
+  const char* Name() const override { return "Test"; }
+  void FindShortestSeparator(
+      std::string* /*start*/,
+      const ROCKSDB_NAMESPACE::Slice& /*limit*/) const override {}
+  void FindShortSuccessor(std::string* /*key*/) const override {}
+};
+
+static TestComparator third_comparator;
+static TestComparator fourth_comparator;
+
+// Test that we can retrieve the comparator from a created CF
+TEST_P(ColumnFamilyTest, GetComparator) {
+  Open();
+  // Add a column family with no comparator specified
+  CreateColumnFamilies({"first"});
+  const Comparator* comp = handles_[0]->GetComparator();
+  ASSERT_EQ(comp, BytewiseComparator());
+
+  // Add three column families - one with no comparator and two
+  // with comparators specified
+  ColumnFamilyOptions second, third, fourth;
+  second.comparator = &third_comparator;
+  third.comparator = &fourth_comparator;
+  CreateColumnFamilies({"second", "third", "fourth"}, {second, third, fourth});
+  ASSERT_EQ(handles_[1]->GetComparator(), BytewiseComparator());
+  ASSERT_EQ(handles_[2]->GetComparator(), &third_comparator);
+  ASSERT_EQ(handles_[3]->GetComparator(), &fourth_comparator);
+  Close();
+}
+
+TEST_P(ColumnFamilyTest, DifferentMergeOperators) {
+  Open();
+  CreateColumnFamilies({"first", "second"});
+  ColumnFamilyOptions default_cf, first, second;
+  first.merge_operator = MergeOperators::CreateUInt64AddOperator();
+  second.merge_operator = MergeOperators::CreateStringAppendOperator();
+  Reopen({default_cf, first, second});
+
+  std::string one, two, three;
+  PutFixed64(&one, 1);
+  PutFixed64(&two, 2);
+  PutFixed64(&three, 3);
+
+  ASSERT_OK(Put(0, "foo", two));
+  ASSERT_OK(Put(0, "foo", one));
+  ASSERT_TRUE(Merge(0, "foo", two).IsNotSupported());
+  ASSERT_EQ(Get(0, "foo"), one);
+
+  ASSERT_OK(Put(1, "foo", two));
+  ASSERT_OK(Put(1, "foo", one));
+  ASSERT_OK(Merge(1, "foo", two));
+  ASSERT_EQ(Get(1, "foo"), three);
+
+  ASSERT_OK(Put(2, "foo", two));
+  ASSERT_OK(Put(2, "foo", one));
+  ASSERT_OK(Merge(2, "foo", two));
+  ASSERT_EQ(Get(2, "foo"), one + "," + two);
+  Close();
+}
+
+#ifndef ROCKSDB_LITE  // WaitForFlush() is not supported
+TEST_P(ColumnFamilyTest, DifferentCompactionStyles) {
+  Open();
+  CreateColumnFamilies({"one", "two"});
+  ColumnFamilyOptions default_cf, one, two;
+  db_options_.max_open_files = 20;  // only 10 files in file cache
+
+  default_cf.compaction_style = kCompactionStyleLevel;
+  default_cf.num_levels = 3;
+  default_cf.write_buffer_size = 64 << 10;  // 64KB
+  default_cf.target_file_size_base = 30 << 10;
+  default_cf.max_compaction_bytes = static_cast<uint64_t>(1) << 60;
+
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+  table_options.no_block_cache = true;
+  default_cf.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  one.compaction_style = kCompactionStyleUniversal;
+
+  one.num_levels = 1;
+  // trigger compaction if there are >= 4 files
+  one.level0_file_num_compaction_trigger = 4;
+  one.write_buffer_size = 120000;
+
+  two.compaction_style = kCompactionStyleLevel;
+  two.num_levels = 4;
+  two.level0_file_num_compaction_trigger = 3;
+  two.write_buffer_size = 100000;
+
+  Reopen({default_cf, one, two});
+
+  // SETUP column family "one" -- universal style
+  for (int i = 0; i < one.level0_file_num_compaction_trigger - 1; ++i) {
+    PutRandomData(1, 10, 12000);
+    PutRandomData(1, 1, 10);
+    WaitForFlush(1);
+    AssertFilesPerLevel(ToString(i + 1), 1);
+  }
+
+  // SETUP column family "two" -- level style with 4 levels
+  for (int i = 0; i < two.level0_file_num_compaction_trigger - 1; ++i) {
+    PutRandomData(2, 10, 12000);
+    PutRandomData(2, 1, 10);
+    WaitForFlush(2);
+    AssertFilesPerLevel(ToString(i + 1), 2);
+  }
+
+  // TRIGGER compaction "one"
+  PutRandomData(1, 10, 12000);
+  PutRandomData(1, 1, 10);
+
+  // TRIGGER compaction "two"
+  PutRandomData(2, 10, 12000);
+  PutRandomData(2, 1, 10);
+
+  // WAIT for compactions
+  WaitForCompaction();
+
+  // VERIFY compaction "one"
+  AssertFilesPerLevel("1", 1);
+
+  // VERIFY compaction "two"
+  AssertFilesPerLevel("0,1", 2);
+  CompactAll(2);
+  AssertFilesPerLevel("0,1", 2);
+
+  Close();
+}
+#endif  // !ROCKSDB_LITE
+
+#ifndef ROCKSDB_LITE
+// Sync points not supported in RocksDB Lite
+
+TEST_P(ColumnFamilyTest, MultipleManualCompactions) {
+  Open();
+  CreateColumnFamilies({"one", "two"});
+  ColumnFamilyOptions default_cf, one, two;
+  db_options_.max_open_files = 20;  // only 10 files in file cache
+  db_options_.max_background_compactions = 3;
+
+  default_cf.compaction_style = kCompactionStyleLevel;
+  default_cf.num_levels = 3;
+  default_cf.write_buffer_size = 64 << 10;  // 64KB
+  default_cf.target_file_size_base = 30 << 10;
+  default_cf.max_compaction_bytes = default_cf.target_file_size_base * 1100;
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+  table_options.no_block_cache = true;
+  default_cf.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  one.compaction_style = kCompactionStyleUniversal;
+
+  one.num_levels = 1;
+  // trigger compaction if there are >= 4 files
+  one.level0_file_num_compaction_trigger = 4;
+  one.write_buffer_size = 120000;
+
+  two.compaction_style = kCompactionStyleLevel;
+  two.num_levels = 4;
+  two.level0_file_num_compaction_trigger = 3;
+  two.write_buffer_size = 100000;
+
+  Reopen({default_cf, one, two});
+
+  // SETUP column family "one" -- universal style
+  for (int i = 0; i < one.level0_file_num_compaction_trigger - 2; ++i) {
+    PutRandomData(1, 10, 12000, true);
+    PutRandomData(1, 1, 10, true);
+    WaitForFlush(1);
+    AssertFilesPerLevel(ToString(i + 1), 1);
+  }
+  bool cf_1_1 = true;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"ColumnFamilyTest::MultiManual:4", "ColumnFamilyTest::MultiManual:1"},
+       {"ColumnFamilyTest::MultiManual:2", "ColumnFamilyTest::MultiManual:5"},
+       {"ColumnFamilyTest::MultiManual:2", "ColumnFamilyTest::MultiManual:3"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:NonTrivial:AfterRun", [&](void* /*arg*/) {
+        if (cf_1_1) {
+          TEST_SYNC_POINT("ColumnFamilyTest::MultiManual:4");
+          cf_1_1 = false;
+          TEST_SYNC_POINT("ColumnFamilyTest::MultiManual:3");
+        }
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  std::vector<port::Thread> threads;
+  threads.emplace_back([&] {
+    CompactRangeOptions compact_options;
+    compact_options.exclusive_manual_compaction = false;
+    ASSERT_OK(
+        db_->CompactRange(compact_options, handles_[1], nullptr, nullptr));
+  });
+
+  // SETUP column family "two" -- level style with 4 levels
+  for (int i = 0; i < two.level0_file_num_compaction_trigger - 2; ++i) {
+    PutRandomData(2, 10, 12000);
+    PutRandomData(2, 1, 10);
+    WaitForFlush(2);
+    AssertFilesPerLevel(ToString(i + 1), 2);
+  }
+  threads.emplace_back([&] {
+    TEST_SYNC_POINT("ColumnFamilyTest::MultiManual:1");
+    CompactRangeOptions compact_options;
+    compact_options.exclusive_manual_compaction = false;
+    ASSERT_OK(
+        db_->CompactRange(compact_options, handles_[2], nullptr, nullptr));
+    TEST_SYNC_POINT("ColumnFamilyTest::MultiManual:2");
+  });
+
+  TEST_SYNC_POINT("ColumnFamilyTest::MultiManual:5");
+  for (auto& t : threads) {
+    t.join();
+  }
+
+  // VERIFY compaction "one"
+  AssertFilesPerLevel("1", 1);
+
+  // VERIFY compaction "two"
+  AssertFilesPerLevel("0,1", 2);
+  CompactAll(2);
+  AssertFilesPerLevel("0,1", 2);
+  // Compare against saved keys
+  std::set<std::string>::iterator key_iter = keys_[1].begin();
+  while (key_iter != keys_[1].end()) {
+    ASSERT_NE("NOT_FOUND", Get(1, *key_iter));
+    key_iter++;
+  }
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+  Close();
+}
+
+TEST_P(ColumnFamilyTest, AutomaticAndManualCompactions) {
+  Open();
+  CreateColumnFamilies({"one", "two"});
+  ColumnFamilyOptions default_cf, one, two;
+  db_options_.max_open_files = 20;  // only 10 files in file cache
+  db_options_.max_background_compactions = 3;
+
+  default_cf.compaction_style = kCompactionStyleLevel;
+  default_cf.num_levels = 3;
+  default_cf.write_buffer_size = 64 << 10;  // 64KB
+  default_cf.target_file_size_base = 30 << 10;
+  default_cf.max_compaction_bytes = default_cf.target_file_size_base * 1100;
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+  ;
+  table_options.no_block_cache = true;
+  default_cf.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  one.compaction_style = kCompactionStyleUniversal;
+
+  one.num_levels = 1;
+  // trigger compaction if there are >= 4 files
+  one.level0_file_num_compaction_trigger = 4;
+  one.write_buffer_size = 120000;
+
+  two.compaction_style = kCompactionStyleLevel;
+  two.num_levels = 4;
+  two.level0_file_num_compaction_trigger = 3;
+  two.write_buffer_size = 100000;
+
+  Reopen({default_cf, one, two});
+  // make sure all background compaction jobs can be scheduled
+  auto stop_token =
+      dbfull()->TEST_write_controler().GetCompactionPressureToken();
+
+  bool cf_1_1 = true;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"ColumnFamilyTest::AutoManual:4", "ColumnFamilyTest::AutoManual:1"},
+       {"ColumnFamilyTest::AutoManual:2", "ColumnFamilyTest::AutoManual:5"},
+       {"ColumnFamilyTest::AutoManual:2", "ColumnFamilyTest::AutoManual:3"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:NonTrivial:AfterRun", [&](void* /*arg*/) {
+        if (cf_1_1) {
+          cf_1_1 = false;
+          TEST_SYNC_POINT("ColumnFamilyTest::AutoManual:4");
+          TEST_SYNC_POINT("ColumnFamilyTest::AutoManual:3");
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  // SETUP column family "one" -- universal style
+  for (int i = 0; i < one.level0_file_num_compaction_trigger; ++i) {
+    PutRandomData(1, 10, 12000, true);
+    PutRandomData(1, 1, 10, true);
+    WaitForFlush(1);
+    AssertFilesPerLevel(ToString(i + 1), 1);
+  }
+
+  TEST_SYNC_POINT("ColumnFamilyTest::AutoManual:1");
+
+  // SETUP column family "two" -- level style with 4 levels
+  for (int i = 0; i < two.level0_file_num_compaction_trigger - 2; ++i) {
+    PutRandomData(2, 10, 12000);
+    PutRandomData(2, 1, 10);
+    WaitForFlush(2);
+    AssertFilesPerLevel(ToString(i + 1), 2);
+  }
+  ROCKSDB_NAMESPACE::port::Thread threads([&] {
+    CompactRangeOptions compact_options;
+    compact_options.exclusive_manual_compaction = false;
+    ASSERT_OK(
+        db_->CompactRange(compact_options, handles_[2], nullptr, nullptr));
+    TEST_SYNC_POINT("ColumnFamilyTest::AutoManual:2");
+  });
+
+  TEST_SYNC_POINT("ColumnFamilyTest::AutoManual:5");
+  threads.join();
+
+  // WAIT for compactions
+  WaitForCompaction();
+
+  // VERIFY compaction "one"
+  AssertFilesPerLevel("1", 1);
+
+  // VERIFY compaction "two"
+  AssertFilesPerLevel("0,1", 2);
+  CompactAll(2);
+  AssertFilesPerLevel("0,1", 2);
+  // Compare against saved keys
+  std::set<std::string>::iterator key_iter = keys_[1].begin();
+  while (key_iter != keys_[1].end()) {
+    ASSERT_NE("NOT_FOUND", Get(1, *key_iter));
+    key_iter++;
+  }
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_P(ColumnFamilyTest, ManualAndAutomaticCompactions) {
+  Open();
+  CreateColumnFamilies({"one", "two"});
+  ColumnFamilyOptions default_cf, one, two;
+  db_options_.max_open_files = 20;  // only 10 files in file cache
+  db_options_.max_background_compactions = 3;
+
+  default_cf.compaction_style = kCompactionStyleLevel;
+  default_cf.num_levels = 3;
+  default_cf.write_buffer_size = 64 << 10;  // 64KB
+  default_cf.target_file_size_base = 30 << 10;
+  default_cf.max_compaction_bytes = default_cf.target_file_size_base * 1100;
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+  ;
+  table_options.no_block_cache = true;
+  default_cf.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  one.compaction_style = kCompactionStyleUniversal;
+
+  one.num_levels = 1;
+  // trigger compaction if there are >= 4 files
+  one.level0_file_num_compaction_trigger = 4;
+  one.write_buffer_size = 120000;
+
+  two.compaction_style = kCompactionStyleLevel;
+  two.num_levels = 4;
+  two.level0_file_num_compaction_trigger = 3;
+  two.write_buffer_size = 100000;
+
+  Reopen({default_cf, one, two});
+  // make sure all background compaction jobs can be scheduled
+  auto stop_token =
+      dbfull()->TEST_write_controler().GetCompactionPressureToken();
+
+  // SETUP column family "one" -- universal style
+  for (int i = 0; i < one.level0_file_num_compaction_trigger - 2; ++i) {
+    PutRandomData(1, 10, 12000, true);
+    PutRandomData(1, 1, 10, true);
+    WaitForFlush(1);
+    AssertFilesPerLevel(ToString(i + 1), 1);
+  }
+  bool cf_1_1 = true;
+  bool cf_1_2 = true;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"ColumnFamilyTest::ManualAuto:4", "ColumnFamilyTest::ManualAuto:1"},
+       {"ColumnFamilyTest::ManualAuto:5", "ColumnFamilyTest::ManualAuto:2"},
+       {"ColumnFamilyTest::ManualAuto:2", "ColumnFamilyTest::ManualAuto:3"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:NonTrivial:AfterRun", [&](void* /*arg*/) {
+        if (cf_1_1) {
+          TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:4");
+          cf_1_1 = false;
+          TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:3");
+        } else if (cf_1_2) {
+          TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:2");
+          cf_1_2 = false;
+        }
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  ROCKSDB_NAMESPACE::port::Thread threads([&] {
+    CompactRangeOptions compact_options;
+    compact_options.exclusive_manual_compaction = false;
+    ASSERT_OK(
+        db_->CompactRange(compact_options, handles_[1], nullptr, nullptr));
+  });
+
+  TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:1");
+
+  // SETUP column family "two" -- level style with 4 levels
+  for (int i = 0; i < two.level0_file_num_compaction_trigger; ++i) {
+    PutRandomData(2, 10, 12000);
+    PutRandomData(2, 1, 10);
+    WaitForFlush(2);
+    AssertFilesPerLevel(ToString(i + 1), 2);
+  }
+  TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:5");
+  threads.join();
+
+  // WAIT for compactions
+  WaitForCompaction();
+
+  // VERIFY compaction "one"
+  AssertFilesPerLevel("1", 1);
+
+  // VERIFY compaction "two"
+  AssertFilesPerLevel("0,1", 2);
+  CompactAll(2);
+  AssertFilesPerLevel("0,1", 2);
+  // Compare against saved keys
+  std::set<std::string>::iterator key_iter = keys_[1].begin();
+  while (key_iter != keys_[1].end()) {
+    ASSERT_NE("NOT_FOUND", Get(1, *key_iter));
+    key_iter++;
+  }
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_P(ColumnFamilyTest, SameCFManualManualCompactions) {
+  Open();
+  CreateColumnFamilies({"one"});
+  ColumnFamilyOptions default_cf, one;
+  db_options_.max_open_files = 20;  // only 10 files in file cache
+  db_options_.max_background_compactions = 3;
+
+  default_cf.compaction_style = kCompactionStyleLevel;
+  default_cf.num_levels = 3;
+  default_cf.write_buffer_size = 64 << 10;  // 64KB
+  default_cf.target_file_size_base = 30 << 10;
+  default_cf.max_compaction_bytes = default_cf.target_file_size_base * 1100;
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+  ;
+  table_options.no_block_cache = true;
+  default_cf.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  one.compaction_style = kCompactionStyleUniversal;
+
+  one.num_levels = 1;
+  // trigger compaction if there are >= 4 files
+  one.level0_file_num_compaction_trigger = 4;
+  one.write_buffer_size = 120000;
+
+  Reopen({default_cf, one});
+  // make sure all background compaction jobs can be scheduled
+  auto stop_token =
+      dbfull()->TEST_write_controler().GetCompactionPressureToken();
+
+  // SETUP column family "one" -- universal style
+  for (int i = 0; i < one.level0_file_num_compaction_trigger - 2; ++i) {
+    PutRandomData(1, 10, 12000, true);
+    PutRandomData(1, 1, 10, true);
+    WaitForFlush(1);
+    AssertFilesPerLevel(ToString(i + 1), 1);
+  }
+  bool cf_1_1 = true;
+  bool cf_1_2 = true;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"ColumnFamilyTest::ManualManual:4", "ColumnFamilyTest::ManualManual:2"},
+       {"ColumnFamilyTest::ManualManual:4", "ColumnFamilyTest::ManualManual:5"},
+       {"ColumnFamilyTest::ManualManual:1", "ColumnFamilyTest::ManualManual:2"},
+       {"ColumnFamilyTest::ManualManual:1",
+        "ColumnFamilyTest::ManualManual:3"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:NonTrivial:AfterRun", [&](void* /*arg*/) {
+        if (cf_1_1) {
+          TEST_SYNC_POINT("ColumnFamilyTest::ManualManual:4");
+          cf_1_1 = false;
+          TEST_SYNC_POINT("ColumnFamilyTest::ManualManual:3");
+        } else if (cf_1_2) {
+          TEST_SYNC_POINT("ColumnFamilyTest::ManualManual:2");
+          cf_1_2 = false;
+        }
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  ROCKSDB_NAMESPACE::port::Thread threads([&] {
+    CompactRangeOptions compact_options;
+    compact_options.exclusive_manual_compaction = true;
+    ASSERT_OK(
+        db_->CompactRange(compact_options, handles_[1], nullptr, nullptr));
+  });
+
+  TEST_SYNC_POINT("ColumnFamilyTest::ManualManual:5");
+
+  WaitForFlush(1);
+
+  // Add more L0 files and force another manual compaction
+  for (int i = 0; i < one.level0_file_num_compaction_trigger - 2; ++i) {
+    PutRandomData(1, 10, 12000, true);
+    PutRandomData(1, 1, 10, true);
+    WaitForFlush(1);
+    AssertFilesPerLevel(ToString(one.level0_file_num_compaction_trigger + i),
+                        1);
+  }
+
+  ROCKSDB_NAMESPACE::port::Thread threads1([&] {
+    CompactRangeOptions compact_options;
+    compact_options.exclusive_manual_compaction = false;
+    ASSERT_OK(
+        db_->CompactRange(compact_options, handles_[1], nullptr, nullptr));
+  });
+
+  TEST_SYNC_POINT("ColumnFamilyTest::ManualManual:1");
+
+  threads.join();
+  threads1.join();
+  WaitForCompaction();
+  // VERIFY compaction "one"
+  ASSERT_LE(NumTableFilesAtLevel(0, 1), 2);
+
+  // Compare against saved keys
+  std::set<std::string>::iterator key_iter = keys_[1].begin();
+  while (key_iter != keys_[1].end()) {
+    ASSERT_NE("NOT_FOUND", Get(1, *key_iter));
+    key_iter++;
+  }
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_P(ColumnFamilyTest, SameCFManualAutomaticCompactions) {
+  Open();
+  CreateColumnFamilies({"one"});
+  ColumnFamilyOptions default_cf, one;
+  db_options_.max_open_files = 20;  // only 10 files in file cache
+  db_options_.max_background_compactions = 3;
+
+  default_cf.compaction_style = kCompactionStyleLevel;
+  default_cf.num_levels = 3;
+  default_cf.write_buffer_size = 64 << 10;  // 64KB
+  default_cf.target_file_size_base = 30 << 10;
+  default_cf.max_compaction_bytes = default_cf.target_file_size_base * 1100;
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+  ;
+  table_options.no_block_cache = true;
+  default_cf.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  one.compaction_style = kCompactionStyleUniversal;
+
+  one.num_levels = 1;
+  // trigger compaction if there are >= 4 files
+  one.level0_file_num_compaction_trigger = 4;
+  one.write_buffer_size = 120000;
+
+  Reopen({default_cf, one});
+  // make sure all background compaction jobs can be scheduled
+  auto stop_token =
+      dbfull()->TEST_write_controler().GetCompactionPressureToken();
+
+  // SETUP column family "one" -- universal style
+  for (int i = 0; i < one.level0_file_num_compaction_trigger - 2; ++i) {
+    PutRandomData(1, 10, 12000, true);
+    PutRandomData(1, 1, 10, true);
+    WaitForFlush(1);
+    AssertFilesPerLevel(ToString(i + 1), 1);
+  }
+  bool cf_1_1 = true;
+  bool cf_1_2 = true;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"ColumnFamilyTest::ManualAuto:4", "ColumnFamilyTest::ManualAuto:2"},
+       {"ColumnFamilyTest::ManualAuto:4", "ColumnFamilyTest::ManualAuto:5"},
+       {"ColumnFamilyTest::ManualAuto:1", "ColumnFamilyTest::ManualAuto:2"},
+       {"ColumnFamilyTest::ManualAuto:1", "ColumnFamilyTest::ManualAuto:3"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:NonTrivial:AfterRun", [&](void* /*arg*/) {
+        if (cf_1_1) {
+          TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:4");
+          cf_1_1 = false;
+          TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:3");
+        } else if (cf_1_2) {
+          TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:2");
+          cf_1_2 = false;
+        }
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  ROCKSDB_NAMESPACE::port::Thread threads([&] {
+    CompactRangeOptions compact_options;
+    compact_options.exclusive_manual_compaction = false;
+    ASSERT_OK(
+        db_->CompactRange(compact_options, handles_[1], nullptr, nullptr));
+  });
+
+  TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:5");
+
+  WaitForFlush(1);
+
+  // Add more L0 files and force automatic compaction
+  for (int i = 0; i < one.level0_file_num_compaction_trigger; ++i) {
+    PutRandomData(1, 10, 12000, true);
+    PutRandomData(1, 1, 10, true);
+    WaitForFlush(1);
+    AssertFilesPerLevel(ToString(one.level0_file_num_compaction_trigger + i),
+                        1);
+  }
+
+  TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:1");
+
+  threads.join();
+  WaitForCompaction();
+  // VERIFY compaction "one"
+  ASSERT_LE(NumTableFilesAtLevel(0, 1), 2);
+
+  // Compare against saved keys
+  std::set<std::string>::iterator key_iter = keys_[1].begin();
+  while (key_iter != keys_[1].end()) {
+    ASSERT_NE("NOT_FOUND", Get(1, *key_iter));
+    key_iter++;
+  }
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_P(ColumnFamilyTest, SameCFManualAutomaticCompactionsLevel) {
+  Open();
+  CreateColumnFamilies({"one"});
+  ColumnFamilyOptions default_cf, one;
+  db_options_.max_open_files = 20;  // only 10 files in file cache
+  db_options_.max_background_compactions = 3;
+
+  default_cf.compaction_style = kCompactionStyleLevel;
+  default_cf.num_levels = 3;
+  default_cf.write_buffer_size = 64 << 10;  // 64KB
+  default_cf.target_file_size_base = 30 << 10;
+  default_cf.max_compaction_bytes = default_cf.target_file_size_base * 1100;
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+  ;
+  table_options.no_block_cache = true;
+  default_cf.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  one.compaction_style = kCompactionStyleLevel;
+
+  one.num_levels = 1;
+  // trigger compaction if there are >= 4 files
+  one.level0_file_num_compaction_trigger = 3;
+  one.write_buffer_size = 120000;
+
+  Reopen({default_cf, one});
+  // make sure all background compaction jobs can be scheduled
+  auto stop_token =
+      dbfull()->TEST_write_controler().GetCompactionPressureToken();
+
+  // SETUP column family "one" -- level style
+  for (int i = 0; i < one.level0_file_num_compaction_trigger - 2; ++i) {
+    PutRandomData(1, 10, 12000, true);
+    PutRandomData(1, 1, 10, true);
+    WaitForFlush(1);
+    AssertFilesPerLevel(ToString(i + 1), 1);
+  }
+  bool cf_1_1 = true;
+  bool cf_1_2 = true;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"ColumnFamilyTest::ManualAuto:4", "ColumnFamilyTest::ManualAuto:2"},
+       {"ColumnFamilyTest::ManualAuto:4", "ColumnFamilyTest::ManualAuto:5"},
+       {"ColumnFamilyTest::ManualAuto:3", "ColumnFamilyTest::ManualAuto:2"},
+       {"LevelCompactionPicker::PickCompactionBySize:0",
+        "ColumnFamilyTest::ManualAuto:3"},
+       {"ColumnFamilyTest::ManualAuto:1", "ColumnFamilyTest::ManualAuto:3"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:NonTrivial:AfterRun", [&](void* /*arg*/) {
+        if (cf_1_1) {
+          TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:4");
+          cf_1_1 = false;
+          TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:3");
+        } else if (cf_1_2) {
+          TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:2");
+          cf_1_2 = false;
+        }
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  ROCKSDB_NAMESPACE::port::Thread threads([&] {
+    CompactRangeOptions compact_options;
+    compact_options.exclusive_manual_compaction = false;
+    ASSERT_OK(
+        db_->CompactRange(compact_options, handles_[1], nullptr, nullptr));
+  });
+
+  TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:5");
+
+  // Add more L0 files and force automatic compaction
+  for (int i = 0; i < one.level0_file_num_compaction_trigger; ++i) {
+    PutRandomData(1, 10, 12000, true);
+    PutRandomData(1, 1, 10, true);
+    WaitForFlush(1);
+    AssertFilesPerLevel(ToString(one.level0_file_num_compaction_trigger + i),
+                        1);
+  }
+
+  TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:1");
+
+  threads.join();
+  WaitForCompaction();
+  // VERIFY compaction "one"
+  AssertFilesPerLevel("0,1", 1);
+
+  // Compare against saved keys
+  std::set<std::string>::iterator key_iter = keys_[1].begin();
+  while (key_iter != keys_[1].end()) {
+    ASSERT_NE("NOT_FOUND", Get(1, *key_iter));
+    key_iter++;
+  }
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+// In this test, we generate enough files to trigger automatic compactions.
+// The automatic compaction waits in NonTrivial:AfterRun
+// We generate more files and then trigger an automatic compaction
+// This will wait because the automatic compaction has files it needs.
+// Once the conflict is hit, the automatic compaction starts and ends
+// Then the manual will run and end.
+TEST_P(ColumnFamilyTest, SameCFAutomaticManualCompactions) {
+  Open();
+  CreateColumnFamilies({"one"});
+  ColumnFamilyOptions default_cf, one;
+  db_options_.max_open_files = 20;  // only 10 files in file cache
+  db_options_.max_background_compactions = 3;
+
+  default_cf.compaction_style = kCompactionStyleLevel;
+  default_cf.num_levels = 3;
+  default_cf.write_buffer_size = 64 << 10;  // 64KB
+  default_cf.target_file_size_base = 30 << 10;
+  default_cf.max_compaction_bytes = default_cf.target_file_size_base * 1100;
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+  ;
+  table_options.no_block_cache = true;
+  default_cf.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  one.compaction_style = kCompactionStyleUniversal;
+
+  one.num_levels = 1;
+  // trigger compaction if there are >= 4 files
+  one.level0_file_num_compaction_trigger = 4;
+  one.write_buffer_size = 120000;
+
+  Reopen({default_cf, one});
+  // make sure all background compaction jobs can be scheduled
+  auto stop_token =
+      dbfull()->TEST_write_controler().GetCompactionPressureToken();
+
+  bool cf_1_1 = true;
+  bool cf_1_2 = true;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"ColumnFamilyTest::AutoManual:4", "ColumnFamilyTest::AutoManual:2"},
+       {"ColumnFamilyTest::AutoManual:4", "ColumnFamilyTest::AutoManual:5"},
+       {"CompactionPicker::CompactRange:Conflict",
+        "ColumnFamilyTest::AutoManual:3"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:NonTrivial:AfterRun", [&](void* /*arg*/) {
+        if (cf_1_1) {
+          TEST_SYNC_POINT("ColumnFamilyTest::AutoManual:4");
+          cf_1_1 = false;
+          TEST_SYNC_POINT("ColumnFamilyTest::AutoManual:3");
+        } else if (cf_1_2) {
+          TEST_SYNC_POINT("ColumnFamilyTest::AutoManual:2");
+          cf_1_2 = false;
+        }
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // SETUP column family "one" -- universal style
+  for (int i = 0; i < one.level0_file_num_compaction_trigger; ++i) {
+    PutRandomData(1, 10, 12000, true);
+    PutRandomData(1, 1, 10, true);
+    WaitForFlush(1);
+    AssertFilesPerLevel(ToString(i + 1), 1);
+  }
+
+  TEST_SYNC_POINT("ColumnFamilyTest::AutoManual:5");
+
+  // Add another L0 file and force automatic compaction
+  for (int i = 0; i < one.level0_file_num_compaction_trigger - 2; ++i) {
+    PutRandomData(1, 10, 12000, true);
+    PutRandomData(1, 1, 10, true);
+    WaitForFlush(1);
+  }
+
+  CompactRangeOptions compact_options;
+  compact_options.exclusive_manual_compaction = false;
+  ASSERT_OK(db_->CompactRange(compact_options, handles_[1], nullptr, nullptr));
+
+  TEST_SYNC_POINT("ColumnFamilyTest::AutoManual:1");
+
+  WaitForCompaction();
+  // VERIFY compaction "one"
+  AssertFilesPerLevel("1", 1);
+  // Compare against saved keys
+  std::set<std::string>::iterator key_iter = keys_[1].begin();
+  while (key_iter != keys_[1].end()) {
+    ASSERT_NE("NOT_FOUND", Get(1, *key_iter));
+    key_iter++;
+  }
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+#endif  // !ROCKSDB_LITE
+
+#ifndef ROCKSDB_LITE  // Tailing iterator not supported
+namespace {
+std::string IterStatus(Iterator* iter) {
+  std::string result;
+  if (iter->Valid()) {
+    result = iter->key().ToString() + "->" + iter->value().ToString();
+  } else {
+    result = "(invalid)";
+  }
+  return result;
+}
+}  // anonymous namespace
+
+TEST_P(ColumnFamilyTest, NewIteratorsTest) {
+  // iter == 0 -- no tailing
+  // iter == 2 -- tailing
+  for (int iter = 0; iter < 2; ++iter) {
+    Open();
+    CreateColumnFamiliesAndReopen({"one", "two"});
+    ASSERT_OK(Put(0, "a", "b"));
+    ASSERT_OK(Put(1, "b", "a"));
+    ASSERT_OK(Put(2, "c", "m"));
+    ASSERT_OK(Put(2, "v", "t"));
+    std::vector<Iterator*> iterators;
+    ReadOptions options;
+    options.tailing = (iter == 1);
+    ASSERT_OK(db_->NewIterators(options, handles_, &iterators));
+
+    for (auto it : iterators) {
+      it->SeekToFirst();
+    }
+    ASSERT_EQ(IterStatus(iterators[0]), "a->b");
+    ASSERT_EQ(IterStatus(iterators[1]), "b->a");
+    ASSERT_EQ(IterStatus(iterators[2]), "c->m");
+
+    ASSERT_OK(Put(1, "x", "x"));
+
+    for (auto it : iterators) {
+      it->Next();
+    }
+
+    ASSERT_EQ(IterStatus(iterators[0]), "(invalid)");
+    if (iter == 0) {
+      // no tailing
+      ASSERT_EQ(IterStatus(iterators[1]), "(invalid)");
+    } else {
+      // tailing
+      ASSERT_EQ(IterStatus(iterators[1]), "x->x");
+    }
+    ASSERT_EQ(IterStatus(iterators[2]), "v->t");
+
+    for (auto it : iterators) {
+      delete it;
+    }
+    Destroy();
+  }
+}
+#endif  // !ROCKSDB_LITE
+
+#ifndef ROCKSDB_LITE  // ReadOnlyDB is not supported
+TEST_P(ColumnFamilyTest, ReadOnlyDBTest) {
+  Open();
+  CreateColumnFamiliesAndReopen({"one", "two", "three", "four"});
+  ASSERT_OK(Put(0, "a", "b"));
+  ASSERT_OK(Put(1, "foo", "bla"));
+  ASSERT_OK(Put(2, "foo", "blabla"));
+  ASSERT_OK(Put(3, "foo", "blablabla"));
+  ASSERT_OK(Put(4, "foo", "blablablabla"));
+
+  DropColumnFamilies({2});
+  Close();
+  // open only a subset of column families
+  AssertOpenReadOnly({"default", "one", "four"});
+  ASSERT_EQ("NOT_FOUND", Get(0, "foo"));
+  ASSERT_EQ("bla", Get(1, "foo"));
+  ASSERT_EQ("blablablabla", Get(2, "foo"));
+
+
+  // test newiterators
+  {
+    std::vector<Iterator*> iterators;
+    ASSERT_OK(db_->NewIterators(ReadOptions(), handles_, &iterators));
+    for (auto it : iterators) {
+      it->SeekToFirst();
+    }
+    ASSERT_EQ(IterStatus(iterators[0]), "a->b");
+    ASSERT_EQ(IterStatus(iterators[1]), "foo->bla");
+    ASSERT_EQ(IterStatus(iterators[2]), "foo->blablablabla");
+    for (auto it : iterators) {
+      it->Next();
+    }
+    ASSERT_EQ(IterStatus(iterators[0]), "(invalid)");
+    ASSERT_EQ(IterStatus(iterators[1]), "(invalid)");
+    ASSERT_EQ(IterStatus(iterators[2]), "(invalid)");
+
+    for (auto it : iterators) {
+      delete it;
+    }
+  }
+
+  Close();
+  // can't open dropped column family
+  Status s = OpenReadOnly({"default", "one", "two"});
+  ASSERT_TRUE(!s.ok());
+
+  // Can't open without specifying default column family
+  s = OpenReadOnly({"one", "four"});
+  ASSERT_TRUE(!s.ok());
+}
+#endif  // !ROCKSDB_LITE
+
+#ifndef ROCKSDB_LITE  //  WaitForFlush() is not supported in lite
+TEST_P(ColumnFamilyTest, DontRollEmptyLogs) {
+  Open();
+  CreateColumnFamiliesAndReopen({"one", "two", "three", "four"});
+
+  for (size_t i = 0; i < handles_.size(); ++i) {
+    PutRandomData(static_cast<int>(i), 10, 100);
+  }
+  int num_writable_file_start = env_->GetNumberOfNewWritableFileCalls();
+  // this will trigger the flushes
+  for (int i = 0; i <= 4; ++i) {
+    ASSERT_OK(Flush(i));
+  }
+
+  for (int i = 0; i < 4; ++i) {
+    WaitForFlush(i);
+  }
+  int total_new_writable_files =
+      env_->GetNumberOfNewWritableFileCalls() - num_writable_file_start;
+  ASSERT_EQ(static_cast<size_t>(total_new_writable_files), handles_.size() + 1);
+  Close();
+}
+#endif  // !ROCKSDB_LITE
+
+#ifndef ROCKSDB_LITE  //  WaitForCompaction() is not supported in lite
+TEST_P(ColumnFamilyTest, FlushStaleColumnFamilies) {
+  Open();
+  CreateColumnFamilies({"one", "two"});
+  ColumnFamilyOptions default_cf, one, two;
+  default_cf.write_buffer_size = 100000;  // small write buffer size
+  default_cf.arena_block_size = 4096;
+  default_cf.disable_auto_compactions = true;
+  one.disable_auto_compactions = true;
+  two.disable_auto_compactions = true;
+  db_options_.max_total_wal_size = 210000;
+
+  Reopen({default_cf, one, two});
+
+  PutRandomData(2, 1, 10);  // 10 bytes
+  for (int i = 0; i < 2; ++i) {
+    PutRandomData(0, 100, 1000);  // flush
+    WaitForFlush(0);
+
+    AssertCountLiveFiles(i + 1);
+  }
+  // third flush. now, CF [two] should be detected as stale and flushed
+  // column family 1 should not be flushed since it's empty
+  PutRandomData(0, 100, 1000);  // flush
+  WaitForFlush(0);
+  WaitForFlush(2);
+  // 3 files for default column families, 1 file for column family [two], zero
+  // files for column family [one], because it's empty
+  AssertCountLiveFiles(4);
+
+  Flush(0);
+  ASSERT_EQ(0, dbfull()->TEST_total_log_size());
+  Close();
+}
+#endif  // !ROCKSDB_LITE
+
+TEST_P(ColumnFamilyTest, CreateMissingColumnFamilies) {
+  Status s = TryOpen({"one", "two"});
+  ASSERT_TRUE(!s.ok());
+  db_options_.create_missing_column_families = true;
+  s = TryOpen({"default", "one", "two"});
+  ASSERT_TRUE(s.ok());
+  Close();
+}
+
+TEST_P(ColumnFamilyTest, SanitizeOptions) {
+  DBOptions db_options;
+  for (int s = kCompactionStyleLevel; s <= kCompactionStyleUniversal; ++s) {
+    for (int l = 0; l <= 2; l++) {
+      for (int i = 1; i <= 3; i++) {
+        for (int j = 1; j <= 3; j++) {
+          for (int k = 1; k <= 3; k++) {
+            ColumnFamilyOptions original;
+            original.compaction_style = static_cast<CompactionStyle>(s);
+            original.num_levels = l;
+            original.level0_stop_writes_trigger = i;
+            original.level0_slowdown_writes_trigger = j;
+            original.level0_file_num_compaction_trigger = k;
+            original.write_buffer_size =
+                l * 4 * 1024 * 1024 + i * 1024 * 1024 + j * 1024 + k;
+
+            ColumnFamilyOptions result =
+                SanitizeOptions(ImmutableDBOptions(db_options), original);
+            ASSERT_TRUE(result.level0_stop_writes_trigger >=
+                        result.level0_slowdown_writes_trigger);
+            ASSERT_TRUE(result.level0_slowdown_writes_trigger >=
+                        result.level0_file_num_compaction_trigger);
+            ASSERT_TRUE(result.level0_file_num_compaction_trigger ==
+                        original.level0_file_num_compaction_trigger);
+            if (s == kCompactionStyleLevel) {
+              ASSERT_GE(result.num_levels, 2);
+            } else {
+              ASSERT_GE(result.num_levels, 1);
+              if (original.num_levels >= 1) {
+                ASSERT_EQ(result.num_levels, original.num_levels);
+              }
+            }
+
+            // Make sure Sanitize options sets arena_block_size to 1/8 of
+            // the write_buffer_size, rounded up to a multiple of 4k.
+            size_t expected_arena_block_size =
+                l * 4 * 1024 * 1024 / 8 + i * 1024 * 1024 / 8;
+            if (j + k != 0) {
+              // not a multiple of 4k, round up 4k
+              expected_arena_block_size += 4 * 1024;
+            }
+            ASSERT_EQ(expected_arena_block_size, result.arena_block_size);
+          }
+        }
+      }
+    }
+  }
+}
+
+TEST_P(ColumnFamilyTest, ReadDroppedColumnFamily) {
+  // iter 0 -- drop CF, don't reopen
+  // iter 1 -- delete CF, reopen
+  for (int iter = 0; iter < 2; ++iter) {
+    db_options_.create_missing_column_families = true;
+    db_options_.max_open_files = 20;
+    // delete obsolete files always
+    db_options_.delete_obsolete_files_period_micros = 0;
+    Open({"default", "one", "two"});
+    ColumnFamilyOptions options;
+    options.level0_file_num_compaction_trigger = 100;
+    options.level0_slowdown_writes_trigger = 200;
+    options.level0_stop_writes_trigger = 200;
+    options.write_buffer_size = 100000;  // small write buffer size
+    Reopen({options, options, options});
+
+    // 1MB should create ~10 files for each CF
+    int kKeysNum = 10000;
+    PutRandomData(0, kKeysNum, 100);
+    PutRandomData(1, kKeysNum, 100);
+    PutRandomData(2, kKeysNum, 100);
+
+    {
+      std::unique_ptr<Iterator> iterator(
+          db_->NewIterator(ReadOptions(), handles_[2]));
+      iterator->SeekToFirst();
+
+      if (iter == 0) {
+        // Drop CF two
+        ASSERT_OK(db_->DropColumnFamily(handles_[2]));
+      } else {
+        // delete CF two
+        db_->DestroyColumnFamilyHandle(handles_[2]);
+        handles_[2] = nullptr;
+      }
+      // Make sure iterator created can still be used.
+      int count = 0;
+      for (; iterator->Valid(); iterator->Next()) {
+        ASSERT_OK(iterator->status());
+        ++count;
+      }
+      ASSERT_OK(iterator->status());
+      ASSERT_EQ(count, kKeysNum);
+    }
+
+    // Add bunch more data to other CFs
+    PutRandomData(0, kKeysNum, 100);
+    PutRandomData(1, kKeysNum, 100);
+
+    if (iter == 1) {
+      Reopen();
+    }
+
+    // Since we didn't delete CF handle, RocksDB's contract guarantees that
+    // we're still able to read dropped CF
+    for (int i = 0; i < 3; ++i) {
+      std::unique_ptr<Iterator> iterator(
+          db_->NewIterator(ReadOptions(), handles_[i]));
+      int count = 0;
+      for (iterator->SeekToFirst(); iterator->Valid(); iterator->Next()) {
+        ASSERT_OK(iterator->status());
+        ++count;
+      }
+      ASSERT_OK(iterator->status());
+      ASSERT_EQ(count, kKeysNum * ((i == 2) ? 1 : 2));
+    }
+
+    Close();
+    Destroy();
+  }
+}
+
+TEST_P(ColumnFamilyTest, LiveIteratorWithDroppedColumnFamily) {
+  db_options_.create_missing_column_families = true;
+  db_options_.max_open_files = 20;
+  // delete obsolete files always
+  db_options_.delete_obsolete_files_period_micros = 0;
+  Open({"default", "one", "two"});
+  ColumnFamilyOptions options;
+  options.level0_file_num_compaction_trigger = 100;
+  options.level0_slowdown_writes_trigger = 200;
+  options.level0_stop_writes_trigger = 200;
+  options.write_buffer_size = 100000;  // small write buffer size
+  Reopen({options, options, options});
+
+  // 1MB should create ~10 files for each CF
+  int kKeysNum = 10000;
+  PutRandomData(1, kKeysNum, 100);
+
+  {
+    std::unique_ptr<Iterator> iterator(
+        db_->NewIterator(ReadOptions(), handles_[1]));
+    iterator->SeekToFirst();
+
+    DropColumnFamilies({1});
+
+    // Make sure iterator created can still be used.
+    int count = 0;
+    for (; iterator->Valid(); iterator->Next()) {
+      ASSERT_OK(iterator->status());
+      ++count;
+    }
+    ASSERT_OK(iterator->status());
+    ASSERT_EQ(count, kKeysNum);
+  }
+
+  Reopen();
+  Close();
+  Destroy();
+}
+
+TEST_P(ColumnFamilyTest, FlushAndDropRaceCondition) {
+  db_options_.create_missing_column_families = true;
+  Open({"default", "one"});
+  ColumnFamilyOptions options;
+  options.level0_file_num_compaction_trigger = 100;
+  options.level0_slowdown_writes_trigger = 200;
+  options.level0_stop_writes_trigger = 200;
+  options.max_write_buffer_number = 20;
+  options.write_buffer_size = 100000;  // small write buffer size
+  Reopen({options, options});
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"VersionSet::LogAndApply::ColumnFamilyDrop:0",
+        "FlushJob::WriteLevel0Table"},
+       {"VersionSet::LogAndApply::ColumnFamilyDrop:1",
+        "FlushJob::InstallResults"},
+       {"FlushJob::InstallResults",
+        "VersionSet::LogAndApply::ColumnFamilyDrop:2"}});
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  test::SleepingBackgroundTask sleeping_task;
+
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task,
+                 Env::Priority::HIGH);
+
+  // 1MB should create ~10 files for each CF
+  int kKeysNum = 10000;
+  PutRandomData(1, kKeysNum, 100);
+
+  std::vector<port::Thread> threads;
+  threads.emplace_back([&] { ASSERT_OK(db_->DropColumnFamily(handles_[1])); });
+
+  sleeping_task.WakeUp();
+  sleeping_task.WaitUntilDone();
+  sleeping_task.Reset();
+  // now we sleep again. this is just so we're certain that flush job finished
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task,
+                 Env::Priority::HIGH);
+  sleeping_task.WakeUp();
+  sleeping_task.WaitUntilDone();
+
+  {
+    // Since we didn't delete CF handle, RocksDB's contract guarantees that
+    // we're still able to read dropped CF
+    std::unique_ptr<Iterator> iterator(
+        db_->NewIterator(ReadOptions(), handles_[1]));
+    int count = 0;
+    for (iterator->SeekToFirst(); iterator->Valid(); iterator->Next()) {
+      ASSERT_OK(iterator->status());
+      ++count;
+    }
+    ASSERT_OK(iterator->status());
+    ASSERT_EQ(count, kKeysNum);
+  }
+  for (auto& t : threads) {
+    t.join();
+  }
+
+  Close();
+  Destroy();
+}
+
+#ifndef ROCKSDB_LITE
+// skipped as persisting options is not supported in ROCKSDB_LITE
+namespace {
+std::atomic<int> test_stage(0);
+std::atomic<bool> ordered_by_writethread(false);
+const int kMainThreadStartPersistingOptionsFile = 1;
+const int kChildThreadFinishDroppingColumnFamily = 2;
+void DropSingleColumnFamily(ColumnFamilyTest* cf_test, int cf_id,
+                            std::vector<Comparator*>* comparators) {
+  while (test_stage < kMainThreadStartPersistingOptionsFile &&
+         !ordered_by_writethread) {
+    Env::Default()->SleepForMicroseconds(100);
+  }
+  cf_test->DropColumnFamilies({cf_id});
+  if ((*comparators)[cf_id]) {
+    delete (*comparators)[cf_id];
+    (*comparators)[cf_id] = nullptr;
+  }
+  test_stage = kChildThreadFinishDroppingColumnFamily;
+}
+}  // namespace
+
+TEST_P(ColumnFamilyTest, CreateAndDropRace) {
+  const int kCfCount = 5;
+  std::vector<ColumnFamilyOptions> cf_opts;
+  std::vector<Comparator*> comparators;
+  for (int i = 0; i < kCfCount; ++i) {
+    cf_opts.emplace_back();
+    comparators.push_back(new test::SimpleSuffixReverseComparator());
+    cf_opts.back().comparator = comparators.back();
+  }
+  db_options_.create_if_missing = true;
+  db_options_.create_missing_column_families = true;
+
+  auto main_thread_id = std::this_thread::get_id();
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "PersistRocksDBOptions:start", [&](void* /*arg*/) {
+        auto current_thread_id = std::this_thread::get_id();
+        // If it's the main thread hitting this sync-point, then it
+        // will be blocked until some other thread update the test_stage.
+        if (main_thread_id == current_thread_id) {
+          test_stage = kMainThreadStartPersistingOptionsFile;
+          while (test_stage < kChildThreadFinishDroppingColumnFamily &&
+                 !ordered_by_writethread) {
+            Env::Default()->SleepForMicroseconds(100);
+          }
+        }
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "WriteThread::EnterUnbatched:Wait", [&](void* /*arg*/) {
+        // This means a thread doing DropColumnFamily() is waiting for
+        // other thread to finish persisting options.
+        // In such case, we update the test_stage to unblock the main thread.
+        ordered_by_writethread = true;
+      });
+
+  // Create a database with four column families
+  Open({"default", "one", "two", "three"},
+       {cf_opts[0], cf_opts[1], cf_opts[2], cf_opts[3]});
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // Start a thread that will drop the first column family
+  // and its comparator
+  ROCKSDB_NAMESPACE::port::Thread drop_cf_thread(DropSingleColumnFamily, this,
+                                                 1, &comparators);
+
+  DropColumnFamilies({2});
+
+  drop_cf_thread.join();
+  Close();
+  Destroy();
+  for (auto* comparator : comparators) {
+    if (comparator) {
+      delete comparator;
+    }
+  }
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+#endif  // !ROCKSDB_LITE
+
+TEST_P(ColumnFamilyTest, WriteStallSingleColumnFamily) {
+  const uint64_t kBaseRate = 800000u;
+  db_options_.delayed_write_rate = kBaseRate;
+  db_options_.max_background_compactions = 6;
+
+  Open({"default"});
+  ColumnFamilyData* cfd =
+      static_cast<ColumnFamilyHandleImpl*>(db_->DefaultColumnFamily())->cfd();
+
+  VersionStorageInfo* vstorage = cfd->current()->storage_info();
+
+  MutableCFOptions mutable_cf_options(column_family_options_);
+
+  mutable_cf_options.level0_slowdown_writes_trigger = 20;
+  mutable_cf_options.level0_stop_writes_trigger = 10000;
+  mutable_cf_options.soft_pending_compaction_bytes_limit = 200;
+  mutable_cf_options.hard_pending_compaction_bytes_limit = 2000;
+  mutable_cf_options.disable_auto_compactions = false;
+
+  vstorage->TEST_set_estimated_compaction_needed_bytes(50);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_TRUE(!IsDbWriteStopped());
+  ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay());
+
+  vstorage->TEST_set_estimated_compaction_needed_bytes(201);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_TRUE(!IsDbWriteStopped());
+  ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+  ASSERT_EQ(kBaseRate, GetDbDelayedWriteRate());
+  ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed());
+
+  vstorage->TEST_set_estimated_compaction_needed_bytes(400);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_TRUE(!IsDbWriteStopped());
+  ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+  ASSERT_EQ(kBaseRate / 1.25, GetDbDelayedWriteRate());
+  ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed());
+
+  vstorage->TEST_set_estimated_compaction_needed_bytes(500);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_TRUE(!IsDbWriteStopped());
+  ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+  ASSERT_EQ(kBaseRate / 1.25 / 1.25, GetDbDelayedWriteRate());
+
+  vstorage->TEST_set_estimated_compaction_needed_bytes(450);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_TRUE(!IsDbWriteStopped());
+  ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+  ASSERT_EQ(kBaseRate / 1.25, GetDbDelayedWriteRate());
+
+  vstorage->TEST_set_estimated_compaction_needed_bytes(205);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_TRUE(!IsDbWriteStopped());
+  ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+  ASSERT_EQ(kBaseRate, GetDbDelayedWriteRate());
+
+  vstorage->TEST_set_estimated_compaction_needed_bytes(202);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_TRUE(!IsDbWriteStopped());
+  ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+  ASSERT_EQ(kBaseRate, GetDbDelayedWriteRate());
+
+  vstorage->TEST_set_estimated_compaction_needed_bytes(201);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_TRUE(!IsDbWriteStopped());
+  ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+  ASSERT_EQ(kBaseRate, GetDbDelayedWriteRate());
+
+  vstorage->TEST_set_estimated_compaction_needed_bytes(198);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_TRUE(!IsDbWriteStopped());
+  ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay());
+
+  vstorage->TEST_set_estimated_compaction_needed_bytes(399);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_TRUE(!IsDbWriteStopped());
+  ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+  ASSERT_EQ(kBaseRate, GetDbDelayedWriteRate());
+
+  vstorage->TEST_set_estimated_compaction_needed_bytes(599);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_TRUE(!IsDbWriteStopped());
+  ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+  ASSERT_EQ(kBaseRate / 1.25, GetDbDelayedWriteRate());
+
+  vstorage->TEST_set_estimated_compaction_needed_bytes(2001);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_TRUE(IsDbWriteStopped());
+  ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay());
+  ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed());
+
+  vstorage->TEST_set_estimated_compaction_needed_bytes(3001);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_TRUE(IsDbWriteStopped());
+  ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay());
+
+  vstorage->TEST_set_estimated_compaction_needed_bytes(390);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_TRUE(!IsDbWriteStopped());
+  ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+  ASSERT_EQ(kBaseRate / 1.25, GetDbDelayedWriteRate());
+
+  vstorage->TEST_set_estimated_compaction_needed_bytes(100);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_TRUE(!IsDbWriteStopped());
+  ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay());
+
+  vstorage->set_l0_delay_trigger_count(100);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_TRUE(!IsDbWriteStopped());
+  ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+  ASSERT_EQ(kBaseRate, GetDbDelayedWriteRate());
+  ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed());
+
+  vstorage->set_l0_delay_trigger_count(101);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_TRUE(!IsDbWriteStopped());
+  ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+  ASSERT_EQ(kBaseRate / 1.25, GetDbDelayedWriteRate());
+
+  vstorage->set_l0_delay_trigger_count(0);
+  vstorage->TEST_set_estimated_compaction_needed_bytes(300);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_TRUE(!IsDbWriteStopped());
+  ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+  ASSERT_EQ(kBaseRate / 1.25 / 1.25, GetDbDelayedWriteRate());
+
+  vstorage->set_l0_delay_trigger_count(101);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_TRUE(!IsDbWriteStopped());
+  ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+  ASSERT_EQ(kBaseRate / 1.25 / 1.25 / 1.25, GetDbDelayedWriteRate());
+
+  vstorage->TEST_set_estimated_compaction_needed_bytes(200);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_TRUE(!IsDbWriteStopped());
+  ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+  ASSERT_EQ(kBaseRate / 1.25 / 1.25, GetDbDelayedWriteRate());
+
+  vstorage->set_l0_delay_trigger_count(0);
+  vstorage->TEST_set_estimated_compaction_needed_bytes(0);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_TRUE(!IsDbWriteStopped());
+  ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay());
+
+  mutable_cf_options.disable_auto_compactions = true;
+  dbfull()->TEST_write_controler().set_delayed_write_rate(kBaseRate);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_TRUE(!IsDbWriteStopped());
+  ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay());
+
+  vstorage->set_l0_delay_trigger_count(50);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_TRUE(!IsDbWriteStopped());
+  ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay());
+  ASSERT_EQ(0, GetDbDelayedWriteRate());
+  ASSERT_EQ(kBaseRate, dbfull()->TEST_write_controler().delayed_write_rate());
+
+  vstorage->set_l0_delay_trigger_count(60);
+  vstorage->TEST_set_estimated_compaction_needed_bytes(300);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_TRUE(!IsDbWriteStopped());
+  ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay());
+  ASSERT_EQ(0, GetDbDelayedWriteRate());
+  ASSERT_EQ(kBaseRate, dbfull()->TEST_write_controler().delayed_write_rate());
+
+  mutable_cf_options.disable_auto_compactions = false;
+  vstorage->set_l0_delay_trigger_count(70);
+  vstorage->TEST_set_estimated_compaction_needed_bytes(500);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_TRUE(!IsDbWriteStopped());
+  ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+  ASSERT_EQ(kBaseRate, GetDbDelayedWriteRate());
+
+  vstorage->set_l0_delay_trigger_count(71);
+  vstorage->TEST_set_estimated_compaction_needed_bytes(501);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_TRUE(!IsDbWriteStopped());
+  ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+  ASSERT_EQ(kBaseRate / 1.25, GetDbDelayedWriteRate());
+}
+
+TEST_P(ColumnFamilyTest, CompactionSpeedupSingleColumnFamily) {
+  db_options_.max_background_compactions = 6;
+  Open({"default"});
+  ColumnFamilyData* cfd =
+      static_cast<ColumnFamilyHandleImpl*>(db_->DefaultColumnFamily())->cfd();
+
+  VersionStorageInfo* vstorage = cfd->current()->storage_info();
+
+  MutableCFOptions mutable_cf_options(column_family_options_);
+
+  // Speed up threshold = min(4 * 2, 4 + (36 - 4)/4) = 8
+  mutable_cf_options.level0_file_num_compaction_trigger = 4;
+  mutable_cf_options.level0_slowdown_writes_trigger = 36;
+  mutable_cf_options.level0_stop_writes_trigger = 50;
+  // Speedup threshold = 200 / 4 = 50
+  mutable_cf_options.soft_pending_compaction_bytes_limit = 200;
+  mutable_cf_options.hard_pending_compaction_bytes_limit = 2000;
+
+  vstorage->TEST_set_estimated_compaction_needed_bytes(40);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed());
+
+  vstorage->TEST_set_estimated_compaction_needed_bytes(50);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed());
+
+  vstorage->TEST_set_estimated_compaction_needed_bytes(300);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed());
+
+  vstorage->TEST_set_estimated_compaction_needed_bytes(45);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed());
+
+  vstorage->set_l0_delay_trigger_count(7);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed());
+
+  vstorage->set_l0_delay_trigger_count(9);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed());
+
+  vstorage->set_l0_delay_trigger_count(6);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed());
+
+  // Speed up threshold = min(4 * 2, 4 + (12 - 4)/4) = 6
+  mutable_cf_options.level0_file_num_compaction_trigger = 4;
+  mutable_cf_options.level0_slowdown_writes_trigger = 16;
+  mutable_cf_options.level0_stop_writes_trigger = 30;
+
+  vstorage->set_l0_delay_trigger_count(5);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed());
+
+  vstorage->set_l0_delay_trigger_count(7);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed());
+
+  vstorage->set_l0_delay_trigger_count(3);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed());
+}
+
+TEST_P(ColumnFamilyTest, WriteStallTwoColumnFamilies) {
+  const uint64_t kBaseRate = 810000u;
+  db_options_.delayed_write_rate = kBaseRate;
+  Open();
+  CreateColumnFamilies({"one"});
+  ColumnFamilyData* cfd =
+      static_cast<ColumnFamilyHandleImpl*>(db_->DefaultColumnFamily())->cfd();
+  VersionStorageInfo* vstorage = cfd->current()->storage_info();
+
+  ColumnFamilyData* cfd1 =
+      static_cast<ColumnFamilyHandleImpl*>(handles_[1])->cfd();
+  VersionStorageInfo* vstorage1 = cfd1->current()->storage_info();
+
+  MutableCFOptions mutable_cf_options(column_family_options_);
+  mutable_cf_options.level0_slowdown_writes_trigger = 20;
+  mutable_cf_options.level0_stop_writes_trigger = 10000;
+  mutable_cf_options.soft_pending_compaction_bytes_limit = 200;
+  mutable_cf_options.hard_pending_compaction_bytes_limit = 2000;
+
+  MutableCFOptions mutable_cf_options1 = mutable_cf_options;
+  mutable_cf_options1.soft_pending_compaction_bytes_limit = 500;
+
+  vstorage->TEST_set_estimated_compaction_needed_bytes(50);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_TRUE(!IsDbWriteStopped());
+  ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay());
+
+  vstorage1->TEST_set_estimated_compaction_needed_bytes(201);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_TRUE(!IsDbWriteStopped());
+  ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay());
+
+  vstorage1->TEST_set_estimated_compaction_needed_bytes(600);
+  RecalculateWriteStallConditions(cfd1, mutable_cf_options);
+  ASSERT_TRUE(!IsDbWriteStopped());
+  ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+  ASSERT_EQ(kBaseRate, GetDbDelayedWriteRate());
+
+  vstorage->TEST_set_estimated_compaction_needed_bytes(70);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_TRUE(!IsDbWriteStopped());
+  ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+  ASSERT_EQ(kBaseRate, GetDbDelayedWriteRate());
+
+  vstorage1->TEST_set_estimated_compaction_needed_bytes(800);
+  RecalculateWriteStallConditions(cfd1, mutable_cf_options);
+  ASSERT_TRUE(!IsDbWriteStopped());
+  ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+  ASSERT_EQ(kBaseRate / 1.25, GetDbDelayedWriteRate());
+
+  vstorage->TEST_set_estimated_compaction_needed_bytes(300);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_TRUE(!IsDbWriteStopped());
+  ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+  ASSERT_EQ(kBaseRate / 1.25 / 1.25, GetDbDelayedWriteRate());
+
+  vstorage1->TEST_set_estimated_compaction_needed_bytes(700);
+  RecalculateWriteStallConditions(cfd1, mutable_cf_options);
+  ASSERT_TRUE(!IsDbWriteStopped());
+  ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+  ASSERT_EQ(kBaseRate / 1.25, GetDbDelayedWriteRate());
+
+  vstorage->TEST_set_estimated_compaction_needed_bytes(500);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_TRUE(!IsDbWriteStopped());
+  ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+  ASSERT_EQ(kBaseRate / 1.25 / 1.25, GetDbDelayedWriteRate());
+
+  vstorage1->TEST_set_estimated_compaction_needed_bytes(600);
+  RecalculateWriteStallConditions(cfd1, mutable_cf_options);
+  ASSERT_TRUE(!IsDbWriteStopped());
+  ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+  ASSERT_EQ(kBaseRate / 1.25, GetDbDelayedWriteRate());
+}
+
+TEST_P(ColumnFamilyTest, CompactionSpeedupTwoColumnFamilies) {
+  db_options_.max_background_compactions = 6;
+  column_family_options_.soft_pending_compaction_bytes_limit = 200;
+  column_family_options_.hard_pending_compaction_bytes_limit = 2000;
+  Open();
+  CreateColumnFamilies({"one"});
+  ColumnFamilyData* cfd =
+      static_cast<ColumnFamilyHandleImpl*>(db_->DefaultColumnFamily())->cfd();
+  VersionStorageInfo* vstorage = cfd->current()->storage_info();
+
+  ColumnFamilyData* cfd1 =
+      static_cast<ColumnFamilyHandleImpl*>(handles_[1])->cfd();
+  VersionStorageInfo* vstorage1 = cfd1->current()->storage_info();
+
+  MutableCFOptions mutable_cf_options(column_family_options_);
+  // Speed up threshold = min(4 * 2, 4 + (36 - 4)/4) = 8
+  mutable_cf_options.level0_file_num_compaction_trigger = 4;
+  mutable_cf_options.level0_slowdown_writes_trigger = 36;
+  mutable_cf_options.level0_stop_writes_trigger = 30;
+  // Speedup threshold = 200 / 4 = 50
+  mutable_cf_options.soft_pending_compaction_bytes_limit = 200;
+  mutable_cf_options.hard_pending_compaction_bytes_limit = 2000;
+
+  MutableCFOptions mutable_cf_options1 = mutable_cf_options;
+  mutable_cf_options1.level0_slowdown_writes_trigger = 16;
+
+  vstorage->TEST_set_estimated_compaction_needed_bytes(40);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed());
+
+  vstorage->TEST_set_estimated_compaction_needed_bytes(60);
+  RecalculateWriteStallConditions(cfd1, mutable_cf_options);
+  ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed());
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed());
+
+  vstorage1->TEST_set_estimated_compaction_needed_bytes(30);
+  RecalculateWriteStallConditions(cfd1, mutable_cf_options);
+  ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed());
+
+  vstorage1->TEST_set_estimated_compaction_needed_bytes(70);
+  RecalculateWriteStallConditions(cfd1, mutable_cf_options);
+  ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed());
+
+  vstorage->TEST_set_estimated_compaction_needed_bytes(20);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed());
+
+  vstorage1->TEST_set_estimated_compaction_needed_bytes(3);
+  RecalculateWriteStallConditions(cfd1, mutable_cf_options);
+  ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed());
+
+  vstorage->set_l0_delay_trigger_count(9);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed());
+
+  vstorage1->set_l0_delay_trigger_count(2);
+  RecalculateWriteStallConditions(cfd1, mutable_cf_options);
+  ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed());
+
+  vstorage->set_l0_delay_trigger_count(0);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed());
+}
+
+TEST_P(ColumnFamilyTest, CreateAndDestoryOptions) {
+  std::unique_ptr<ColumnFamilyOptions> cfo(new ColumnFamilyOptions());
+  ColumnFamilyHandle* cfh;
+  Open();
+  ASSERT_OK(db_->CreateColumnFamily(*(cfo.get()), "yoyo", &cfh));
+  cfo.reset();
+  ASSERT_OK(db_->Put(WriteOptions(), cfh, "foo", "bar"));
+  ASSERT_OK(db_->Flush(FlushOptions(), cfh));
+  ASSERT_OK(db_->DropColumnFamily(cfh));
+  ASSERT_OK(db_->DestroyColumnFamilyHandle(cfh));
+}
+
+TEST_P(ColumnFamilyTest, CreateDropAndDestroy) {
+  ColumnFamilyHandle* cfh;
+  Open();
+  ASSERT_OK(db_->CreateColumnFamily(ColumnFamilyOptions(), "yoyo", &cfh));
+  ASSERT_OK(db_->Put(WriteOptions(), cfh, "foo", "bar"));
+  ASSERT_OK(db_->Flush(FlushOptions(), cfh));
+  ASSERT_OK(db_->DropColumnFamily(cfh));
+  ASSERT_OK(db_->DestroyColumnFamilyHandle(cfh));
+}
+
+#ifndef ROCKSDB_LITE
+TEST_P(ColumnFamilyTest, CreateDropAndDestroyWithoutFileDeletion) {
+  ColumnFamilyHandle* cfh;
+  Open();
+  ASSERT_OK(db_->CreateColumnFamily(ColumnFamilyOptions(), "yoyo", &cfh));
+  ASSERT_OK(db_->Put(WriteOptions(), cfh, "foo", "bar"));
+  ASSERT_OK(db_->Flush(FlushOptions(), cfh));
+  ASSERT_OK(db_->DisableFileDeletions());
+  ASSERT_OK(db_->DropColumnFamily(cfh));
+  ASSERT_OK(db_->DestroyColumnFamilyHandle(cfh));
+}
+
+TEST_P(ColumnFamilyTest, FlushCloseWALFiles) {
+  SpecialEnv env(Env::Default());
+  db_options_.env = &env;
+  db_options_.max_background_flushes = 1;
+  column_family_options_.memtable_factory.reset(new SpecialSkipListFactory(2));
+  Open();
+  CreateColumnFamilies({"one"});
+  ASSERT_OK(Put(1, "fodor", "mirko"));
+  ASSERT_OK(Put(0, "fodor", "mirko"));
+  ASSERT_OK(Put(1, "fodor", "mirko"));
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+      {"DBImpl::BGWorkFlush:done", "FlushCloseWALFiles:0"},
+  });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // Block flush jobs from running
+  test::SleepingBackgroundTask sleeping_task;
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task,
+                 Env::Priority::HIGH);
+
+  WriteOptions wo;
+  wo.sync = true;
+  ASSERT_OK(db_->Put(wo, handles_[1], "fodor", "mirko"));
+
+  ASSERT_EQ(2, env.num_open_wal_file_.load());
+
+  sleeping_task.WakeUp();
+  sleeping_task.WaitUntilDone();
+  TEST_SYNC_POINT("FlushCloseWALFiles:0");
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ASSERT_EQ(1, env.num_open_wal_file_.load());
+
+  Reopen();
+  ASSERT_EQ("mirko", Get(0, "fodor"));
+  ASSERT_EQ("mirko", Get(1, "fodor"));
+  db_options_.env = env_;
+  Close();
+}
+#endif  // !ROCKSDB_LITE
+
+#ifndef ROCKSDB_LITE  // WaitForFlush() is not supported
+TEST_P(ColumnFamilyTest, IteratorCloseWALFile1) {
+  SpecialEnv env(Env::Default());
+  db_options_.env = &env;
+  db_options_.max_background_flushes = 1;
+  column_family_options_.memtable_factory.reset(new SpecialSkipListFactory(2));
+  Open();
+  CreateColumnFamilies({"one"});
+  ASSERT_OK(Put(1, "fodor", "mirko"));
+  // Create an iterator holding the current super version.
+  Iterator* it = db_->NewIterator(ReadOptions(), handles_[1]);
+  // A flush will make `it` hold the last reference of its super version.
+  Flush(1);
+
+  ASSERT_OK(Put(1, "fodor", "mirko"));
+  ASSERT_OK(Put(0, "fodor", "mirko"));
+  ASSERT_OK(Put(1, "fodor", "mirko"));
+
+  // Flush jobs will close previous WAL files after finishing. By
+  // block flush jobs from running, we trigger a condition where
+  // the iterator destructor should close the WAL files.
+  test::SleepingBackgroundTask sleeping_task;
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task,
+                 Env::Priority::HIGH);
+
+  WriteOptions wo;
+  wo.sync = true;
+  ASSERT_OK(db_->Put(wo, handles_[1], "fodor", "mirko"));
+
+  ASSERT_EQ(2, env.num_open_wal_file_.load());
+  // Deleting the iterator will clear its super version, triggering
+  // closing all files
+  delete it;
+  ASSERT_EQ(1, env.num_open_wal_file_.load());
+
+  sleeping_task.WakeUp();
+  sleeping_task.WaitUntilDone();
+  WaitForFlush(1);
+
+  Reopen();
+  ASSERT_EQ("mirko", Get(0, "fodor"));
+  ASSERT_EQ("mirko", Get(1, "fodor"));
+  db_options_.env = env_;
+  Close();
+}
+
+TEST_P(ColumnFamilyTest, IteratorCloseWALFile2) {
+  SpecialEnv env(Env::Default());
+  // Allow both of flush and purge job to schedule.
+  env.SetBackgroundThreads(2, Env::HIGH);
+  db_options_.env = &env;
+  db_options_.max_background_flushes = 1;
+  column_family_options_.memtable_factory.reset(new SpecialSkipListFactory(2));
+  Open();
+  CreateColumnFamilies({"one"});
+  ASSERT_OK(Put(1, "fodor", "mirko"));
+  // Create an iterator holding the current super version.
+  ReadOptions ro;
+  ro.background_purge_on_iterator_cleanup = true;
+  Iterator* it = db_->NewIterator(ro, handles_[1]);
+  // A flush will make `it` hold the last reference of its super version.
+  Flush(1);
+
+  ASSERT_OK(Put(1, "fodor", "mirko"));
+  ASSERT_OK(Put(0, "fodor", "mirko"));
+  ASSERT_OK(Put(1, "fodor", "mirko"));
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+      {"ColumnFamilyTest::IteratorCloseWALFile2:0",
+       "DBImpl::BGWorkPurge:start"},
+      {"ColumnFamilyTest::IteratorCloseWALFile2:2",
+       "DBImpl::BackgroundCallFlush:start"},
+      {"DBImpl::BGWorkPurge:end", "ColumnFamilyTest::IteratorCloseWALFile2:1"},
+  });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  WriteOptions wo;
+  wo.sync = true;
+  ASSERT_OK(db_->Put(wo, handles_[1], "fodor", "mirko"));
+
+  ASSERT_EQ(2, env.num_open_wal_file_.load());
+  // Deleting the iterator will clear its super version, triggering
+  // closing all files
+  delete it;
+  ASSERT_EQ(2, env.num_open_wal_file_.load());
+
+  TEST_SYNC_POINT("ColumnFamilyTest::IteratorCloseWALFile2:0");
+  TEST_SYNC_POINT("ColumnFamilyTest::IteratorCloseWALFile2:1");
+  ASSERT_EQ(1, env.num_open_wal_file_.load());
+  TEST_SYNC_POINT("ColumnFamilyTest::IteratorCloseWALFile2:2");
+  WaitForFlush(1);
+  ASSERT_EQ(1, env.num_open_wal_file_.load());
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+  Reopen();
+  ASSERT_EQ("mirko", Get(0, "fodor"));
+  ASSERT_EQ("mirko", Get(1, "fodor"));
+  db_options_.env = env_;
+  Close();
+}
+#endif  // !ROCKSDB_LITE
+
+#ifndef ROCKSDB_LITE  // TEST functions are not supported in lite
+TEST_P(ColumnFamilyTest, ForwardIteratorCloseWALFile) {
+  SpecialEnv env(Env::Default());
+  // Allow both of flush and purge job to schedule.
+  env.SetBackgroundThreads(2, Env::HIGH);
+  db_options_.env = &env;
+  db_options_.max_background_flushes = 1;
+  column_family_options_.memtable_factory.reset(new SpecialSkipListFactory(3));
+  column_family_options_.level0_file_num_compaction_trigger = 2;
+  Open();
+  CreateColumnFamilies({"one"});
+  ASSERT_OK(Put(1, "fodor", "mirko"));
+  ASSERT_OK(Put(1, "fodar2", "mirko"));
+  Flush(1);
+
+  // Create an iterator holding the current super version, as well as
+  // the SST file just flushed.
+  ReadOptions ro;
+  ro.tailing = true;
+  ro.background_purge_on_iterator_cleanup = true;
+  Iterator* it = db_->NewIterator(ro, handles_[1]);
+  // A flush will make `it` hold the last reference of its super version.
+
+  ASSERT_OK(Put(1, "fodor", "mirko"));
+  ASSERT_OK(Put(1, "fodar2", "mirko"));
+  Flush(1);
+
+  WaitForCompaction();
+
+  ASSERT_OK(Put(1, "fodor", "mirko"));
+  ASSERT_OK(Put(1, "fodor", "mirko"));
+  ASSERT_OK(Put(0, "fodor", "mirko"));
+  ASSERT_OK(Put(1, "fodor", "mirko"));
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+      {"ColumnFamilyTest::IteratorCloseWALFile2:0",
+       "DBImpl::BGWorkPurge:start"},
+      {"ColumnFamilyTest::IteratorCloseWALFile2:2",
+       "DBImpl::BackgroundCallFlush:start"},
+      {"DBImpl::BGWorkPurge:end", "ColumnFamilyTest::IteratorCloseWALFile2:1"},
+  });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  WriteOptions wo;
+  wo.sync = true;
+  ASSERT_OK(db_->Put(wo, handles_[1], "fodor", "mirko"));
+
+  env.delete_count_.store(0);
+  ASSERT_EQ(2, env.num_open_wal_file_.load());
+  // Deleting the iterator will clear its super version, triggering
+  // closing all files
+  it->Seek("");
+  ASSERT_EQ(2, env.num_open_wal_file_.load());
+  ASSERT_EQ(0, env.delete_count_.load());
+
+  TEST_SYNC_POINT("ColumnFamilyTest::IteratorCloseWALFile2:0");
+  TEST_SYNC_POINT("ColumnFamilyTest::IteratorCloseWALFile2:1");
+  ASSERT_EQ(1, env.num_open_wal_file_.load());
+  ASSERT_EQ(1, env.delete_count_.load());
+  TEST_SYNC_POINT("ColumnFamilyTest::IteratorCloseWALFile2:2");
+  WaitForFlush(1);
+  ASSERT_EQ(1, env.num_open_wal_file_.load());
+  ASSERT_EQ(1, env.delete_count_.load());
+
+  delete it;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+  Reopen();
+  ASSERT_EQ("mirko", Get(0, "fodor"));
+  ASSERT_EQ("mirko", Get(1, "fodor"));
+  db_options_.env = env_;
+  Close();
+}
+#endif  // !ROCKSDB_LITE
+
+// Disable on windows because SyncWAL requires env->IsSyncThreadSafe()
+// to return true which is not so in unbuffered mode.
+#ifndef OS_WIN
+TEST_P(ColumnFamilyTest, LogSyncConflictFlush) {
+  Open();
+  CreateColumnFamiliesAndReopen({"one", "two"});
+
+  Put(0, "", "");
+  Put(1, "foo", "bar");
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::SyncWAL:BeforeMarkLogsSynced:1",
+        "ColumnFamilyTest::LogSyncConflictFlush:1"},
+       {"ColumnFamilyTest::LogSyncConflictFlush:2",
+        "DBImpl::SyncWAL:BeforeMarkLogsSynced:2"}});
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ROCKSDB_NAMESPACE::port::Thread thread([&] { db_->SyncWAL(); });
+
+  TEST_SYNC_POINT("ColumnFamilyTest::LogSyncConflictFlush:1");
+  Flush(1);
+  Put(1, "foo", "bar");
+  Flush(1);
+
+  TEST_SYNC_POINT("ColumnFamilyTest::LogSyncConflictFlush:2");
+
+  thread.join();
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  Close();
+}
+#endif
+
+// this test is placed here, because the infrastructure for Column Family
+// test is being used to ensure a roll of wal files.
+// Basic idea is to test that WAL truncation is being detected and not
+// ignored
+TEST_P(ColumnFamilyTest, DISABLED_LogTruncationTest) {
+  Open();
+  CreateColumnFamiliesAndReopen({"one", "two"});
+
+  Build(0, 100);
+
+  // Flush the 0th column family to force a roll of the wal log
+  Flush(0);
+
+  // Add some more entries
+  Build(100, 100);
+
+  std::vector<std::string> filenames;
+  ASSERT_OK(env_->GetChildren(dbname_, &filenames));
+
+  // collect wal files
+  std::vector<std::string> logfs;
+  for (size_t i = 0; i < filenames.size(); i++) {
+    uint64_t number;
+    FileType type;
+    if (!(ParseFileName(filenames[i], &number, &type))) continue;
+
+    if (type != kLogFile) continue;
+
+    logfs.push_back(filenames[i]);
+  }
+
+  std::sort(logfs.begin(), logfs.end());
+  ASSERT_GE(logfs.size(), 2);
+
+  // Take the last but one file, and truncate it
+  std::string fpath = dbname_ + "/" + logfs[logfs.size() - 2];
+  std::vector<std::string> names_save = names_;
+
+  uint64_t fsize;
+  ASSERT_OK(env_->GetFileSize(fpath, &fsize));
+  ASSERT_GT(fsize, 0);
+
+  Close();
+
+  std::string backup_logs = dbname_ + "/backup_logs";
+  std::string t_fpath = backup_logs + "/" + logfs[logfs.size() - 2];
+
+  ASSERT_OK(env_->CreateDirIfMissing(backup_logs));
+  // Not sure how easy it is to make this data driven.
+  // need to read back the WAL file and truncate last 10
+  // entries
+  CopyFile(fpath, t_fpath, fsize - 9180);
+
+  ASSERT_OK(env_->DeleteFile(fpath));
+  ASSERT_OK(env_->RenameFile(t_fpath, fpath));
+
+  db_options_.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery;
+
+  OpenReadOnly(names_save);
+
+  CheckMissed();
+
+  Close();
+
+  Open(names_save);
+
+  CheckMissed();
+
+  Close();
+
+  // cleanup
+  env_->DeleteDir(backup_logs);
+}
+
+TEST_P(ColumnFamilyTest, DefaultCfPathsTest) {
+  Open();
+  // Leave cf_paths for one column families to be empty.
+  // Files should be generated according to db_paths for that
+  // column family.
+  ColumnFamilyOptions cf_opt1, cf_opt2;
+  cf_opt1.cf_paths.emplace_back(dbname_ + "_one_1",
+                                std::numeric_limits<uint64_t>::max());
+  CreateColumnFamilies({"one", "two"}, {cf_opt1, cf_opt2});
+  Reopen({ColumnFamilyOptions(), cf_opt1, cf_opt2});
+
+  // Fill Column family 1.
+  PutRandomData(1, 100, 100);
+  Flush(1);
+
+  ASSERT_EQ(1, GetSstFileCount(cf_opt1.cf_paths[0].path));
+  ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+  // Fill column family 2
+  PutRandomData(2, 100, 100);
+  Flush(2);
+
+  // SST from Column family 2 should be generated in
+  // db_paths which is dbname_ in this case.
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+}
+
+TEST_P(ColumnFamilyTest, MultipleCFPathsTest) {
+  Open();
+  // Configure Column family specific paths.
+  ColumnFamilyOptions cf_opt1, cf_opt2;
+  cf_opt1.cf_paths.emplace_back(dbname_ + "_one_1",
+                                std::numeric_limits<uint64_t>::max());
+  cf_opt2.cf_paths.emplace_back(dbname_ + "_two_1",
+                                std::numeric_limits<uint64_t>::max());
+  CreateColumnFamilies({"one", "two"}, {cf_opt1, cf_opt2});
+  Reopen({ColumnFamilyOptions(), cf_opt1, cf_opt2});
+
+  PutRandomData(1, 100, 100, true /* save */);
+  Flush(1);
+
+  // Check that files are generated in appropriate paths.
+  ASSERT_EQ(1, GetSstFileCount(cf_opt1.cf_paths[0].path));
+  ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+  PutRandomData(2, 100, 100, true /* save */);
+  Flush(2);
+
+  ASSERT_EQ(1, GetSstFileCount(cf_opt2.cf_paths[0].path));
+  ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+  // Re-open and verify the keys.
+  Reopen({ColumnFamilyOptions(), cf_opt1, cf_opt2});
+  DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
+  for (int cf = 1; cf != 3; ++cf) {
+    ReadOptions read_options;
+    read_options.readahead_size = 0;
+    auto it = dbi->NewIterator(read_options, handles_[cf]);
+    for (it->SeekToFirst(); it->Valid(); it->Next()) {
+      Slice key(it->key());
+      ASSERT_NE(keys_[cf].end(), keys_[cf].find(key.ToString()));
+    }
+    delete it;
+
+    for (const auto& key : keys_[cf]) {
+      ASSERT_NE("NOT_FOUND", Get(cf, key));
+    }
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#ifdef ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS
+extern "C" {
+void RegisterCustomObjects(int argc, char** argv);
+}
+#else
+void RegisterCustomObjects(int /*argc*/, char** /*argv*/) {}
+#endif  // !ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  RegisterCustomObjects(argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/compact_files_test.cc b/src/rocksdb/db/compact_files_test.cc
new file mode 100644
index 000000000..948ada675
--- /dev/null
+++ b/src/rocksdb/db/compact_files_test.cc
@@ -0,0 +1,421 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include <mutex>
+#include <string>
+#include <thread>
+#include <vector>
+
+#include "db/db_impl/db_impl.h"
+#include "port/port.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class CompactFilesTest : public testing::Test {
+ public:
+  CompactFilesTest() {
+    env_ = Env::Default();
+    db_name_ = test::PerThreadDBPath("compact_files_test");
+  }
+
+  std::string db_name_;
+  Env* env_;
+};
+
+// A class which remembers the name of each flushed file.
+class FlushedFileCollector : public EventListener {
+ public:
+  FlushedFileCollector() {}
+  ~FlushedFileCollector() override {}
+
+  void OnFlushCompleted(DB* /*db*/, const FlushJobInfo& info) override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    flushed_files_.push_back(info.file_path);
+  }
+
+  std::vector<std::string> GetFlushedFiles() {
+    std::lock_guard<std::mutex> lock(mutex_);
+    std::vector<std::string> result;
+    for (auto fname : flushed_files_) {
+      result.push_back(fname);
+    }
+    return result;
+  }
+  void ClearFlushedFiles() {
+    std::lock_guard<std::mutex> lock(mutex_);
+    flushed_files_.clear();
+  }
+
+ private:
+  std::vector<std::string> flushed_files_;
+  std::mutex mutex_;
+};
+
+TEST_F(CompactFilesTest, L0ConflictsFiles) {
+  Options options;
+  // to trigger compaction more easily
+  const int kWriteBufferSize = 10000;
+  const int kLevel0Trigger = 2;
+  options.create_if_missing = true;
+  options.compaction_style = kCompactionStyleLevel;
+  // Small slowdown and stop trigger for experimental purpose.
+  options.level0_slowdown_writes_trigger = 20;
+  options.level0_stop_writes_trigger = 20;
+  options.level0_stop_writes_trigger = 20;
+  options.write_buffer_size = kWriteBufferSize;
+  options.level0_file_num_compaction_trigger = kLevel0Trigger;
+  options.compression = kNoCompression;
+
+  DB* db = nullptr;
+  DestroyDB(db_name_, options);
+  Status s = DB::Open(options, db_name_, &db);
+  assert(s.ok());
+  assert(db);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+      {"CompactFilesImpl:0", "BackgroundCallCompaction:0"},
+      {"BackgroundCallCompaction:1", "CompactFilesImpl:1"},
+  });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // create couple files
+  // Background compaction starts and waits in BackgroundCallCompaction:0
+  for (int i = 0; i < kLevel0Trigger * 4; ++i) {
+    db->Put(WriteOptions(), ToString(i), "");
+    db->Put(WriteOptions(), ToString(100 - i), "");
+    db->Flush(FlushOptions());
+  }
+
+  ROCKSDB_NAMESPACE::ColumnFamilyMetaData meta;
+  db->GetColumnFamilyMetaData(&meta);
+  std::string file1;
+  for (auto& file : meta.levels[0].files) {
+    ASSERT_EQ(0, meta.levels[0].level);
+    if (file1 == "") {
+      file1 = file.db_path + "/" + file.name;
+    } else {
+      std::string file2 = file.db_path + "/" + file.name;
+      // Another thread starts a compact files and creates an L0 compaction
+      // The background compaction then notices that there is an L0 compaction
+      // already in progress and doesn't do an L0 compaction
+      // Once the background compaction finishes, the compact files finishes
+      ASSERT_OK(db->CompactFiles(ROCKSDB_NAMESPACE::CompactionOptions(),
+                                 {file1, file2}, 0));
+      break;
+    }
+  }
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  delete db;
+}
+
+TEST_F(CompactFilesTest, ObsoleteFiles) {
+  Options options;
+  // to trigger compaction more easily
+  const int kWriteBufferSize = 65536;
+  options.create_if_missing = true;
+  // Disable RocksDB background compaction.
+  options.compaction_style = kCompactionStyleNone;
+  options.level0_slowdown_writes_trigger = (1 << 30);
+  options.level0_stop_writes_trigger = (1 << 30);
+  options.write_buffer_size = kWriteBufferSize;
+  options.max_write_buffer_number = 2;
+  options.compression = kNoCompression;
+
+  // Add listener
+  FlushedFileCollector* collector = new FlushedFileCollector();
+  options.listeners.emplace_back(collector);
+
+  DB* db = nullptr;
+  DestroyDB(db_name_, options);
+  Status s = DB::Open(options, db_name_, &db);
+  assert(s.ok());
+  assert(db);
+
+  // create couple files
+  for (int i = 1000; i < 2000; ++i) {
+    db->Put(WriteOptions(), ToString(i),
+            std::string(kWriteBufferSize / 10, 'a' + (i % 26)));
+  }
+
+  auto l0_files = collector->GetFlushedFiles();
+  ASSERT_OK(db->CompactFiles(CompactionOptions(), l0_files, 1));
+  reinterpret_cast<DBImpl*>(db)->TEST_WaitForCompact();
+
+  // verify all compaction input files are deleted
+  for (auto fname : l0_files) {
+    ASSERT_EQ(Status::NotFound(), env_->FileExists(fname));
+  }
+  delete db;
+}
+
+TEST_F(CompactFilesTest, NotCutOutputOnLevel0) {
+  Options options;
+  options.create_if_missing = true;
+  // Disable RocksDB background compaction.
+  options.compaction_style = kCompactionStyleNone;
+  options.level0_slowdown_writes_trigger = 1000;
+  options.level0_stop_writes_trigger = 1000;
+  options.write_buffer_size = 65536;
+  options.max_write_buffer_number = 2;
+  options.compression = kNoCompression;
+  options.max_compaction_bytes = 5000;
+
+  // Add listener
+  FlushedFileCollector* collector = new FlushedFileCollector();
+  options.listeners.emplace_back(collector);
+
+  DB* db = nullptr;
+  DestroyDB(db_name_, options);
+  Status s = DB::Open(options, db_name_, &db);
+  assert(s.ok());
+  assert(db);
+
+  // create couple files
+  for (int i = 0; i < 500; ++i) {
+    db->Put(WriteOptions(), ToString(i), std::string(1000, 'a' + (i % 26)));
+  }
+  reinterpret_cast<DBImpl*>(db)->TEST_WaitForFlushMemTable();
+  auto l0_files_1 = collector->GetFlushedFiles();
+  collector->ClearFlushedFiles();
+  for (int i = 0; i < 500; ++i) {
+    db->Put(WriteOptions(), ToString(i), std::string(1000, 'a' + (i % 26)));
+  }
+  reinterpret_cast<DBImpl*>(db)->TEST_WaitForFlushMemTable();
+  auto l0_files_2 = collector->GetFlushedFiles();
+  ASSERT_OK(db->CompactFiles(CompactionOptions(), l0_files_1, 0));
+  ASSERT_OK(db->CompactFiles(CompactionOptions(), l0_files_2, 0));
+  // no assertion failure
+  delete db;
+}
+
+TEST_F(CompactFilesTest, CapturingPendingFiles) {
+  Options options;
+  options.create_if_missing = true;
+  // Disable RocksDB background compaction.
+  options.compaction_style = kCompactionStyleNone;
+  // Always do full scans for obsolete files (needed to reproduce the issue).
+  options.delete_obsolete_files_period_micros = 0;
+
+  // Add listener.
+  FlushedFileCollector* collector = new FlushedFileCollector();
+  options.listeners.emplace_back(collector);
+
+  DB* db = nullptr;
+  DestroyDB(db_name_, options);
+  Status s = DB::Open(options, db_name_, &db);
+  assert(s.ok());
+  assert(db);
+
+  // Create 5 files.
+  for (int i = 0; i < 5; ++i) {
+    db->Put(WriteOptions(), "key" + ToString(i), "value");
+    db->Flush(FlushOptions());
+  }
+
+  auto l0_files = collector->GetFlushedFiles();
+  EXPECT_EQ(5, l0_files.size());
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+      {"CompactFilesImpl:2", "CompactFilesTest.CapturingPendingFiles:0"},
+      {"CompactFilesTest.CapturingPendingFiles:1", "CompactFilesImpl:3"},
+  });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // Start compacting files.
+  ROCKSDB_NAMESPACE::port::Thread compaction_thread(
+      [&] { EXPECT_OK(db->CompactFiles(CompactionOptions(), l0_files, 1)); });
+
+  // In the meantime flush another file.
+  TEST_SYNC_POINT("CompactFilesTest.CapturingPendingFiles:0");
+  db->Put(WriteOptions(), "key5", "value");
+  db->Flush(FlushOptions());
+  TEST_SYNC_POINT("CompactFilesTest.CapturingPendingFiles:1");
+
+  compaction_thread.join();
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+  delete db;
+
+  // Make sure we can reopen the DB.
+  s = DB::Open(options, db_name_, &db);
+  ASSERT_TRUE(s.ok());
+  assert(db);
+  delete db;
+}
+
+TEST_F(CompactFilesTest, CompactionFilterWithGetSv) {
+  class FilterWithGet : public CompactionFilter {
+   public:
+    bool Filter(int /*level*/, const Slice& /*key*/, const Slice& /*value*/,
+                std::string* /*new_value*/,
+                bool* /*value_changed*/) const override {
+      if (db_ == nullptr) {
+        return true;
+      }
+      std::string res;
+      db_->Get(ReadOptions(), "", &res);
+      return true;
+    }
+
+    void SetDB(DB* db) {
+      db_ = db;
+    }
+
+    const char* Name() const override { return "FilterWithGet"; }
+
+   private:
+    DB* db_;
+  };
+
+
+  std::shared_ptr<FilterWithGet> cf(new FilterWithGet());
+
+  Options options;
+  options.create_if_missing = true;
+  options.compaction_filter = cf.get();
+
+  DB* db = nullptr;
+  DestroyDB(db_name_, options);
+  Status s = DB::Open(options, db_name_, &db);
+  ASSERT_OK(s);
+
+  cf->SetDB(db);
+
+  // Write one L0 file
+  db->Put(WriteOptions(), "K1", "V1");
+  db->Flush(FlushOptions());
+
+  // Compact all L0 files using CompactFiles
+  ROCKSDB_NAMESPACE::ColumnFamilyMetaData meta;
+  db->GetColumnFamilyMetaData(&meta);
+  for (auto& file : meta.levels[0].files) {
+    std::string fname = file.db_path + "/" + file.name;
+    ASSERT_OK(
+        db->CompactFiles(ROCKSDB_NAMESPACE::CompactionOptions(), {fname}, 0));
+  }
+
+
+  delete db;
+}
+
+TEST_F(CompactFilesTest, SentinelCompressionType) {
+  if (!Zlib_Supported()) {
+    fprintf(stderr, "zlib compression not supported, skip this test\n");
+    return;
+  }
+  if (!Snappy_Supported()) {
+    fprintf(stderr, "snappy compression not supported, skip this test\n");
+    return;
+  }
+  // Check that passing `CompressionType::kDisableCompressionOption` to
+  // `CompactFiles` causes it to use the column family compression options.
+  for (auto compaction_style :
+       {CompactionStyle::kCompactionStyleLevel,
+        CompactionStyle::kCompactionStyleUniversal,
+        CompactionStyle::kCompactionStyleNone}) {
+    DestroyDB(db_name_, Options());
+    Options options;
+    options.compaction_style = compaction_style;
+    // L0: Snappy, L1: ZSTD, L2: Snappy
+    options.compression_per_level = {CompressionType::kSnappyCompression,
+                                     CompressionType::kZlibCompression,
+                                     CompressionType::kSnappyCompression};
+    options.create_if_missing = true;
+    FlushedFileCollector* collector = new FlushedFileCollector();
+    options.listeners.emplace_back(collector);
+    DB* db = nullptr;
+    ASSERT_OK(DB::Open(options, db_name_, &db));
+
+    db->Put(WriteOptions(), "key", "val");
+    db->Flush(FlushOptions());
+
+    auto l0_files = collector->GetFlushedFiles();
+    ASSERT_EQ(1, l0_files.size());
+
+    // L0->L1 compaction, so output should be ZSTD-compressed
+    CompactionOptions compaction_opts;
+    compaction_opts.compression = CompressionType::kDisableCompressionOption;
+    ASSERT_OK(db->CompactFiles(compaction_opts, l0_files, 1));
+
+    ROCKSDB_NAMESPACE::TablePropertiesCollection all_tables_props;
+    ASSERT_OK(db->GetPropertiesOfAllTables(&all_tables_props));
+    for (const auto& name_and_table_props : all_tables_props) {
+      ASSERT_EQ(CompressionTypeToString(CompressionType::kZlibCompression),
+                name_and_table_props.second->compression_name);
+    }
+    delete db;
+  }
+}
+
+TEST_F(CompactFilesTest, GetCompactionJobInfo) {
+  Options options;
+  options.create_if_missing = true;
+  // Disable RocksDB background compaction.
+  options.compaction_style = kCompactionStyleNone;
+  options.level0_slowdown_writes_trigger = 1000;
+  options.level0_stop_writes_trigger = 1000;
+  options.write_buffer_size = 65536;
+  options.max_write_buffer_number = 2;
+  options.compression = kNoCompression;
+  options.max_compaction_bytes = 5000;
+
+  // Add listener
+  FlushedFileCollector* collector = new FlushedFileCollector();
+  options.listeners.emplace_back(collector);
+
+  DB* db = nullptr;
+  DestroyDB(db_name_, options);
+  Status s = DB::Open(options, db_name_, &db);
+  assert(s.ok());
+  assert(db);
+
+  // create couple files
+  for (int i = 0; i < 500; ++i) {
+    db->Put(WriteOptions(), ToString(i), std::string(1000, 'a' + (i % 26)));
+  }
+  reinterpret_cast<DBImpl*>(db)->TEST_WaitForFlushMemTable();
+  auto l0_files_1 = collector->GetFlushedFiles();
+  CompactionOptions co;
+  co.compression = CompressionType::kLZ4Compression;
+  CompactionJobInfo compaction_job_info{};
+  ASSERT_OK(
+      db->CompactFiles(co, l0_files_1, 0, -1, nullptr, &compaction_job_info));
+  ASSERT_EQ(compaction_job_info.base_input_level, 0);
+  ASSERT_EQ(compaction_job_info.cf_id, db->DefaultColumnFamily()->GetID());
+  ASSERT_EQ(compaction_job_info.cf_name, db->DefaultColumnFamily()->GetName());
+  ASSERT_EQ(compaction_job_info.compaction_reason,
+            CompactionReason::kManualCompaction);
+  ASSERT_EQ(compaction_job_info.compression, CompressionType::kLZ4Compression);
+  ASSERT_EQ(compaction_job_info.output_level, 0);
+  ASSERT_OK(compaction_job_info.status);
+  // no assertion failure
+  delete db;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+  fprintf(stderr,
+          "SKIPPED as DBImpl::CompactFiles is not supported in ROCKSDB_LITE\n");
+  return 0;
+}
+
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/compacted_db_impl.cc b/src/rocksdb/db/compacted_db_impl.cc
new file mode 100644
index 000000000..47d6ecced
--- /dev/null
+++ b/src/rocksdb/db/compacted_db_impl.cc
@@ -0,0 +1,160 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+#include "db/compacted_db_impl.h"
+#include "db/db_impl/db_impl.h"
+#include "db/version_set.h"
+#include "table/get_context.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+extern void MarkKeyMayExist(void* arg);
+extern bool SaveValue(void* arg, const ParsedInternalKey& parsed_key,
+                      const Slice& v, bool hit_and_return);
+
+CompactedDBImpl::CompactedDBImpl(
+  const DBOptions& options, const std::string& dbname)
+  : DBImpl(options, dbname), cfd_(nullptr), version_(nullptr),
+    user_comparator_(nullptr) {
+}
+
+CompactedDBImpl::~CompactedDBImpl() {
+}
+
+size_t CompactedDBImpl::FindFile(const Slice& key) {
+  size_t right = files_.num_files - 1;
+  auto cmp = [&](const FdWithKeyRange& f, const Slice& k) -> bool {
+    return user_comparator_->Compare(ExtractUserKey(f.largest_key), k) < 0;
+  };
+  return static_cast<size_t>(std::lower_bound(files_.files,
+                            files_.files + right, key, cmp) - files_.files);
+}
+
+Status CompactedDBImpl::Get(const ReadOptions& options, ColumnFamilyHandle*,
+                            const Slice& key, PinnableSlice* value) {
+  GetContext get_context(user_comparator_, nullptr, nullptr, nullptr,
+                         GetContext::kNotFound, key, value, nullptr, nullptr,
+                         true, nullptr, nullptr);
+  LookupKey lkey(key, kMaxSequenceNumber);
+  files_.files[FindFile(key)].fd.table_reader->Get(options, lkey.internal_key(),
+                                                   &get_context, nullptr);
+  if (get_context.State() == GetContext::kFound) {
+    return Status::OK();
+  }
+  return Status::NotFound();
+}
+
+std::vector<Status> CompactedDBImpl::MultiGet(const ReadOptions& options,
+    const std::vector<ColumnFamilyHandle*>&,
+    const std::vector<Slice>& keys, std::vector<std::string>* values) {
+  autovector<TableReader*, 16> reader_list;
+  for (const auto& key : keys) {
+    const FdWithKeyRange& f = files_.files[FindFile(key)];
+    if (user_comparator_->Compare(key, ExtractUserKey(f.smallest_key)) < 0) {
+      reader_list.push_back(nullptr);
+    } else {
+      LookupKey lkey(key, kMaxSequenceNumber);
+      f.fd.table_reader->Prepare(lkey.internal_key());
+      reader_list.push_back(f.fd.table_reader);
+    }
+  }
+  std::vector<Status> statuses(keys.size(), Status::NotFound());
+  values->resize(keys.size());
+  int idx = 0;
+  for (auto* r : reader_list) {
+    if (r != nullptr) {
+      PinnableSlice pinnable_val;
+      std::string& value = (*values)[idx];
+      GetContext get_context(user_comparator_, nullptr, nullptr, nullptr,
+                             GetContext::kNotFound, keys[idx], &pinnable_val,
+                             nullptr, nullptr, true, nullptr, nullptr);
+      LookupKey lkey(keys[idx], kMaxSequenceNumber);
+      r->Get(options, lkey.internal_key(), &get_context, nullptr);
+      value.assign(pinnable_val.data(), pinnable_val.size());
+      if (get_context.State() == GetContext::kFound) {
+        statuses[idx] = Status::OK();
+      }
+    }
+    ++idx;
+  }
+  return statuses;
+}
+
+Status CompactedDBImpl::Init(const Options& options) {
+  SuperVersionContext sv_context(/* create_superversion */ true);
+  mutex_.Lock();
+  ColumnFamilyDescriptor cf(kDefaultColumnFamilyName,
+                            ColumnFamilyOptions(options));
+  Status s = Recover({cf}, true /* read only */, false, true);
+  if (s.ok()) {
+    cfd_ = reinterpret_cast<ColumnFamilyHandleImpl*>(
+              DefaultColumnFamily())->cfd();
+    cfd_->InstallSuperVersion(&sv_context, &mutex_);
+  }
+  mutex_.Unlock();
+  sv_context.Clean();
+  if (!s.ok()) {
+    return s;
+  }
+  NewThreadStatusCfInfo(cfd_);
+  version_ = cfd_->GetSuperVersion()->current;
+  user_comparator_ = cfd_->user_comparator();
+  auto* vstorage = version_->storage_info();
+  if (vstorage->num_non_empty_levels() == 0) {
+    return Status::NotSupported("no file exists");
+  }
+  const LevelFilesBrief& l0 = vstorage->LevelFilesBrief(0);
+  // L0 should not have files
+  if (l0.num_files > 1) {
+    return Status::NotSupported("L0 contain more than 1 file");
+  }
+  if (l0.num_files == 1) {
+    if (vstorage->num_non_empty_levels() > 1) {
+      return Status::NotSupported("Both L0 and other level contain files");
+    }
+    files_ = l0;
+    return Status::OK();
+  }
+
+  for (int i = 1; i < vstorage->num_non_empty_levels() - 1; ++i) {
+    if (vstorage->LevelFilesBrief(i).num_files > 0) {
+      return Status::NotSupported("Other levels also contain files");
+    }
+  }
+
+  int level = vstorage->num_non_empty_levels() - 1;
+  if (vstorage->LevelFilesBrief(level).num_files > 0) {
+    files_ = vstorage->LevelFilesBrief(level);
+    return Status::OK();
+  }
+  return Status::NotSupported("no file exists");
+}
+
+Status CompactedDBImpl::Open(const Options& options,
+                             const std::string& dbname, DB** dbptr) {
+  *dbptr = nullptr;
+
+  if (options.max_open_files != -1) {
+    return Status::InvalidArgument("require max_open_files = -1");
+  }
+  if (options.merge_operator.get() != nullptr) {
+    return Status::InvalidArgument("merge operator is not supported");
+  }
+  DBOptions db_options(options);
+  std::unique_ptr<CompactedDBImpl> db(new CompactedDBImpl(db_options, dbname));
+  Status s = db->Init(options);
+  if (s.ok()) {
+    db->StartTimedTasks();
+    ROCKS_LOG_INFO(db->immutable_db_options_.info_log,
+                   "Opened the db as fully compacted mode");
+    LogFlush(db->immutable_db_options_.info_log);
+    *dbptr = db.release();
+  }
+  return s;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/db/compacted_db_impl.h b/src/rocksdb/db/compacted_db_impl.h
new file mode 100644
index 000000000..7099566fc
--- /dev/null
+++ b/src/rocksdb/db/compacted_db_impl.h
@@ -0,0 +1,113 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#ifndef ROCKSDB_LITE
+#include <string>
+#include <vector>
+#include "db/db_impl/db_impl.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class CompactedDBImpl : public DBImpl {
+ public:
+  CompactedDBImpl(const DBOptions& options, const std::string& dbname);
+  // No copying allowed
+  CompactedDBImpl(const CompactedDBImpl&) = delete;
+  void operator=(const CompactedDBImpl&) = delete;
+
+  virtual ~CompactedDBImpl();
+
+  static Status Open(const Options& options, const std::string& dbname,
+                     DB** dbptr);
+
+  // Implementations of the DB interface
+  using DB::Get;
+  virtual Status Get(const ReadOptions& options,
+                     ColumnFamilyHandle* column_family, const Slice& key,
+                     PinnableSlice* value) override;
+  using DB::MultiGet;
+  virtual std::vector<Status> MultiGet(
+      const ReadOptions& options,
+      const std::vector<ColumnFamilyHandle*>&,
+      const std::vector<Slice>& keys, std::vector<std::string>* values)
+    override;
+
+  using DBImpl::Put;
+  virtual Status Put(const WriteOptions& /*options*/,
+                     ColumnFamilyHandle* /*column_family*/,
+                     const Slice& /*key*/, const Slice& /*value*/) override {
+    return Status::NotSupported("Not supported in compacted db mode.");
+  }
+  using DBImpl::Merge;
+  virtual Status Merge(const WriteOptions& /*options*/,
+                       ColumnFamilyHandle* /*column_family*/,
+                       const Slice& /*key*/, const Slice& /*value*/) override {
+    return Status::NotSupported("Not supported in compacted db mode.");
+  }
+  using DBImpl::Delete;
+  virtual Status Delete(const WriteOptions& /*options*/,
+                        ColumnFamilyHandle* /*column_family*/,
+                        const Slice& /*key*/) override {
+    return Status::NotSupported("Not supported in compacted db mode.");
+  }
+  virtual Status Write(const WriteOptions& /*options*/,
+                       WriteBatch* /*updates*/) override {
+    return Status::NotSupported("Not supported in compacted db mode.");
+  }
+  using DBImpl::CompactRange;
+  virtual Status CompactRange(const CompactRangeOptions& /*options*/,
+                              ColumnFamilyHandle* /*column_family*/,
+                              const Slice* /*begin*/,
+                              const Slice* /*end*/) override {
+    return Status::NotSupported("Not supported in compacted db mode.");
+  }
+
+  virtual Status DisableFileDeletions() override {
+    return Status::NotSupported("Not supported in compacted db mode.");
+  }
+  virtual Status EnableFileDeletions(bool /*force*/) override {
+    return Status::NotSupported("Not supported in compacted db mode.");
+  }
+  virtual Status GetLiveFiles(std::vector<std::string>& ret,
+                              uint64_t* manifest_file_size,
+                              bool /*flush_memtable*/) override {
+    return DBImpl::GetLiveFiles(ret, manifest_file_size,
+                                false /* flush_memtable */);
+  }
+  using DBImpl::Flush;
+  virtual Status Flush(const FlushOptions& /*options*/,
+                       ColumnFamilyHandle* /*column_family*/) override {
+    return Status::NotSupported("Not supported in compacted db mode.");
+  }
+  using DB::IngestExternalFile;
+  virtual Status IngestExternalFile(
+      ColumnFamilyHandle* /*column_family*/,
+      const std::vector<std::string>& /*external_files*/,
+      const IngestExternalFileOptions& /*ingestion_options*/) override {
+    return Status::NotSupported("Not supported in compacted db mode.");
+  }
+  using DB::CreateColumnFamilyWithImport;
+  virtual Status CreateColumnFamilyWithImport(
+      const ColumnFamilyOptions& /*options*/,
+      const std::string& /*column_family_name*/,
+      const ImportColumnFamilyOptions& /*import_options*/,
+      const ExportImportFilesMetaData& /*metadata*/,
+      ColumnFamilyHandle** /*handle*/) override {
+    return Status::NotSupported("Not supported in compacted db mode.");
+  }
+
+ private:
+  friend class DB;
+  inline size_t FindFile(const Slice& key);
+  Status Init(const Options& options);
+
+  ColumnFamilyData* cfd_;
+  Version* version_;
+  const Comparator* user_comparator_;
+  LevelFilesBrief files_;
+};
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/db/compaction/compaction.cc b/src/rocksdb/db/compaction/compaction.cc
new file mode 100644
index 000000000..5c34fdcaa
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction.cc
@@ -0,0 +1,564 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <cinttypes>
+#include <vector>
+
+#include "db/column_family.h"
+#include "db/compaction/compaction.h"
+#include "rocksdb/compaction_filter.h"
+#include "test_util/sync_point.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+const uint64_t kRangeTombstoneSentinel =
+    PackSequenceAndType(kMaxSequenceNumber, kTypeRangeDeletion);
+
+int sstableKeyCompare(const Comparator* user_cmp, const InternalKey& a,
+                      const InternalKey& b) {
+  auto c = user_cmp->Compare(a.user_key(), b.user_key());
+  if (c != 0) {
+    return c;
+  }
+  auto a_footer = ExtractInternalKeyFooter(a.Encode());
+  auto b_footer = ExtractInternalKeyFooter(b.Encode());
+  if (a_footer == kRangeTombstoneSentinel) {
+    if (b_footer != kRangeTombstoneSentinel) {
+      return -1;
+    }
+  } else if (b_footer == kRangeTombstoneSentinel) {
+    return 1;
+  }
+  return 0;
+}
+
+int sstableKeyCompare(const Comparator* user_cmp, const InternalKey* a,
+                      const InternalKey& b) {
+  if (a == nullptr) {
+    return -1;
+  }
+  return sstableKeyCompare(user_cmp, *a, b);
+}
+
+int sstableKeyCompare(const Comparator* user_cmp, const InternalKey& a,
+                      const InternalKey* b) {
+  if (b == nullptr) {
+    return -1;
+  }
+  return sstableKeyCompare(user_cmp, a, *b);
+}
+
+uint64_t TotalFileSize(const std::vector<FileMetaData*>& files) {
+  uint64_t sum = 0;
+  for (size_t i = 0; i < files.size() && files[i]; i++) {
+    sum += files[i]->fd.GetFileSize();
+  }
+  return sum;
+}
+
+void Compaction::SetInputVersion(Version* _input_version) {
+  input_version_ = _input_version;
+  cfd_ = input_version_->cfd();
+
+  cfd_->Ref();
+  input_version_->Ref();
+  edit_.SetColumnFamily(cfd_->GetID());
+}
+
+void Compaction::GetBoundaryKeys(
+    VersionStorageInfo* vstorage,
+    const std::vector<CompactionInputFiles>& inputs, Slice* smallest_user_key,
+    Slice* largest_user_key) {
+  bool initialized = false;
+  const Comparator* ucmp = vstorage->InternalComparator()->user_comparator();
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    if (inputs[i].files.empty()) {
+      continue;
+    }
+    if (inputs[i].level == 0) {
+      // we need to consider all files on level 0
+      for (const auto* f : inputs[i].files) {
+        const Slice& start_user_key = f->smallest.user_key();
+        if (!initialized ||
+            ucmp->Compare(start_user_key, *smallest_user_key) < 0) {
+          *smallest_user_key = start_user_key;
+        }
+        const Slice& end_user_key = f->largest.user_key();
+        if (!initialized ||
+            ucmp->Compare(end_user_key, *largest_user_key) > 0) {
+          *largest_user_key = end_user_key;
+        }
+        initialized = true;
+      }
+    } else {
+      // we only need to consider the first and last file
+      const Slice& start_user_key = inputs[i].files[0]->smallest.user_key();
+      if (!initialized ||
+          ucmp->Compare(start_user_key, *smallest_user_key) < 0) {
+        *smallest_user_key = start_user_key;
+      }
+      const Slice& end_user_key = inputs[i].files.back()->largest.user_key();
+      if (!initialized || ucmp->Compare(end_user_key, *largest_user_key) > 0) {
+        *largest_user_key = end_user_key;
+      }
+      initialized = true;
+    }
+  }
+}
+
+std::vector<CompactionInputFiles> Compaction::PopulateWithAtomicBoundaries(
+    VersionStorageInfo* vstorage, std::vector<CompactionInputFiles> inputs) {
+  const Comparator* ucmp = vstorage->InternalComparator()->user_comparator();
+  for (size_t i = 0; i < inputs.size(); i++) {
+    if (inputs[i].level == 0 || inputs[i].files.empty()) {
+      continue;
+    }
+    inputs[i].atomic_compaction_unit_boundaries.reserve(inputs[i].files.size());
+    AtomicCompactionUnitBoundary cur_boundary;
+    size_t first_atomic_idx = 0;
+    auto add_unit_boundary = [&](size_t to) {
+      if (first_atomic_idx == to) return;
+      for (size_t k = first_atomic_idx; k < to; k++) {
+        inputs[i].atomic_compaction_unit_boundaries.push_back(cur_boundary);
+      }
+      first_atomic_idx = to;
+    };
+    for (size_t j = 0; j < inputs[i].files.size(); j++) {
+      const auto* f = inputs[i].files[j];
+      if (j == 0) {
+        // First file in a level.
+        cur_boundary.smallest = &f->smallest;
+        cur_boundary.largest = &f->largest;
+      } else if (sstableKeyCompare(ucmp, *cur_boundary.largest, f->smallest) ==
+                 0) {
+        // SSTs overlap but the end key of the previous file was not
+        // artificially extended by a range tombstone. Extend the current
+        // boundary.
+        cur_boundary.largest = &f->largest;
+      } else {
+        // Atomic compaction unit has ended.
+        add_unit_boundary(j);
+        cur_boundary.smallest = &f->smallest;
+        cur_boundary.largest = &f->largest;
+      }
+    }
+    add_unit_boundary(inputs[i].files.size());
+    assert(inputs[i].files.size() ==
+           inputs[i].atomic_compaction_unit_boundaries.size());
+  }
+  return inputs;
+}
+
+// helper function to determine if compaction is creating files at the
+// bottommost level
+bool Compaction::IsBottommostLevel(
+    int output_level, VersionStorageInfo* vstorage,
+    const std::vector<CompactionInputFiles>& inputs) {
+  int output_l0_idx;
+  if (output_level == 0) {
+    output_l0_idx = 0;
+    for (const auto* file : vstorage->LevelFiles(0)) {
+      if (inputs[0].files.back() == file) {
+        break;
+      }
+      ++output_l0_idx;
+    }
+    assert(static_cast<size_t>(output_l0_idx) < vstorage->LevelFiles(0).size());
+  } else {
+    output_l0_idx = -1;
+  }
+  Slice smallest_key, largest_key;
+  GetBoundaryKeys(vstorage, inputs, &smallest_key, &largest_key);
+  return !vstorage->RangeMightExistAfterSortedRun(smallest_key, largest_key,
+                                                  output_level, output_l0_idx);
+}
+
+// test function to validate the functionality of IsBottommostLevel()
+// function -- determines if compaction with inputs and storage is bottommost
+bool Compaction::TEST_IsBottommostLevel(
+    int output_level, VersionStorageInfo* vstorage,
+    const std::vector<CompactionInputFiles>& inputs) {
+  return IsBottommostLevel(output_level, vstorage, inputs);
+}
+
+bool Compaction::IsFullCompaction(
+    VersionStorageInfo* vstorage,
+    const std::vector<CompactionInputFiles>& inputs) {
+  size_t num_files_in_compaction = 0;
+  size_t total_num_files = 0;
+  for (int l = 0; l < vstorage->num_levels(); l++) {
+    total_num_files += vstorage->NumLevelFiles(l);
+  }
+  for (size_t i = 0; i < inputs.size(); i++) {
+    num_files_in_compaction += inputs[i].size();
+  }
+  return num_files_in_compaction == total_num_files;
+}
+
+Compaction::Compaction(VersionStorageInfo* vstorage,
+                       const ImmutableCFOptions& _immutable_cf_options,
+                       const MutableCFOptions& _mutable_cf_options,
+                       std::vector<CompactionInputFiles> _inputs,
+                       int _output_level, uint64_t _target_file_size,
+                       uint64_t _max_compaction_bytes, uint32_t _output_path_id,
+                       CompressionType _compression,
+                       CompressionOptions _compression_opts,
+                       uint32_t _max_subcompactions,
+                       std::vector<FileMetaData*> _grandparents,
+                       bool _manual_compaction, double _score,
+                       bool _deletion_compaction,
+                       CompactionReason _compaction_reason)
+    : input_vstorage_(vstorage),
+      start_level_(_inputs[0].level),
+      output_level_(_output_level),
+      max_output_file_size_(_target_file_size),
+      max_compaction_bytes_(_max_compaction_bytes),
+      max_subcompactions_(_max_subcompactions),
+      immutable_cf_options_(_immutable_cf_options),
+      mutable_cf_options_(_mutable_cf_options),
+      input_version_(nullptr),
+      number_levels_(vstorage->num_levels()),
+      cfd_(nullptr),
+      output_path_id_(_output_path_id),
+      output_compression_(_compression),
+      output_compression_opts_(_compression_opts),
+      deletion_compaction_(_deletion_compaction),
+      inputs_(PopulateWithAtomicBoundaries(vstorage, std::move(_inputs))),
+      grandparents_(std::move(_grandparents)),
+      score_(_score),
+      bottommost_level_(IsBottommostLevel(output_level_, vstorage, inputs_)),
+      is_full_compaction_(IsFullCompaction(vstorage, inputs_)),
+      is_manual_compaction_(_manual_compaction),
+      is_trivial_move_(false),
+      compaction_reason_(_compaction_reason) {
+  MarkFilesBeingCompacted(true);
+  if (is_manual_compaction_) {
+    compaction_reason_ = CompactionReason::kManualCompaction;
+  }
+  if (max_subcompactions_ == 0) {
+    max_subcompactions_ = immutable_cf_options_.max_subcompactions;
+  }
+  if (!bottommost_level_) {
+    // Currently we only enable dictionary compression during compaction to the
+    // bottommost level.
+    output_compression_opts_.max_dict_bytes = 0;
+    output_compression_opts_.zstd_max_train_bytes = 0;
+  }
+
+#ifndef NDEBUG
+  for (size_t i = 1; i < inputs_.size(); ++i) {
+    assert(inputs_[i].level > inputs_[i - 1].level);
+  }
+#endif
+
+  // setup input_levels_
+  {
+    input_levels_.resize(num_input_levels());
+    for (size_t which = 0; which < num_input_levels(); which++) {
+      DoGenerateLevelFilesBrief(&input_levels_[which], inputs_[which].files,
+                                &arena_);
+    }
+  }
+
+  GetBoundaryKeys(vstorage, inputs_, &smallest_user_key_, &largest_user_key_);
+}
+
+Compaction::~Compaction() {
+  if (input_version_ != nullptr) {
+    input_version_->Unref();
+  }
+  if (cfd_ != nullptr) {
+    cfd_->UnrefAndTryDelete();
+  }
+}
+
+bool Compaction::InputCompressionMatchesOutput() const {
+  int base_level = input_vstorage_->base_level();
+  bool matches = (GetCompressionType(immutable_cf_options_, input_vstorage_,
+                                     mutable_cf_options_, start_level_,
+                                     base_level) == output_compression_);
+  if (matches) {
+    TEST_SYNC_POINT("Compaction::InputCompressionMatchesOutput:Matches");
+    return true;
+  }
+  TEST_SYNC_POINT("Compaction::InputCompressionMatchesOutput:DidntMatch");
+  return matches;
+}
+
+bool Compaction::IsTrivialMove() const {
+  // Avoid a move if there is lots of overlapping grandparent data.
+  // Otherwise, the move could create a parent file that will require
+  // a very expensive merge later on.
+  // If start_level_== output_level_, the purpose is to force compaction
+  // filter to be applied to that level, and thus cannot be a trivial move.
+
+  // Check if start level have files with overlapping ranges
+  if (start_level_ == 0 && input_vstorage_->level0_non_overlapping() == false) {
+    // We cannot move files from L0 to L1 if the files are overlapping
+    return false;
+  }
+
+  if (is_manual_compaction_ &&
+      (immutable_cf_options_.compaction_filter != nullptr ||
+       immutable_cf_options_.compaction_filter_factory != nullptr)) {
+    // This is a manual compaction and we have a compaction filter that should
+    // be executed, we cannot do a trivial move
+    return false;
+  }
+
+  // Used in universal compaction, where trivial move can be done if the
+  // input files are non overlapping
+  if ((mutable_cf_options_.compaction_options_universal.allow_trivial_move) &&
+      (output_level_ != 0)) {
+    return is_trivial_move_;
+  }
+
+  if (!(start_level_ != output_level_ && num_input_levels() == 1 &&
+          input(0, 0)->fd.GetPathId() == output_path_id() &&
+          InputCompressionMatchesOutput())) {
+    return false;
+  }
+
+  // assert inputs_.size() == 1
+
+  for (const auto& file : inputs_.front().files) {
+    std::vector<FileMetaData*> file_grand_parents;
+    if (output_level_ + 1 >= number_levels_) {
+      continue;
+    }
+    input_vstorage_->GetOverlappingInputs(output_level_ + 1, &file->smallest,
+                                          &file->largest, &file_grand_parents);
+    const auto compaction_size =
+        file->fd.GetFileSize() + TotalFileSize(file_grand_parents);
+    if (compaction_size > max_compaction_bytes_) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+void Compaction::AddInputDeletions(VersionEdit* out_edit) {
+  for (size_t which = 0; which < num_input_levels(); which++) {
+    for (size_t i = 0; i < inputs_[which].size(); i++) {
+      out_edit->DeleteFile(level(which), inputs_[which][i]->fd.GetNumber());
+    }
+  }
+}
+
+bool Compaction::KeyNotExistsBeyondOutputLevel(
+    const Slice& user_key, std::vector<size_t>* level_ptrs) const {
+  assert(input_version_ != nullptr);
+  assert(level_ptrs != nullptr);
+  assert(level_ptrs->size() == static_cast<size_t>(number_levels_));
+  if (bottommost_level_) {
+    return true;
+  } else if (output_level_ != 0 &&
+             cfd_->ioptions()->compaction_style == kCompactionStyleLevel) {
+    // Maybe use binary search to find right entry instead of linear search?
+    const Comparator* user_cmp = cfd_->user_comparator();
+    for (int lvl = output_level_ + 1; lvl < number_levels_; lvl++) {
+      const std::vector<FileMetaData*>& files =
+          input_vstorage_->LevelFiles(lvl);
+      for (; level_ptrs->at(lvl) < files.size(); level_ptrs->at(lvl)++) {
+        auto* f = files[level_ptrs->at(lvl)];
+        if (user_cmp->Compare(user_key, f->largest.user_key()) <= 0) {
+          // We've advanced far enough
+          if (user_cmp->Compare(user_key, f->smallest.user_key()) >= 0) {
+            // Key falls in this file's range, so it may
+            // exist beyond output level
+            return false;
+          }
+          break;
+        }
+      }
+    }
+    return true;
+  }
+  return false;
+}
+
+// Mark (or clear) each file that is being compacted
+void Compaction::MarkFilesBeingCompacted(bool mark_as_compacted) {
+  for (size_t i = 0; i < num_input_levels(); i++) {
+    for (size_t j = 0; j < inputs_[i].size(); j++) {
+      assert(mark_as_compacted ? !inputs_[i][j]->being_compacted
+                               : inputs_[i][j]->being_compacted);
+      inputs_[i][j]->being_compacted = mark_as_compacted;
+    }
+  }
+}
+
+// Sample output:
+// If compacting 3 L0 files, 2 L3 files and 1 L4 file, and outputting to L5,
+// print: "3@0 + 2@3 + 1@4 files to L5"
+const char* Compaction::InputLevelSummary(
+    InputLevelSummaryBuffer* scratch) const {
+  int len = 0;
+  bool is_first = true;
+  for (auto& input_level : inputs_) {
+    if (input_level.empty()) {
+      continue;
+    }
+    if (!is_first) {
+      len +=
+          snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, " + ");
+      len = std::min(len, static_cast<int>(sizeof(scratch->buffer)));
+    } else {
+      is_first = false;
+    }
+    len += snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len,
+                    "%" ROCKSDB_PRIszt "@%d", input_level.size(),
+                    input_level.level);
+    len = std::min(len, static_cast<int>(sizeof(scratch->buffer)));
+  }
+  snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len,
+           " files to L%d", output_level());
+
+  return scratch->buffer;
+}
+
+uint64_t Compaction::CalculateTotalInputSize() const {
+  uint64_t size = 0;
+  for (auto& input_level : inputs_) {
+    for (auto f : input_level.files) {
+      size += f->fd.GetFileSize();
+    }
+  }
+  return size;
+}
+
+void Compaction::ReleaseCompactionFiles(Status status) {
+  MarkFilesBeingCompacted(false);
+  cfd_->compaction_picker()->ReleaseCompactionFiles(this, status);
+}
+
+void Compaction::ResetNextCompactionIndex() {
+  assert(input_version_ != nullptr);
+  input_vstorage_->ResetNextCompactionIndex(start_level_);
+}
+
+namespace {
+int InputSummary(const std::vector<FileMetaData*>& files, char* output,
+                 int len) {
+  *output = '\0';
+  int write = 0;
+  for (size_t i = 0; i < files.size(); i++) {
+    int sz = len - write;
+    int ret;
+    char sztxt[16];
+    AppendHumanBytes(files.at(i)->fd.GetFileSize(), sztxt, 16);
+    ret = snprintf(output + write, sz, "%" PRIu64 "(%s) ",
+                   files.at(i)->fd.GetNumber(), sztxt);
+    if (ret < 0 || ret >= sz) break;
+    write += ret;
+  }
+  // if files.size() is non-zero, overwrite the last space
+  return write - !!files.size();
+}
+}  // namespace
+
+void Compaction::Summary(char* output, int len) {
+  int write =
+      snprintf(output, len, "Base version %" PRIu64 " Base level %d, inputs: [",
+               input_version_->GetVersionNumber(), start_level_);
+  if (write < 0 || write >= len) {
+    return;
+  }
+
+  for (size_t level_iter = 0; level_iter < num_input_levels(); ++level_iter) {
+    if (level_iter > 0) {
+      write += snprintf(output + write, len - write, "], [");
+      if (write < 0 || write >= len) {
+        return;
+      }
+    }
+    write +=
+        InputSummary(inputs_[level_iter].files, output + write, len - write);
+    if (write < 0 || write >= len) {
+      return;
+    }
+  }
+
+  snprintf(output + write, len - write, "]");
+}
+
+uint64_t Compaction::OutputFilePreallocationSize() const {
+  uint64_t preallocation_size = 0;
+
+  for (const auto& level_files : inputs_) {
+    for (const auto& file : level_files.files) {
+      preallocation_size += file->fd.GetFileSize();
+    }
+  }
+
+  if (max_output_file_size_ != port::kMaxUint64 &&
+      (immutable_cf_options_.compaction_style == kCompactionStyleLevel ||
+       output_level() > 0)) {
+    preallocation_size = std::min(max_output_file_size_, preallocation_size);
+  }
+
+  // Over-estimate slightly so we don't end up just barely crossing
+  // the threshold
+  // No point to prellocate more than 1GB.
+  return std::min(uint64_t{1073741824},
+                  preallocation_size + (preallocation_size / 10));
+}
+
+std::unique_ptr<CompactionFilter> Compaction::CreateCompactionFilter() const {
+  if (!cfd_->ioptions()->compaction_filter_factory) {
+    return nullptr;
+  }
+
+  CompactionFilter::Context context;
+  context.is_full_compaction = is_full_compaction_;
+  context.is_manual_compaction = is_manual_compaction_;
+  context.column_family_id = cfd_->GetID();
+  return cfd_->ioptions()->compaction_filter_factory->CreateCompactionFilter(
+      context);
+}
+
+bool Compaction::IsOutputLevelEmpty() const {
+  return inputs_.back().level != output_level_ || inputs_.back().empty();
+}
+
+bool Compaction::ShouldFormSubcompactions() const {
+  if (max_subcompactions_ <= 1 || cfd_ == nullptr) {
+    return false;
+  }
+  if (cfd_->ioptions()->compaction_style == kCompactionStyleLevel) {
+    return (start_level_ == 0 || is_manual_compaction_) && output_level_ > 0 &&
+           !IsOutputLevelEmpty();
+  } else if (cfd_->ioptions()->compaction_style == kCompactionStyleUniversal) {
+    return number_levels_ > 1 && output_level_ > 0;
+  } else {
+    return false;
+  }
+}
+
+uint64_t Compaction::MinInputFileOldestAncesterTime() const {
+  uint64_t min_oldest_ancester_time = port::kMaxUint64;
+  for (const auto& level_files : inputs_) {
+    for (const auto& file : level_files.files) {
+      uint64_t oldest_ancester_time = file->TryGetOldestAncesterTime();
+      if (oldest_ancester_time != 0) {
+        min_oldest_ancester_time =
+            std::min(min_oldest_ancester_time, oldest_ancester_time);
+      }
+    }
+  }
+  return min_oldest_ancester_time;
+}
+
+int Compaction::GetInputBaseLevel() const {
+  return input_vstorage_->base_level();
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/compaction/compaction.h b/src/rocksdb/db/compaction/compaction.h
new file mode 100644
index 000000000..9358e50ff
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction.h
@@ -0,0 +1,384 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include "db/version_set.h"
+#include "memory/arena.h"
+#include "options/cf_options.h"
+#include "util/autovector.h"
+
+namespace ROCKSDB_NAMESPACE {
+// The file contains class Compaction, as well as some helper functions
+// and data structures used by the class.
+
+// Utility for comparing sstable boundary keys. Returns -1 if either a or b is
+// null which provides the property that a==null indicates a key that is less
+// than any key and b==null indicates a key that is greater than any key. Note
+// that the comparison is performed primarily on the user-key portion of the
+// key. If the user-keys compare equal, an additional test is made to sort
+// range tombstone sentinel keys before other keys with the same user-key. The
+// result is that 2 user-keys will compare equal if they differ purely on
+// their sequence number and value, but the range tombstone sentinel for that
+// user-key will compare not equal. This is necessary because the range
+// tombstone sentinel key is set as the largest key for an sstable even though
+// that key never appears in the database. We don't want adjacent sstables to
+// be considered overlapping if they are separated by the range tombstone
+// sentinel.
+int sstableKeyCompare(const Comparator* user_cmp, const InternalKey& a,
+                      const InternalKey& b);
+int sstableKeyCompare(const Comparator* user_cmp, const InternalKey* a,
+                      const InternalKey& b);
+int sstableKeyCompare(const Comparator* user_cmp, const InternalKey& a,
+                      const InternalKey* b);
+
+// An AtomicCompactionUnitBoundary represents a range of keys [smallest,
+// largest] that exactly spans one ore more neighbouring SSTs on the same
+// level. Every pair of  SSTs in this range "overlap" (i.e., the largest
+// user key of one file is the smallest user key of the next file). These
+// boundaries are propagated down to RangeDelAggregator during compaction
+// to provide safe truncation boundaries for range tombstones.
+struct AtomicCompactionUnitBoundary {
+  const InternalKey* smallest = nullptr;
+  const InternalKey* largest = nullptr;
+};
+
+// The structure that manages compaction input files associated
+// with the same physical level.
+struct CompactionInputFiles {
+  int level;
+  std::vector<FileMetaData*> files;
+  std::vector<AtomicCompactionUnitBoundary> atomic_compaction_unit_boundaries;
+  inline bool empty() const { return files.empty(); }
+  inline size_t size() const { return files.size(); }
+  inline void clear() { files.clear(); }
+  inline FileMetaData* operator[](size_t i) const { return files[i]; }
+};
+
+class Version;
+class ColumnFamilyData;
+class VersionStorageInfo;
+class CompactionFilter;
+
+// A Compaction encapsulates metadata about a compaction.
+class Compaction {
+ public:
+  Compaction(VersionStorageInfo* input_version,
+             const ImmutableCFOptions& immutable_cf_options,
+             const MutableCFOptions& mutable_cf_options,
+             std::vector<CompactionInputFiles> inputs, int output_level,
+             uint64_t target_file_size, uint64_t max_compaction_bytes,
+             uint32_t output_path_id, CompressionType compression,
+             CompressionOptions compression_opts, uint32_t max_subcompactions,
+             std::vector<FileMetaData*> grandparents,
+             bool manual_compaction = false, double score = -1,
+             bool deletion_compaction = false,
+             CompactionReason compaction_reason = CompactionReason::kUnknown);
+
+  // No copying allowed
+  Compaction(const Compaction&) = delete;
+  void operator=(const Compaction&) = delete;
+
+  ~Compaction();
+
+  // Returns the level associated to the specified compaction input level.
+  // If compaction_input_level is not specified, then input_level is set to 0.
+  int level(size_t compaction_input_level = 0) const {
+    return inputs_[compaction_input_level].level;
+  }
+
+  int start_level() const { return start_level_; }
+
+  // Outputs will go to this level
+  int output_level() const { return output_level_; }
+
+  // Returns the number of input levels in this compaction.
+  size_t num_input_levels() const { return inputs_.size(); }
+
+  // Return the object that holds the edits to the descriptor done
+  // by this compaction.
+  VersionEdit* edit() { return &edit_; }
+
+  // Returns the number of input files associated to the specified
+  // compaction input level.
+  // The function will return 0 if when "compaction_input_level" < 0
+  // or "compaction_input_level" >= "num_input_levels()".
+  size_t num_input_files(size_t compaction_input_level) const {
+    if (compaction_input_level < inputs_.size()) {
+      return inputs_[compaction_input_level].size();
+    }
+    return 0;
+  }
+
+  // Returns input version of the compaction
+  Version* input_version() const { return input_version_; }
+
+  // Returns the ColumnFamilyData associated with the compaction.
+  ColumnFamilyData* column_family_data() const { return cfd_; }
+
+  // Returns the file meta data of the 'i'th input file at the
+  // specified compaction input level.
+  // REQUIREMENT: "compaction_input_level" must be >= 0 and
+  //              < "input_levels()"
+  FileMetaData* input(size_t compaction_input_level, size_t i) const {
+    assert(compaction_input_level < inputs_.size());
+    return inputs_[compaction_input_level][i];
+  }
+
+  const std::vector<AtomicCompactionUnitBoundary>* boundaries(
+      size_t compaction_input_level) const {
+    assert(compaction_input_level < inputs_.size());
+    return &inputs_[compaction_input_level].atomic_compaction_unit_boundaries;
+  }
+
+  // Returns the list of file meta data of the specified compaction
+  // input level.
+  // REQUIREMENT: "compaction_input_level" must be >= 0 and
+  //              < "input_levels()"
+  const std::vector<FileMetaData*>* inputs(
+      size_t compaction_input_level) const {
+    assert(compaction_input_level < inputs_.size());
+    return &inputs_[compaction_input_level].files;
+  }
+
+  const std::vector<CompactionInputFiles>* inputs() { return &inputs_; }
+
+  // Returns the LevelFilesBrief of the specified compaction input level.
+  const LevelFilesBrief* input_levels(size_t compaction_input_level) const {
+    return &input_levels_[compaction_input_level];
+  }
+
+  // Maximum size of files to build during this compaction.
+  uint64_t max_output_file_size() const { return max_output_file_size_; }
+
+  // What compression for output
+  CompressionType output_compression() const { return output_compression_; }
+
+  // What compression options for output
+  CompressionOptions output_compression_opts() const {
+    return output_compression_opts_;
+  }
+
+  // Whether need to write output file to second DB path.
+  uint32_t output_path_id() const { return output_path_id_; }
+
+  // Is this a trivial compaction that can be implemented by just
+  // moving a single input file to the next level (no merging or splitting)
+  bool IsTrivialMove() const;
+
+  // If true, then the compaction can be done by simply deleting input files.
+  bool deletion_compaction() const { return deletion_compaction_; }
+
+  // Add all inputs to this compaction as delete operations to *edit.
+  void AddInputDeletions(VersionEdit* edit);
+
+  // Returns true if the available information we have guarantees that
+  // the input "user_key" does not exist in any level beyond "output_level()".
+  bool KeyNotExistsBeyondOutputLevel(const Slice& user_key,
+                                     std::vector<size_t>* level_ptrs) const;
+
+  // Clear all files to indicate that they are not being compacted
+  // Delete this compaction from the list of running compactions.
+  //
+  // Requirement: DB mutex held
+  void ReleaseCompactionFiles(Status status);
+
+  // Returns the summary of the compaction in "output" with maximum "len"
+  // in bytes.  The caller is responsible for the memory management of
+  // "output".
+  void Summary(char* output, int len);
+
+  // Return the score that was used to pick this compaction run.
+  double score() const { return score_; }
+
+  // Is this compaction creating a file in the bottom most level?
+  bool bottommost_level() const { return bottommost_level_; }
+
+  // Does this compaction include all sst files?
+  bool is_full_compaction() const { return is_full_compaction_; }
+
+  // Was this compaction triggered manually by the client?
+  bool is_manual_compaction() const { return is_manual_compaction_; }
+
+  // Used when allow_trivial_move option is set in
+  // Universal compaction. If all the input files are
+  // non overlapping, then is_trivial_move_ variable
+  // will be set true, else false
+  void set_is_trivial_move(bool trivial_move) {
+    is_trivial_move_ = trivial_move;
+  }
+
+  // Used when allow_trivial_move option is set in
+  // Universal compaction. Returns true, if the input files
+  // are non-overlapping and can be trivially moved.
+  bool is_trivial_move() const { return is_trivial_move_; }
+
+  // How many total levels are there?
+  int number_levels() const { return number_levels_; }
+
+  // Return the ImmutableCFOptions that should be used throughout the compaction
+  // procedure
+  const ImmutableCFOptions* immutable_cf_options() const {
+    return &immutable_cf_options_;
+  }
+
+  // Return the MutableCFOptions that should be used throughout the compaction
+  // procedure
+  const MutableCFOptions* mutable_cf_options() const {
+    return &mutable_cf_options_;
+  }
+
+  // Returns the size in bytes that the output file should be preallocated to.
+  // In level compaction, that is max_file_size_. In universal compaction, that
+  // is the sum of all input file sizes.
+  uint64_t OutputFilePreallocationSize() const;
+
+  void SetInputVersion(Version* input_version);
+
+  struct InputLevelSummaryBuffer {
+    char buffer[128];
+  };
+
+  const char* InputLevelSummary(InputLevelSummaryBuffer* scratch) const;
+
+  uint64_t CalculateTotalInputSize() const;
+
+  // In case of compaction error, reset the nextIndex that is used
+  // to pick up the next file to be compacted from files_by_size_
+  void ResetNextCompactionIndex();
+
+  // Create a CompactionFilter from compaction_filter_factory
+  std::unique_ptr<CompactionFilter> CreateCompactionFilter() const;
+
+  // Is the input level corresponding to output_level_ empty?
+  bool IsOutputLevelEmpty() const;
+
+  // Should this compaction be broken up into smaller ones run in parallel?
+  bool ShouldFormSubcompactions() const;
+
+  // test function to validate the functionality of IsBottommostLevel()
+  // function -- determines if compaction with inputs and storage is bottommost
+  static bool TEST_IsBottommostLevel(
+      int output_level, VersionStorageInfo* vstorage,
+      const std::vector<CompactionInputFiles>& inputs);
+
+  TablePropertiesCollection GetOutputTableProperties() const {
+    return output_table_properties_;
+  }
+
+  void SetOutputTableProperties(TablePropertiesCollection tp) {
+    output_table_properties_ = std::move(tp);
+  }
+
+  Slice GetSmallestUserKey() const { return smallest_user_key_; }
+
+  Slice GetLargestUserKey() const { return largest_user_key_; }
+
+  int GetInputBaseLevel() const;
+
+  CompactionReason compaction_reason() { return compaction_reason_; }
+
+  const std::vector<FileMetaData*>& grandparents() const {
+    return grandparents_;
+  }
+
+  uint64_t max_compaction_bytes() const { return max_compaction_bytes_; }
+
+  uint32_t max_subcompactions() const { return max_subcompactions_; }
+
+  uint64_t MinInputFileOldestAncesterTime() const;
+
+ private:
+  // mark (or clear) all files that are being compacted
+  void MarkFilesBeingCompacted(bool mark_as_compacted);
+
+  // get the smallest and largest key present in files to be compacted
+  static void GetBoundaryKeys(VersionStorageInfo* vstorage,
+                              const std::vector<CompactionInputFiles>& inputs,
+                              Slice* smallest_key, Slice* largest_key);
+
+  // Get the atomic file boundaries for all files in the compaction. Necessary
+  // in order to avoid the scenario described in
+  // https://github.com/facebook/rocksdb/pull/4432#discussion_r221072219 and plumb
+  // down appropriate key boundaries to RangeDelAggregator during compaction.
+  static std::vector<CompactionInputFiles> PopulateWithAtomicBoundaries(
+      VersionStorageInfo* vstorage, std::vector<CompactionInputFiles> inputs);
+
+  // helper function to determine if compaction with inputs and storage is
+  // bottommost
+  static bool IsBottommostLevel(
+      int output_level, VersionStorageInfo* vstorage,
+      const std::vector<CompactionInputFiles>& inputs);
+
+  static bool IsFullCompaction(VersionStorageInfo* vstorage,
+                               const std::vector<CompactionInputFiles>& inputs);
+
+  VersionStorageInfo* input_vstorage_;
+
+  const int start_level_;    // the lowest level to be compacted
+  const int output_level_;  // levels to which output files are stored
+  uint64_t max_output_file_size_;
+  uint64_t max_compaction_bytes_;
+  uint32_t max_subcompactions_;
+  const ImmutableCFOptions immutable_cf_options_;
+  const MutableCFOptions mutable_cf_options_;
+  Version* input_version_;
+  VersionEdit edit_;
+  const int number_levels_;
+  ColumnFamilyData* cfd_;
+  Arena arena_;          // Arena used to allocate space for file_levels_
+
+  const uint32_t output_path_id_;
+  CompressionType output_compression_;
+  CompressionOptions output_compression_opts_;
+  // If true, then the comaction can be done by simply deleting input files.
+  const bool deletion_compaction_;
+
+  // Compaction input files organized by level. Constant after construction
+  const std::vector<CompactionInputFiles> inputs_;
+
+  // A copy of inputs_, organized more closely in memory
+  autovector<LevelFilesBrief, 2> input_levels_;
+
+  // State used to check for number of overlapping grandparent files
+  // (grandparent == "output_level_ + 1")
+  std::vector<FileMetaData*> grandparents_;
+  const double score_;         // score that was used to pick this compaction.
+
+  // Is this compaction creating a file in the bottom most level?
+  const bool bottommost_level_;
+  // Does this compaction include all sst files?
+  const bool is_full_compaction_;
+
+  // Is this compaction requested by the client?
+  const bool is_manual_compaction_;
+
+  // True if we can do trivial move in Universal multi level
+  // compaction
+  bool is_trivial_move_;
+
+  // Does input compression match the output compression?
+  bool InputCompressionMatchesOutput() const;
+
+  // table properties of output files
+  TablePropertiesCollection output_table_properties_;
+
+  // smallest user keys in compaction
+  Slice smallest_user_key_;
+
+  // largest user keys in compaction
+  Slice largest_user_key_;
+
+  // Reason for compaction
+  CompactionReason compaction_reason_;
+};
+
+// Return sum of sizes of all files in `files`.
+extern uint64_t TotalFileSize(const std::vector<FileMetaData*>& files);
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/compaction/compaction_iteration_stats.h b/src/rocksdb/db/compaction/compaction_iteration_stats.h
new file mode 100644
index 000000000..963c1d8eb
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_iteration_stats.h
@@ -0,0 +1,37 @@
+//  Copyright (c) 2016-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include "rocksdb/rocksdb_namespace.h"
+
+struct CompactionIterationStats {
+  // Compaction statistics
+
+  // Doesn't include records skipped because of
+  // CompactionFilter::Decision::kRemoveAndSkipUntil.
+  int64_t num_record_drop_user = 0;
+
+  int64_t num_record_drop_hidden = 0;
+  int64_t num_record_drop_obsolete = 0;
+  int64_t num_record_drop_range_del = 0;
+  int64_t num_range_del_drop_obsolete = 0;
+  // Deletions obsoleted before bottom level due to file gap optimization.
+  int64_t num_optimized_del_drop_obsolete = 0;
+  uint64_t total_filter_time = 0;
+
+  // Input statistics
+  // TODO(noetzli): The stats are incomplete. They are lacking everything
+  // consumed by MergeHelper.
+  uint64_t num_input_records = 0;
+  uint64_t num_input_deletion_records = 0;
+  uint64_t num_input_corrupt_records = 0;
+  uint64_t total_input_raw_key_bytes = 0;
+  uint64_t total_input_raw_value_bytes = 0;
+
+  // Single-Delete diagnostics for exceptional situations
+  uint64_t num_single_del_fallthru = 0;
+  uint64_t num_single_del_mismatch = 0;
+};
diff --git a/src/rocksdb/db/compaction/compaction_iterator.cc b/src/rocksdb/db/compaction/compaction_iterator.cc
new file mode 100644
index 000000000..1bebfc717
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_iterator.cc
@@ -0,0 +1,774 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include <cinttypes>
+
+#include "db/compaction/compaction_iterator.h"
+#include "db/snapshot_checker.h"
+#include "port/likely.h"
+#include "rocksdb/listener.h"
+#include "table/internal_iterator.h"
+#include "test_util/sync_point.h"
+
+#define DEFINITELY_IN_SNAPSHOT(seq, snapshot)                       \
+  ((seq) <= (snapshot) &&                                           \
+   (snapshot_checker_ == nullptr ||                                 \
+    LIKELY(snapshot_checker_->CheckInSnapshot((seq), (snapshot)) == \
+           SnapshotCheckerResult::kInSnapshot)))
+
+#define DEFINITELY_NOT_IN_SNAPSHOT(seq, snapshot)                     \
+  ((seq) > (snapshot) ||                                              \
+   (snapshot_checker_ != nullptr &&                                   \
+    UNLIKELY(snapshot_checker_->CheckInSnapshot((seq), (snapshot)) == \
+             SnapshotCheckerResult::kNotInSnapshot)))
+
+#define IN_EARLIEST_SNAPSHOT(seq) \
+  ((seq) <= earliest_snapshot_ && \
+   (snapshot_checker_ == nullptr || LIKELY(IsInEarliestSnapshot(seq))))
+
+namespace ROCKSDB_NAMESPACE {
+
+CompactionIterator::CompactionIterator(
+    InternalIterator* input, const Comparator* cmp, MergeHelper* merge_helper,
+    SequenceNumber last_sequence, std::vector<SequenceNumber>* snapshots,
+    SequenceNumber earliest_write_conflict_snapshot,
+    const SnapshotChecker* snapshot_checker, Env* env,
+    bool report_detailed_time, bool expect_valid_internal_key,
+    CompactionRangeDelAggregator* range_del_agg, const Compaction* compaction,
+    const CompactionFilter* compaction_filter,
+    const std::atomic<bool>* shutting_down,
+    const SequenceNumber preserve_deletes_seqnum,
+    const std::atomic<bool>* manual_compaction_paused,
+    const std::shared_ptr<Logger> info_log)
+    : CompactionIterator(
+          input, cmp, merge_helper, last_sequence, snapshots,
+          earliest_write_conflict_snapshot, snapshot_checker, env,
+          report_detailed_time, expect_valid_internal_key, range_del_agg,
+          std::unique_ptr<CompactionProxy>(
+              compaction ? new CompactionProxy(compaction) : nullptr),
+          compaction_filter, shutting_down, preserve_deletes_seqnum,
+          manual_compaction_paused, info_log) {}
+
+CompactionIterator::CompactionIterator(
+    InternalIterator* input, const Comparator* cmp, MergeHelper* merge_helper,
+    SequenceNumber /*last_sequence*/, std::vector<SequenceNumber>* snapshots,
+    SequenceNumber earliest_write_conflict_snapshot,
+    const SnapshotChecker* snapshot_checker, Env* env,
+    bool report_detailed_time, bool expect_valid_internal_key,
+    CompactionRangeDelAggregator* range_del_agg,
+    std::unique_ptr<CompactionProxy> compaction,
+    const CompactionFilter* compaction_filter,
+    const std::atomic<bool>* shutting_down,
+    const SequenceNumber preserve_deletes_seqnum,
+    const std::atomic<bool>* manual_compaction_paused,
+    const std::shared_ptr<Logger> info_log)
+    : input_(input),
+      cmp_(cmp),
+      merge_helper_(merge_helper),
+      snapshots_(snapshots),
+      earliest_write_conflict_snapshot_(earliest_write_conflict_snapshot),
+      snapshot_checker_(snapshot_checker),
+      env_(env),
+      report_detailed_time_(report_detailed_time),
+      expect_valid_internal_key_(expect_valid_internal_key),
+      range_del_agg_(range_del_agg),
+      compaction_(std::move(compaction)),
+      compaction_filter_(compaction_filter),
+      shutting_down_(shutting_down),
+      manual_compaction_paused_(manual_compaction_paused),
+      preserve_deletes_seqnum_(preserve_deletes_seqnum),
+      current_user_key_sequence_(0),
+      current_user_key_snapshot_(0),
+      merge_out_iter_(merge_helper_),
+      current_key_committed_(false),
+      info_log_(info_log) {
+  assert(compaction_filter_ == nullptr || compaction_ != nullptr);
+  assert(snapshots_ != nullptr);
+  bottommost_level_ =
+      compaction_ == nullptr ? false : compaction_->bottommost_level();
+  if (compaction_ != nullptr) {
+    level_ptrs_ = std::vector<size_t>(compaction_->number_levels(), 0);
+  }
+  if (snapshots_->size() == 0) {
+    // optimize for fast path if there are no snapshots
+    visible_at_tip_ = true;
+    earliest_snapshot_iter_ = snapshots_->end();
+    earliest_snapshot_ = kMaxSequenceNumber;
+    latest_snapshot_ = 0;
+  } else {
+    visible_at_tip_ = false;
+    earliest_snapshot_iter_ = snapshots_->begin();
+    earliest_snapshot_ = snapshots_->at(0);
+    latest_snapshot_ = snapshots_->back();
+  }
+#ifndef NDEBUG
+  // findEarliestVisibleSnapshot assumes this ordering.
+  for (size_t i = 1; i < snapshots_->size(); ++i) {
+    assert(snapshots_->at(i - 1) < snapshots_->at(i));
+  }
+#endif
+  input_->SetPinnedItersMgr(&pinned_iters_mgr_);
+  TEST_SYNC_POINT_CALLBACK("CompactionIterator:AfterInit", compaction_.get());
+}
+
+CompactionIterator::~CompactionIterator() {
+  // input_ Iteartor lifetime is longer than pinned_iters_mgr_ lifetime
+  input_->SetPinnedItersMgr(nullptr);
+}
+
+void CompactionIterator::ResetRecordCounts() {
+  iter_stats_.num_record_drop_user = 0;
+  iter_stats_.num_record_drop_hidden = 0;
+  iter_stats_.num_record_drop_obsolete = 0;
+  iter_stats_.num_record_drop_range_del = 0;
+  iter_stats_.num_range_del_drop_obsolete = 0;
+  iter_stats_.num_optimized_del_drop_obsolete = 0;
+}
+
+void CompactionIterator::SeekToFirst() {
+  NextFromInput();
+  PrepareOutput();
+}
+
+void CompactionIterator::Next() {
+  // If there is a merge output, return it before continuing to process the
+  // input.
+  if (merge_out_iter_.Valid()) {
+    merge_out_iter_.Next();
+
+    // Check if we returned all records of the merge output.
+    if (merge_out_iter_.Valid()) {
+      key_ = merge_out_iter_.key();
+      value_ = merge_out_iter_.value();
+      bool valid_key __attribute__((__unused__));
+      valid_key =  ParseInternalKey(key_, &ikey_);
+      // MergeUntil stops when it encounters a corrupt key and does not
+      // include them in the result, so we expect the keys here to be valid.
+      assert(valid_key);
+      if (!valid_key) {
+        ROCKS_LOG_FATAL(info_log_, "Invalid key (%s) in compaction",
+                        key_.ToString(true).c_str());
+      }
+
+      // Keep current_key_ in sync.
+      current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type);
+      key_ = current_key_.GetInternalKey();
+      ikey_.user_key = current_key_.GetUserKey();
+      valid_ = true;
+    } else {
+      // We consumed all pinned merge operands, release pinned iterators
+      pinned_iters_mgr_.ReleasePinnedData();
+      // MergeHelper moves the iterator to the first record after the merged
+      // records, so even though we reached the end of the merge output, we do
+      // not want to advance the iterator.
+      NextFromInput();
+    }
+  } else {
+    // Only advance the input iterator if there is no merge output and the
+    // iterator is not already at the next record.
+    if (!at_next_) {
+      input_->Next();
+    }
+    NextFromInput();
+  }
+
+  if (valid_) {
+    // Record that we've outputted a record for the current key.
+    has_outputted_key_ = true;
+  }
+
+  PrepareOutput();
+}
+
+void CompactionIterator::InvokeFilterIfNeeded(bool* need_skip,
+                                              Slice* skip_until) {
+  if (compaction_filter_ != nullptr &&
+      (ikey_.type == kTypeValue || ikey_.type == kTypeBlobIndex)) {
+    // If the user has specified a compaction filter and the sequence
+    // number is greater than any external snapshot, then invoke the
+    // filter. If the return value of the compaction filter is true,
+    // replace the entry with a deletion marker.
+    CompactionFilter::Decision filter;
+    compaction_filter_value_.clear();
+    compaction_filter_skip_until_.Clear();
+    CompactionFilter::ValueType value_type =
+        ikey_.type == kTypeValue ? CompactionFilter::ValueType::kValue
+                                 : CompactionFilter::ValueType::kBlobIndex;
+    // Hack: pass internal key to BlobIndexCompactionFilter since it needs
+    // to get sequence number.
+    Slice& filter_key = ikey_.type == kTypeValue ? ikey_.user_key : key_;
+    {
+      StopWatchNano timer(env_, report_detailed_time_);
+      filter = compaction_filter_->FilterV2(
+          compaction_->level(), filter_key, value_type, value_,
+          &compaction_filter_value_, compaction_filter_skip_until_.rep());
+      iter_stats_.total_filter_time +=
+          env_ != nullptr && report_detailed_time_ ? timer.ElapsedNanos() : 0;
+    }
+
+    if (filter == CompactionFilter::Decision::kRemoveAndSkipUntil &&
+        cmp_->Compare(*compaction_filter_skip_until_.rep(), ikey_.user_key) <=
+            0) {
+      // Can't skip to a key smaller than the current one.
+      // Keep the key as per FilterV2 documentation.
+      filter = CompactionFilter::Decision::kKeep;
+    }
+
+    if (filter == CompactionFilter::Decision::kRemove) {
+      // convert the current key to a delete; key_ is pointing into
+      // current_key_ at this point, so updating current_key_ updates key()
+      ikey_.type = kTypeDeletion;
+      current_key_.UpdateInternalKey(ikey_.sequence, kTypeDeletion);
+      // no value associated with delete
+      value_.clear();
+      iter_stats_.num_record_drop_user++;
+    } else if (filter == CompactionFilter::Decision::kChangeValue) {
+      value_ = compaction_filter_value_;
+    } else if (filter == CompactionFilter::Decision::kRemoveAndSkipUntil) {
+      *need_skip = true;
+      compaction_filter_skip_until_.ConvertFromUserKey(kMaxSequenceNumber,
+                                                       kValueTypeForSeek);
+      *skip_until = compaction_filter_skip_until_.Encode();
+    }
+  }
+}
+
+void CompactionIterator::NextFromInput() {
+  at_next_ = false;
+  valid_ = false;
+
+  while (!valid_ && input_->Valid() && !IsPausingManualCompaction() &&
+         !IsShuttingDown()) {
+    key_ = input_->key();
+    value_ = input_->value();
+    iter_stats_.num_input_records++;
+
+    if (!ParseInternalKey(key_, &ikey_)) {
+      // If `expect_valid_internal_key_` is false, return the corrupted key
+      // and let the caller decide what to do with it.
+      // TODO(noetzli): We should have a more elegant solution for this.
+      if (expect_valid_internal_key_) {
+        assert(!"Corrupted internal key not expected.");
+        status_ = Status::Corruption("Corrupted internal key not expected.");
+        break;
+      }
+      key_ = current_key_.SetInternalKey(key_);
+      has_current_user_key_ = false;
+      current_user_key_sequence_ = kMaxSequenceNumber;
+      current_user_key_snapshot_ = 0;
+      iter_stats_.num_input_corrupt_records++;
+      valid_ = true;
+      break;
+    }
+    TEST_SYNC_POINT_CALLBACK("CompactionIterator:ProcessKV", &ikey_);
+
+    // Update input statistics
+    if (ikey_.type == kTypeDeletion || ikey_.type == kTypeSingleDeletion) {
+      iter_stats_.num_input_deletion_records++;
+    }
+    iter_stats_.total_input_raw_key_bytes += key_.size();
+    iter_stats_.total_input_raw_value_bytes += value_.size();
+
+    // If need_skip is true, we should seek the input iterator
+    // to internal key skip_until and continue from there.
+    bool need_skip = false;
+    // Points either into compaction_filter_skip_until_ or into
+    // merge_helper_->compaction_filter_skip_until_.
+    Slice skip_until;
+
+    // Check whether the user key changed. After this if statement current_key_
+    // is a copy of the current input key (maybe converted to a delete by the
+    // compaction filter). ikey_.user_key is pointing to the copy.
+    if (!has_current_user_key_ ||
+        !cmp_->Equal(ikey_.user_key, current_user_key_)) {
+      // First occurrence of this user key
+      // Copy key for output
+      key_ = current_key_.SetInternalKey(key_, &ikey_);
+      current_user_key_ = ikey_.user_key;
+      has_current_user_key_ = true;
+      has_outputted_key_ = false;
+      current_user_key_sequence_ = kMaxSequenceNumber;
+      current_user_key_snapshot_ = 0;
+      current_key_committed_ = KeyCommitted(ikey_.sequence);
+
+      // Apply the compaction filter to the first committed version of the user
+      // key.
+      if (current_key_committed_) {
+        InvokeFilterIfNeeded(&need_skip, &skip_until);
+      }
+    } else {
+      // Update the current key to reflect the new sequence number/type without
+      // copying the user key.
+      // TODO(rven): Compaction filter does not process keys in this path
+      // Need to have the compaction filter process multiple versions
+      // if we have versions on both sides of a snapshot
+      current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type);
+      key_ = current_key_.GetInternalKey();
+      ikey_.user_key = current_key_.GetUserKey();
+
+      // Note that newer version of a key is ordered before older versions. If a
+      // newer version of a key is committed, so as the older version. No need
+      // to query snapshot_checker_ in that case.
+      if (UNLIKELY(!current_key_committed_)) {
+        assert(snapshot_checker_ != nullptr);
+        current_key_committed_ = KeyCommitted(ikey_.sequence);
+        // Apply the compaction filter to the first committed version of the
+        // user key.
+        if (current_key_committed_) {
+          InvokeFilterIfNeeded(&need_skip, &skip_until);
+        }
+      }
+    }
+
+    if (UNLIKELY(!current_key_committed_)) {
+      assert(snapshot_checker_ != nullptr);
+      valid_ = true;
+      break;
+    }
+
+    // If there are no snapshots, then this kv affect visibility at tip.
+    // Otherwise, search though all existing snapshots to find the earliest
+    // snapshot that is affected by this kv.
+    SequenceNumber last_sequence __attribute__((__unused__));
+    last_sequence = current_user_key_sequence_;
+    current_user_key_sequence_ = ikey_.sequence;
+    SequenceNumber last_snapshot = current_user_key_snapshot_;
+    SequenceNumber prev_snapshot = 0;  // 0 means no previous snapshot
+    current_user_key_snapshot_ =
+        visible_at_tip_
+            ? earliest_snapshot_
+            : findEarliestVisibleSnapshot(ikey_.sequence, &prev_snapshot);
+
+    if (need_skip) {
+      // This case is handled below.
+    } else if (clear_and_output_next_key_) {
+      // In the previous iteration we encountered a single delete that we could
+      // not compact out.  We will keep this Put, but can drop it's data.
+      // (See Optimization 3, below.)
+      assert(ikey_.type == kTypeValue);
+      if (ikey_.type != kTypeValue) {
+        ROCKS_LOG_FATAL(info_log_,
+                        "Unexpected key type %d for compaction output",
+                        ikey_.type);
+      }
+      assert(current_user_key_snapshot_ == last_snapshot);
+      if (current_user_key_snapshot_ != last_snapshot) {
+        ROCKS_LOG_FATAL(info_log_,
+                        "current_user_key_snapshot_ (%" PRIu64
+                        ") != last_snapshot (%" PRIu64 ")",
+                        current_user_key_snapshot_, last_snapshot);
+      }
+
+      value_.clear();
+      valid_ = true;
+      clear_and_output_next_key_ = false;
+    } else if (ikey_.type == kTypeSingleDeletion) {
+      // We can compact out a SingleDelete if:
+      // 1) We encounter the corresponding PUT -OR- we know that this key
+      //    doesn't appear past this output level
+      // =AND=
+      // 2) We've already returned a record in this snapshot -OR-
+      //    there are no earlier earliest_write_conflict_snapshot.
+      //
+      // Rule 1 is needed for SingleDelete correctness.  Rule 2 is needed to
+      // allow Transactions to do write-conflict checking (if we compacted away
+      // all keys, then we wouldn't know that a write happened in this
+      // snapshot).  If there is no earlier snapshot, then we know that there
+      // are no active transactions that need to know about any writes.
+      //
+      // Optimization 3:
+      // If we encounter a SingleDelete followed by a PUT and Rule 2 is NOT
+      // true, then we must output a SingleDelete.  In this case, we will decide
+      // to also output the PUT.  While we are compacting less by outputting the
+      // PUT now, hopefully this will lead to better compaction in the future
+      // when Rule 2 is later true (Ie, We are hoping we can later compact out
+      // both the SingleDelete and the Put, while we couldn't if we only
+      // outputted the SingleDelete now).
+      // In this case, we can save space by removing the PUT's value as it will
+      // never be read.
+      //
+      // Deletes and Merges are not supported on the same key that has a
+      // SingleDelete as it is not possible to correctly do any partial
+      // compaction of such a combination of operations.  The result of mixing
+      // those operations for a given key is documented as being undefined.  So
+      // we can choose how to handle such a combinations of operations.  We will
+      // try to compact out as much as we can in these cases.
+      // We will report counts on these anomalous cases.
+
+      // The easiest way to process a SingleDelete during iteration is to peek
+      // ahead at the next key.
+      ParsedInternalKey next_ikey;
+      input_->Next();
+
+      // Check whether the next key exists, is not corrupt, and is the same key
+      // as the single delete.
+      if (input_->Valid() && ParseInternalKey(input_->key(), &next_ikey) &&
+          cmp_->Equal(ikey_.user_key, next_ikey.user_key)) {
+        // Check whether the next key belongs to the same snapshot as the
+        // SingleDelete.
+        if (prev_snapshot == 0 ||
+            DEFINITELY_NOT_IN_SNAPSHOT(next_ikey.sequence, prev_snapshot)) {
+          if (next_ikey.type == kTypeSingleDeletion) {
+            // We encountered two SingleDeletes in a row.  This could be due to
+            // unexpected user input.
+            // Skip the first SingleDelete and let the next iteration decide how
+            // to handle the second SingleDelete
+
+            // First SingleDelete has been skipped since we already called
+            // input_->Next().
+            ++iter_stats_.num_record_drop_obsolete;
+            ++iter_stats_.num_single_del_mismatch;
+          } else if (has_outputted_key_ ||
+                     DEFINITELY_IN_SNAPSHOT(
+                         ikey_.sequence, earliest_write_conflict_snapshot_)) {
+            // Found a matching value, we can drop the single delete and the
+            // value.  It is safe to drop both records since we've already
+            // outputted a key in this snapshot, or there is no earlier
+            // snapshot (Rule 2 above).
+
+            // Note: it doesn't matter whether the second key is a Put or if it
+            // is an unexpected Merge or Delete.  We will compact it out
+            // either way. We will maintain counts of how many mismatches
+            // happened
+            if (next_ikey.type != kTypeValue &&
+                next_ikey.type != kTypeBlobIndex) {
+              ++iter_stats_.num_single_del_mismatch;
+            }
+
+            ++iter_stats_.num_record_drop_hidden;
+            ++iter_stats_.num_record_drop_obsolete;
+            // Already called input_->Next() once.  Call it a second time to
+            // skip past the second key.
+            input_->Next();
+          } else {
+            // Found a matching value, but we cannot drop both keys since
+            // there is an earlier snapshot and we need to leave behind a record
+            // to know that a write happened in this snapshot (Rule 2 above).
+            // Clear the value and output the SingleDelete. (The value will be
+            // outputted on the next iteration.)
+
+            // Setting valid_ to true will output the current SingleDelete
+            valid_ = true;
+
+            // Set up the Put to be outputted in the next iteration.
+            // (Optimization 3).
+            clear_and_output_next_key_ = true;
+          }
+        } else {
+          // We hit the next snapshot without hitting a put, so the iterator
+          // returns the single delete.
+          valid_ = true;
+        }
+      } else {
+        // We are at the end of the input, could not parse the next key, or hit
+        // a different key. The iterator returns the single delete if the key
+        // possibly exists beyond the current output level.  We set
+        // has_current_user_key to false so that if the iterator is at the next
+        // key, we do not compare it again against the previous key at the next
+        // iteration. If the next key is corrupt, we return before the
+        // comparison, so the value of has_current_user_key does not matter.
+        has_current_user_key_ = false;
+        if (compaction_ != nullptr && IN_EARLIEST_SNAPSHOT(ikey_.sequence) &&
+            compaction_->KeyNotExistsBeyondOutputLevel(ikey_.user_key,
+                                                       &level_ptrs_)) {
+          // Key doesn't exist outside of this range.
+          // Can compact out this SingleDelete.
+          ++iter_stats_.num_record_drop_obsolete;
+          ++iter_stats_.num_single_del_fallthru;
+          if (!bottommost_level_) {
+            ++iter_stats_.num_optimized_del_drop_obsolete;
+          }
+        } else {
+          // Output SingleDelete
+          valid_ = true;
+        }
+      }
+
+      if (valid_) {
+        at_next_ = true;
+      }
+    } else if (last_snapshot == current_user_key_snapshot_ ||
+               (last_snapshot > 0 &&
+                last_snapshot < current_user_key_snapshot_)) {
+      // If the earliest snapshot is which this key is visible in
+      // is the same as the visibility of a previous instance of the
+      // same key, then this kv is not visible in any snapshot.
+      // Hidden by an newer entry for same user key
+      //
+      // Note: Dropping this key will not affect TransactionDB write-conflict
+      // checking since there has already been a record returned for this key
+      // in this snapshot.
+      assert(last_sequence >= current_user_key_sequence_);
+      if (last_sequence < current_user_key_sequence_) {
+        ROCKS_LOG_FATAL(info_log_,
+                        "last_sequence (%" PRIu64
+                        ") < current_user_key_sequence_ (%" PRIu64 ")",
+                        last_sequence, current_user_key_sequence_);
+      }
+
+      ++iter_stats_.num_record_drop_hidden;  // (A)
+      input_->Next();
+    } else if (compaction_ != nullptr && ikey_.type == kTypeDeletion &&
+               IN_EARLIEST_SNAPSHOT(ikey_.sequence) &&
+               ikeyNotNeededForIncrementalSnapshot() &&
+               compaction_->KeyNotExistsBeyondOutputLevel(ikey_.user_key,
+                                                          &level_ptrs_)) {
+      // TODO(noetzli): This is the only place where we use compaction_
+      // (besides the constructor). We should probably get rid of this
+      // dependency and find a way to do similar filtering during flushes.
+      //
+      // For this user key:
+      // (1) there is no data in higher levels
+      // (2) data in lower levels will have larger sequence numbers
+      // (3) data in layers that are being compacted here and have
+      //     smaller sequence numbers will be dropped in the next
+      //     few iterations of this loop (by rule (A) above).
+      // Therefore this deletion marker is obsolete and can be dropped.
+      //
+      // Note:  Dropping this Delete will not affect TransactionDB
+      // write-conflict checking since it is earlier than any snapshot.
+      //
+      // It seems that we can also drop deletion later than earliest snapshot
+      // given that:
+      // (1) The deletion is earlier than earliest_write_conflict_snapshot, and
+      // (2) No value exist earlier than the deletion.
+      ++iter_stats_.num_record_drop_obsolete;
+      if (!bottommost_level_) {
+        ++iter_stats_.num_optimized_del_drop_obsolete;
+      }
+      input_->Next();
+    } else if ((ikey_.type == kTypeDeletion) && bottommost_level_ &&
+               ikeyNotNeededForIncrementalSnapshot()) {
+      // Handle the case where we have a delete key at the bottom most level
+      // We can skip outputting the key iff there are no subsequent puts for this
+      // key
+      ParsedInternalKey next_ikey;
+      input_->Next();
+      // Skip over all versions of this key that happen to occur in the same snapshot
+      // range as the delete
+      while (input_->Valid() && ParseInternalKey(input_->key(), &next_ikey) &&
+             cmp_->Equal(ikey_.user_key, next_ikey.user_key) &&
+             (prev_snapshot == 0 ||
+              DEFINITELY_NOT_IN_SNAPSHOT(next_ikey.sequence, prev_snapshot))) {
+        input_->Next();
+      }
+      // If you find you still need to output a row with this key, we need to output the
+      // delete too
+      if (input_->Valid() && ParseInternalKey(input_->key(), &next_ikey) &&
+          cmp_->Equal(ikey_.user_key, next_ikey.user_key)) {
+        valid_ = true;
+        at_next_ = true;
+      }
+    } else if (ikey_.type == kTypeMerge) {
+      if (!merge_helper_->HasOperator()) {
+        status_ = Status::InvalidArgument(
+            "merge_operator is not properly initialized.");
+        return;
+      }
+
+      pinned_iters_mgr_.StartPinning();
+      // We know the merge type entry is not hidden, otherwise we would
+      // have hit (A)
+      // We encapsulate the merge related state machine in a different
+      // object to minimize change to the existing flow.
+      Status s = merge_helper_->MergeUntil(input_, range_del_agg_,
+                                           prev_snapshot, bottommost_level_);
+      merge_out_iter_.SeekToFirst();
+
+      if (!s.ok() && !s.IsMergeInProgress()) {
+        status_ = s;
+        return;
+      } else if (merge_out_iter_.Valid()) {
+        // NOTE: key, value, and ikey_ refer to old entries.
+        //       These will be correctly set below.
+        key_ = merge_out_iter_.key();
+        value_ = merge_out_iter_.value();
+        bool valid_key __attribute__((__unused__));
+        valid_key = ParseInternalKey(key_, &ikey_);
+        // MergeUntil stops when it encounters a corrupt key and does not
+        // include them in the result, so we expect the keys here to valid.
+        assert(valid_key);
+        if (!valid_key) {
+          ROCKS_LOG_FATAL(info_log_, "Invalid key (%s) in compaction",
+                          key_.ToString(true).c_str());
+        }
+        // Keep current_key_ in sync.
+        current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type);
+        key_ = current_key_.GetInternalKey();
+        ikey_.user_key = current_key_.GetUserKey();
+        valid_ = true;
+      } else {
+        // all merge operands were filtered out. reset the user key, since the
+        // batch consumed by the merge operator should not shadow any keys
+        // coming after the merges
+        has_current_user_key_ = false;
+        pinned_iters_mgr_.ReleasePinnedData();
+
+        if (merge_helper_->FilteredUntil(&skip_until)) {
+          need_skip = true;
+        }
+      }
+    } else {
+      // 1. new user key -OR-
+      // 2. different snapshot stripe
+      bool should_delete = range_del_agg_->ShouldDelete(
+          key_, RangeDelPositioningMode::kForwardTraversal);
+      if (should_delete) {
+        ++iter_stats_.num_record_drop_hidden;
+        ++iter_stats_.num_record_drop_range_del;
+        input_->Next();
+      } else {
+        valid_ = true;
+      }
+    }
+
+    if (need_skip) {
+      input_->Seek(skip_until);
+    }
+  }
+
+  if (!valid_ && IsShuttingDown()) {
+    status_ = Status::ShutdownInProgress();
+  }
+
+  if (IsPausingManualCompaction()) {
+    status_ = Status::Incomplete(Status::SubCode::kManualCompactionPaused);
+  }
+}
+
+void CompactionIterator::PrepareOutput() {
+  if (valid_) {
+    if (compaction_filter_ && ikey_.type == kTypeBlobIndex) {
+      const auto blob_decision = compaction_filter_->PrepareBlobOutput(
+          user_key(), value_, &compaction_filter_value_);
+
+      if (blob_decision == CompactionFilter::BlobDecision::kCorruption) {
+        status_ = Status::Corruption(
+            "Corrupted blob reference encountered during GC");
+        valid_ = false;
+      } else if (blob_decision == CompactionFilter::BlobDecision::kIOError) {
+        status_ = Status::IOError("Could not relocate blob during GC");
+        valid_ = false;
+      } else if (blob_decision ==
+                 CompactionFilter::BlobDecision::kChangeValue) {
+        value_ = compaction_filter_value_;
+      }
+    }
+
+    // Zeroing out the sequence number leads to better compression.
+    // If this is the bottommost level (no files in lower levels)
+    // and the earliest snapshot is larger than this seqno
+    // and the userkey differs from the last userkey in compaction
+    // then we can squash the seqno to zero.
+    //
+    // This is safe for TransactionDB write-conflict checking since transactions
+    // only care about sequence number larger than any active snapshots.
+    //
+    // Can we do the same for levels above bottom level as long as
+    // KeyNotExistsBeyondOutputLevel() return true?
+    if (valid_ && compaction_ != nullptr &&
+        !compaction_->allow_ingest_behind() &&
+        ikeyNotNeededForIncrementalSnapshot() && bottommost_level_ &&
+        IN_EARLIEST_SNAPSHOT(ikey_.sequence) && ikey_.type != kTypeMerge) {
+      assert(ikey_.type != kTypeDeletion && ikey_.type != kTypeSingleDeletion);
+      if (ikey_.type == kTypeDeletion || ikey_.type == kTypeSingleDeletion) {
+        ROCKS_LOG_FATAL(info_log_,
+                        "Unexpected key type %d for seq-zero optimization",
+                        ikey_.type);
+      }
+      ikey_.sequence = 0;
+      current_key_.UpdateInternalKey(0, ikey_.type);
+    }
+  }
+}
+
+inline SequenceNumber CompactionIterator::findEarliestVisibleSnapshot(
+    SequenceNumber in, SequenceNumber* prev_snapshot) {
+  assert(snapshots_->size());
+  if (snapshots_->size() == 0) {
+    ROCKS_LOG_FATAL(info_log_,
+                    "No snapshot left in findEarliestVisibleSnapshot");
+  }
+  auto snapshots_iter = std::lower_bound(
+      snapshots_->begin(), snapshots_->end(), in);
+  if (snapshots_iter == snapshots_->begin()) {
+    *prev_snapshot = 0;
+  } else {
+    *prev_snapshot = *std::prev(snapshots_iter);
+    assert(*prev_snapshot < in);
+    if (*prev_snapshot >= in) {
+      ROCKS_LOG_FATAL(info_log_,
+                      "*prev_snapshot >= in in findEarliestVisibleSnapshot");
+    }
+  }
+  if (snapshot_checker_ == nullptr) {
+    return snapshots_iter != snapshots_->end()
+      ? *snapshots_iter : kMaxSequenceNumber;
+  }
+  bool has_released_snapshot = !released_snapshots_.empty();
+  for (; snapshots_iter != snapshots_->end(); ++snapshots_iter) {
+    auto cur = *snapshots_iter;
+    assert(in <= cur);
+    if (in > cur) {
+      ROCKS_LOG_FATAL(info_log_, "in > cur in findEarliestVisibleSnapshot");
+    }
+    // Skip if cur is in released_snapshots.
+    if (has_released_snapshot && released_snapshots_.count(cur) > 0) {
+      continue;
+    }
+    auto res = snapshot_checker_->CheckInSnapshot(in, cur);
+    if (res == SnapshotCheckerResult::kInSnapshot) {
+      return cur;
+    } else if (res == SnapshotCheckerResult::kSnapshotReleased) {
+      released_snapshots_.insert(cur);
+    }
+    *prev_snapshot = cur;
+  }
+  return kMaxSequenceNumber;
+}
+
+// used in 2 places - prevents deletion markers to be dropped if they may be
+// needed and disables seqnum zero-out in PrepareOutput for recent keys.
+inline bool CompactionIterator::ikeyNotNeededForIncrementalSnapshot() {
+  return (!compaction_->preserve_deletes()) ||
+         (ikey_.sequence < preserve_deletes_seqnum_);
+}
+
+bool CompactionIterator::IsInEarliestSnapshot(SequenceNumber sequence) {
+  assert(snapshot_checker_ != nullptr);
+  bool pre_condition = (earliest_snapshot_ == kMaxSequenceNumber ||
+                        (earliest_snapshot_iter_ != snapshots_->end() &&
+                         *earliest_snapshot_iter_ == earliest_snapshot_));
+  assert(pre_condition);
+  if (!pre_condition) {
+    ROCKS_LOG_FATAL(info_log_,
+                    "Pre-Condition is not hold in IsInEarliestSnapshot");
+  }
+  auto in_snapshot =
+      snapshot_checker_->CheckInSnapshot(sequence, earliest_snapshot_);
+  while (UNLIKELY(in_snapshot == SnapshotCheckerResult::kSnapshotReleased)) {
+    // Avoid the the current earliest_snapshot_ being return as
+    // earliest visible snapshot for the next value. So if a value's sequence
+    // is zero-ed out by PrepareOutput(), the next value will be compact out.
+    released_snapshots_.insert(earliest_snapshot_);
+    earliest_snapshot_iter_++;
+
+    if (earliest_snapshot_iter_ == snapshots_->end()) {
+      earliest_snapshot_ = kMaxSequenceNumber;
+    } else {
+      earliest_snapshot_ = *earliest_snapshot_iter_;
+    }
+    in_snapshot =
+        snapshot_checker_->CheckInSnapshot(sequence, earliest_snapshot_);
+  }
+  assert(in_snapshot != SnapshotCheckerResult::kSnapshotReleased);
+  if (in_snapshot == SnapshotCheckerResult::kSnapshotReleased) {
+    ROCKS_LOG_FATAL(info_log_,
+                    "Unexpected released snapshot in IsInEarliestSnapshot");
+  }
+  return in_snapshot == SnapshotCheckerResult::kInSnapshot;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/compaction/compaction_iterator.h b/src/rocksdb/db/compaction/compaction_iterator.h
new file mode 100644
index 000000000..8be60eb9e
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_iterator.h
@@ -0,0 +1,240 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#include <algorithm>
+#include <deque>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "db/compaction/compaction.h"
+#include "db/compaction/compaction_iteration_stats.h"
+#include "db/merge_helper.h"
+#include "db/pinned_iterators_manager.h"
+#include "db/range_del_aggregator.h"
+#include "db/snapshot_checker.h"
+#include "options/cf_options.h"
+#include "rocksdb/compaction_filter.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class CompactionIterator {
+ public:
+  // A wrapper around Compaction. Has a much smaller interface, only what
+  // CompactionIterator uses. Tests can override it.
+  class CompactionProxy {
+   public:
+    explicit CompactionProxy(const Compaction* compaction)
+        : compaction_(compaction) {}
+
+    virtual ~CompactionProxy() = default;
+    virtual int level(size_t /*compaction_input_level*/ = 0) const {
+      return compaction_->level();
+    }
+    virtual bool KeyNotExistsBeyondOutputLevel(
+        const Slice& user_key, std::vector<size_t>* level_ptrs) const {
+      return compaction_->KeyNotExistsBeyondOutputLevel(user_key, level_ptrs);
+    }
+    virtual bool bottommost_level() const {
+      return compaction_->bottommost_level();
+    }
+    virtual int number_levels() const { return compaction_->number_levels(); }
+    virtual Slice GetLargestUserKey() const {
+      return compaction_->GetLargestUserKey();
+    }
+    virtual bool allow_ingest_behind() const {
+      return compaction_->immutable_cf_options()->allow_ingest_behind;
+    }
+    virtual bool preserve_deletes() const {
+      return compaction_->immutable_cf_options()->preserve_deletes;
+    }
+
+   protected:
+    CompactionProxy() = default;
+
+   private:
+    const Compaction* compaction_;
+  };
+
+  CompactionIterator(
+      InternalIterator* input, const Comparator* cmp, MergeHelper* merge_helper,
+      SequenceNumber last_sequence, std::vector<SequenceNumber>* snapshots,
+      SequenceNumber earliest_write_conflict_snapshot,
+      const SnapshotChecker* snapshot_checker, Env* env,
+      bool report_detailed_time, bool expect_valid_internal_key,
+      CompactionRangeDelAggregator* range_del_agg,
+      const Compaction* compaction = nullptr,
+      const CompactionFilter* compaction_filter = nullptr,
+      const std::atomic<bool>* shutting_down = nullptr,
+      const SequenceNumber preserve_deletes_seqnum = 0,
+      const std::atomic<bool>* manual_compaction_paused = nullptr,
+      const std::shared_ptr<Logger> info_log = nullptr);
+
+  // Constructor with custom CompactionProxy, used for tests.
+  CompactionIterator(
+      InternalIterator* input, const Comparator* cmp, MergeHelper* merge_helper,
+      SequenceNumber last_sequence, std::vector<SequenceNumber>* snapshots,
+      SequenceNumber earliest_write_conflict_snapshot,
+      const SnapshotChecker* snapshot_checker, Env* env,
+      bool report_detailed_time, bool expect_valid_internal_key,
+      CompactionRangeDelAggregator* range_del_agg,
+      std::unique_ptr<CompactionProxy> compaction,
+      const CompactionFilter* compaction_filter = nullptr,
+      const std::atomic<bool>* shutting_down = nullptr,
+      const SequenceNumber preserve_deletes_seqnum = 0,
+      const std::atomic<bool>* manual_compaction_paused = nullptr,
+      const std::shared_ptr<Logger> info_log = nullptr);
+
+  ~CompactionIterator();
+
+  void ResetRecordCounts();
+
+  // Seek to the beginning of the compaction iterator output.
+  //
+  // REQUIRED: Call only once.
+  void SeekToFirst();
+
+  // Produces the next record in the compaction.
+  //
+  // REQUIRED: SeekToFirst() has been called.
+  void Next();
+
+  // Getters
+  const Slice& key() const { return key_; }
+  const Slice& value() const { return value_; }
+  const Status& status() const { return status_; }
+  const ParsedInternalKey& ikey() const { return ikey_; }
+  bool Valid() const { return valid_; }
+  const Slice& user_key() const { return current_user_key_; }
+  const CompactionIterationStats& iter_stats() const { return iter_stats_; }
+
+ private:
+  // Processes the input stream to find the next output
+  void NextFromInput();
+
+  // Do last preparations before presenting the output to the callee. At this
+  // point this only zeroes out the sequence number if possible for better
+  // compression.
+  void PrepareOutput();
+
+  // Invoke compaction filter if needed.
+  void InvokeFilterIfNeeded(bool* need_skip, Slice* skip_until);
+
+  // Given a sequence number, return the sequence number of the
+  // earliest snapshot that this sequence number is visible in.
+  // The snapshots themselves are arranged in ascending order of
+  // sequence numbers.
+  // Employ a sequential search because the total number of
+  // snapshots are typically small.
+  inline SequenceNumber findEarliestVisibleSnapshot(
+      SequenceNumber in, SequenceNumber* prev_snapshot);
+
+  // Checks whether the currently seen ikey_ is needed for
+  // incremental (differential) snapshot and hence can't be dropped
+  // or seqnum be zero-ed out even if all other conditions for it are met.
+  inline bool ikeyNotNeededForIncrementalSnapshot();
+
+  inline bool KeyCommitted(SequenceNumber sequence) {
+    return snapshot_checker_ == nullptr ||
+           snapshot_checker_->CheckInSnapshot(sequence, kMaxSequenceNumber) ==
+               SnapshotCheckerResult::kInSnapshot;
+  }
+
+  bool IsInEarliestSnapshot(SequenceNumber sequence);
+
+  InternalIterator* input_;
+  const Comparator* cmp_;
+  MergeHelper* merge_helper_;
+  const std::vector<SequenceNumber>* snapshots_;
+  // List of snapshots released during compaction.
+  // findEarliestVisibleSnapshot() find them out from return of
+  // snapshot_checker, and make sure they will not be returned as
+  // earliest visible snapshot of an older value.
+  // See WritePreparedTransactionTest::ReleaseSnapshotDuringCompaction3.
+  std::unordered_set<SequenceNumber> released_snapshots_;
+  std::vector<SequenceNumber>::const_iterator earliest_snapshot_iter_;
+  const SequenceNumber earliest_write_conflict_snapshot_;
+  const SnapshotChecker* const snapshot_checker_;
+  Env* env_;
+  bool report_detailed_time_;
+  bool expect_valid_internal_key_;
+  CompactionRangeDelAggregator* range_del_agg_;
+  std::unique_ptr<CompactionProxy> compaction_;
+  const CompactionFilter* compaction_filter_;
+  const std::atomic<bool>* shutting_down_;
+  const std::atomic<bool>* manual_compaction_paused_;
+  const SequenceNumber preserve_deletes_seqnum_;
+  bool bottommost_level_;
+  bool valid_ = false;
+  bool visible_at_tip_;
+  SequenceNumber earliest_snapshot_;
+  SequenceNumber latest_snapshot_;
+
+  // State
+  //
+  // Points to a copy of the current compaction iterator output (current_key_)
+  // if valid_.
+  Slice key_;
+  // Points to the value in the underlying iterator that corresponds to the
+  // current output.
+  Slice value_;
+  // The status is OK unless compaction iterator encounters a merge operand
+  // while not having a merge operator defined.
+  Status status_;
+  // Stores the user key, sequence number and type of the current compaction
+  // iterator output (or current key in the underlying iterator during
+  // NextFromInput()).
+  ParsedInternalKey ikey_;
+  // Stores whether ikey_.user_key is valid. If set to false, the user key is
+  // not compared against the current key in the underlying iterator.
+  bool has_current_user_key_ = false;
+  bool at_next_ = false;  // If false, the iterator
+  // Holds a copy of the current compaction iterator output (or current key in
+  // the underlying iterator during NextFromInput()).
+  IterKey current_key_;
+  Slice current_user_key_;
+  SequenceNumber current_user_key_sequence_;
+  SequenceNumber current_user_key_snapshot_;
+
+  // True if the iterator has already returned a record for the current key.
+  bool has_outputted_key_ = false;
+
+  // truncated the value of the next key and output it without applying any
+  // compaction rules.  This is used for outputting a put after a single delete.
+  bool clear_and_output_next_key_ = false;
+
+  MergeOutputIterator merge_out_iter_;
+  // PinnedIteratorsManager used to pin input_ Iterator blocks while reading
+  // merge operands and then releasing them after consuming them.
+  PinnedIteratorsManager pinned_iters_mgr_;
+  std::string compaction_filter_value_;
+  InternalKey compaction_filter_skip_until_;
+  // "level_ptrs" holds indices that remember which file of an associated
+  // level we were last checking during the last call to compaction->
+  // KeyNotExistsBeyondOutputLevel(). This allows future calls to the function
+  // to pick off where it left off since each subcompaction's key range is
+  // increasing so a later call to the function must be looking for a key that
+  // is in or beyond the last file checked during the previous call
+  std::vector<size_t> level_ptrs_;
+  CompactionIterationStats iter_stats_;
+
+  // Used to avoid purging uncommitted values. The application can specify
+  // uncommitted values by providing a SnapshotChecker object.
+  bool current_key_committed_;
+  std::shared_ptr<Logger> info_log_;
+
+  bool IsShuttingDown() {
+    // This is a best-effort facility, so memory_order_relaxed is sufficient.
+    return shutting_down_ && shutting_down_->load(std::memory_order_relaxed);
+  }
+
+  bool IsPausingManualCompaction() {
+    // This is a best-effort facility, so memory_order_relaxed is sufficient.
+    return manual_compaction_paused_ &&
+           manual_compaction_paused_->load(std::memory_order_relaxed);
+  }
+};
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/compaction/compaction_iterator_test.cc b/src/rocksdb/db/compaction/compaction_iterator_test.cc
new file mode 100644
index 000000000..0c50fb9ba
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_iterator_test.cc
@@ -0,0 +1,976 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+
+#include <string>
+#include <vector>
+
+#include "db/compaction/compaction_iterator.h"
+#include "port/port.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/string_util.h"
+#include "utilities/merge_operators.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Expects no merging attempts.
+class NoMergingMergeOp : public MergeOperator {
+ public:
+  bool FullMergeV2(const MergeOperationInput& /*merge_in*/,
+                   MergeOperationOutput* /*merge_out*/) const override {
+    ADD_FAILURE();
+    return false;
+  }
+  bool PartialMergeMulti(const Slice& /*key*/,
+                         const std::deque<Slice>& /*operand_list*/,
+                         std::string* /*new_value*/,
+                         Logger* /*logger*/) const override {
+    ADD_FAILURE();
+    return false;
+  }
+  const char* Name() const override {
+    return "CompactionIteratorTest NoMergingMergeOp";
+  }
+};
+
+// Compaction filter that gets stuck when it sees a particular key,
+// then gets unstuck when told to.
+// Always returns Decition::kRemove.
+class StallingFilter : public CompactionFilter {
+ public:
+  Decision FilterV2(int /*level*/, const Slice& key, ValueType /*type*/,
+                    const Slice& /*existing_value*/, std::string* /*new_value*/,
+                    std::string* /*skip_until*/) const override {
+    int k = std::atoi(key.ToString().c_str());
+    last_seen.store(k);
+    while (k >= stall_at.load()) {
+      std::this_thread::yield();
+    }
+    return Decision::kRemove;
+  }
+
+  const char* Name() const override {
+    return "CompactionIteratorTest StallingFilter";
+  }
+
+  // Wait until the filter sees a key >= k and stalls at that key.
+  // If `exact`, asserts that the seen key is equal to k.
+  void WaitForStall(int k, bool exact = true) {
+    stall_at.store(k);
+    while (last_seen.load() < k) {
+      std::this_thread::yield();
+    }
+    if (exact) {
+      EXPECT_EQ(k, last_seen.load());
+    }
+  }
+
+  // Filter will stall on key >= stall_at. Advance stall_at to unstall.
+  mutable std::atomic<int> stall_at{0};
+  // Last key the filter was called with.
+  mutable std::atomic<int> last_seen{0};
+};
+
+// Compaction filter that filter out all keys.
+class FilterAllKeysCompactionFilter : public CompactionFilter {
+ public:
+  Decision FilterV2(int /*level*/, const Slice& /*key*/, ValueType /*type*/,
+                    const Slice& /*existing_value*/, std::string* /*new_value*/,
+                    std::string* /*skip_until*/) const override {
+    return Decision::kRemove;
+  }
+
+  const char* Name() const override { return "AllKeysCompactionFilter"; }
+};
+
+class LoggingForwardVectorIterator : public InternalIterator {
+ public:
+  struct Action {
+    enum class Type {
+      SEEK_TO_FIRST,
+      SEEK,
+      NEXT,
+    };
+
+    Type type;
+    std::string arg;
+
+    explicit Action(Type _type, std::string _arg = "")
+        : type(_type), arg(_arg) {}
+
+    bool operator==(const Action& rhs) const {
+      return std::tie(type, arg) == std::tie(rhs.type, rhs.arg);
+    }
+  };
+
+  LoggingForwardVectorIterator(const std::vector<std::string>& keys,
+                               const std::vector<std::string>& values)
+      : keys_(keys), values_(values), current_(keys.size()) {
+    assert(keys_.size() == values_.size());
+  }
+
+  bool Valid() const override { return current_ < keys_.size(); }
+
+  void SeekToFirst() override {
+    log.emplace_back(Action::Type::SEEK_TO_FIRST);
+    current_ = 0;
+  }
+  void SeekToLast() override { assert(false); }
+
+  void Seek(const Slice& target) override {
+    log.emplace_back(Action::Type::SEEK, target.ToString());
+    current_ = std::lower_bound(keys_.begin(), keys_.end(), target.ToString()) -
+               keys_.begin();
+  }
+
+  void SeekForPrev(const Slice& /*target*/) override { assert(false); }
+
+  void Next() override {
+    assert(Valid());
+    log.emplace_back(Action::Type::NEXT);
+    current_++;
+  }
+  void Prev() override { assert(false); }
+
+  Slice key() const override {
+    assert(Valid());
+    return Slice(keys_[current_]);
+  }
+  Slice value() const override {
+    assert(Valid());
+    return Slice(values_[current_]);
+  }
+
+  Status status() const override { return Status::OK(); }
+
+  std::vector<Action> log;
+
+ private:
+  std::vector<std::string> keys_;
+  std::vector<std::string> values_;
+  size_t current_;
+};
+
+class FakeCompaction : public CompactionIterator::CompactionProxy {
+ public:
+  FakeCompaction() = default;
+
+  int level(size_t /*compaction_input_level*/) const override { return 0; }
+  bool KeyNotExistsBeyondOutputLevel(
+      const Slice& /*user_key*/,
+      std::vector<size_t>* /*level_ptrs*/) const override {
+    return is_bottommost_level || key_not_exists_beyond_output_level;
+  }
+  bool bottommost_level() const override { return is_bottommost_level; }
+  int number_levels() const override { return 1; }
+  Slice GetLargestUserKey() const override {
+    return "\xff\xff\xff\xff\xff\xff\xff\xff\xff";
+  }
+  bool allow_ingest_behind() const override { return false; }
+
+  bool preserve_deletes() const override { return false; }
+
+  bool key_not_exists_beyond_output_level = false;
+
+  bool is_bottommost_level = false;
+};
+
+// A simplifed snapshot checker which assumes each snapshot has a global
+// last visible sequence.
+class TestSnapshotChecker : public SnapshotChecker {
+ public:
+  explicit TestSnapshotChecker(
+      SequenceNumber last_committed_sequence,
+      const std::unordered_map<SequenceNumber, SequenceNumber>& snapshots = {{}})
+      : last_committed_sequence_(last_committed_sequence),
+        snapshots_(snapshots) {}
+
+  SnapshotCheckerResult CheckInSnapshot(
+      SequenceNumber seq, SequenceNumber snapshot_seq) const override {
+    if (snapshot_seq == kMaxSequenceNumber) {
+      return seq <= last_committed_sequence_
+                 ? SnapshotCheckerResult::kInSnapshot
+                 : SnapshotCheckerResult::kNotInSnapshot;
+    }
+    assert(snapshots_.count(snapshot_seq) > 0);
+    return seq <= snapshots_.at(snapshot_seq)
+               ? SnapshotCheckerResult::kInSnapshot
+               : SnapshotCheckerResult::kNotInSnapshot;
+  }
+
+ private:
+  SequenceNumber last_committed_sequence_;
+  // A map of valid snapshot to last visible sequence to the snapshot.
+  std::unordered_map<SequenceNumber, SequenceNumber> snapshots_;
+};
+
+// Test param:
+//   bool: whether to pass snapshot_checker to compaction iterator.
+class CompactionIteratorTest : public testing::TestWithParam<bool> {
+ public:
+  CompactionIteratorTest()
+      : cmp_(BytewiseComparator()), icmp_(cmp_), snapshots_({}) {}
+
+  void InitIterators(
+      const std::vector<std::string>& ks, const std::vector<std::string>& vs,
+      const std::vector<std::string>& range_del_ks,
+      const std::vector<std::string>& range_del_vs,
+      SequenceNumber last_sequence,
+      SequenceNumber last_committed_sequence = kMaxSequenceNumber,
+      MergeOperator* merge_op = nullptr, CompactionFilter* filter = nullptr,
+      bool bottommost_level = false,
+      SequenceNumber earliest_write_conflict_snapshot = kMaxSequenceNumber) {
+    std::unique_ptr<InternalIterator> unfragmented_range_del_iter(
+        new test::VectorIterator(range_del_ks, range_del_vs));
+    auto tombstone_list = std::make_shared<FragmentedRangeTombstoneList>(
+        std::move(unfragmented_range_del_iter), icmp_);
+    std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter(
+        new FragmentedRangeTombstoneIterator(tombstone_list, icmp_,
+                                             kMaxSequenceNumber));
+    range_del_agg_.reset(new CompactionRangeDelAggregator(&icmp_, snapshots_));
+    range_del_agg_->AddTombstones(std::move(range_del_iter));
+
+    std::unique_ptr<CompactionIterator::CompactionProxy> compaction;
+    if (filter || bottommost_level) {
+      compaction_proxy_ = new FakeCompaction();
+      compaction_proxy_->is_bottommost_level = bottommost_level;
+      compaction.reset(compaction_proxy_);
+    }
+    bool use_snapshot_checker = UseSnapshotChecker() || GetParam();
+    if (use_snapshot_checker || last_committed_sequence < kMaxSequenceNumber) {
+      snapshot_checker_.reset(
+          new TestSnapshotChecker(last_committed_sequence, snapshot_map_));
+    }
+    merge_helper_.reset(
+        new MergeHelper(Env::Default(), cmp_, merge_op, filter, nullptr, false,
+                        0 /*latest_snapshot*/, snapshot_checker_.get(),
+                        0 /*level*/, nullptr /*statistics*/, &shutting_down_));
+
+    iter_.reset(new LoggingForwardVectorIterator(ks, vs));
+    iter_->SeekToFirst();
+    c_iter_.reset(new CompactionIterator(
+        iter_.get(), cmp_, merge_helper_.get(), last_sequence, &snapshots_,
+        earliest_write_conflict_snapshot, snapshot_checker_.get(),
+        Env::Default(), false /* report_detailed_time */, false,
+        range_del_agg_.get(), std::move(compaction), filter, &shutting_down_));
+  }
+
+  void AddSnapshot(SequenceNumber snapshot,
+                   SequenceNumber last_visible_seq = kMaxSequenceNumber) {
+    snapshots_.push_back(snapshot);
+    snapshot_map_[snapshot] = last_visible_seq;
+  }
+
+  virtual bool UseSnapshotChecker() const { return false; }
+
+  void RunTest(
+      const std::vector<std::string>& input_keys,
+      const std::vector<std::string>& input_values,
+      const std::vector<std::string>& expected_keys,
+      const std::vector<std::string>& expected_values,
+      SequenceNumber last_committed_seq = kMaxSequenceNumber,
+      MergeOperator* merge_operator = nullptr,
+      CompactionFilter* compaction_filter = nullptr,
+      bool bottommost_level = false,
+      SequenceNumber earliest_write_conflict_snapshot = kMaxSequenceNumber) {
+    InitIterators(input_keys, input_values, {}, {}, kMaxSequenceNumber,
+                  last_committed_seq, merge_operator, compaction_filter,
+                  bottommost_level, earliest_write_conflict_snapshot);
+    c_iter_->SeekToFirst();
+    for (size_t i = 0; i < expected_keys.size(); i++) {
+      std::string info = "i = " + ToString(i);
+      ASSERT_TRUE(c_iter_->Valid()) << info;
+      ASSERT_OK(c_iter_->status()) << info;
+      ASSERT_EQ(expected_keys[i], c_iter_->key().ToString()) << info;
+      ASSERT_EQ(expected_values[i], c_iter_->value().ToString()) << info;
+      c_iter_->Next();
+    }
+    ASSERT_FALSE(c_iter_->Valid());
+  }
+
+  const Comparator* cmp_;
+  const InternalKeyComparator icmp_;
+  std::vector<SequenceNumber> snapshots_;
+  // A map of valid snapshot to last visible sequence to the snapshot.
+  std::unordered_map<SequenceNumber, SequenceNumber> snapshot_map_;
+  std::unique_ptr<MergeHelper> merge_helper_;
+  std::unique_ptr<LoggingForwardVectorIterator> iter_;
+  std::unique_ptr<CompactionIterator> c_iter_;
+  std::unique_ptr<CompactionRangeDelAggregator> range_del_agg_;
+  std::unique_ptr<SnapshotChecker> snapshot_checker_;
+  std::atomic<bool> shutting_down_{false};
+  FakeCompaction* compaction_proxy_;
+};
+
+// It is possible that the output of the compaction iterator is empty even if
+// the input is not.
+TEST_P(CompactionIteratorTest, EmptyResult) {
+  InitIterators({test::KeyStr("a", 5, kTypeSingleDeletion),
+                 test::KeyStr("a", 3, kTypeValue)},
+                {"", "val"}, {}, {}, 5);
+  c_iter_->SeekToFirst();
+  ASSERT_FALSE(c_iter_->Valid());
+}
+
+// If there is a corruption after a single deletion, the corrupted key should
+// be preserved.
+TEST_P(CompactionIteratorTest, CorruptionAfterSingleDeletion) {
+  InitIterators({test::KeyStr("a", 5, kTypeSingleDeletion),
+                 test::KeyStr("a", 3, kTypeValue, true),
+                 test::KeyStr("b", 10, kTypeValue)},
+                {"", "val", "val2"}, {}, {}, 10);
+  c_iter_->SeekToFirst();
+  ASSERT_TRUE(c_iter_->Valid());
+  ASSERT_EQ(test::KeyStr("a", 5, kTypeSingleDeletion),
+            c_iter_->key().ToString());
+  c_iter_->Next();
+  ASSERT_TRUE(c_iter_->Valid());
+  ASSERT_EQ(test::KeyStr("a", 3, kTypeValue, true), c_iter_->key().ToString());
+  c_iter_->Next();
+  ASSERT_TRUE(c_iter_->Valid());
+  ASSERT_EQ(test::KeyStr("b", 10, kTypeValue), c_iter_->key().ToString());
+  c_iter_->Next();
+  ASSERT_FALSE(c_iter_->Valid());
+}
+
+TEST_P(CompactionIteratorTest, SimpleRangeDeletion) {
+  InitIterators({test::KeyStr("morning", 5, kTypeValue),
+                 test::KeyStr("morning", 2, kTypeValue),
+                 test::KeyStr("night", 3, kTypeValue)},
+                {"zao", "zao", "wan"},
+                {test::KeyStr("ma", 4, kTypeRangeDeletion)}, {"mz"}, 5);
+  c_iter_->SeekToFirst();
+  ASSERT_TRUE(c_iter_->Valid());
+  ASSERT_EQ(test::KeyStr("morning", 5, kTypeValue), c_iter_->key().ToString());
+  c_iter_->Next();
+  ASSERT_TRUE(c_iter_->Valid());
+  ASSERT_EQ(test::KeyStr("night", 3, kTypeValue), c_iter_->key().ToString());
+  c_iter_->Next();
+  ASSERT_FALSE(c_iter_->Valid());
+}
+
+TEST_P(CompactionIteratorTest, RangeDeletionWithSnapshots) {
+  AddSnapshot(10);
+  std::vector<std::string> ks1;
+  ks1.push_back(test::KeyStr("ma", 28, kTypeRangeDeletion));
+  std::vector<std::string> vs1{"mz"};
+  std::vector<std::string> ks2{test::KeyStr("morning", 15, kTypeValue),
+                               test::KeyStr("morning", 5, kTypeValue),
+                               test::KeyStr("night", 40, kTypeValue),
+                               test::KeyStr("night", 20, kTypeValue)};
+  std::vector<std::string> vs2{"zao 15", "zao 5", "wan 40", "wan 20"};
+  InitIterators(ks2, vs2, ks1, vs1, 40);
+  c_iter_->SeekToFirst();
+  ASSERT_TRUE(c_iter_->Valid());
+  ASSERT_EQ(test::KeyStr("morning", 5, kTypeValue), c_iter_->key().ToString());
+  c_iter_->Next();
+  ASSERT_TRUE(c_iter_->Valid());
+  ASSERT_EQ(test::KeyStr("night", 40, kTypeValue), c_iter_->key().ToString());
+  c_iter_->Next();
+  ASSERT_FALSE(c_iter_->Valid());
+}
+
+TEST_P(CompactionIteratorTest, CompactionFilterSkipUntil) {
+  class Filter : public CompactionFilter {
+    Decision FilterV2(int /*level*/, const Slice& key, ValueType t,
+                      const Slice& existing_value, std::string* /*new_value*/,
+                      std::string* skip_until) const override {
+      std::string k = key.ToString();
+      std::string v = existing_value.ToString();
+      // See InitIterators() call below for the sequence of keys and their
+      // filtering decisions. Here we closely assert that compaction filter is
+      // called with the expected keys and only them, and with the right values.
+      if (k == "a") {
+        EXPECT_EQ(ValueType::kValue, t);
+        EXPECT_EQ("av50", v);
+        return Decision::kKeep;
+      }
+      if (k == "b") {
+        EXPECT_EQ(ValueType::kValue, t);
+        EXPECT_EQ("bv60", v);
+        *skip_until = "d+";
+        return Decision::kRemoveAndSkipUntil;
+      }
+      if (k == "e") {
+        EXPECT_EQ(ValueType::kMergeOperand, t);
+        EXPECT_EQ("em71", v);
+        return Decision::kKeep;
+      }
+      if (k == "f") {
+        if (v == "fm65") {
+          EXPECT_EQ(ValueType::kMergeOperand, t);
+          *skip_until = "f";
+        } else {
+          EXPECT_EQ("fm30", v);
+          EXPECT_EQ(ValueType::kMergeOperand, t);
+          *skip_until = "g+";
+        }
+        return Decision::kRemoveAndSkipUntil;
+      }
+      if (k == "h") {
+        EXPECT_EQ(ValueType::kValue, t);
+        EXPECT_EQ("hv91", v);
+        return Decision::kKeep;
+      }
+      if (k == "i") {
+        EXPECT_EQ(ValueType::kMergeOperand, t);
+        EXPECT_EQ("im95", v);
+        *skip_until = "z";
+        return Decision::kRemoveAndSkipUntil;
+      }
+      ADD_FAILURE();
+      return Decision::kKeep;
+    }
+
+    const char* Name() const override {
+      return "CompactionIteratorTest.CompactionFilterSkipUntil::Filter";
+    }
+  };
+
+  NoMergingMergeOp merge_op;
+  Filter filter;
+  InitIterators(
+      {test::KeyStr("a", 50, kTypeValue),  // keep
+       test::KeyStr("a", 45, kTypeMerge),
+       test::KeyStr("b", 60, kTypeValue),  // skip to "d+"
+       test::KeyStr("b", 40, kTypeValue), test::KeyStr("c", 35, kTypeValue),
+       test::KeyStr("d", 70, kTypeMerge),
+       test::KeyStr("e", 71, kTypeMerge),  // keep
+       test::KeyStr("f", 65, kTypeMerge),  // skip to "f", aka keep
+       test::KeyStr("f", 30, kTypeMerge),  // skip to "g+"
+       test::KeyStr("f", 25, kTypeValue), test::KeyStr("g", 90, kTypeValue),
+       test::KeyStr("h", 91, kTypeValue),  // keep
+       test::KeyStr("i", 95, kTypeMerge),  // skip to "z"
+       test::KeyStr("j", 99, kTypeValue)},
+      {"av50", "am45", "bv60", "bv40", "cv35", "dm70", "em71", "fm65", "fm30",
+       "fv25", "gv90", "hv91", "im95", "jv99"},
+      {}, {}, kMaxSequenceNumber, kMaxSequenceNumber, &merge_op, &filter);
+
+  // Compaction should output just "a", "e" and "h" keys.
+  c_iter_->SeekToFirst();
+  ASSERT_TRUE(c_iter_->Valid());
+  ASSERT_EQ(test::KeyStr("a", 50, kTypeValue), c_iter_->key().ToString());
+  ASSERT_EQ("av50", c_iter_->value().ToString());
+  c_iter_->Next();
+  ASSERT_TRUE(c_iter_->Valid());
+  ASSERT_EQ(test::KeyStr("e", 71, kTypeMerge), c_iter_->key().ToString());
+  ASSERT_EQ("em71", c_iter_->value().ToString());
+  c_iter_->Next();
+  ASSERT_TRUE(c_iter_->Valid());
+  ASSERT_EQ(test::KeyStr("h", 91, kTypeValue), c_iter_->key().ToString());
+  ASSERT_EQ("hv91", c_iter_->value().ToString());
+  c_iter_->Next();
+  ASSERT_FALSE(c_iter_->Valid());
+
+  // Check that the compaction iterator did the correct sequence of calls on
+  // the underlying iterator.
+  using A = LoggingForwardVectorIterator::Action;
+  using T = A::Type;
+  std::vector<A> expected_actions = {
+      A(T::SEEK_TO_FIRST),
+      A(T::NEXT),
+      A(T::NEXT),
+      A(T::SEEK, test::KeyStr("d+", kMaxSequenceNumber, kValueTypeForSeek)),
+      A(T::NEXT),
+      A(T::NEXT),
+      A(T::SEEK, test::KeyStr("g+", kMaxSequenceNumber, kValueTypeForSeek)),
+      A(T::NEXT),
+      A(T::SEEK, test::KeyStr("z", kMaxSequenceNumber, kValueTypeForSeek))};
+  ASSERT_EQ(expected_actions, iter_->log);
+}
+
+TEST_P(CompactionIteratorTest, ShuttingDownInFilter) {
+  NoMergingMergeOp merge_op;
+  StallingFilter filter;
+  InitIterators(
+      {test::KeyStr("1", 1, kTypeValue), test::KeyStr("2", 2, kTypeValue),
+       test::KeyStr("3", 3, kTypeValue), test::KeyStr("4", 4, kTypeValue)},
+      {"v1", "v2", "v3", "v4"}, {}, {}, kMaxSequenceNumber, kMaxSequenceNumber,
+      &merge_op, &filter);
+  // Don't leave tombstones (kTypeDeletion) for filtered keys.
+  compaction_proxy_->key_not_exists_beyond_output_level = true;
+
+  std::atomic<bool> seek_done{false};
+  ROCKSDB_NAMESPACE::port::Thread compaction_thread([&] {
+    c_iter_->SeekToFirst();
+    EXPECT_FALSE(c_iter_->Valid());
+    EXPECT_TRUE(c_iter_->status().IsShutdownInProgress());
+    seek_done.store(true);
+  });
+
+  // Let key 1 through.
+  filter.WaitForStall(1);
+
+  // Shutdown during compaction filter call for key 2.
+  filter.WaitForStall(2);
+  shutting_down_.store(true);
+  EXPECT_FALSE(seek_done.load());
+
+  // Unstall filter and wait for SeekToFirst() to return.
+  filter.stall_at.store(3);
+  compaction_thread.join();
+  assert(seek_done.load());
+
+  // Check that filter was never called again.
+  EXPECT_EQ(2, filter.last_seen.load());
+}
+
+// Same as ShuttingDownInFilter, but shutdown happens during filter call for
+// a merge operand, not for a value.
+TEST_P(CompactionIteratorTest, ShuttingDownInMerge) {
+  NoMergingMergeOp merge_op;
+  StallingFilter filter;
+  InitIterators(
+      {test::KeyStr("1", 1, kTypeValue), test::KeyStr("2", 2, kTypeMerge),
+       test::KeyStr("3", 3, kTypeMerge), test::KeyStr("4", 4, kTypeValue)},
+      {"v1", "v2", "v3", "v4"}, {}, {}, kMaxSequenceNumber, kMaxSequenceNumber,
+      &merge_op, &filter);
+  compaction_proxy_->key_not_exists_beyond_output_level = true;
+
+  std::atomic<bool> seek_done{false};
+  ROCKSDB_NAMESPACE::port::Thread compaction_thread([&] {
+    c_iter_->SeekToFirst();
+    ASSERT_FALSE(c_iter_->Valid());
+    ASSERT_TRUE(c_iter_->status().IsShutdownInProgress());
+    seek_done.store(true);
+  });
+
+  // Let key 1 through.
+  filter.WaitForStall(1);
+
+  // Shutdown during compaction filter call for key 2.
+  filter.WaitForStall(2);
+  shutting_down_.store(true);
+  EXPECT_FALSE(seek_done.load());
+
+  // Unstall filter and wait for SeekToFirst() to return.
+  filter.stall_at.store(3);
+  compaction_thread.join();
+  assert(seek_done.load());
+
+  // Check that filter was never called again.
+  EXPECT_EQ(2, filter.last_seen.load());
+}
+
+TEST_P(CompactionIteratorTest, SingleMergeOperand) {
+  class Filter : public CompactionFilter {
+    Decision FilterV2(int /*level*/, const Slice& key, ValueType t,
+                      const Slice& existing_value, std::string* /*new_value*/,
+                      std::string* /*skip_until*/) const override {
+      std::string k = key.ToString();
+      std::string v = existing_value.ToString();
+
+      // See InitIterators() call below for the sequence of keys and their
+      // filtering decisions. Here we closely assert that compaction filter is
+      // called with the expected keys and only them, and with the right values.
+      if (k == "a") {
+        EXPECT_EQ(ValueType::kMergeOperand, t);
+        EXPECT_EQ("av1", v);
+        return Decision::kKeep;
+      } else if (k == "b") {
+        EXPECT_EQ(ValueType::kMergeOperand, t);
+        return Decision::kKeep;
+      } else if (k == "c") {
+        return Decision::kKeep;
+      }
+
+      ADD_FAILURE();
+      return Decision::kKeep;
+    }
+
+    const char* Name() const override {
+      return "CompactionIteratorTest.SingleMergeOperand::Filter";
+    }
+  };
+
+  class SingleMergeOp : public MergeOperator {
+   public:
+    bool FullMergeV2(const MergeOperationInput& merge_in,
+                     MergeOperationOutput* merge_out) const override {
+      // See InitIterators() call below for why "c" is the only key for which
+      // FullMergeV2 should be called.
+      EXPECT_EQ("c", merge_in.key.ToString());
+
+      std::string temp_value;
+      if (merge_in.existing_value != nullptr) {
+        temp_value = merge_in.existing_value->ToString();
+      }
+
+      for (auto& operand : merge_in.operand_list) {
+        temp_value.append(operand.ToString());
+      }
+      merge_out->new_value = temp_value;
+
+      return true;
+    }
+
+    bool PartialMergeMulti(const Slice& key,
+                           const std::deque<Slice>& operand_list,
+                           std::string* new_value,
+                           Logger* /*logger*/) const override {
+      std::string string_key = key.ToString();
+      EXPECT_TRUE(string_key == "a" || string_key == "b");
+
+      if (string_key == "a") {
+        EXPECT_EQ(1, operand_list.size());
+      } else if (string_key == "b") {
+        EXPECT_EQ(2, operand_list.size());
+      }
+
+      std::string temp_value;
+      for (auto& operand : operand_list) {
+        temp_value.append(operand.ToString());
+      }
+      swap(temp_value, *new_value);
+
+      return true;
+    }
+
+    const char* Name() const override {
+      return "CompactionIteratorTest SingleMergeOp";
+    }
+
+    bool AllowSingleOperand() const override { return true; }
+  };
+
+  SingleMergeOp merge_op;
+  Filter filter;
+  InitIterators(
+      // a should invoke PartialMergeMulti with a single merge operand.
+      {test::KeyStr("a", 50, kTypeMerge),
+       // b should invoke PartialMergeMulti with two operands.
+       test::KeyStr("b", 70, kTypeMerge), test::KeyStr("b", 60, kTypeMerge),
+       // c should invoke FullMerge due to kTypeValue at the beginning.
+       test::KeyStr("c", 90, kTypeMerge), test::KeyStr("c", 80, kTypeValue)},
+      {"av1", "bv2", "bv1", "cv2", "cv1"}, {}, {}, kMaxSequenceNumber,
+      kMaxSequenceNumber, &merge_op, &filter);
+
+  c_iter_->SeekToFirst();
+  ASSERT_TRUE(c_iter_->Valid());
+  ASSERT_EQ(test::KeyStr("a", 50, kTypeMerge), c_iter_->key().ToString());
+  ASSERT_EQ("av1", c_iter_->value().ToString());
+  c_iter_->Next();
+  ASSERT_TRUE(c_iter_->Valid());
+  ASSERT_EQ("bv1bv2", c_iter_->value().ToString());
+  c_iter_->Next();
+  ASSERT_EQ("cv1cv2", c_iter_->value().ToString());
+}
+
+// In bottommost level, values earlier than earliest snapshot can be output
+// with sequence = 0.
+TEST_P(CompactionIteratorTest, ZeroOutSequenceAtBottomLevel) {
+  AddSnapshot(1);
+  RunTest({test::KeyStr("a", 1, kTypeValue), test::KeyStr("b", 2, kTypeValue)},
+          {"v1", "v2"},
+          {test::KeyStr("a", 0, kTypeValue), test::KeyStr("b", 2, kTypeValue)},
+          {"v1", "v2"}, kMaxSequenceNumber /*last_commited_seq*/,
+          nullptr /*merge_operator*/, nullptr /*compaction_filter*/,
+          true /*bottommost_level*/);
+}
+
+// In bottommost level, deletions earlier than earliest snapshot can be removed
+// permanently.
+TEST_P(CompactionIteratorTest, RemoveDeletionAtBottomLevel) {
+  AddSnapshot(1);
+  RunTest({test::KeyStr("a", 1, kTypeDeletion),
+           test::KeyStr("b", 3, kTypeDeletion),
+           test::KeyStr("b", 1, kTypeValue)},
+          {"", "", ""},
+          {test::KeyStr("b", 3, kTypeDeletion),
+           test::KeyStr("b", 0, kTypeValue)},
+          {"", ""},
+          kMaxSequenceNumber /*last_commited_seq*/, nullptr /*merge_operator*/,
+          nullptr /*compaction_filter*/, true /*bottommost_level*/);
+}
+
+// In bottommost level, single deletions earlier than earliest snapshot can be
+// removed permanently.
+TEST_P(CompactionIteratorTest, RemoveSingleDeletionAtBottomLevel) {
+  AddSnapshot(1);
+  RunTest({test::KeyStr("a", 1, kTypeSingleDeletion),
+           test::KeyStr("b", 2, kTypeSingleDeletion)},
+          {"", ""}, {test::KeyStr("b", 2, kTypeSingleDeletion)}, {""},
+          kMaxSequenceNumber /*last_commited_seq*/, nullptr /*merge_operator*/,
+          nullptr /*compaction_filter*/, true /*bottommost_level*/);
+}
+
+INSTANTIATE_TEST_CASE_P(CompactionIteratorTestInstance, CompactionIteratorTest,
+                        testing::Values(true, false));
+
+// Tests how CompactionIterator work together with SnapshotChecker.
+class CompactionIteratorWithSnapshotCheckerTest
+    : public CompactionIteratorTest {
+ public:
+  bool UseSnapshotChecker() const override { return true; }
+};
+
+// Uncommitted keys (keys with seq > last_committed_seq) should be output as-is
+// while committed version of these keys should get compacted as usual.
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+       PreserveUncommittedKeys_Value) {
+  RunTest(
+      {test::KeyStr("foo", 3, kTypeValue), test::KeyStr("foo", 2, kTypeValue),
+       test::KeyStr("foo", 1, kTypeValue)},
+      {"v3", "v2", "v1"},
+      {test::KeyStr("foo", 3, kTypeValue), test::KeyStr("foo", 2, kTypeValue)},
+      {"v3", "v2"}, 2 /*last_committed_seq*/);
+}
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+       PreserveUncommittedKeys_Deletion) {
+  RunTest({test::KeyStr("foo", 2, kTypeDeletion),
+           test::KeyStr("foo", 1, kTypeValue)},
+          {"", "v1"},
+          {test::KeyStr("foo", 2, kTypeDeletion),
+           test::KeyStr("foo", 1, kTypeValue)},
+          {"", "v1"}, 1 /*last_committed_seq*/);
+}
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+       PreserveUncommittedKeys_Merge) {
+  auto merge_op = MergeOperators::CreateStringAppendOperator();
+  RunTest(
+      {test::KeyStr("foo", 3, kTypeMerge), test::KeyStr("foo", 2, kTypeMerge),
+       test::KeyStr("foo", 1, kTypeValue)},
+      {"v3", "v2", "v1"},
+      {test::KeyStr("foo", 3, kTypeMerge), test::KeyStr("foo", 2, kTypeValue)},
+      {"v3", "v1,v2"}, 2 /*last_committed_seq*/, merge_op.get());
+}
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+       PreserveUncommittedKeys_SingleDelete) {
+  RunTest({test::KeyStr("foo", 2, kTypeSingleDeletion),
+           test::KeyStr("foo", 1, kTypeValue)},
+          {"", "v1"},
+          {test::KeyStr("foo", 2, kTypeSingleDeletion),
+           test::KeyStr("foo", 1, kTypeValue)},
+          {"", "v1"}, 1 /*last_committed_seq*/);
+}
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+       PreserveUncommittedKeys_BlobIndex) {
+  RunTest({test::KeyStr("foo", 3, kTypeBlobIndex),
+           test::KeyStr("foo", 2, kTypeBlobIndex),
+           test::KeyStr("foo", 1, kTypeBlobIndex)},
+          {"v3", "v2", "v1"},
+          {test::KeyStr("foo", 3, kTypeBlobIndex),
+           test::KeyStr("foo", 2, kTypeBlobIndex)},
+          {"v3", "v2"}, 2 /*last_committed_seq*/);
+}
+
+// Test compaction iterator dedup keys visible to the same snapshot.
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest, DedupSameSnapshot_Value) {
+  AddSnapshot(2, 1);
+  RunTest(
+      {test::KeyStr("foo", 4, kTypeValue), test::KeyStr("foo", 3, kTypeValue),
+       test::KeyStr("foo", 2, kTypeValue), test::KeyStr("foo", 1, kTypeValue)},
+      {"v4", "v3", "v2", "v1"},
+      {test::KeyStr("foo", 4, kTypeValue), test::KeyStr("foo", 3, kTypeValue),
+       test::KeyStr("foo", 1, kTypeValue)},
+      {"v4", "v3", "v1"}, 3 /*last_committed_seq*/);
+}
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest, DedupSameSnapshot_Deletion) {
+  AddSnapshot(2, 1);
+  RunTest(
+      {test::KeyStr("foo", 4, kTypeValue),
+       test::KeyStr("foo", 3, kTypeDeletion),
+       test::KeyStr("foo", 2, kTypeValue), test::KeyStr("foo", 1, kTypeValue)},
+      {"v4", "", "v2", "v1"},
+      {test::KeyStr("foo", 4, kTypeValue),
+       test::KeyStr("foo", 3, kTypeDeletion),
+       test::KeyStr("foo", 1, kTypeValue)},
+      {"v4", "", "v1"}, 3 /*last_committed_seq*/);
+}
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest, DedupSameSnapshot_Merge) {
+  AddSnapshot(2, 1);
+  AddSnapshot(4, 3);
+  auto merge_op = MergeOperators::CreateStringAppendOperator();
+  RunTest(
+      {test::KeyStr("foo", 5, kTypeMerge), test::KeyStr("foo", 4, kTypeMerge),
+       test::KeyStr("foo", 3, kTypeMerge), test::KeyStr("foo", 2, kTypeMerge),
+       test::KeyStr("foo", 1, kTypeValue)},
+      {"v5", "v4", "v3", "v2", "v1"},
+      {test::KeyStr("foo", 5, kTypeMerge), test::KeyStr("foo", 4, kTypeMerge),
+       test::KeyStr("foo", 3, kTypeMerge), test::KeyStr("foo", 1, kTypeValue)},
+      {"v5", "v4", "v2,v3", "v1"}, 4 /*last_committed_seq*/, merge_op.get());
+}
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+       DedupSameSnapshot_SingleDeletion) {
+  AddSnapshot(2, 1);
+  RunTest(
+      {test::KeyStr("foo", 4, kTypeValue),
+       test::KeyStr("foo", 3, kTypeSingleDeletion),
+       test::KeyStr("foo", 2, kTypeValue), test::KeyStr("foo", 1, kTypeValue)},
+      {"v4", "", "v2", "v1"},
+      {test::KeyStr("foo", 4, kTypeValue), test::KeyStr("foo", 1, kTypeValue)},
+      {"v4", "v1"}, 3 /*last_committed_seq*/);
+}
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest, DedupSameSnapshot_BlobIndex) {
+  AddSnapshot(2, 1);
+  RunTest({test::KeyStr("foo", 4, kTypeBlobIndex),
+           test::KeyStr("foo", 3, kTypeBlobIndex),
+           test::KeyStr("foo", 2, kTypeBlobIndex),
+           test::KeyStr("foo", 1, kTypeBlobIndex)},
+          {"v4", "v3", "v2", "v1"},
+          {test::KeyStr("foo", 4, kTypeBlobIndex),
+           test::KeyStr("foo", 3, kTypeBlobIndex),
+           test::KeyStr("foo", 1, kTypeBlobIndex)},
+          {"v4", "v3", "v1"}, 3 /*last_committed_seq*/);
+}
+
+// At bottom level, sequence numbers can be zero out, and deletions can be
+// removed, but only when they are visible to earliest snapshot.
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+       NotZeroOutSequenceIfNotVisibleToEarliestSnapshot) {
+  AddSnapshot(2, 1);
+  RunTest({test::KeyStr("a", 1, kTypeValue), test::KeyStr("b", 2, kTypeValue),
+           test::KeyStr("c", 3, kTypeValue)},
+          {"v1", "v2", "v3"},
+          {test::KeyStr("a", 0, kTypeValue), test::KeyStr("b", 2, kTypeValue),
+           test::KeyStr("c", 3, kTypeValue)},
+          {"v1", "v2", "v3"}, kMaxSequenceNumber /*last_commited_seq*/,
+          nullptr /*merge_operator*/, nullptr /*compaction_filter*/,
+          true /*bottommost_level*/);
+}
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+       NotRemoveDeletionIfNotVisibleToEarliestSnapshot) {
+  AddSnapshot(2, 1);
+  RunTest(
+      {test::KeyStr("a", 1, kTypeDeletion), test::KeyStr("b", 2, kTypeDeletion),
+       test::KeyStr("c", 3, kTypeDeletion)},
+      {"", "", ""},
+      {},
+      {"", ""}, kMaxSequenceNumber /*last_commited_seq*/,
+      nullptr /*merge_operator*/, nullptr /*compaction_filter*/,
+      true /*bottommost_level*/);
+}
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+       NotRemoveDeletionIfValuePresentToEarlierSnapshot) {
+  AddSnapshot(2,1);
+  RunTest(
+      {test::KeyStr("a", 4, kTypeDeletion), test::KeyStr("a", 1, kTypeValue),
+          test::KeyStr("b", 3, kTypeValue)},
+      {"", "", ""},
+      {test::KeyStr("a", 4, kTypeDeletion), test::KeyStr("a", 0, kTypeValue),
+            test::KeyStr("b", 3, kTypeValue)},
+      {"", "", ""}, kMaxSequenceNumber /*last_commited_seq*/,
+      nullptr /*merge_operator*/, nullptr /*compaction_filter*/,
+      true /*bottommost_level*/);
+}
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+       NotRemoveSingleDeletionIfNotVisibleToEarliestSnapshot) {
+  AddSnapshot(2, 1);
+  RunTest({test::KeyStr("a", 1, kTypeSingleDeletion),
+           test::KeyStr("b", 2, kTypeSingleDeletion),
+           test::KeyStr("c", 3, kTypeSingleDeletion)},
+          {"", "", ""},
+          {test::KeyStr("b", 2, kTypeSingleDeletion),
+           test::KeyStr("c", 3, kTypeSingleDeletion)},
+          {"", ""}, kMaxSequenceNumber /*last_commited_seq*/,
+          nullptr /*merge_operator*/, nullptr /*compaction_filter*/,
+          true /*bottommost_level*/);
+}
+
+// Single delete should not cancel out values that not visible to the
+// same set of snapshots
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+       SingleDeleteAcrossSnapshotBoundary) {
+  AddSnapshot(2, 1);
+  RunTest({test::KeyStr("a", 2, kTypeSingleDeletion),
+           test::KeyStr("a", 1, kTypeValue)},
+          {"", "v1"},
+          {test::KeyStr("a", 2, kTypeSingleDeletion),
+           test::KeyStr("a", 1, kTypeValue)},
+          {"", "v1"}, 2 /*last_committed_seq*/);
+}
+
+// Single delete should be kept in case it is not visible to the
+// earliest write conflict snapshot. If a single delete is kept for this reason,
+// corresponding value can be trimmed to save space.
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+       KeepSingleDeletionForWriteConflictChecking) {
+  AddSnapshot(2, 0);
+  RunTest({test::KeyStr("a", 2, kTypeSingleDeletion),
+           test::KeyStr("a", 1, kTypeValue)},
+          {"", "v1"},
+          {test::KeyStr("a", 2, kTypeSingleDeletion),
+           test::KeyStr("a", 1, kTypeValue)},
+          {"", ""}, 2 /*last_committed_seq*/, nullptr /*merge_operator*/,
+          nullptr /*compaction_filter*/, false /*bottommost_level*/,
+          2 /*earliest_write_conflict_snapshot*/);
+}
+
+// Compaction filter should keep uncommitted key as-is, and
+//   * Convert the latest velue to deletion, and/or
+//   * if latest value is a merge, apply filter to all suequent merges.
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest, CompactionFilter_Value) {
+  std::unique_ptr<CompactionFilter> compaction_filter(
+      new FilterAllKeysCompactionFilter());
+  RunTest(
+      {test::KeyStr("a", 2, kTypeValue), test::KeyStr("a", 1, kTypeValue),
+       test::KeyStr("b", 3, kTypeValue), test::KeyStr("c", 1, kTypeValue)},
+      {"v2", "v1", "v3", "v4"},
+      {test::KeyStr("a", 2, kTypeValue), test::KeyStr("a", 1, kTypeDeletion),
+       test::KeyStr("b", 3, kTypeValue), test::KeyStr("c", 1, kTypeDeletion)},
+      {"v2", "", "v3", ""}, 1 /*last_committed_seq*/,
+      nullptr /*merge_operator*/, compaction_filter.get());
+}
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest, CompactionFilter_Deletion) {
+  std::unique_ptr<CompactionFilter> compaction_filter(
+      new FilterAllKeysCompactionFilter());
+  RunTest(
+      {test::KeyStr("a", 2, kTypeDeletion), test::KeyStr("a", 1, kTypeValue)},
+      {"", "v1"},
+      {test::KeyStr("a", 2, kTypeDeletion),
+       test::KeyStr("a", 1, kTypeDeletion)},
+      {"", ""}, 1 /*last_committed_seq*/, nullptr /*merge_operator*/,
+      compaction_filter.get());
+}
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+       CompactionFilter_PartialMerge) {
+  std::shared_ptr<MergeOperator> merge_op =
+      MergeOperators::CreateStringAppendOperator();
+  std::unique_ptr<CompactionFilter> compaction_filter(
+      new FilterAllKeysCompactionFilter());
+  RunTest({test::KeyStr("a", 3, kTypeMerge), test::KeyStr("a", 2, kTypeMerge),
+           test::KeyStr("a", 1, kTypeMerge)},
+          {"v3", "v2", "v1"}, {test::KeyStr("a", 3, kTypeMerge)}, {"v3"},
+          2 /*last_committed_seq*/, merge_op.get(), compaction_filter.get());
+}
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest, CompactionFilter_FullMerge) {
+  std::shared_ptr<MergeOperator> merge_op =
+      MergeOperators::CreateStringAppendOperator();
+  std::unique_ptr<CompactionFilter> compaction_filter(
+      new FilterAllKeysCompactionFilter());
+  RunTest(
+      {test::KeyStr("a", 3, kTypeMerge), test::KeyStr("a", 2, kTypeMerge),
+       test::KeyStr("a", 1, kTypeValue)},
+      {"v3", "v2", "v1"},
+      {test::KeyStr("a", 3, kTypeMerge), test::KeyStr("a", 1, kTypeDeletion)},
+      {"v3", ""}, 2 /*last_committed_seq*/, merge_op.get(),
+      compaction_filter.get());
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/compaction/compaction_job.cc b/src/rocksdb/db/compaction/compaction_job.cc
new file mode 100644
index 000000000..576ec7b45
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_job.cc
@@ -0,0 +1,1700 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <algorithm>
+#include <cinttypes>
+#include <functional>
+#include <list>
+#include <memory>
+#include <random>
+#include <set>
+#include <thread>
+#include <utility>
+#include <vector>
+
+#include "db/builder.h"
+#include "db/compaction/compaction_job.h"
+#include "db/db_impl/db_impl.h"
+#include "db/db_iter.h"
+#include "db/dbformat.h"
+#include "db/error_handler.h"
+#include "db/event_helpers.h"
+#include "db/log_reader.h"
+#include "db/log_writer.h"
+#include "db/memtable.h"
+#include "db/memtable_list.h"
+#include "db/merge_context.h"
+#include "db/merge_helper.h"
+#include "db/range_del_aggregator.h"
+#include "db/version_set.h"
+#include "file/filename.h"
+#include "file/read_write_util.h"
+#include "file/sst_file_manager_impl.h"
+#include "file/writable_file_writer.h"
+#include "logging/log_buffer.h"
+#include "logging/logging.h"
+#include "monitoring/iostats_context_imp.h"
+#include "monitoring/perf_context_imp.h"
+#include "monitoring/thread_status_util.h"
+#include "port/port.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/status.h"
+#include "rocksdb/table.h"
+#include "table/block_based/block.h"
+#include "table/block_based/block_based_table_factory.h"
+#include "table/merging_iterator.h"
+#include "table/table_builder.h"
+#include "test_util/sync_point.h"
+#include "util/coding.h"
+#include "util/mutexlock.h"
+#include "util/random.h"
+#include "util/stop_watch.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+const char* GetCompactionReasonString(CompactionReason compaction_reason) {
+  switch (compaction_reason) {
+    case CompactionReason::kUnknown:
+      return "Unknown";
+    case CompactionReason::kLevelL0FilesNum:
+      return "LevelL0FilesNum";
+    case CompactionReason::kLevelMaxLevelSize:
+      return "LevelMaxLevelSize";
+    case CompactionReason::kUniversalSizeAmplification:
+      return "UniversalSizeAmplification";
+    case CompactionReason::kUniversalSizeRatio:
+      return "UniversalSizeRatio";
+    case CompactionReason::kUniversalSortedRunNum:
+      return "UniversalSortedRunNum";
+    case CompactionReason::kFIFOMaxSize:
+      return "FIFOMaxSize";
+    case CompactionReason::kFIFOReduceNumFiles:
+      return "FIFOReduceNumFiles";
+    case CompactionReason::kFIFOTtl:
+      return "FIFOTtl";
+    case CompactionReason::kManualCompaction:
+      return "ManualCompaction";
+    case CompactionReason::kFilesMarkedForCompaction:
+      return "FilesMarkedForCompaction";
+    case CompactionReason::kBottommostFiles:
+      return "BottommostFiles";
+    case CompactionReason::kTtl:
+      return "Ttl";
+    case CompactionReason::kFlush:
+      return "Flush";
+    case CompactionReason::kExternalSstIngestion:
+      return "ExternalSstIngestion";
+    case CompactionReason::kPeriodicCompaction:
+      return "PeriodicCompaction";
+    case CompactionReason::kNumOfReasons:
+      // fall through
+    default:
+      assert(false);
+      return "Invalid";
+  }
+}
+
+// Maintains state for each sub-compaction
+struct CompactionJob::SubcompactionState {
+  const Compaction* compaction;
+  std::unique_ptr<CompactionIterator> c_iter;
+
+  // The boundaries of the key-range this compaction is interested in. No two
+  // subcompactions may have overlapping key-ranges.
+  // 'start' is inclusive, 'end' is exclusive, and nullptr means unbounded
+  Slice *start, *end;
+
+  // The return status of this subcompaction
+  Status status;
+
+  // Files produced by this subcompaction
+  struct Output {
+    FileMetaData meta;
+    bool finished;
+    std::shared_ptr<const TableProperties> table_properties;
+  };
+
+  // State kept for output being generated
+  std::vector<Output> outputs;
+  std::unique_ptr<WritableFileWriter> outfile;
+  std::unique_ptr<TableBuilder> builder;
+  Output* current_output() {
+    if (outputs.empty()) {
+      // This subcompaction's outptut could be empty if compaction was aborted
+      // before this subcompaction had a chance to generate any output files.
+      // When subcompactions are executed sequentially this is more likely and
+      // will be particulalry likely for the later subcompactions to be empty.
+      // Once they are run in parallel however it should be much rarer.
+      return nullptr;
+    } else {
+      return &outputs.back();
+    }
+  }
+
+  uint64_t current_output_file_size;
+
+  // State during the subcompaction
+  uint64_t total_bytes;
+  uint64_t num_output_records;
+  CompactionJobStats compaction_job_stats;
+  uint64_t approx_size;
+  // An index that used to speed up ShouldStopBefore().
+  size_t grandparent_index = 0;
+  // The number of bytes overlapping between the current output and
+  // grandparent files used in ShouldStopBefore().
+  uint64_t overlapped_bytes = 0;
+  // A flag determine whether the key has been seen in ShouldStopBefore()
+  bool seen_key = false;
+
+  SubcompactionState(Compaction* c, Slice* _start, Slice* _end,
+                     uint64_t size = 0)
+      : compaction(c),
+        start(_start),
+        end(_end),
+        outfile(nullptr),
+        builder(nullptr),
+        current_output_file_size(0),
+        total_bytes(0),
+        num_output_records(0),
+        approx_size(size),
+        grandparent_index(0),
+        overlapped_bytes(0),
+        seen_key(false) {
+    assert(compaction != nullptr);
+  }
+
+  SubcompactionState(SubcompactionState&& o) { *this = std::move(o); }
+
+  SubcompactionState& operator=(SubcompactionState&& o) {
+    compaction = std::move(o.compaction);
+    start = std::move(o.start);
+    end = std::move(o.end);
+    status = std::move(o.status);
+    outputs = std::move(o.outputs);
+    outfile = std::move(o.outfile);
+    builder = std::move(o.builder);
+    current_output_file_size = std::move(o.current_output_file_size);
+    total_bytes = std::move(o.total_bytes);
+    num_output_records = std::move(o.num_output_records);
+    compaction_job_stats = std::move(o.compaction_job_stats);
+    approx_size = std::move(o.approx_size);
+    grandparent_index = std::move(o.grandparent_index);
+    overlapped_bytes = std::move(o.overlapped_bytes);
+    seen_key = std::move(o.seen_key);
+    return *this;
+  }
+
+  // Because member std::unique_ptrs do not have these.
+  SubcompactionState(const SubcompactionState&) = delete;
+
+  SubcompactionState& operator=(const SubcompactionState&) = delete;
+
+  // Returns true iff we should stop building the current output
+  // before processing "internal_key".
+  bool ShouldStopBefore(const Slice& internal_key, uint64_t curr_file_size) {
+    const InternalKeyComparator* icmp =
+        &compaction->column_family_data()->internal_comparator();
+    const std::vector<FileMetaData*>& grandparents = compaction->grandparents();
+
+    // Scan to find earliest grandparent file that contains key.
+    while (grandparent_index < grandparents.size() &&
+           icmp->Compare(internal_key,
+                         grandparents[grandparent_index]->largest.Encode()) >
+               0) {
+      if (seen_key) {
+        overlapped_bytes += grandparents[grandparent_index]->fd.GetFileSize();
+      }
+      assert(grandparent_index + 1 >= grandparents.size() ||
+             icmp->Compare(
+                 grandparents[grandparent_index]->largest.Encode(),
+                 grandparents[grandparent_index + 1]->smallest.Encode()) <= 0);
+      grandparent_index++;
+    }
+    seen_key = true;
+
+    if (overlapped_bytes + curr_file_size >
+        compaction->max_compaction_bytes()) {
+      // Too much overlap for current output; start new output
+      overlapped_bytes = 0;
+      return true;
+    }
+
+    return false;
+  }
+};
+
+// Maintains state for the entire compaction
+struct CompactionJob::CompactionState {
+  Compaction* const compaction;
+
+  // REQUIRED: subcompaction states are stored in order of increasing
+  // key-range
+  std::vector<CompactionJob::SubcompactionState> sub_compact_states;
+  Status status;
+
+  uint64_t total_bytes;
+  uint64_t num_output_records;
+
+  explicit CompactionState(Compaction* c)
+      : compaction(c),
+        total_bytes(0),
+        num_output_records(0) {}
+
+  size_t NumOutputFiles() {
+    size_t total = 0;
+    for (auto& s : sub_compact_states) {
+      total += s.outputs.size();
+    }
+    return total;
+  }
+
+  Slice SmallestUserKey() {
+    for (const auto& sub_compact_state : sub_compact_states) {
+      if (!sub_compact_state.outputs.empty() &&
+          sub_compact_state.outputs[0].finished) {
+        return sub_compact_state.outputs[0].meta.smallest.user_key();
+      }
+    }
+    // If there is no finished output, return an empty slice.
+    return Slice(nullptr, 0);
+  }
+
+  Slice LargestUserKey() {
+    for (auto it = sub_compact_states.rbegin(); it < sub_compact_states.rend();
+         ++it) {
+      if (!it->outputs.empty() && it->current_output()->finished) {
+        assert(it->current_output() != nullptr);
+        return it->current_output()->meta.largest.user_key();
+      }
+    }
+    // If there is no finished output, return an empty slice.
+    return Slice(nullptr, 0);
+  }
+};
+
+void CompactionJob::AggregateStatistics() {
+  for (SubcompactionState& sc : compact_->sub_compact_states) {
+    compact_->total_bytes += sc.total_bytes;
+    compact_->num_output_records += sc.num_output_records;
+  }
+  if (compaction_job_stats_) {
+    for (SubcompactionState& sc : compact_->sub_compact_states) {
+      compaction_job_stats_->Add(sc.compaction_job_stats);
+    }
+  }
+}
+
+CompactionJob::CompactionJob(
+    int job_id, Compaction* compaction, const ImmutableDBOptions& db_options,
+    const FileOptions& file_options, VersionSet* versions,
+    const std::atomic<bool>* shutting_down,
+    const SequenceNumber preserve_deletes_seqnum, LogBuffer* log_buffer,
+    Directory* db_directory, Directory* output_directory, Statistics* stats,
+    InstrumentedMutex* db_mutex, ErrorHandler* db_error_handler,
+    std::vector<SequenceNumber> existing_snapshots,
+    SequenceNumber earliest_write_conflict_snapshot,
+    const SnapshotChecker* snapshot_checker, std::shared_ptr<Cache> table_cache,
+    EventLogger* event_logger, bool paranoid_file_checks, bool measure_io_stats,
+    const std::string& dbname, CompactionJobStats* compaction_job_stats,
+    Env::Priority thread_pri, const std::atomic<bool>* manual_compaction_paused)
+    : job_id_(job_id),
+      compact_(new CompactionState(compaction)),
+      compaction_job_stats_(compaction_job_stats),
+      compaction_stats_(compaction->compaction_reason(), 1),
+      dbname_(dbname),
+      db_options_(db_options),
+      file_options_(file_options),
+      env_(db_options.env),
+      fs_(db_options.fs.get()),
+      file_options_for_read_(
+          fs_->OptimizeForCompactionTableRead(file_options, db_options_)),
+      versions_(versions),
+      shutting_down_(shutting_down),
+      manual_compaction_paused_(manual_compaction_paused),
+      preserve_deletes_seqnum_(preserve_deletes_seqnum),
+      log_buffer_(log_buffer),
+      db_directory_(db_directory),
+      output_directory_(output_directory),
+      stats_(stats),
+      db_mutex_(db_mutex),
+      db_error_handler_(db_error_handler),
+      existing_snapshots_(std::move(existing_snapshots)),
+      earliest_write_conflict_snapshot_(earliest_write_conflict_snapshot),
+      snapshot_checker_(snapshot_checker),
+      table_cache_(std::move(table_cache)),
+      event_logger_(event_logger),
+      bottommost_level_(false),
+      paranoid_file_checks_(paranoid_file_checks),
+      measure_io_stats_(measure_io_stats),
+      write_hint_(Env::WLTH_NOT_SET),
+      thread_pri_(thread_pri) {
+  assert(log_buffer_ != nullptr);
+  const auto* cfd = compact_->compaction->column_family_data();
+  ThreadStatusUtil::SetColumnFamily(cfd, cfd->ioptions()->env,
+                                    db_options_.enable_thread_tracking);
+  ThreadStatusUtil::SetThreadOperation(ThreadStatus::OP_COMPACTION);
+  ReportStartedCompaction(compaction);
+}
+
+CompactionJob::~CompactionJob() {
+  assert(compact_ == nullptr);
+  ThreadStatusUtil::ResetThreadStatus();
+}
+
+void CompactionJob::ReportStartedCompaction(Compaction* compaction) {
+  const auto* cfd = compact_->compaction->column_family_data();
+  ThreadStatusUtil::SetColumnFamily(cfd, cfd->ioptions()->env,
+                                    db_options_.enable_thread_tracking);
+
+  ThreadStatusUtil::SetThreadOperationProperty(ThreadStatus::COMPACTION_JOB_ID,
+                                               job_id_);
+
+  ThreadStatusUtil::SetThreadOperationProperty(
+      ThreadStatus::COMPACTION_INPUT_OUTPUT_LEVEL,
+      (static_cast<uint64_t>(compact_->compaction->start_level()) << 32) +
+          compact_->compaction->output_level());
+
+  // In the current design, a CompactionJob is always created
+  // for non-trivial compaction.
+  assert(compaction->IsTrivialMove() == false ||
+         compaction->is_manual_compaction() == true);
+
+  ThreadStatusUtil::SetThreadOperationProperty(
+      ThreadStatus::COMPACTION_PROP_FLAGS,
+      compaction->is_manual_compaction() +
+          (compaction->deletion_compaction() << 1));
+
+  ThreadStatusUtil::SetThreadOperationProperty(
+      ThreadStatus::COMPACTION_TOTAL_INPUT_BYTES,
+      compaction->CalculateTotalInputSize());
+
+  IOSTATS_RESET(bytes_written);
+  IOSTATS_RESET(bytes_read);
+  ThreadStatusUtil::SetThreadOperationProperty(
+      ThreadStatus::COMPACTION_BYTES_WRITTEN, 0);
+  ThreadStatusUtil::SetThreadOperationProperty(
+      ThreadStatus::COMPACTION_BYTES_READ, 0);
+
+  // Set the thread operation after operation properties
+  // to ensure GetThreadList() can always show them all together.
+  ThreadStatusUtil::SetThreadOperation(ThreadStatus::OP_COMPACTION);
+
+  if (compaction_job_stats_) {
+    compaction_job_stats_->is_manual_compaction =
+        compaction->is_manual_compaction();
+  }
+}
+
+void CompactionJob::Prepare() {
+  AutoThreadOperationStageUpdater stage_updater(
+      ThreadStatus::STAGE_COMPACTION_PREPARE);
+
+  // Generate file_levels_ for compaction berfore making Iterator
+  auto* c = compact_->compaction;
+  assert(c->column_family_data() != nullptr);
+  assert(c->column_family_data()->current()->storage_info()->NumLevelFiles(
+             compact_->compaction->level()) > 0);
+
+  write_hint_ =
+      c->column_family_data()->CalculateSSTWriteHint(c->output_level());
+  bottommost_level_ = c->bottommost_level();
+
+  if (c->ShouldFormSubcompactions()) {
+    {
+      StopWatch sw(env_, stats_, SUBCOMPACTION_SETUP_TIME);
+      GenSubcompactionBoundaries();
+    }
+    assert(sizes_.size() == boundaries_.size() + 1);
+
+    for (size_t i = 0; i <= boundaries_.size(); i++) {
+      Slice* start = i == 0 ? nullptr : &boundaries_[i - 1];
+      Slice* end = i == boundaries_.size() ? nullptr : &boundaries_[i];
+      compact_->sub_compact_states.emplace_back(c, start, end, sizes_[i]);
+    }
+    RecordInHistogram(stats_, NUM_SUBCOMPACTIONS_SCHEDULED,
+                      compact_->sub_compact_states.size());
+  } else {
+    compact_->sub_compact_states.emplace_back(c, nullptr, nullptr);
+  }
+}
+
+struct RangeWithSize {
+  Range range;
+  uint64_t size;
+
+  RangeWithSize(const Slice& a, const Slice& b, uint64_t s = 0)
+      : range(a, b), size(s) {}
+};
+
+void CompactionJob::GenSubcompactionBoundaries() {
+  auto* c = compact_->compaction;
+  auto* cfd = c->column_family_data();
+  const Comparator* cfd_comparator = cfd->user_comparator();
+  std::vector<Slice> bounds;
+  int start_lvl = c->start_level();
+  int out_lvl = c->output_level();
+
+  // Add the starting and/or ending key of certain input files as a potential
+  // boundary
+  for (size_t lvl_idx = 0; lvl_idx < c->num_input_levels(); lvl_idx++) {
+    int lvl = c->level(lvl_idx);
+    if (lvl >= start_lvl && lvl <= out_lvl) {
+      const LevelFilesBrief* flevel = c->input_levels(lvl_idx);
+      size_t num_files = flevel->num_files;
+
+      if (num_files == 0) {
+        continue;
+      }
+
+      if (lvl == 0) {
+        // For level 0 add the starting and ending key of each file since the
+        // files may have greatly differing key ranges (not range-partitioned)
+        for (size_t i = 0; i < num_files; i++) {
+          bounds.emplace_back(flevel->files[i].smallest_key);
+          bounds.emplace_back(flevel->files[i].largest_key);
+        }
+      } else {
+        // For all other levels add the smallest/largest key in the level to
+        // encompass the range covered by that level
+        bounds.emplace_back(flevel->files[0].smallest_key);
+        bounds.emplace_back(flevel->files[num_files - 1].largest_key);
+        if (lvl == out_lvl) {
+          // For the last level include the starting keys of all files since
+          // the last level is the largest and probably has the widest key
+          // range. Since it's range partitioned, the ending key of one file
+          // and the starting key of the next are very close (or identical).
+          for (size_t i = 1; i < num_files; i++) {
+            bounds.emplace_back(flevel->files[i].smallest_key);
+          }
+        }
+      }
+    }
+  }
+
+  std::sort(bounds.begin(), bounds.end(),
+            [cfd_comparator](const Slice& a, const Slice& b) -> bool {
+              return cfd_comparator->Compare(ExtractUserKey(a),
+                                             ExtractUserKey(b)) < 0;
+            });
+  // Remove duplicated entries from bounds
+  bounds.erase(
+      std::unique(bounds.begin(), bounds.end(),
+                  [cfd_comparator](const Slice& a, const Slice& b) -> bool {
+                    return cfd_comparator->Compare(ExtractUserKey(a),
+                                                   ExtractUserKey(b)) == 0;
+                  }),
+      bounds.end());
+
+  // Combine consecutive pairs of boundaries into ranges with an approximate
+  // size of data covered by keys in that range
+  uint64_t sum = 0;
+  std::vector<RangeWithSize> ranges;
+  // Get input version from CompactionState since it's already referenced
+  // earlier in SetInputVersioCompaction::SetInputVersion and will not change
+  // when db_mutex_ is released below
+  auto* v = compact_->compaction->input_version();
+  for (auto it = bounds.begin();;) {
+    const Slice a = *it;
+    ++it;
+
+    if (it == bounds.end()) {
+      break;
+    }
+
+    const Slice b = *it;
+
+    // ApproximateSize could potentially create table reader iterator to seek
+    // to the index block and may incur I/O cost in the process. Unlock db
+    // mutex to reduce contention
+    db_mutex_->Unlock();
+    uint64_t size = versions_->ApproximateSize(SizeApproximationOptions(), v, a,
+                                               b, start_lvl, out_lvl + 1,
+                                               TableReaderCaller::kCompaction);
+    db_mutex_->Lock();
+    ranges.emplace_back(a, b, size);
+    sum += size;
+  }
+
+  // Group the ranges into subcompactions
+  const double min_file_fill_percent = 4.0 / 5;
+  int base_level = v->storage_info()->base_level();
+  uint64_t max_output_files = static_cast<uint64_t>(std::ceil(
+      sum / min_file_fill_percent /
+      MaxFileSizeForLevel(*(c->mutable_cf_options()), out_lvl,
+          c->immutable_cf_options()->compaction_style, base_level,
+          c->immutable_cf_options()->level_compaction_dynamic_level_bytes)));
+  uint64_t subcompactions =
+      std::min({static_cast<uint64_t>(ranges.size()),
+                static_cast<uint64_t>(c->max_subcompactions()),
+                max_output_files});
+
+  if (subcompactions > 1) {
+    double mean = sum * 1.0 / subcompactions;
+    // Greedily add ranges to the subcompaction until the sum of the ranges'
+    // sizes becomes >= the expected mean size of a subcompaction
+    sum = 0;
+    for (size_t i = 0; i < ranges.size() - 1; i++) {
+      sum += ranges[i].size;
+      if (subcompactions == 1) {
+        // If there's only one left to schedule then it goes to the end so no
+        // need to put an end boundary
+        continue;
+      }
+      if (sum >= mean) {
+        boundaries_.emplace_back(ExtractUserKey(ranges[i].range.limit));
+        sizes_.emplace_back(sum);
+        subcompactions--;
+        sum = 0;
+      }
+    }
+    sizes_.emplace_back(sum + ranges.back().size);
+  } else {
+    // Only one range so its size is the total sum of sizes computed above
+    sizes_.emplace_back(sum);
+  }
+}
+
+Status CompactionJob::Run() {
+  AutoThreadOperationStageUpdater stage_updater(
+      ThreadStatus::STAGE_COMPACTION_RUN);
+  TEST_SYNC_POINT("CompactionJob::Run():Start");
+  log_buffer_->FlushBufferToLog();
+  LogCompaction();
+
+  const size_t num_threads = compact_->sub_compact_states.size();
+  assert(num_threads > 0);
+  const uint64_t start_micros = env_->NowMicros();
+
+  // Launch a thread for each of subcompactions 1...num_threads-1
+  std::vector<port::Thread> thread_pool;
+  thread_pool.reserve(num_threads - 1);
+  for (size_t i = 1; i < compact_->sub_compact_states.size(); i++) {
+    thread_pool.emplace_back(&CompactionJob::ProcessKeyValueCompaction, this,
+                             &compact_->sub_compact_states[i]);
+  }
+
+  // Always schedule the first subcompaction (whether or not there are also
+  // others) in the current thread to be efficient with resources
+  ProcessKeyValueCompaction(&compact_->sub_compact_states[0]);
+
+  // Wait for all other threads (if there are any) to finish execution
+  for (auto& thread : thread_pool) {
+    thread.join();
+  }
+
+  compaction_stats_.micros = env_->NowMicros() - start_micros;
+  compaction_stats_.cpu_micros = 0;
+  for (size_t i = 0; i < compact_->sub_compact_states.size(); i++) {
+    compaction_stats_.cpu_micros +=
+        compact_->sub_compact_states[i].compaction_job_stats.cpu_micros;
+  }
+
+  RecordTimeToHistogram(stats_, COMPACTION_TIME, compaction_stats_.micros);
+  RecordTimeToHistogram(stats_, COMPACTION_CPU_TIME,
+                        compaction_stats_.cpu_micros);
+
+  TEST_SYNC_POINT("CompactionJob::Run:BeforeVerify");
+
+  // Check if any thread encountered an error during execution
+  Status status;
+  for (const auto& state : compact_->sub_compact_states) {
+    if (!state.status.ok()) {
+      status = state.status;
+      break;
+    }
+  }
+
+  if (status.ok() && output_directory_) {
+    status = output_directory_->Fsync();
+  }
+
+  if (status.ok()) {
+    thread_pool.clear();
+    std::vector<const FileMetaData*> files_meta;
+    for (const auto& state : compact_->sub_compact_states) {
+      for (const auto& output : state.outputs) {
+        files_meta.emplace_back(&output.meta);
+      }
+    }
+    ColumnFamilyData* cfd = compact_->compaction->column_family_data();
+    auto prefix_extractor =
+        compact_->compaction->mutable_cf_options()->prefix_extractor.get();
+    std::atomic<size_t> next_file_meta_idx(0);
+    auto verify_table = [&](Status& output_status) {
+      while (true) {
+        size_t file_idx = next_file_meta_idx.fetch_add(1);
+        if (file_idx >= files_meta.size()) {
+          break;
+        }
+        // Verify that the table is usable
+        // We set for_compaction to false and don't OptimizeForCompactionTableRead
+        // here because this is a special case after we finish the table building
+        // No matter whether use_direct_io_for_flush_and_compaction is true,
+        // we will regard this verification as user reads since the goal is
+        // to cache it here for further user reads
+        InternalIterator* iter = cfd->table_cache()->NewIterator(
+            ReadOptions(), file_options_, cfd->internal_comparator(),
+            *files_meta[file_idx], /*range_del_agg=*/nullptr, prefix_extractor,
+            /*table_reader_ptr=*/nullptr,
+            cfd->internal_stats()->GetFileReadHist(
+                compact_->compaction->output_level()),
+            TableReaderCaller::kCompactionRefill, /*arena=*/nullptr,
+            /*skip_filters=*/false, compact_->compaction->output_level(),
+            /*smallest_compaction_key=*/nullptr,
+            /*largest_compaction_key=*/nullptr);
+        auto s = iter->status();
+
+        if (s.ok() && paranoid_file_checks_) {
+          for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {}
+          s = iter->status();
+        }
+
+        delete iter;
+
+        if (!s.ok()) {
+          output_status = s;
+          break;
+        }
+      }
+    };
+    for (size_t i = 1; i < compact_->sub_compact_states.size(); i++) {
+      thread_pool.emplace_back(verify_table,
+                               std::ref(compact_->sub_compact_states[i].status));
+    }
+    verify_table(compact_->sub_compact_states[0].status);
+    for (auto& thread : thread_pool) {
+      thread.join();
+    }
+    for (const auto& state : compact_->sub_compact_states) {
+      if (!state.status.ok()) {
+        status = state.status;
+        break;
+      }
+    }
+  }
+
+  TablePropertiesCollection tp;
+  for (const auto& state : compact_->sub_compact_states) {
+    for (const auto& output : state.outputs) {
+      auto fn =
+          TableFileName(state.compaction->immutable_cf_options()->cf_paths,
+                        output.meta.fd.GetNumber(), output.meta.fd.GetPathId());
+      tp[fn] = output.table_properties;
+    }
+  }
+  compact_->compaction->SetOutputTableProperties(std::move(tp));
+
+  // Finish up all book-keeping to unify the subcompaction results
+  AggregateStatistics();
+  UpdateCompactionStats();
+  RecordCompactionIOStats();
+  LogFlush(db_options_.info_log);
+  TEST_SYNC_POINT("CompactionJob::Run():End");
+
+  compact_->status = status;
+  return status;
+}
+
+Status CompactionJob::Install(const MutableCFOptions& mutable_cf_options) {
+  AutoThreadOperationStageUpdater stage_updater(
+      ThreadStatus::STAGE_COMPACTION_INSTALL);
+  db_mutex_->AssertHeld();
+  Status status = compact_->status;
+  ColumnFamilyData* cfd = compact_->compaction->column_family_data();
+  cfd->internal_stats()->AddCompactionStats(
+      compact_->compaction->output_level(), thread_pri_, compaction_stats_);
+
+  if (status.ok()) {
+    status = InstallCompactionResults(mutable_cf_options);
+  }
+  VersionStorageInfo::LevelSummaryStorage tmp;
+  auto vstorage = cfd->current()->storage_info();
+  const auto& stats = compaction_stats_;
+
+  double read_write_amp = 0.0;
+  double write_amp = 0.0;
+  double bytes_read_per_sec = 0;
+  double bytes_written_per_sec = 0;
+
+  if (stats.bytes_read_non_output_levels > 0) {
+    read_write_amp = (stats.bytes_written + stats.bytes_read_output_level +
+                      stats.bytes_read_non_output_levels) /
+                     static_cast<double>(stats.bytes_read_non_output_levels);
+    write_amp = stats.bytes_written /
+                static_cast<double>(stats.bytes_read_non_output_levels);
+  }
+  if (stats.micros > 0) {
+    bytes_read_per_sec =
+        (stats.bytes_read_non_output_levels + stats.bytes_read_output_level) /
+        static_cast<double>(stats.micros);
+    bytes_written_per_sec =
+        stats.bytes_written / static_cast<double>(stats.micros);
+  }
+
+  ROCKS_LOG_BUFFER(
+      log_buffer_,
+      "[%s] compacted to: %s, MB/sec: %.1f rd, %.1f wr, level %d, "
+      "files in(%d, %d) out(%d) "
+      "MB in(%.1f, %.1f) out(%.1f), read-write-amplify(%.1f) "
+      "write-amplify(%.1f) %s, records in: %" PRIu64
+      ", records dropped: %" PRIu64 " output_compression: %s\n",
+      cfd->GetName().c_str(), vstorage->LevelSummary(&tmp), bytes_read_per_sec,
+      bytes_written_per_sec, compact_->compaction->output_level(),
+      stats.num_input_files_in_non_output_levels,
+      stats.num_input_files_in_output_level, stats.num_output_files,
+      stats.bytes_read_non_output_levels / 1048576.0,
+      stats.bytes_read_output_level / 1048576.0,
+      stats.bytes_written / 1048576.0, read_write_amp, write_amp,
+      status.ToString().c_str(), stats.num_input_records,
+      stats.num_dropped_records,
+      CompressionTypeToString(compact_->compaction->output_compression())
+          .c_str());
+
+  UpdateCompactionJobStats(stats);
+
+  auto stream = event_logger_->LogToBuffer(log_buffer_);
+  stream << "job" << job_id_ << "event"
+         << "compaction_finished"
+         << "compaction_time_micros" << stats.micros
+         << "compaction_time_cpu_micros" << stats.cpu_micros << "output_level"
+         << compact_->compaction->output_level() << "num_output_files"
+         << compact_->NumOutputFiles() << "total_output_size"
+         << compact_->total_bytes << "num_input_records"
+         << stats.num_input_records << "num_output_records"
+         << compact_->num_output_records << "num_subcompactions"
+         << compact_->sub_compact_states.size() << "output_compression"
+         << CompressionTypeToString(compact_->compaction->output_compression());
+
+  if (compaction_job_stats_ != nullptr) {
+    stream << "num_single_delete_mismatches"
+           << compaction_job_stats_->num_single_del_mismatch;
+    stream << "num_single_delete_fallthrough"
+           << compaction_job_stats_->num_single_del_fallthru;
+  }
+
+  if (measure_io_stats_ && compaction_job_stats_ != nullptr) {
+    stream << "file_write_nanos" << compaction_job_stats_->file_write_nanos;
+    stream << "file_range_sync_nanos"
+           << compaction_job_stats_->file_range_sync_nanos;
+    stream << "file_fsync_nanos" << compaction_job_stats_->file_fsync_nanos;
+    stream << "file_prepare_write_nanos"
+           << compaction_job_stats_->file_prepare_write_nanos;
+  }
+
+  stream << "lsm_state";
+  stream.StartArray();
+  for (int level = 0; level < vstorage->num_levels(); ++level) {
+    stream << vstorage->NumLevelFiles(level);
+  }
+  stream.EndArray();
+
+  CleanupCompaction();
+  return status;
+}
+
+void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
+  assert(sub_compact != nullptr);
+
+  uint64_t prev_cpu_micros = env_->NowCPUNanos() / 1000;
+
+  ColumnFamilyData* cfd = sub_compact->compaction->column_family_data();
+
+  // Create compaction filter and fail the compaction if
+  // IgnoreSnapshots() = false because it is not supported anymore
+  const CompactionFilter* compaction_filter =
+      cfd->ioptions()->compaction_filter;
+  std::unique_ptr<CompactionFilter> compaction_filter_from_factory = nullptr;
+  if (compaction_filter == nullptr) {
+    compaction_filter_from_factory =
+        sub_compact->compaction->CreateCompactionFilter();
+    compaction_filter = compaction_filter_from_factory.get();
+  }
+  if (compaction_filter != nullptr && !compaction_filter->IgnoreSnapshots()) {
+    sub_compact->status = Status::NotSupported(
+        "CompactionFilter::IgnoreSnapshots() = false is not supported "
+        "anymore.");
+    return;
+  }
+
+  CompactionRangeDelAggregator range_del_agg(&cfd->internal_comparator(),
+                                             existing_snapshots_);
+
+  // Although the v2 aggregator is what the level iterator(s) know about,
+  // the AddTombstones calls will be propagated down to the v1 aggregator.
+  std::unique_ptr<InternalIterator> input(versions_->MakeInputIterator(
+      sub_compact->compaction, &range_del_agg, file_options_for_read_));
+
+  AutoThreadOperationStageUpdater stage_updater(
+      ThreadStatus::STAGE_COMPACTION_PROCESS_KV);
+
+  // I/O measurement variables
+  PerfLevel prev_perf_level = PerfLevel::kEnableTime;
+  const uint64_t kRecordStatsEvery = 1000;
+  uint64_t prev_write_nanos = 0;
+  uint64_t prev_fsync_nanos = 0;
+  uint64_t prev_range_sync_nanos = 0;
+  uint64_t prev_prepare_write_nanos = 0;
+  uint64_t prev_cpu_write_nanos = 0;
+  uint64_t prev_cpu_read_nanos = 0;
+  if (measure_io_stats_) {
+    prev_perf_level = GetPerfLevel();
+    SetPerfLevel(PerfLevel::kEnableTimeAndCPUTimeExceptForMutex);
+    prev_write_nanos = IOSTATS(write_nanos);
+    prev_fsync_nanos = IOSTATS(fsync_nanos);
+    prev_range_sync_nanos = IOSTATS(range_sync_nanos);
+    prev_prepare_write_nanos = IOSTATS(prepare_write_nanos);
+    prev_cpu_write_nanos = IOSTATS(cpu_write_nanos);
+    prev_cpu_read_nanos = IOSTATS(cpu_read_nanos);
+  }
+
+  MergeHelper merge(
+      env_, cfd->user_comparator(), cfd->ioptions()->merge_operator,
+      compaction_filter, db_options_.info_log.get(),
+      false /* internal key corruption is expected */,
+      existing_snapshots_.empty() ? 0 : existing_snapshots_.back(),
+      snapshot_checker_, compact_->compaction->level(),
+      db_options_.statistics.get());
+
+  TEST_SYNC_POINT("CompactionJob::Run():Inprogress");
+  TEST_SYNC_POINT_CALLBACK(
+      "CompactionJob::Run():PausingManualCompaction:1",
+      reinterpret_cast<void*>(
+          const_cast<std::atomic<bool>*>(manual_compaction_paused_)));
+
+  Slice* start = sub_compact->start;
+  Slice* end = sub_compact->end;
+  if (start != nullptr) {
+    IterKey start_iter;
+    start_iter.SetInternalKey(*start, kMaxSequenceNumber, kValueTypeForSeek);
+    input->Seek(start_iter.GetInternalKey());
+  } else {
+    input->SeekToFirst();
+  }
+
+  Status status;
+  sub_compact->c_iter.reset(new CompactionIterator(
+      input.get(), cfd->user_comparator(), &merge, versions_->LastSequence(),
+      &existing_snapshots_, earliest_write_conflict_snapshot_,
+      snapshot_checker_, env_, ShouldReportDetailedTime(env_, stats_), false,
+      &range_del_agg, sub_compact->compaction, compaction_filter,
+      shutting_down_, preserve_deletes_seqnum_, manual_compaction_paused_,
+      db_options_.info_log));
+  auto c_iter = sub_compact->c_iter.get();
+  c_iter->SeekToFirst();
+  if (c_iter->Valid() && sub_compact->compaction->output_level() != 0) {
+    // ShouldStopBefore() maintains state based on keys processed so far. The
+    // compaction loop always calls it on the "next" key, thus won't tell it the
+    // first key. So we do that here.
+    sub_compact->ShouldStopBefore(c_iter->key(),
+                                  sub_compact->current_output_file_size);
+  }
+  const auto& c_iter_stats = c_iter->iter_stats();
+
+  while (status.ok() && !cfd->IsDropped() && c_iter->Valid()) {
+    // Invariant: c_iter.status() is guaranteed to be OK if c_iter->Valid()
+    // returns true.
+    const Slice& key = c_iter->key();
+    const Slice& value = c_iter->value();
+
+    // If an end key (exclusive) is specified, check if the current key is
+    // >= than it and exit if it is because the iterator is out of its range
+    if (end != nullptr &&
+        cfd->user_comparator()->Compare(c_iter->user_key(), *end) >= 0) {
+      break;
+    }
+    if (c_iter_stats.num_input_records % kRecordStatsEvery ==
+        kRecordStatsEvery - 1) {
+      RecordDroppedKeys(c_iter_stats, &sub_compact->compaction_job_stats);
+      c_iter->ResetRecordCounts();
+      RecordCompactionIOStats();
+    }
+
+    // Open output file if necessary
+    if (sub_compact->builder == nullptr) {
+      status = OpenCompactionOutputFile(sub_compact);
+      if (!status.ok()) {
+        break;
+      }
+    }
+    assert(sub_compact->builder != nullptr);
+    assert(sub_compact->current_output() != nullptr);
+    sub_compact->builder->Add(key, value);
+    sub_compact->current_output_file_size = sub_compact->builder->FileSize();
+    const ParsedInternalKey& ikey = c_iter->ikey();
+    sub_compact->current_output()->meta.UpdateBoundaries(
+        key, value, ikey.sequence, ikey.type);
+    sub_compact->num_output_records++;
+
+    // Close output file if it is big enough. Two possibilities determine it's
+    // time to close it: (1) the current key should be this file's last key, (2)
+    // the next key should not be in this file.
+    //
+    // TODO(aekmekji): determine if file should be closed earlier than this
+    // during subcompactions (i.e. if output size, estimated by input size, is
+    // going to be 1.2MB and max_output_file_size = 1MB, prefer to have 0.6MB
+    // and 0.6MB instead of 1MB and 0.2MB)
+    bool output_file_ended = false;
+    Status input_status;
+    if (sub_compact->compaction->output_level() != 0 &&
+        sub_compact->current_output_file_size >=
+            sub_compact->compaction->max_output_file_size()) {
+      // (1) this key terminates the file. For historical reasons, the iterator
+      // status before advancing will be given to FinishCompactionOutputFile().
+      input_status = input->status();
+      output_file_ended = true;
+    }
+    TEST_SYNC_POINT_CALLBACK(
+        "CompactionJob::Run():PausingManualCompaction:2",
+        reinterpret_cast<void*>(
+            const_cast<std::atomic<bool>*>(manual_compaction_paused_)));
+    c_iter->Next();
+    if (c_iter->status().IsManualCompactionPaused()) {
+      break;
+    }
+    if (!output_file_ended && c_iter->Valid() &&
+        sub_compact->compaction->output_level() != 0 &&
+        sub_compact->ShouldStopBefore(c_iter->key(),
+                                      sub_compact->current_output_file_size) &&
+        sub_compact->builder != nullptr) {
+      // (2) this key belongs to the next file. For historical reasons, the
+      // iterator status after advancing will be given to
+      // FinishCompactionOutputFile().
+      input_status = input->status();
+      output_file_ended = true;
+    }
+    if (output_file_ended) {
+      const Slice* next_key = nullptr;
+      if (c_iter->Valid()) {
+        next_key = &c_iter->key();
+      }
+      CompactionIterationStats range_del_out_stats;
+      status =
+          FinishCompactionOutputFile(input_status, sub_compact, &range_del_agg,
+                                     &range_del_out_stats, next_key);
+      RecordDroppedKeys(range_del_out_stats,
+                        &sub_compact->compaction_job_stats);
+    }
+  }
+
+  sub_compact->compaction_job_stats.num_input_deletion_records =
+      c_iter_stats.num_input_deletion_records;
+  sub_compact->compaction_job_stats.num_corrupt_keys =
+      c_iter_stats.num_input_corrupt_records;
+  sub_compact->compaction_job_stats.num_single_del_fallthru =
+      c_iter_stats.num_single_del_fallthru;
+  sub_compact->compaction_job_stats.num_single_del_mismatch =
+      c_iter_stats.num_single_del_mismatch;
+  sub_compact->compaction_job_stats.total_input_raw_key_bytes +=
+      c_iter_stats.total_input_raw_key_bytes;
+  sub_compact->compaction_job_stats.total_input_raw_value_bytes +=
+      c_iter_stats.total_input_raw_value_bytes;
+
+  RecordTick(stats_, FILTER_OPERATION_TOTAL_TIME,
+             c_iter_stats.total_filter_time);
+  RecordDroppedKeys(c_iter_stats, &sub_compact->compaction_job_stats);
+  RecordCompactionIOStats();
+
+  if (status.ok() && cfd->IsDropped()) {
+    status =
+        Status::ColumnFamilyDropped("Column family dropped during compaction");
+  }
+  if ((status.ok() || status.IsColumnFamilyDropped()) &&
+      shutting_down_->load(std::memory_order_relaxed)) {
+    status = Status::ShutdownInProgress("Database shutdown");
+  }
+  if ((status.ok() || status.IsColumnFamilyDropped()) &&
+      (manual_compaction_paused_ &&
+       manual_compaction_paused_->load(std::memory_order_relaxed))) {
+    status = Status::Incomplete(Status::SubCode::kManualCompactionPaused);
+  }
+  if (status.ok()) {
+    status = input->status();
+  }
+  if (status.ok()) {
+    status = c_iter->status();
+  }
+
+  if (status.ok() && sub_compact->builder == nullptr &&
+      sub_compact->outputs.size() == 0 && !range_del_agg.IsEmpty()) {
+    // handle subcompaction containing only range deletions
+    status = OpenCompactionOutputFile(sub_compact);
+  }
+
+  // Call FinishCompactionOutputFile() even if status is not ok: it needs to
+  // close the output file.
+  if (sub_compact->builder != nullptr) {
+    CompactionIterationStats range_del_out_stats;
+    Status s = FinishCompactionOutputFile(status, sub_compact, &range_del_agg,
+                                          &range_del_out_stats);
+    if (status.ok()) {
+      status = s;
+    }
+    RecordDroppedKeys(range_del_out_stats, &sub_compact->compaction_job_stats);
+  }
+
+  sub_compact->compaction_job_stats.cpu_micros =
+      env_->NowCPUNanos() / 1000 - prev_cpu_micros;
+
+  if (measure_io_stats_) {
+    sub_compact->compaction_job_stats.file_write_nanos +=
+        IOSTATS(write_nanos) - prev_write_nanos;
+    sub_compact->compaction_job_stats.file_fsync_nanos +=
+        IOSTATS(fsync_nanos) - prev_fsync_nanos;
+    sub_compact->compaction_job_stats.file_range_sync_nanos +=
+        IOSTATS(range_sync_nanos) - prev_range_sync_nanos;
+    sub_compact->compaction_job_stats.file_prepare_write_nanos +=
+        IOSTATS(prepare_write_nanos) - prev_prepare_write_nanos;
+    sub_compact->compaction_job_stats.cpu_micros -=
+        (IOSTATS(cpu_write_nanos) - prev_cpu_write_nanos +
+         IOSTATS(cpu_read_nanos) - prev_cpu_read_nanos) /
+        1000;
+    if (prev_perf_level != PerfLevel::kEnableTimeAndCPUTimeExceptForMutex) {
+      SetPerfLevel(prev_perf_level);
+    }
+  }
+
+  sub_compact->c_iter.reset();
+  input.reset();
+  sub_compact->status = status;
+}
+
+void CompactionJob::RecordDroppedKeys(
+    const CompactionIterationStats& c_iter_stats,
+    CompactionJobStats* compaction_job_stats) {
+  if (c_iter_stats.num_record_drop_user > 0) {
+    RecordTick(stats_, COMPACTION_KEY_DROP_USER,
+               c_iter_stats.num_record_drop_user);
+  }
+  if (c_iter_stats.num_record_drop_hidden > 0) {
+    RecordTick(stats_, COMPACTION_KEY_DROP_NEWER_ENTRY,
+               c_iter_stats.num_record_drop_hidden);
+    if (compaction_job_stats) {
+      compaction_job_stats->num_records_replaced +=
+          c_iter_stats.num_record_drop_hidden;
+    }
+  }
+  if (c_iter_stats.num_record_drop_obsolete > 0) {
+    RecordTick(stats_, COMPACTION_KEY_DROP_OBSOLETE,
+               c_iter_stats.num_record_drop_obsolete);
+    if (compaction_job_stats) {
+      compaction_job_stats->num_expired_deletion_records +=
+          c_iter_stats.num_record_drop_obsolete;
+    }
+  }
+  if (c_iter_stats.num_record_drop_range_del > 0) {
+    RecordTick(stats_, COMPACTION_KEY_DROP_RANGE_DEL,
+               c_iter_stats.num_record_drop_range_del);
+  }
+  if (c_iter_stats.num_range_del_drop_obsolete > 0) {
+    RecordTick(stats_, COMPACTION_RANGE_DEL_DROP_OBSOLETE,
+               c_iter_stats.num_range_del_drop_obsolete);
+  }
+  if (c_iter_stats.num_optimized_del_drop_obsolete > 0) {
+    RecordTick(stats_, COMPACTION_OPTIMIZED_DEL_DROP_OBSOLETE,
+               c_iter_stats.num_optimized_del_drop_obsolete);
+  }
+}
+
+Status CompactionJob::FinishCompactionOutputFile(
+    const Status& input_status, SubcompactionState* sub_compact,
+    CompactionRangeDelAggregator* range_del_agg,
+    CompactionIterationStats* range_del_out_stats,
+    const Slice* next_table_min_key /* = nullptr */) {
+  AutoThreadOperationStageUpdater stage_updater(
+      ThreadStatus::STAGE_COMPACTION_SYNC_FILE);
+  assert(sub_compact != nullptr);
+  assert(sub_compact->outfile);
+  assert(sub_compact->builder != nullptr);
+  assert(sub_compact->current_output() != nullptr);
+
+  uint64_t output_number = sub_compact->current_output()->meta.fd.GetNumber();
+  assert(output_number != 0);
+
+  ColumnFamilyData* cfd = sub_compact->compaction->column_family_data();
+  const Comparator* ucmp = cfd->user_comparator();
+
+  // Check for iterator errors
+  Status s = input_status;
+  auto meta = &sub_compact->current_output()->meta;
+  assert(meta != nullptr);
+  if (s.ok()) {
+    Slice lower_bound_guard, upper_bound_guard;
+    std::string smallest_user_key;
+    const Slice *lower_bound, *upper_bound;
+    bool lower_bound_from_sub_compact = false;
+    if (sub_compact->outputs.size() == 1) {
+      // For the first output table, include range tombstones before the min key
+      // but after the subcompaction boundary.
+      lower_bound = sub_compact->start;
+      lower_bound_from_sub_compact = true;
+    } else if (meta->smallest.size() > 0) {
+      // For subsequent output tables, only include range tombstones from min
+      // key onwards since the previous file was extended to contain range
+      // tombstones falling before min key.
+      smallest_user_key = meta->smallest.user_key().ToString(false /*hex*/);
+      lower_bound_guard = Slice(smallest_user_key);
+      lower_bound = &lower_bound_guard;
+    } else {
+      lower_bound = nullptr;
+    }
+    if (next_table_min_key != nullptr) {
+      // This may be the last file in the subcompaction in some cases, so we
+      // need to compare the end key of subcompaction with the next file start
+      // key. When the end key is chosen by the subcompaction, we know that
+      // it must be the biggest key in output file. Therefore, it is safe to
+      // use the smaller key as the upper bound of the output file, to ensure
+      // that there is no overlapping between different output files.
+      upper_bound_guard = ExtractUserKey(*next_table_min_key);
+      if (sub_compact->end != nullptr &&
+          ucmp->Compare(upper_bound_guard, *sub_compact->end) >= 0) {
+        upper_bound = sub_compact->end;
+      } else {
+        upper_bound = &upper_bound_guard;
+      }
+    } else {
+      // This is the last file in the subcompaction, so extend until the
+      // subcompaction ends.
+      upper_bound = sub_compact->end;
+    }
+    auto earliest_snapshot = kMaxSequenceNumber;
+    if (existing_snapshots_.size() > 0) {
+      earliest_snapshot = existing_snapshots_[0];
+    }
+    bool has_overlapping_endpoints;
+    if (upper_bound != nullptr && meta->largest.size() > 0) {
+      has_overlapping_endpoints =
+          ucmp->Compare(meta->largest.user_key(), *upper_bound) == 0;
+    } else {
+      has_overlapping_endpoints = false;
+    }
+
+    // The end key of the subcompaction must be bigger or equal to the upper
+    // bound. If the end of subcompaction is null or the upper bound is null,
+    // it means that this file is the last file in the compaction. So there
+    // will be no overlapping between this file and others.
+    assert(sub_compact->end == nullptr ||
+           upper_bound == nullptr ||
+           ucmp->Compare(*upper_bound , *sub_compact->end) <= 0);
+    auto it = range_del_agg->NewIterator(lower_bound, upper_bound,
+                                         has_overlapping_endpoints);
+    // Position the range tombstone output iterator. There may be tombstone
+    // fragments that are entirely out of range, so make sure that we do not
+    // include those.
+    if (lower_bound != nullptr) {
+      it->Seek(*lower_bound);
+    } else {
+      it->SeekToFirst();
+    }
+    for (; it->Valid(); it->Next()) {
+      auto tombstone = it->Tombstone();
+      if (upper_bound != nullptr) {
+        int cmp = ucmp->Compare(*upper_bound, tombstone.start_key_);
+        if ((has_overlapping_endpoints && cmp < 0) ||
+            (!has_overlapping_endpoints && cmp <= 0)) {
+          // Tombstones starting after upper_bound only need to be included in
+          // the next table. If the current SST ends before upper_bound, i.e.,
+          // `has_overlapping_endpoints == false`, we can also skip over range
+          // tombstones that start exactly at upper_bound. Such range tombstones
+          // will be included in the next file and are not relevant to the point
+          // keys or endpoints of the current file.
+          break;
+        }
+      }
+
+      if (bottommost_level_ && tombstone.seq_ <= earliest_snapshot) {
+        // TODO(andrewkr): tombstones that span multiple output files are
+        // counted for each compaction output file, so lots of double counting.
+        range_del_out_stats->num_range_del_drop_obsolete++;
+        range_del_out_stats->num_record_drop_obsolete++;
+        continue;
+      }
+
+      auto kv = tombstone.Serialize();
+      assert(lower_bound == nullptr ||
+             ucmp->Compare(*lower_bound, kv.second) < 0);
+      sub_compact->builder->Add(kv.first.Encode(), kv.second);
+      InternalKey smallest_candidate = std::move(kv.first);
+      if (lower_bound != nullptr &&
+          ucmp->Compare(smallest_candidate.user_key(), *lower_bound) <= 0) {
+        // Pretend the smallest key has the same user key as lower_bound
+        // (the max key in the previous table or subcompaction) in order for
+        // files to appear key-space partitioned.
+        //
+        // When lower_bound is chosen by a subcompaction, we know that
+        // subcompactions over smaller keys cannot contain any keys at
+        // lower_bound. We also know that smaller subcompactions exist, because
+        // otherwise the subcompaction woud be unbounded on the left. As a
+        // result, we know that no other files on the output level will contain
+        // actual keys at lower_bound (an output file may have a largest key of
+        // lower_bound@kMaxSequenceNumber, but this only indicates a large range
+        // tombstone was truncated). Therefore, it is safe to use the
+        // tombstone's sequence number, to ensure that keys at lower_bound at
+        // lower levels are covered by truncated tombstones.
+        //
+        // If lower_bound was chosen by the smallest data key in the file,
+        // choose lowest seqnum so this file's smallest internal key comes after
+        // the previous file's largest. The fake seqnum is OK because the read
+        // path's file-picking code only considers user key.
+        smallest_candidate = InternalKey(
+            *lower_bound, lower_bound_from_sub_compact ? tombstone.seq_ : 0,
+            kTypeRangeDeletion);
+      }
+      InternalKey largest_candidate = tombstone.SerializeEndKey();
+      if (upper_bound != nullptr &&
+          ucmp->Compare(*upper_bound, largest_candidate.user_key()) <= 0) {
+        // Pretend the largest key has the same user key as upper_bound (the
+        // min key in the following table or subcompaction) in order for files
+        // to appear key-space partitioned.
+        //
+        // Choose highest seqnum so this file's largest internal key comes
+        // before the next file's/subcompaction's smallest. The fake seqnum is
+        // OK because the read path's file-picking code only considers the user
+        // key portion.
+        //
+        // Note Seek() also creates InternalKey with (user_key,
+        // kMaxSequenceNumber), but with kTypeDeletion (0x7) instead of
+        // kTypeRangeDeletion (0xF), so the range tombstone comes before the
+        // Seek() key in InternalKey's ordering. So Seek() will look in the
+        // next file for the user key.
+        largest_candidate =
+            InternalKey(*upper_bound, kMaxSequenceNumber, kTypeRangeDeletion);
+      }
+#ifndef NDEBUG
+      SequenceNumber smallest_ikey_seqnum = kMaxSequenceNumber;
+      if (meta->smallest.size() > 0) {
+        smallest_ikey_seqnum = GetInternalKeySeqno(meta->smallest.Encode());
+      }
+#endif
+      meta->UpdateBoundariesForRange(smallest_candidate, largest_candidate,
+                                     tombstone.seq_,
+                                     cfd->internal_comparator());
+
+      // The smallest key in a file is used for range tombstone truncation, so
+      // it cannot have a seqnum of 0 (unless the smallest data key in a file
+      // has a seqnum of 0). Otherwise, the truncated tombstone may expose
+      // deleted keys at lower levels.
+      assert(smallest_ikey_seqnum == 0 ||
+             ExtractInternalKeyFooter(meta->smallest.Encode()) !=
+                 PackSequenceAndType(0, kTypeRangeDeletion));
+    }
+    meta->marked_for_compaction = sub_compact->builder->NeedCompact();
+  }
+  const uint64_t current_entries = sub_compact->builder->NumEntries();
+  if (s.ok()) {
+    s = sub_compact->builder->Finish();
+  } else {
+    sub_compact->builder->Abandon();
+  }
+  const uint64_t current_bytes = sub_compact->builder->FileSize();
+  if (s.ok()) {
+    // Add the checksum information to file metadata.
+    meta->file_checksum = sub_compact->builder->GetFileChecksum();
+    meta->file_checksum_func_name =
+        sub_compact->builder->GetFileChecksumFuncName();
+
+    meta->fd.file_size = current_bytes;
+  }
+  sub_compact->current_output()->finished = true;
+  sub_compact->total_bytes += current_bytes;
+
+  // Finish and check for file errors
+  if (s.ok()) {
+    StopWatch sw(env_, stats_, COMPACTION_OUTFILE_SYNC_MICROS);
+    s = sub_compact->outfile->Sync(db_options_.use_fsync);
+  }
+  if (s.ok()) {
+    s = sub_compact->outfile->Close();
+  }
+  sub_compact->outfile.reset();
+
+  TableProperties tp;
+  if (s.ok()) {
+    tp = sub_compact->builder->GetTableProperties();
+  }
+
+  if (s.ok() && current_entries == 0 && tp.num_range_deletions == 0) {
+    // If there is nothing to output, no necessary to generate a sst file.
+    // This happens when the output level is bottom level, at the same time
+    // the sub_compact output nothing.
+    std::string fname =
+        TableFileName(sub_compact->compaction->immutable_cf_options()->cf_paths,
+                      meta->fd.GetNumber(), meta->fd.GetPathId());
+    env_->DeleteFile(fname);
+
+    // Also need to remove the file from outputs, or it will be added to the
+    // VersionEdit.
+    assert(!sub_compact->outputs.empty());
+    sub_compact->outputs.pop_back();
+    meta = nullptr;
+  }
+
+  if (s.ok() && (current_entries > 0 || tp.num_range_deletions > 0)) {
+    // Output to event logger and fire events.
+    sub_compact->current_output()->table_properties =
+        std::make_shared<TableProperties>(tp);
+    ROCKS_LOG_INFO(db_options_.info_log,
+                   "[%s] [JOB %d] Generated table #%" PRIu64 ": %" PRIu64
+                   " keys, %" PRIu64 " bytes%s",
+                   cfd->GetName().c_str(), job_id_, output_number,
+                   current_entries, current_bytes,
+                   meta->marked_for_compaction ? " (need compaction)" : "");
+  }
+  std::string fname;
+  FileDescriptor output_fd;
+  uint64_t oldest_blob_file_number = kInvalidBlobFileNumber;
+  if (meta != nullptr) {
+    fname =
+        TableFileName(sub_compact->compaction->immutable_cf_options()->cf_paths,
+                      meta->fd.GetNumber(), meta->fd.GetPathId());
+    output_fd = meta->fd;
+    oldest_blob_file_number = meta->oldest_blob_file_number;
+  } else {
+    fname = "(nil)";
+  }
+  EventHelpers::LogAndNotifyTableFileCreationFinished(
+      event_logger_, cfd->ioptions()->listeners, dbname_, cfd->GetName(), fname,
+      job_id_, output_fd, oldest_blob_file_number, tp,
+      TableFileCreationReason::kCompaction, s);
+
+#ifndef ROCKSDB_LITE
+  // Report new file to SstFileManagerImpl
+  auto sfm =
+      static_cast<SstFileManagerImpl*>(db_options_.sst_file_manager.get());
+  if (sfm && meta != nullptr && meta->fd.GetPathId() == 0) {
+    sfm->OnAddFile(fname);
+    if (sfm->IsMaxAllowedSpaceReached()) {
+      // TODO(ajkr): should we return OK() if max space was reached by the final
+      // compaction output file (similarly to how flush works when full)?
+      s = Status::SpaceLimit("Max allowed space was reached");
+      TEST_SYNC_POINT(
+          "CompactionJob::FinishCompactionOutputFile:"
+          "MaxAllowedSpaceReached");
+      InstrumentedMutexLock l(db_mutex_);
+      db_error_handler_->SetBGError(s, BackgroundErrorReason::kCompaction);
+    }
+  }
+#endif
+
+  sub_compact->builder.reset();
+  sub_compact->current_output_file_size = 0;
+  return s;
+}
+
+Status CompactionJob::InstallCompactionResults(
+    const MutableCFOptions& mutable_cf_options) {
+  db_mutex_->AssertHeld();
+
+  auto* compaction = compact_->compaction;
+  // paranoia: verify that the files that we started with
+  // still exist in the current version and in the same original level.
+  // This ensures that a concurrent compaction did not erroneously
+  // pick the same files to compact_.
+  if (!versions_->VerifyCompactionFileConsistency(compaction)) {
+    Compaction::InputLevelSummaryBuffer inputs_summary;
+
+    ROCKS_LOG_ERROR(db_options_.info_log, "[%s] [JOB %d] Compaction %s aborted",
+                    compaction->column_family_data()->GetName().c_str(),
+                    job_id_, compaction->InputLevelSummary(&inputs_summary));
+    return Status::Corruption("Compaction input files inconsistent");
+  }
+
+  {
+    Compaction::InputLevelSummaryBuffer inputs_summary;
+    ROCKS_LOG_INFO(
+        db_options_.info_log, "[%s] [JOB %d] Compacted %s => %" PRIu64 " bytes",
+        compaction->column_family_data()->GetName().c_str(), job_id_,
+        compaction->InputLevelSummary(&inputs_summary), compact_->total_bytes);
+  }
+
+  // Add compaction inputs
+  compaction->AddInputDeletions(compact_->compaction->edit());
+
+  for (const auto& sub_compact : compact_->sub_compact_states) {
+    for (const auto& out : sub_compact.outputs) {
+      compaction->edit()->AddFile(compaction->output_level(), out.meta);
+    }
+  }
+  return versions_->LogAndApply(compaction->column_family_data(),
+                                mutable_cf_options, compaction->edit(),
+                                db_mutex_, db_directory_);
+}
+
+void CompactionJob::RecordCompactionIOStats() {
+  RecordTick(stats_, COMPACT_READ_BYTES, IOSTATS(bytes_read));
+  ThreadStatusUtil::IncreaseThreadOperationProperty(
+      ThreadStatus::COMPACTION_BYTES_READ, IOSTATS(bytes_read));
+  IOSTATS_RESET(bytes_read);
+  RecordTick(stats_, COMPACT_WRITE_BYTES, IOSTATS(bytes_written));
+  ThreadStatusUtil::IncreaseThreadOperationProperty(
+      ThreadStatus::COMPACTION_BYTES_WRITTEN, IOSTATS(bytes_written));
+  IOSTATS_RESET(bytes_written);
+}
+
+Status CompactionJob::OpenCompactionOutputFile(
+    SubcompactionState* sub_compact) {
+  assert(sub_compact != nullptr);
+  assert(sub_compact->builder == nullptr);
+  // no need to lock because VersionSet::next_file_number_ is atomic
+  uint64_t file_number = versions_->NewFileNumber();
+  std::string fname =
+      TableFileName(sub_compact->compaction->immutable_cf_options()->cf_paths,
+                    file_number, sub_compact->compaction->output_path_id());
+  // Fire events.
+  ColumnFamilyData* cfd = sub_compact->compaction->column_family_data();
+#ifndef ROCKSDB_LITE
+  EventHelpers::NotifyTableFileCreationStarted(
+      cfd->ioptions()->listeners, dbname_, cfd->GetName(), fname, job_id_,
+      TableFileCreationReason::kCompaction);
+#endif  // !ROCKSDB_LITE
+  // Make the output file
+  std::unique_ptr<FSWritableFile> writable_file;
+#ifndef NDEBUG
+  bool syncpoint_arg = file_options_.use_direct_writes;
+  TEST_SYNC_POINT_CALLBACK("CompactionJob::OpenCompactionOutputFile",
+                           &syncpoint_arg);
+#endif
+  Status s = NewWritableFile(fs_, fname, &writable_file, file_options_);
+  if (!s.ok()) {
+    ROCKS_LOG_ERROR(
+        db_options_.info_log,
+        "[%s] [JOB %d] OpenCompactionOutputFiles for table #%" PRIu64
+        " fails at NewWritableFile with status %s",
+        sub_compact->compaction->column_family_data()->GetName().c_str(),
+        job_id_, file_number, s.ToString().c_str());
+    LogFlush(db_options_.info_log);
+    EventHelpers::LogAndNotifyTableFileCreationFinished(
+        event_logger_, cfd->ioptions()->listeners, dbname_, cfd->GetName(),
+        fname, job_id_, FileDescriptor(), kInvalidBlobFileNumber,
+        TableProperties(), TableFileCreationReason::kCompaction, s);
+    return s;
+  }
+
+  // Try to figure out the output file's oldest ancester time.
+  int64_t temp_current_time = 0;
+  auto get_time_status = env_->GetCurrentTime(&temp_current_time);
+  // Safe to proceed even if GetCurrentTime fails. So, log and proceed.
+  if (!get_time_status.ok()) {
+    ROCKS_LOG_WARN(db_options_.info_log,
+                   "Failed to get current time. Status: %s",
+                   get_time_status.ToString().c_str());
+  }
+  uint64_t current_time = static_cast<uint64_t>(temp_current_time);
+  uint64_t oldest_ancester_time =
+      sub_compact->compaction->MinInputFileOldestAncesterTime();
+  if (oldest_ancester_time == port::kMaxUint64) {
+    oldest_ancester_time = current_time;
+  }
+
+  // Initialize a SubcompactionState::Output and add it to sub_compact->outputs
+  {
+    SubcompactionState::Output out;
+    out.meta.fd = FileDescriptor(file_number,
+                                 sub_compact->compaction->output_path_id(), 0);
+    out.meta.oldest_ancester_time = oldest_ancester_time;
+    out.meta.file_creation_time = current_time;
+    out.finished = false;
+    sub_compact->outputs.push_back(out);
+  }
+
+  writable_file->SetIOPriority(Env::IOPriority::IO_LOW);
+  writable_file->SetWriteLifeTimeHint(write_hint_);
+  writable_file->SetPreallocationBlockSize(static_cast<size_t>(
+      sub_compact->compaction->OutputFilePreallocationSize()));
+  const auto& listeners =
+      sub_compact->compaction->immutable_cf_options()->listeners;
+  sub_compact->outfile.reset(
+      new WritableFileWriter(std::move(writable_file), fname, file_options_,
+                             env_, db_options_.statistics.get(), listeners,
+                             db_options_.sst_file_checksum_func.get()));
+
+  // If the Column family flag is to only optimize filters for hits,
+  // we can skip creating filters if this is the bottommost_level where
+  // data is going to be found
+  bool skip_filters =
+      cfd->ioptions()->optimize_filters_for_hits && bottommost_level_;
+
+  sub_compact->builder.reset(NewTableBuilder(
+      *cfd->ioptions(), *(sub_compact->compaction->mutable_cf_options()),
+      cfd->internal_comparator(), cfd->int_tbl_prop_collector_factories(),
+      cfd->GetID(), cfd->GetName(), sub_compact->outfile.get(),
+      sub_compact->compaction->output_compression(),
+      0 /*sample_for_compression */,
+      sub_compact->compaction->output_compression_opts(),
+      sub_compact->compaction->output_level(), skip_filters,
+      oldest_ancester_time, 0 /* oldest_key_time */,
+      sub_compact->compaction->max_output_file_size(), current_time));
+  LogFlush(db_options_.info_log);
+  return s;
+}
+
+void CompactionJob::CleanupCompaction() {
+  for (SubcompactionState& sub_compact : compact_->sub_compact_states) {
+    const auto& sub_status = sub_compact.status;
+
+    if (sub_compact.builder != nullptr) {
+      // May happen if we get a shutdown call in the middle of compaction
+      sub_compact.builder->Abandon();
+      sub_compact.builder.reset();
+    } else {
+      assert(!sub_status.ok() || sub_compact.outfile == nullptr);
+    }
+    for (const auto& out : sub_compact.outputs) {
+      // If this file was inserted into the table cache then remove
+      // them here because this compaction was not committed.
+      if (!sub_status.ok()) {
+        TableCache::Evict(table_cache_.get(), out.meta.fd.GetNumber());
+      }
+    }
+  }
+  delete compact_;
+  compact_ = nullptr;
+}
+
+#ifndef ROCKSDB_LITE
+namespace {
+void CopyPrefix(const Slice& src, size_t prefix_length, std::string* dst) {
+  assert(prefix_length > 0);
+  size_t length = src.size() > prefix_length ? prefix_length : src.size();
+  dst->assign(src.data(), length);
+}
+}  // namespace
+
+#endif  // !ROCKSDB_LITE
+
+void CompactionJob::UpdateCompactionStats() {
+  Compaction* compaction = compact_->compaction;
+  compaction_stats_.num_input_files_in_non_output_levels = 0;
+  compaction_stats_.num_input_files_in_output_level = 0;
+  for (int input_level = 0;
+       input_level < static_cast<int>(compaction->num_input_levels());
+       ++input_level) {
+    if (compaction->level(input_level) != compaction->output_level()) {
+      UpdateCompactionInputStatsHelper(
+          &compaction_stats_.num_input_files_in_non_output_levels,
+          &compaction_stats_.bytes_read_non_output_levels, input_level);
+    } else {
+      UpdateCompactionInputStatsHelper(
+          &compaction_stats_.num_input_files_in_output_level,
+          &compaction_stats_.bytes_read_output_level, input_level);
+    }
+  }
+
+  uint64_t num_output_records = 0;
+
+  for (const auto& sub_compact : compact_->sub_compact_states) {
+    size_t num_output_files = sub_compact.outputs.size();
+    if (sub_compact.builder != nullptr) {
+      // An error occurred so ignore the last output.
+      assert(num_output_files > 0);
+      --num_output_files;
+    }
+    compaction_stats_.num_output_files += static_cast<int>(num_output_files);
+
+    num_output_records += sub_compact.num_output_records;
+
+    for (const auto& out : sub_compact.outputs) {
+      compaction_stats_.bytes_written += out.meta.fd.file_size;
+    }
+  }
+
+  if (compaction_stats_.num_input_records > num_output_records) {
+    compaction_stats_.num_dropped_records =
+        compaction_stats_.num_input_records - num_output_records;
+  }
+}
+
+void CompactionJob::UpdateCompactionInputStatsHelper(int* num_files,
+                                                     uint64_t* bytes_read,
+                                                     int input_level) {
+  const Compaction* compaction = compact_->compaction;
+  auto num_input_files = compaction->num_input_files(input_level);
+  *num_files += static_cast<int>(num_input_files);
+
+  for (size_t i = 0; i < num_input_files; ++i) {
+    const auto* file_meta = compaction->input(input_level, i);
+    *bytes_read += file_meta->fd.GetFileSize();
+    compaction_stats_.num_input_records +=
+        static_cast<uint64_t>(file_meta->num_entries);
+  }
+}
+
+void CompactionJob::UpdateCompactionJobStats(
+    const InternalStats::CompactionStats& stats) const {
+#ifndef ROCKSDB_LITE
+  if (compaction_job_stats_) {
+    compaction_job_stats_->elapsed_micros = stats.micros;
+
+    // input information
+    compaction_job_stats_->total_input_bytes =
+        stats.bytes_read_non_output_levels + stats.bytes_read_output_level;
+    compaction_job_stats_->num_input_records = stats.num_input_records;
+    compaction_job_stats_->num_input_files =
+        stats.num_input_files_in_non_output_levels +
+        stats.num_input_files_in_output_level;
+    compaction_job_stats_->num_input_files_at_output_level =
+        stats.num_input_files_in_output_level;
+
+    // output information
+    compaction_job_stats_->total_output_bytes = stats.bytes_written;
+    compaction_job_stats_->num_output_records = compact_->num_output_records;
+    compaction_job_stats_->num_output_files = stats.num_output_files;
+
+    if (compact_->NumOutputFiles() > 0U) {
+      CopyPrefix(compact_->SmallestUserKey(),
+                 CompactionJobStats::kMaxPrefixLength,
+                 &compaction_job_stats_->smallest_output_key_prefix);
+      CopyPrefix(compact_->LargestUserKey(),
+                 CompactionJobStats::kMaxPrefixLength,
+                 &compaction_job_stats_->largest_output_key_prefix);
+    }
+  }
+#else
+  (void)stats;
+#endif  // !ROCKSDB_LITE
+}
+
+void CompactionJob::LogCompaction() {
+  Compaction* compaction = compact_->compaction;
+  ColumnFamilyData* cfd = compaction->column_family_data();
+
+  // Let's check if anything will get logged. Don't prepare all the info if
+  // we're not logging
+  if (db_options_.info_log_level <= InfoLogLevel::INFO_LEVEL) {
+    Compaction::InputLevelSummaryBuffer inputs_summary;
+    ROCKS_LOG_INFO(
+        db_options_.info_log, "[%s] [JOB %d] Compacting %s, score %.2f",
+        cfd->GetName().c_str(), job_id_,
+        compaction->InputLevelSummary(&inputs_summary), compaction->score());
+    char scratch[2345];
+    compaction->Summary(scratch, sizeof(scratch));
+    ROCKS_LOG_INFO(db_options_.info_log, "[%s] Compaction start summary: %s\n",
+                   cfd->GetName().c_str(), scratch);
+    // build event logger report
+    auto stream = event_logger_->Log();
+    stream << "job" << job_id_ << "event"
+           << "compaction_started"
+           << "compaction_reason"
+           << GetCompactionReasonString(compaction->compaction_reason());
+    for (size_t i = 0; i < compaction->num_input_levels(); ++i) {
+      stream << ("files_L" + ToString(compaction->level(i)));
+      stream.StartArray();
+      for (auto f : *compaction->inputs(i)) {
+        stream << f->fd.GetNumber();
+      }
+      stream.EndArray();
+    }
+    stream << "score" << compaction->score() << "input_data_size"
+           << compaction->CalculateTotalInputSize();
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/compaction/compaction_job.h b/src/rocksdb/db/compaction/compaction_job.h
new file mode 100644
index 000000000..c15f502a1
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_job.h
@@ -0,0 +1,198 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+
+#include <atomic>
+#include <deque>
+#include <functional>
+#include <limits>
+#include <set>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "db/column_family.h"
+#include "db/compaction/compaction_iterator.h"
+#include "db/dbformat.h"
+#include "db/flush_scheduler.h"
+#include "db/internal_stats.h"
+#include "db/job_context.h"
+#include "db/log_writer.h"
+#include "db/memtable_list.h"
+#include "db/range_del_aggregator.h"
+#include "db/version_edit.h"
+#include "db/write_controller.h"
+#include "db/write_thread.h"
+#include "logging/event_logger.h"
+#include "options/cf_options.h"
+#include "options/db_options.h"
+#include "port/port.h"
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/compaction_job_stats.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/memtablerep.h"
+#include "rocksdb/transaction_log.h"
+#include "table/scoped_arena_iterator.h"
+#include "util/autovector.h"
+#include "util/stop_watch.h"
+#include "util/thread_local.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Arena;
+class ErrorHandler;
+class MemTable;
+class SnapshotChecker;
+class TableCache;
+class Version;
+class VersionEdit;
+class VersionSet;
+
+// CompactionJob is responsible for executing the compaction. Each (manual or
+// automated) compaction corresponds to a CompactionJob object, and usually
+// goes through the stages of `Prepare()`->`Run()`->`Install()`. CompactionJob
+// will divide the compaction into subcompactions and execute them in parallel
+// if needed.
+class CompactionJob {
+ public:
+  CompactionJob(int job_id, Compaction* compaction,
+                const ImmutableDBOptions& db_options,
+                const FileOptions& file_options, VersionSet* versions,
+                const std::atomic<bool>* shutting_down,
+                const SequenceNumber preserve_deletes_seqnum,
+                LogBuffer* log_buffer, Directory* db_directory,
+                Directory* output_directory, Statistics* stats,
+                InstrumentedMutex* db_mutex, ErrorHandler* db_error_handler,
+                std::vector<SequenceNumber> existing_snapshots,
+                SequenceNumber earliest_write_conflict_snapshot,
+                const SnapshotChecker* snapshot_checker,
+                std::shared_ptr<Cache> table_cache, EventLogger* event_logger,
+                bool paranoid_file_checks, bool measure_io_stats,
+                const std::string& dbname,
+                CompactionJobStats* compaction_job_stats,
+                Env::Priority thread_pri,
+                const std::atomic<bool>* manual_compaction_paused = nullptr);
+
+  ~CompactionJob();
+
+  // no copy/move
+  CompactionJob(CompactionJob&& job) = delete;
+  CompactionJob(const CompactionJob& job) = delete;
+  CompactionJob& operator=(const CompactionJob& job) = delete;
+
+  // REQUIRED: mutex held
+  // Prepare for the compaction by setting up boundaries for each subcompaction
+  void Prepare();
+  // REQUIRED mutex not held
+  // Launch threads for each subcompaction and wait for them to finish. After
+  // that, verify table is usable and finally do bookkeeping to unify
+  // subcompaction results
+  Status Run();
+
+  // REQUIRED: mutex held
+  // Add compaction input/output to the current version
+  Status Install(const MutableCFOptions& mutable_cf_options);
+
+ private:
+  struct SubcompactionState;
+
+  void AggregateStatistics();
+
+  // Generates a histogram representing potential divisions of key ranges from
+  // the input. It adds the starting and/or ending keys of certain input files
+  // to the working set and then finds the approximate size of data in between
+  // each consecutive pair of slices. Then it divides these ranges into
+  // consecutive groups such that each group has a similar size.
+  void GenSubcompactionBoundaries();
+
+  // update the thread status for starting a compaction.
+  void ReportStartedCompaction(Compaction* compaction);
+  void AllocateCompactionOutputFileNumbers();
+  // Call compaction filter. Then iterate through input and compact the
+  // kv-pairs
+  void ProcessKeyValueCompaction(SubcompactionState* sub_compact);
+
+  Status FinishCompactionOutputFile(
+      const Status& input_status, SubcompactionState* sub_compact,
+      CompactionRangeDelAggregator* range_del_agg,
+      CompactionIterationStats* range_del_out_stats,
+      const Slice* next_table_min_key = nullptr);
+  Status InstallCompactionResults(const MutableCFOptions& mutable_cf_options);
+  void RecordCompactionIOStats();
+  Status OpenCompactionOutputFile(SubcompactionState* sub_compact);
+  void CleanupCompaction();
+  void UpdateCompactionJobStats(
+    const InternalStats::CompactionStats& stats) const;
+  void RecordDroppedKeys(const CompactionIterationStats& c_iter_stats,
+                         CompactionJobStats* compaction_job_stats = nullptr);
+
+  void UpdateCompactionStats();
+  void UpdateCompactionInputStatsHelper(
+      int* num_files, uint64_t* bytes_read, int input_level);
+
+  void LogCompaction();
+
+  int job_id_;
+
+  // CompactionJob state
+  struct CompactionState;
+  CompactionState* compact_;
+  CompactionJobStats* compaction_job_stats_;
+  InternalStats::CompactionStats compaction_stats_;
+
+  // DBImpl state
+  const std::string& dbname_;
+  const ImmutableDBOptions& db_options_;
+  const FileOptions file_options_;
+
+  Env* env_;
+  FileSystem* fs_;
+  // env_option optimized for compaction table reads
+  FileOptions file_options_for_read_;
+  VersionSet* versions_;
+  const std::atomic<bool>* shutting_down_;
+  const std::atomic<bool>* manual_compaction_paused_;
+  const SequenceNumber preserve_deletes_seqnum_;
+  LogBuffer* log_buffer_;
+  Directory* db_directory_;
+  Directory* output_directory_;
+  Statistics* stats_;
+  InstrumentedMutex* db_mutex_;
+  ErrorHandler* db_error_handler_;
+  // If there were two snapshots with seq numbers s1 and
+  // s2 and s1 < s2, and if we find two instances of a key k1 then lies
+  // entirely within s1 and s2, then the earlier version of k1 can be safely
+  // deleted because that version is not visible in any snapshot.
+  std::vector<SequenceNumber> existing_snapshots_;
+
+  // This is the earliest snapshot that could be used for write-conflict
+  // checking by a transaction.  For any user-key newer than this snapshot, we
+  // should make sure not to remove evidence that a write occurred.
+  SequenceNumber earliest_write_conflict_snapshot_;
+
+  const SnapshotChecker* const snapshot_checker_;
+
+  std::shared_ptr<Cache> table_cache_;
+
+  EventLogger* event_logger_;
+
+  // Is this compaction creating a file in the bottom most level?
+  bool bottommost_level_;
+  bool paranoid_file_checks_;
+  bool measure_io_stats_;
+  // Stores the Slices that designate the boundaries for each subcompaction
+  std::vector<Slice> boundaries_;
+  // Stores the approx size of keys covered in the range of each subcompaction
+  std::vector<uint64_t> sizes_;
+  Env::WriteLifeTimeHint write_hint_;
+  Env::Priority thread_pri_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/compaction/compaction_job_stats_test.cc b/src/rocksdb/db/compaction/compaction_job_stats_test.cc
new file mode 100644
index 000000000..51a665797
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_job_stats_test.cc
@@ -0,0 +1,1043 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <algorithm>
+#include <cinttypes>
+#include <iostream>
+#include <mutex>
+#include <queue>
+#include <set>
+#include <thread>
+#include <unordered_set>
+#include <utility>
+
+#include "db/db_impl/db_impl.h"
+#include "db/dbformat.h"
+#include "db/job_context.h"
+#include "db/version_set.h"
+#include "db/write_batch_internal.h"
+#include "env/mock_env.h"
+#include "file/filename.h"
+#include "logging/logging.h"
+#include "memtable/hash_linklist_rep.h"
+#include "monitoring/statistics.h"
+#include "monitoring/thread_status_util.h"
+#include "port/stack_trace.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/experimental.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/options.h"
+#include "rocksdb/perf_context.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/table.h"
+#include "rocksdb/table_properties.h"
+#include "rocksdb/thread_status.h"
+#include "rocksdb/utilities/checkpoint.h"
+#include "rocksdb/utilities/write_batch_with_index.h"
+#include "table/block_based/block_based_table_factory.h"
+#include "table/mock_table.h"
+#include "table/plain/plain_table_factory.h"
+#include "table/scoped_arena_iterator.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/compression.h"
+#include "util/hash.h"
+#include "util/mutexlock.h"
+#include "util/rate_limiter.h"
+#include "util/string_util.h"
+#include "utilities/merge_operators.h"
+
+#if !defined(IOS_CROSS_COMPILE)
+#ifndef ROCKSDB_LITE
+namespace ROCKSDB_NAMESPACE {
+
+static std::string RandomString(Random* rnd, int len, double ratio) {
+  std::string r;
+  test::CompressibleString(rnd, ratio, len, &r);
+  return r;
+}
+
+std::string Key(uint64_t key, int length) {
+  const int kBufSize = 1000;
+  char buf[kBufSize];
+  if (length > kBufSize) {
+    length = kBufSize;
+  }
+  snprintf(buf, kBufSize, "%0*" PRIu64, length, key);
+  return std::string(buf);
+}
+
+class CompactionJobStatsTest : public testing::Test,
+                               public testing::WithParamInterface<bool> {
+ public:
+  std::string dbname_;
+  std::string alternative_wal_dir_;
+  Env* env_;
+  DB* db_;
+  std::vector<ColumnFamilyHandle*> handles_;
+  uint32_t max_subcompactions_;
+
+  Options last_options_;
+
+  CompactionJobStatsTest() : env_(Env::Default()) {
+    env_->SetBackgroundThreads(1, Env::LOW);
+    env_->SetBackgroundThreads(1, Env::HIGH);
+    dbname_ = test::PerThreadDBPath("compaction_job_stats_test");
+    alternative_wal_dir_ = dbname_ + "/wal";
+    Options options;
+    options.create_if_missing = true;
+    max_subcompactions_ = GetParam();
+    options.max_subcompactions = max_subcompactions_;
+    auto delete_options = options;
+    delete_options.wal_dir = alternative_wal_dir_;
+    EXPECT_OK(DestroyDB(dbname_, delete_options));
+    // Destroy it for not alternative WAL dir is used.
+    EXPECT_OK(DestroyDB(dbname_, options));
+    db_ = nullptr;
+    Reopen(options);
+  }
+
+  ~CompactionJobStatsTest() override {
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({});
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+    Close();
+    Options options;
+    options.db_paths.emplace_back(dbname_, 0);
+    options.db_paths.emplace_back(dbname_ + "_2", 0);
+    options.db_paths.emplace_back(dbname_ + "_3", 0);
+    options.db_paths.emplace_back(dbname_ + "_4", 0);
+    EXPECT_OK(DestroyDB(dbname_, options));
+  }
+
+  // Required if inheriting from testing::WithParamInterface<>
+  static void SetUpTestCase() {}
+  static void TearDownTestCase() {}
+
+  DBImpl* dbfull() {
+    return reinterpret_cast<DBImpl*>(db_);
+  }
+
+  void CreateColumnFamilies(const std::vector<std::string>& cfs,
+                            const Options& options) {
+    ColumnFamilyOptions cf_opts(options);
+    size_t cfi = handles_.size();
+    handles_.resize(cfi + cfs.size());
+    for (auto cf : cfs) {
+      ASSERT_OK(db_->CreateColumnFamily(cf_opts, cf, &handles_[cfi++]));
+    }
+  }
+
+  void CreateAndReopenWithCF(const std::vector<std::string>& cfs,
+                             const Options& options) {
+    CreateColumnFamilies(cfs, options);
+    std::vector<std::string> cfs_plus_default = cfs;
+    cfs_plus_default.insert(cfs_plus_default.begin(), kDefaultColumnFamilyName);
+    ReopenWithColumnFamilies(cfs_plus_default, options);
+  }
+
+  void ReopenWithColumnFamilies(const std::vector<std::string>& cfs,
+                                const std::vector<Options>& options) {
+    ASSERT_OK(TryReopenWithColumnFamilies(cfs, options));
+  }
+
+  void ReopenWithColumnFamilies(const std::vector<std::string>& cfs,
+                                const Options& options) {
+    ASSERT_OK(TryReopenWithColumnFamilies(cfs, options));
+  }
+
+  Status TryReopenWithColumnFamilies(
+      const std::vector<std::string>& cfs,
+      const std::vector<Options>& options) {
+    Close();
+    EXPECT_EQ(cfs.size(), options.size());
+    std::vector<ColumnFamilyDescriptor> column_families;
+    for (size_t i = 0; i < cfs.size(); ++i) {
+      column_families.push_back(ColumnFamilyDescriptor(cfs[i], options[i]));
+    }
+    DBOptions db_opts = DBOptions(options[0]);
+    return DB::Open(db_opts, dbname_, column_families, &handles_, &db_);
+  }
+
+  Status TryReopenWithColumnFamilies(const std::vector<std::string>& cfs,
+                                     const Options& options) {
+    Close();
+    std::vector<Options> v_opts(cfs.size(), options);
+    return TryReopenWithColumnFamilies(cfs, v_opts);
+  }
+
+  void Reopen(const Options& options) {
+    ASSERT_OK(TryReopen(options));
+  }
+
+  void Close() {
+    for (auto h : handles_) {
+      delete h;
+    }
+    handles_.clear();
+    delete db_;
+    db_ = nullptr;
+  }
+
+  void DestroyAndReopen(const Options& options) {
+    // Destroy using last options
+    Destroy(last_options_);
+    ASSERT_OK(TryReopen(options));
+  }
+
+  void Destroy(const Options& options) {
+    Close();
+    ASSERT_OK(DestroyDB(dbname_, options));
+  }
+
+  Status ReadOnlyReopen(const Options& options) {
+    return DB::OpenForReadOnly(options, dbname_, &db_);
+  }
+
+  Status TryReopen(const Options& options) {
+    Close();
+    last_options_ = options;
+    return DB::Open(options, dbname_, &db_);
+  }
+
+  Status Flush(int cf = 0) {
+    if (cf == 0) {
+      return db_->Flush(FlushOptions());
+    } else {
+      return db_->Flush(FlushOptions(), handles_[cf]);
+    }
+  }
+
+  Status Put(const Slice& k, const Slice& v, WriteOptions wo = WriteOptions()) {
+    return db_->Put(wo, k, v);
+  }
+
+  Status Put(int cf, const Slice& k, const Slice& v,
+             WriteOptions wo = WriteOptions()) {
+    return db_->Put(wo, handles_[cf], k, v);
+  }
+
+  Status Delete(const std::string& k) {
+    return db_->Delete(WriteOptions(), k);
+  }
+
+  Status Delete(int cf, const std::string& k) {
+    return db_->Delete(WriteOptions(), handles_[cf], k);
+  }
+
+  std::string Get(const std::string& k, const Snapshot* snapshot = nullptr) {
+    ReadOptions options;
+    options.verify_checksums = true;
+    options.snapshot = snapshot;
+    std::string result;
+    Status s = db_->Get(options, k, &result);
+    if (s.IsNotFound()) {
+      result = "NOT_FOUND";
+    } else if (!s.ok()) {
+      result = s.ToString();
+    }
+    return result;
+  }
+
+  std::string Get(int cf, const std::string& k,
+                  const Snapshot* snapshot = nullptr) {
+    ReadOptions options;
+    options.verify_checksums = true;
+    options.snapshot = snapshot;
+    std::string result;
+    Status s = db_->Get(options, handles_[cf], k, &result);
+    if (s.IsNotFound()) {
+      result = "NOT_FOUND";
+    } else if (!s.ok()) {
+      result = s.ToString();
+    }
+    return result;
+  }
+
+  int NumTableFilesAtLevel(int level, int cf = 0) {
+    std::string property;
+    if (cf == 0) {
+      // default cfd
+      EXPECT_TRUE(db_->GetProperty(
+          "rocksdb.num-files-at-level" + NumberToString(level), &property));
+    } else {
+      EXPECT_TRUE(db_->GetProperty(
+          handles_[cf], "rocksdb.num-files-at-level" + NumberToString(level),
+          &property));
+    }
+    return atoi(property.c_str());
+  }
+
+  // Return spread of files per level
+  std::string FilesPerLevel(int cf = 0) {
+    int num_levels =
+        (cf == 0) ? db_->NumberLevels() : db_->NumberLevels(handles_[1]);
+    std::string result;
+    size_t last_non_zero_offset = 0;
+    for (int level = 0; level < num_levels; level++) {
+      int f = NumTableFilesAtLevel(level, cf);
+      char buf[100];
+      snprintf(buf, sizeof(buf), "%s%d", (level ? "," : ""), f);
+      result += buf;
+      if (f > 0) {
+        last_non_zero_offset = result.size();
+      }
+    }
+    result.resize(last_non_zero_offset);
+    return result;
+  }
+
+  uint64_t Size(const Slice& start, const Slice& limit, int cf = 0) {
+    Range r(start, limit);
+    uint64_t size;
+    if (cf == 0) {
+      db_->GetApproximateSizes(&r, 1, &size);
+    } else {
+      db_->GetApproximateSizes(handles_[1], &r, 1, &size);
+    }
+    return size;
+  }
+
+  void Compact(int cf, const Slice& start, const Slice& limit,
+               uint32_t target_path_id) {
+    CompactRangeOptions compact_options;
+    compact_options.target_path_id = target_path_id;
+    ASSERT_OK(db_->CompactRange(compact_options, handles_[cf], &start, &limit));
+  }
+
+  void Compact(int cf, const Slice& start, const Slice& limit) {
+    ASSERT_OK(
+        db_->CompactRange(CompactRangeOptions(), handles_[cf], &start, &limit));
+  }
+
+  void Compact(const Slice& start, const Slice& limit) {
+    ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &start, &limit));
+  }
+
+  void TEST_Compact(int level, int cf, const Slice& start, const Slice& limit) {
+    ASSERT_OK(dbfull()->TEST_CompactRange(level, &start, &limit, handles_[cf],
+                                          true /* disallow trivial move */));
+  }
+
+  // Do n memtable compactions, each of which produces an sstable
+  // covering the range [small,large].
+  void MakeTables(int n, const std::string& small, const std::string& large,
+                  int cf = 0) {
+    for (int i = 0; i < n; i++) {
+      ASSERT_OK(Put(cf, small, "begin"));
+      ASSERT_OK(Put(cf, large, "end"));
+      ASSERT_OK(Flush(cf));
+    }
+  }
+
+  static void SetDeletionCompactionStats(
+      CompactionJobStats *stats, uint64_t input_deletions,
+      uint64_t expired_deletions, uint64_t records_replaced) {
+    stats->num_input_deletion_records = input_deletions;
+    stats->num_expired_deletion_records = expired_deletions;
+    stats->num_records_replaced = records_replaced;
+  }
+
+  void MakeTableWithKeyValues(
+    Random* rnd, uint64_t smallest, uint64_t largest,
+    int key_size, int value_size, uint64_t interval,
+    double ratio, int cf = 0) {
+    for (auto key = smallest; key < largest; key += interval) {
+      ASSERT_OK(Put(cf, Slice(Key(key, key_size)),
+                        Slice(RandomString(rnd, value_size, ratio))));
+    }
+    ASSERT_OK(Flush(cf));
+  }
+
+  // This function behaves with the implicit understanding that two
+  // rounds of keys are inserted into the database, as per the behavior
+  // of the DeletionStatsTest.
+  void SelectivelyDeleteKeys(uint64_t smallest, uint64_t largest,
+    uint64_t interval, int deletion_interval, int key_size,
+    uint64_t cutoff_key_num, CompactionJobStats* stats, int cf = 0) {
+
+    // interval needs to be >= 2 so that deletion entries can be inserted
+    // that are intended to not result in an actual key deletion by using
+    // an offset of 1 from another existing key
+    ASSERT_GE(interval, 2);
+
+    uint64_t ctr = 1;
+    uint32_t deletions_made = 0;
+    uint32_t num_deleted = 0;
+    uint32_t num_expired = 0;
+    for (auto key = smallest; key <= largest; key += interval, ctr++) {
+      if (ctr % deletion_interval == 0) {
+        ASSERT_OK(Delete(cf, Key(key, key_size)));
+        deletions_made++;
+        num_deleted++;
+
+        if (key > cutoff_key_num) {
+          num_expired++;
+        }
+      }
+    }
+
+    // Insert some deletions for keys that don't exist that
+    // are both in and out of the key range
+    ASSERT_OK(Delete(cf, Key(smallest+1, key_size)));
+    deletions_made++;
+
+    ASSERT_OK(Delete(cf, Key(smallest-1, key_size)));
+    deletions_made++;
+    num_expired++;
+
+    ASSERT_OK(Delete(cf, Key(smallest-9, key_size)));
+    deletions_made++;
+    num_expired++;
+
+    ASSERT_OK(Flush(cf));
+    SetDeletionCompactionStats(stats, deletions_made, num_expired,
+      num_deleted);
+  }
+};
+
+// An EventListener which helps verify the compaction results in
+// test CompactionJobStatsTest.
+class CompactionJobStatsChecker : public EventListener {
+ public:
+  CompactionJobStatsChecker()
+      : compression_enabled_(false), verify_next_comp_io_stats_(false) {}
+
+  size_t NumberOfUnverifiedStats() { return expected_stats_.size(); }
+
+  void set_verify_next_comp_io_stats(bool v) { verify_next_comp_io_stats_ = v; }
+
+  // Once a compaction completed, this function will verify the returned
+  // CompactionJobInfo with the oldest CompactionJobInfo added earlier
+  // in "expected_stats_" which has not yet being used for verification.
+  void OnCompactionCompleted(DB* /*db*/, const CompactionJobInfo& ci) override {
+    if (verify_next_comp_io_stats_) {
+      ASSERT_GT(ci.stats.file_write_nanos, 0);
+      ASSERT_GT(ci.stats.file_range_sync_nanos, 0);
+      ASSERT_GT(ci.stats.file_fsync_nanos, 0);
+      ASSERT_GT(ci.stats.file_prepare_write_nanos, 0);
+      verify_next_comp_io_stats_ = false;
+    }
+
+    std::lock_guard<std::mutex> lock(mutex_);
+    if (expected_stats_.size()) {
+      Verify(ci.stats, expected_stats_.front());
+      expected_stats_.pop();
+    }
+  }
+
+  // A helper function which verifies whether two CompactionJobStats
+  // match.  The verification of all compaction stats are done by
+  // ASSERT_EQ except for the total input / output bytes, which we
+  // use ASSERT_GE and ASSERT_LE with a reasonable bias ---
+  // 10% in uncompressed case and 20% when compression is used.
+  virtual void Verify(const CompactionJobStats& current_stats,
+              const CompactionJobStats& stats) {
+    // time
+    ASSERT_GT(current_stats.elapsed_micros, 0U);
+
+    ASSERT_EQ(current_stats.num_input_records,
+        stats.num_input_records);
+    ASSERT_EQ(current_stats.num_input_files,
+        stats.num_input_files);
+    ASSERT_EQ(current_stats.num_input_files_at_output_level,
+        stats.num_input_files_at_output_level);
+
+    ASSERT_EQ(current_stats.num_output_records,
+        stats.num_output_records);
+    ASSERT_EQ(current_stats.num_output_files,
+        stats.num_output_files);
+
+    ASSERT_EQ(current_stats.is_manual_compaction,
+        stats.is_manual_compaction);
+
+    // file size
+    double kFileSizeBias = compression_enabled_ ? 0.20 : 0.10;
+    ASSERT_GE(current_stats.total_input_bytes * (1.00 + kFileSizeBias),
+              stats.total_input_bytes);
+    ASSERT_LE(current_stats.total_input_bytes,
+              stats.total_input_bytes * (1.00 + kFileSizeBias));
+    ASSERT_GE(current_stats.total_output_bytes * (1.00 + kFileSizeBias),
+              stats.total_output_bytes);
+    ASSERT_LE(current_stats.total_output_bytes,
+              stats.total_output_bytes * (1.00 + kFileSizeBias));
+    ASSERT_EQ(current_stats.total_input_raw_key_bytes,
+              stats.total_input_raw_key_bytes);
+    ASSERT_EQ(current_stats.total_input_raw_value_bytes,
+              stats.total_input_raw_value_bytes);
+
+    ASSERT_EQ(current_stats.num_records_replaced,
+        stats.num_records_replaced);
+
+    ASSERT_EQ(current_stats.num_corrupt_keys,
+        stats.num_corrupt_keys);
+
+    ASSERT_EQ(
+        std::string(current_stats.smallest_output_key_prefix),
+        std::string(stats.smallest_output_key_prefix));
+    ASSERT_EQ(
+        std::string(current_stats.largest_output_key_prefix),
+        std::string(stats.largest_output_key_prefix));
+  }
+
+  // Add an expected compaction stats, which will be used to
+  // verify the CompactionJobStats returned by the OnCompactionCompleted()
+  // callback.
+  void AddExpectedStats(const CompactionJobStats& stats) {
+    std::lock_guard<std::mutex> lock(mutex_);
+    expected_stats_.push(stats);
+  }
+
+  void EnableCompression(bool flag) {
+    compression_enabled_ = flag;
+  }
+
+  bool verify_next_comp_io_stats() const { return verify_next_comp_io_stats_; }
+
+ private:
+  std::mutex mutex_;
+  std::queue<CompactionJobStats> expected_stats_;
+  bool compression_enabled_;
+  bool verify_next_comp_io_stats_;
+};
+
+// An EventListener which helps verify the compaction statistics in
+// the test DeletionStatsTest.
+class CompactionJobDeletionStatsChecker : public CompactionJobStatsChecker {
+ public:
+  // Verifies whether two CompactionJobStats match.
+  void Verify(const CompactionJobStats& current_stats,
+              const CompactionJobStats& stats) override {
+    ASSERT_EQ(
+      current_stats.num_input_deletion_records,
+      stats.num_input_deletion_records);
+    ASSERT_EQ(
+        current_stats.num_expired_deletion_records,
+        stats.num_expired_deletion_records);
+    ASSERT_EQ(
+        current_stats.num_records_replaced,
+        stats.num_records_replaced);
+
+    ASSERT_EQ(current_stats.num_corrupt_keys,
+        stats.num_corrupt_keys);
+  }
+};
+
+namespace {
+
+uint64_t EstimatedFileSize(
+    uint64_t num_records, size_t key_size, size_t value_size,
+    double compression_ratio = 1.0,
+    size_t block_size = 4096,
+    int bloom_bits_per_key = 10) {
+  const size_t kPerKeyOverhead = 8;
+  const size_t kFooterSize = 512;
+
+  uint64_t data_size =
+    static_cast<uint64_t>(
+      num_records * (key_size + value_size * compression_ratio +
+                     kPerKeyOverhead));
+
+  return data_size + kFooterSize
+         + num_records * bloom_bits_per_key / 8      // filter block
+         + data_size * (key_size + 8) / block_size;  // index block
+}
+
+namespace {
+
+void CopyPrefix(
+    const Slice& src, size_t prefix_length, std::string* dst) {
+  assert(prefix_length > 0);
+  size_t length = src.size() > prefix_length ? prefix_length : src.size();
+  dst->assign(src.data(), length);
+}
+
+}  // namespace
+
+CompactionJobStats NewManualCompactionJobStats(
+    const std::string& smallest_key, const std::string& largest_key,
+    size_t num_input_files, size_t num_input_files_at_output_level,
+    uint64_t num_input_records, size_t key_size, size_t value_size,
+    size_t num_output_files, uint64_t num_output_records,
+    double compression_ratio, uint64_t num_records_replaced,
+    bool is_manual = true) {
+  CompactionJobStats stats;
+  stats.Reset();
+
+  stats.num_input_records = num_input_records;
+  stats.num_input_files = num_input_files;
+  stats.num_input_files_at_output_level = num_input_files_at_output_level;
+
+  stats.num_output_records = num_output_records;
+  stats.num_output_files = num_output_files;
+
+  stats.total_input_bytes =
+      EstimatedFileSize(
+          num_input_records / num_input_files,
+          key_size, value_size, compression_ratio) * num_input_files;
+  stats.total_output_bytes =
+      EstimatedFileSize(
+          num_output_records / num_output_files,
+          key_size, value_size, compression_ratio) * num_output_files;
+  stats.total_input_raw_key_bytes =
+      num_input_records * (key_size + 8);
+  stats.total_input_raw_value_bytes =
+      num_input_records * value_size;
+
+  stats.is_manual_compaction = is_manual;
+
+  stats.num_records_replaced = num_records_replaced;
+
+  CopyPrefix(smallest_key,
+             CompactionJobStats::kMaxPrefixLength,
+             &stats.smallest_output_key_prefix);
+  CopyPrefix(largest_key,
+             CompactionJobStats::kMaxPrefixLength,
+             &stats.largest_output_key_prefix);
+
+  return stats;
+}
+
+CompressionType GetAnyCompression() {
+  if (Snappy_Supported()) {
+    return kSnappyCompression;
+  } else if (Zlib_Supported()) {
+    return kZlibCompression;
+  } else if (BZip2_Supported()) {
+    return kBZip2Compression;
+  } else if (LZ4_Supported()) {
+    return kLZ4Compression;
+  } else if (XPRESS_Supported()) {
+    return kXpressCompression;
+  }
+
+  return kNoCompression;
+}
+
+}  // namespace
+
+TEST_P(CompactionJobStatsTest, CompactionJobStatsTest) {
+  Random rnd(301);
+  const int kBufSize = 100;
+  char buf[kBufSize];
+  uint64_t key_base = 100000000l;
+  // Note: key_base must be multiple of num_keys_per_L0_file
+  int num_keys_per_L0_file = 100;
+  const int kTestScale = 8;
+  const int kKeySize = 10;
+  const int kValueSize = 1000;
+  const double kCompressionRatio = 0.5;
+  double compression_ratio = 1.0;
+  uint64_t key_interval = key_base / num_keys_per_L0_file;
+
+  // Whenever a compaction completes, this listener will try to
+  // verify whether the returned CompactionJobStats matches
+  // what we expect.  The expected CompactionJobStats is added
+  // via AddExpectedStats().
+  auto* stats_checker = new CompactionJobStatsChecker();
+  Options options;
+  options.listeners.emplace_back(stats_checker);
+  options.create_if_missing = true;
+  // just enough setting to hold off auto-compaction.
+  options.level0_file_num_compaction_trigger = kTestScale + 1;
+  options.num_levels = 3;
+  options.compression = kNoCompression;
+  options.max_subcompactions = max_subcompactions_;
+  options.bytes_per_sync = 512 * 1024;
+
+  options.report_bg_io_stats = true;
+  for (int test = 0; test < 2; ++test) {
+    DestroyAndReopen(options);
+    CreateAndReopenWithCF({"pikachu"}, options);
+
+    // 1st Phase: generate "num_L0_files" L0 files.
+    int num_L0_files = 0;
+    for (uint64_t start_key = key_base;
+                  start_key <= key_base * kTestScale;
+                  start_key += key_base) {
+      MakeTableWithKeyValues(
+          &rnd, start_key, start_key + key_base - 1,
+          kKeySize, kValueSize, key_interval,
+          compression_ratio, 1);
+      snprintf(buf, kBufSize, "%d", ++num_L0_files);
+      ASSERT_EQ(std::string(buf), FilesPerLevel(1));
+    }
+    ASSERT_EQ(ToString(num_L0_files), FilesPerLevel(1));
+
+    // 2nd Phase: perform L0 -> L1 compaction.
+    int L0_compaction_count = 6;
+    int count = 1;
+    std::string smallest_key;
+    std::string largest_key;
+    for (uint64_t start_key = key_base;
+         start_key <= key_base * L0_compaction_count;
+         start_key += key_base, count++) {
+      smallest_key = Key(start_key, 10);
+      largest_key = Key(start_key + key_base - key_interval, 10);
+      stats_checker->AddExpectedStats(
+          NewManualCompactionJobStats(
+              smallest_key, largest_key,
+              1, 0, num_keys_per_L0_file,
+              kKeySize, kValueSize,
+              1, num_keys_per_L0_file,
+              compression_ratio, 0));
+      ASSERT_EQ(stats_checker->NumberOfUnverifiedStats(), 1U);
+      TEST_Compact(0, 1, smallest_key, largest_key);
+      snprintf(buf, kBufSize, "%d,%d", num_L0_files - count, count);
+      ASSERT_EQ(std::string(buf), FilesPerLevel(1));
+    }
+
+    // compact two files into one in the last L0 -> L1 compaction
+    int num_remaining_L0 = num_L0_files - L0_compaction_count;
+    smallest_key = Key(key_base * (L0_compaction_count + 1), 10);
+    largest_key = Key(key_base * (kTestScale + 1) - key_interval, 10);
+    stats_checker->AddExpectedStats(
+        NewManualCompactionJobStats(
+            smallest_key, largest_key,
+            num_remaining_L0,
+            0, num_keys_per_L0_file * num_remaining_L0,
+            kKeySize, kValueSize,
+            1, num_keys_per_L0_file * num_remaining_L0,
+            compression_ratio, 0));
+    ASSERT_EQ(stats_checker->NumberOfUnverifiedStats(), 1U);
+    TEST_Compact(0, 1, smallest_key, largest_key);
+
+    int num_L1_files = num_L0_files - num_remaining_L0 + 1;
+    num_L0_files = 0;
+    snprintf(buf, kBufSize, "%d,%d", num_L0_files, num_L1_files);
+    ASSERT_EQ(std::string(buf), FilesPerLevel(1));
+
+    // 3rd Phase: generate sparse L0 files (wider key-range, same num of keys)
+    int sparseness = 2;
+    for (uint64_t start_key = key_base;
+                  start_key <= key_base * kTestScale;
+                  start_key += key_base * sparseness) {
+      MakeTableWithKeyValues(
+          &rnd, start_key, start_key + key_base * sparseness - 1,
+          kKeySize, kValueSize,
+          key_base * sparseness / num_keys_per_L0_file,
+          compression_ratio, 1);
+      snprintf(buf, kBufSize, "%d,%d", ++num_L0_files, num_L1_files);
+      ASSERT_EQ(std::string(buf), FilesPerLevel(1));
+    }
+
+    // 4th Phase: perform L0 -> L1 compaction again, expect higher write amp
+    // When subcompactions are enabled, the number of output files increases
+    // by 1 because multiple threads are consuming the input and generating
+    // output files without coordinating to see if the output could fit into
+    // a smaller number of files like it does when it runs sequentially
+    int num_output_files = options.max_subcompactions > 1 ? 2 : 1;
+    for (uint64_t start_key = key_base;
+         num_L0_files > 1;
+         start_key += key_base * sparseness) {
+      smallest_key = Key(start_key, 10);
+      largest_key =
+          Key(start_key + key_base * sparseness - key_interval, 10);
+      stats_checker->AddExpectedStats(
+          NewManualCompactionJobStats(
+              smallest_key, largest_key,
+              3, 2, num_keys_per_L0_file * 3,
+              kKeySize, kValueSize,
+              num_output_files,
+              num_keys_per_L0_file * 2,  // 1/3 of the data will be updated.
+              compression_ratio,
+              num_keys_per_L0_file));
+      ASSERT_EQ(stats_checker->NumberOfUnverifiedStats(), 1U);
+      Compact(1, smallest_key, largest_key);
+      if (options.max_subcompactions == 1) {
+        --num_L1_files;
+      }
+      snprintf(buf, kBufSize, "%d,%d", --num_L0_files, num_L1_files);
+      ASSERT_EQ(std::string(buf), FilesPerLevel(1));
+    }
+
+    // 5th Phase: Do a full compaction, which involves in two sub-compactions.
+    // Here we expect to have 1 L0 files and 4 L1 files
+    // In the first sub-compaction, we expect L0 compaction.
+    smallest_key = Key(key_base, 10);
+    largest_key = Key(key_base * (kTestScale + 1) - key_interval, 10);
+    stats_checker->AddExpectedStats(
+        NewManualCompactionJobStats(
+            Key(key_base * (kTestScale + 1 - sparseness), 10), largest_key,
+            2, 1, num_keys_per_L0_file * 3,
+            kKeySize, kValueSize,
+            1, num_keys_per_L0_file * 2,
+            compression_ratio,
+            num_keys_per_L0_file));
+    ASSERT_EQ(stats_checker->NumberOfUnverifiedStats(), 1U);
+    Compact(1, smallest_key, largest_key);
+
+    num_L1_files = options.max_subcompactions > 1 ? 7 : 4;
+    char L1_buf[4];
+    snprintf(L1_buf, sizeof(L1_buf), "0,%d", num_L1_files);
+    std::string L1_files(L1_buf);
+    ASSERT_EQ(L1_files, FilesPerLevel(1));
+    options.compression = GetAnyCompression();
+    if (options.compression == kNoCompression) {
+      break;
+    }
+    stats_checker->EnableCompression(true);
+    compression_ratio = kCompressionRatio;
+
+    for (int i = 0; i < 5; i++) {
+      ASSERT_OK(Put(1, Slice(Key(key_base + i, 10)),
+                    Slice(RandomString(&rnd, 512 * 1024, 1))));
+    }
+
+    ASSERT_OK(Flush(1));
+    reinterpret_cast<DBImpl*>(db_)->TEST_WaitForCompact();
+
+    stats_checker->set_verify_next_comp_io_stats(true);
+    std::atomic<bool> first_prepare_write(true);
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+        "WritableFileWriter::Append:BeforePrepareWrite", [&](void* /*arg*/) {
+          if (first_prepare_write.load()) {
+            options.env->SleepForMicroseconds(3);
+            first_prepare_write.store(false);
+          }
+        });
+
+    std::atomic<bool> first_flush(true);
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+        "WritableFileWriter::Flush:BeforeAppend", [&](void* /*arg*/) {
+          if (first_flush.load()) {
+            options.env->SleepForMicroseconds(3);
+            first_flush.store(false);
+          }
+        });
+
+    std::atomic<bool> first_sync(true);
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+        "WritableFileWriter::SyncInternal:0", [&](void* /*arg*/) {
+          if (first_sync.load()) {
+            options.env->SleepForMicroseconds(3);
+            first_sync.store(false);
+          }
+        });
+
+    std::atomic<bool> first_range_sync(true);
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+        "WritableFileWriter::RangeSync:0", [&](void* /*arg*/) {
+          if (first_range_sync.load()) {
+            options.env->SleepForMicroseconds(3);
+            first_range_sync.store(false);
+          }
+        });
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+    Compact(1, smallest_key, largest_key);
+
+    ASSERT_TRUE(!stats_checker->verify_next_comp_io_stats());
+    ASSERT_TRUE(!first_prepare_write.load());
+    ASSERT_TRUE(!first_flush.load());
+    ASSERT_TRUE(!first_sync.load());
+    ASSERT_TRUE(!first_range_sync.load());
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  }
+  ASSERT_EQ(stats_checker->NumberOfUnverifiedStats(), 0U);
+}
+
+TEST_P(CompactionJobStatsTest, DeletionStatsTest) {
+  Random rnd(301);
+  uint64_t key_base = 100000l;
+  // Note: key_base must be multiple of num_keys_per_L0_file
+  int num_keys_per_L0_file = 20;
+  const int kTestScale = 8;  // make sure this is even
+  const int kKeySize = 10;
+  const int kValueSize = 100;
+  double compression_ratio = 1.0;
+  uint64_t key_interval = key_base / num_keys_per_L0_file;
+  uint64_t largest_key_num = key_base * (kTestScale + 1) - key_interval;
+  uint64_t cutoff_key_num = key_base * (kTestScale / 2 + 1) - key_interval;
+  const std::string smallest_key = Key(key_base - 10, kKeySize);
+  const std::string largest_key = Key(largest_key_num + 10, kKeySize);
+
+  // Whenever a compaction completes, this listener will try to
+  // verify whether the returned CompactionJobStats matches
+  // what we expect.
+  auto* stats_checker = new CompactionJobDeletionStatsChecker();
+  Options options;
+  options.listeners.emplace_back(stats_checker);
+  options.create_if_missing = true;
+  options.level0_file_num_compaction_trigger = kTestScale+1;
+  options.num_levels = 3;
+  options.compression = kNoCompression;
+  options.max_bytes_for_level_multiplier = 2;
+  options.max_subcompactions = max_subcompactions_;
+
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  // Stage 1: Generate several L0 files and then send them to L2 by
+  // using CompactRangeOptions and CompactRange(). These files will
+  // have a strict subset of the keys from the full key-range
+  for (uint64_t start_key = key_base;
+                start_key <= key_base * kTestScale / 2;
+                start_key += key_base) {
+    MakeTableWithKeyValues(
+        &rnd, start_key, start_key + key_base - 1,
+        kKeySize, kValueSize, key_interval,
+        compression_ratio, 1);
+  }
+
+  CompactRangeOptions cr_options;
+  cr_options.change_level = true;
+  cr_options.target_level = 2;
+  db_->CompactRange(cr_options, handles_[1], nullptr, nullptr);
+  ASSERT_GT(NumTableFilesAtLevel(2, 1), 0);
+
+  // Stage 2: Generate files including keys from the entire key range
+  for (uint64_t start_key = key_base;
+                start_key <= key_base * kTestScale;
+                start_key += key_base) {
+    MakeTableWithKeyValues(
+        &rnd, start_key, start_key + key_base - 1,
+        kKeySize, kValueSize, key_interval,
+        compression_ratio, 1);
+  }
+
+  // Send these L0 files to L1
+  TEST_Compact(0, 1, smallest_key, largest_key);
+  ASSERT_GT(NumTableFilesAtLevel(1, 1), 0);
+
+  // Add a new record and flush so now there is a L0 file
+  // with a value too (not just deletions from the next step)
+  ASSERT_OK(Put(1, Key(key_base-6, kKeySize), "test"));
+  ASSERT_OK(Flush(1));
+
+  // Stage 3: Generate L0 files with some deletions so now
+  // there are files with the same key range in L0, L1, and L2
+  int deletion_interval = 3;
+  CompactionJobStats first_compaction_stats;
+  SelectivelyDeleteKeys(key_base, largest_key_num,
+      key_interval, deletion_interval, kKeySize, cutoff_key_num,
+      &first_compaction_stats, 1);
+
+  stats_checker->AddExpectedStats(first_compaction_stats);
+
+  // Stage 4: Trigger compaction and verify the stats
+  TEST_Compact(0, 1, smallest_key, largest_key);
+}
+
+namespace {
+int GetUniversalCompactionInputUnits(uint32_t num_flushes) {
+  uint32_t compaction_input_units;
+  for (compaction_input_units = 1;
+       num_flushes >= compaction_input_units;
+       compaction_input_units *= 2) {
+    if ((num_flushes & compaction_input_units) != 0) {
+      return compaction_input_units > 1 ? compaction_input_units : 0;
+    }
+  }
+  return 0;
+}
+}  // namespace
+
+TEST_P(CompactionJobStatsTest, UniversalCompactionTest) {
+  Random rnd(301);
+  uint64_t key_base = 100000000l;
+  // Note: key_base must be multiple of num_keys_per_L0_file
+  int num_keys_per_table = 100;
+  const uint32_t kTestScale = 6;
+  const int kKeySize = 10;
+  const int kValueSize = 900;
+  double compression_ratio = 1.0;
+  uint64_t key_interval = key_base / num_keys_per_table;
+
+  auto* stats_checker = new CompactionJobStatsChecker();
+  Options options;
+  options.listeners.emplace_back(stats_checker);
+  options.create_if_missing = true;
+  options.num_levels = 3;
+  options.compression = kNoCompression;
+  options.level0_file_num_compaction_trigger = 2;
+  options.target_file_size_base = num_keys_per_table * 1000;
+  options.compaction_style = kCompactionStyleUniversal;
+  options.compaction_options_universal.size_ratio = 1;
+  options.compaction_options_universal.max_size_amplification_percent = 1000;
+  options.max_subcompactions = max_subcompactions_;
+
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  // Generates the expected CompactionJobStats for each compaction
+  for (uint32_t num_flushes = 2; num_flushes <= kTestScale; num_flushes++) {
+    // Here we treat one newly flushed file as an unit.
+    //
+    // For example, if a newly flushed file is 100k, and a compaction has
+    // 4 input units, then this compaction inputs 400k.
+    uint32_t num_input_units = GetUniversalCompactionInputUnits(num_flushes);
+    if (num_input_units == 0) {
+      continue;
+    }
+    // The following statement determines the expected smallest key
+    // based on whether it is a full compaction.  A full compaction only
+    // happens when the number of flushes equals to the number of compaction
+    // input runs.
+    uint64_t smallest_key =
+        (num_flushes == num_input_units) ?
+            key_base : key_base * (num_flushes - 1);
+
+    stats_checker->AddExpectedStats(
+        NewManualCompactionJobStats(
+            Key(smallest_key, 10),
+            Key(smallest_key + key_base * num_input_units - key_interval, 10),
+            num_input_units,
+            num_input_units > 2 ? num_input_units / 2 : 0,
+            num_keys_per_table * num_input_units,
+            kKeySize, kValueSize,
+            num_input_units,
+            num_keys_per_table * num_input_units,
+            1.0, 0, false));
+    dbfull()->TEST_WaitForCompact();
+  }
+  ASSERT_EQ(stats_checker->NumberOfUnverifiedStats(), 3U);
+
+  for (uint64_t start_key = key_base;
+                start_key <= key_base * kTestScale;
+                start_key += key_base) {
+    MakeTableWithKeyValues(
+        &rnd, start_key, start_key + key_base - 1,
+        kKeySize, kValueSize, key_interval,
+        compression_ratio, 1);
+    reinterpret_cast<DBImpl*>(db_)->TEST_WaitForCompact();
+  }
+  ASSERT_EQ(stats_checker->NumberOfUnverifiedStats(), 0U);
+}
+
+INSTANTIATE_TEST_CASE_P(CompactionJobStatsTest, CompactionJobStatsTest,
+                        ::testing::Values(1, 4));
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+  fprintf(stderr, "SKIPPED, not supported in ROCKSDB_LITE\n");
+  return 0;
+}
+
+#endif  // !ROCKSDB_LITE
+
+#else
+
+int main(int /*argc*/, char** /*argv*/) { return 0; }
+#endif  // !defined(IOS_CROSS_COMPILE)
diff --git a/src/rocksdb/db/compaction/compaction_job_test.cc b/src/rocksdb/db/compaction/compaction_job_test.cc
new file mode 100644
index 000000000..e7b46ef97
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_job_test.cc
@@ -0,0 +1,1082 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include <algorithm>
+#include <array>
+#include <cinttypes>
+#include <map>
+#include <string>
+#include <tuple>
+
+#include "db/blob_index.h"
+#include "db/column_family.h"
+#include "db/compaction/compaction_job.h"
+#include "db/db_impl/db_impl.h"
+#include "db/error_handler.h"
+#include "db/version_set.h"
+#include "file/writable_file_writer.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/db.h"
+#include "rocksdb/options.h"
+#include "rocksdb/write_buffer_manager.h"
+#include "table/mock_table.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/string_util.h"
+#include "utilities/merge_operators.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+
+void VerifyInitializationOfCompactionJobStats(
+      const CompactionJobStats& compaction_job_stats) {
+#if !defined(IOS_CROSS_COMPILE)
+  ASSERT_EQ(compaction_job_stats.elapsed_micros, 0U);
+
+  ASSERT_EQ(compaction_job_stats.num_input_records, 0U);
+  ASSERT_EQ(compaction_job_stats.num_input_files, 0U);
+  ASSERT_EQ(compaction_job_stats.num_input_files_at_output_level, 0U);
+
+  ASSERT_EQ(compaction_job_stats.num_output_records, 0U);
+  ASSERT_EQ(compaction_job_stats.num_output_files, 0U);
+
+  ASSERT_EQ(compaction_job_stats.is_manual_compaction, true);
+
+  ASSERT_EQ(compaction_job_stats.total_input_bytes, 0U);
+  ASSERT_EQ(compaction_job_stats.total_output_bytes, 0U);
+
+  ASSERT_EQ(compaction_job_stats.total_input_raw_key_bytes, 0U);
+  ASSERT_EQ(compaction_job_stats.total_input_raw_value_bytes, 0U);
+
+  ASSERT_EQ(compaction_job_stats.smallest_output_key_prefix[0], 0);
+  ASSERT_EQ(compaction_job_stats.largest_output_key_prefix[0], 0);
+
+  ASSERT_EQ(compaction_job_stats.num_records_replaced, 0U);
+
+  ASSERT_EQ(compaction_job_stats.num_input_deletion_records, 0U);
+  ASSERT_EQ(compaction_job_stats.num_expired_deletion_records, 0U);
+
+  ASSERT_EQ(compaction_job_stats.num_corrupt_keys, 0U);
+#endif  // !defined(IOS_CROSS_COMPILE)
+}
+
+}  // namespace
+
+// TODO(icanadi) Make it simpler once we mock out VersionSet
+class CompactionJobTest : public testing::Test {
+ public:
+  CompactionJobTest()
+      : env_(Env::Default()),
+        fs_(std::make_shared<LegacyFileSystemWrapper>(env_)),
+        dbname_(test::PerThreadDBPath("compaction_job_test")),
+        db_options_(),
+        mutable_cf_options_(cf_options_),
+        table_cache_(NewLRUCache(50000, 16)),
+        write_buffer_manager_(db_options_.db_write_buffer_size),
+        versions_(new VersionSet(dbname_, &db_options_, env_options_,
+                                 table_cache_.get(), &write_buffer_manager_,
+                                 &write_controller_,
+                                 /*block_cache_tracer=*/nullptr)),
+        shutting_down_(false),
+        preserve_deletes_seqnum_(0),
+        mock_table_factory_(new mock::MockTableFactory()),
+        error_handler_(nullptr, db_options_, &mutex_) {
+    EXPECT_OK(env_->CreateDirIfMissing(dbname_));
+    db_options_.env = env_;
+    db_options_.fs = fs_;
+    db_options_.db_paths.emplace_back(dbname_,
+                                      std::numeric_limits<uint64_t>::max());
+  }
+
+  std::string GenerateFileName(uint64_t file_number) {
+    FileMetaData meta;
+    std::vector<DbPath> db_paths;
+    db_paths.emplace_back(dbname_, std::numeric_limits<uint64_t>::max());
+    meta.fd = FileDescriptor(file_number, 0, 0);
+    return TableFileName(db_paths, meta.fd.GetNumber(), meta.fd.GetPathId());
+  }
+
+  static std::string KeyStr(const std::string& user_key,
+                            const SequenceNumber seq_num, const ValueType t) {
+    return InternalKey(user_key, seq_num, t).Encode().ToString();
+  }
+
+  static std::string BlobStr(uint64_t blob_file_number, uint64_t offset,
+                             uint64_t size) {
+    std::string blob_index;
+    BlobIndex::EncodeBlob(&blob_index, blob_file_number, offset, size,
+                          kNoCompression);
+    return blob_index;
+  }
+
+  static std::string BlobStrTTL(uint64_t blob_file_number, uint64_t offset,
+                                uint64_t size, uint64_t expiration) {
+    std::string blob_index;
+    BlobIndex::EncodeBlobTTL(&blob_index, expiration, blob_file_number, offset,
+                             size, kNoCompression);
+    return blob_index;
+  }
+
+  static std::string BlobStrInlinedTTL(const Slice& value,
+                                       uint64_t expiration) {
+    std::string blob_index;
+    BlobIndex::EncodeInlinedTTL(&blob_index, expiration, value);
+    return blob_index;
+  }
+
+  void AddMockFile(const stl_wrappers::KVMap& contents, int level = 0) {
+    assert(contents.size() > 0);
+
+    bool first_key = true;
+    std::string smallest, largest;
+    InternalKey smallest_key, largest_key;
+    SequenceNumber smallest_seqno = kMaxSequenceNumber;
+    SequenceNumber largest_seqno = 0;
+    uint64_t oldest_blob_file_number = kInvalidBlobFileNumber;
+    for (auto kv : contents) {
+      ParsedInternalKey key;
+      std::string skey;
+      std::string value;
+      std::tie(skey, value) = kv;
+      bool parsed = ParseInternalKey(skey, &key);
+
+      smallest_seqno = std::min(smallest_seqno, key.sequence);
+      largest_seqno = std::max(largest_seqno, key.sequence);
+
+      if (first_key ||
+          cfd_->user_comparator()->Compare(key.user_key, smallest) < 0) {
+        smallest.assign(key.user_key.data(), key.user_key.size());
+        smallest_key.DecodeFrom(skey);
+      }
+      if (first_key ||
+          cfd_->user_comparator()->Compare(key.user_key, largest) > 0) {
+        largest.assign(key.user_key.data(), key.user_key.size());
+        largest_key.DecodeFrom(skey);
+      }
+
+      first_key = false;
+
+      if (parsed && key.type == kTypeBlobIndex) {
+        BlobIndex blob_index;
+        const Status s = blob_index.DecodeFrom(value);
+        if (!s.ok()) {
+          continue;
+        }
+
+        if (blob_index.IsInlined() || blob_index.HasTTL() ||
+            blob_index.file_number() == kInvalidBlobFileNumber) {
+          continue;
+        }
+
+        if (oldest_blob_file_number == kInvalidBlobFileNumber ||
+            oldest_blob_file_number > blob_index.file_number()) {
+          oldest_blob_file_number = blob_index.file_number();
+        }
+      }
+    }
+
+    uint64_t file_number = versions_->NewFileNumber();
+    EXPECT_OK(mock_table_factory_->CreateMockTable(
+        env_, GenerateFileName(file_number), std::move(contents)));
+
+    VersionEdit edit;
+    edit.AddFile(level, file_number, 0, 10, smallest_key, largest_key,
+                 smallest_seqno, largest_seqno, false, oldest_blob_file_number,
+                 kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+                 kUnknownFileChecksum, kUnknownFileChecksumFuncName);
+
+    mutex_.Lock();
+    versions_->LogAndApply(versions_->GetColumnFamilySet()->GetDefault(),
+                           mutable_cf_options_, &edit, &mutex_);
+    mutex_.Unlock();
+  }
+
+  void SetLastSequence(const SequenceNumber sequence_number) {
+    versions_->SetLastAllocatedSequence(sequence_number + 1);
+    versions_->SetLastPublishedSequence(sequence_number + 1);
+    versions_->SetLastSequence(sequence_number + 1);
+  }
+
+  // returns expected result after compaction
+  stl_wrappers::KVMap CreateTwoFiles(bool gen_corrupted_keys) {
+    auto expected_results = mock::MakeMockFile();
+    const int kKeysPerFile = 10000;
+    const int kCorruptKeysPerFile = 200;
+    const int kMatchingKeys = kKeysPerFile / 2;
+    SequenceNumber sequence_number = 0;
+
+    auto corrupt_id = [&](int id) {
+      return gen_corrupted_keys && id > 0 && id <= kCorruptKeysPerFile;
+    };
+
+    for (int i = 0; i < 2; ++i) {
+      auto contents = mock::MakeMockFile();
+      for (int k = 0; k < kKeysPerFile; ++k) {
+        auto key = ToString(i * kMatchingKeys + k);
+        auto value = ToString(i * kKeysPerFile + k);
+        InternalKey internal_key(key, ++sequence_number, kTypeValue);
+
+        // This is how the key will look like once it's written in bottommost
+        // file
+        InternalKey bottommost_internal_key(
+            key, 0, kTypeValue);
+
+        if (corrupt_id(k)) {
+          test::CorruptKeyType(&internal_key);
+          test::CorruptKeyType(&bottommost_internal_key);
+        }
+        contents.insert({ internal_key.Encode().ToString(), value });
+        if (i == 1 || k < kMatchingKeys || corrupt_id(k - kMatchingKeys)) {
+          expected_results.insert(
+              { bottommost_internal_key.Encode().ToString(), value });
+        }
+      }
+
+      AddMockFile(contents);
+    }
+
+    SetLastSequence(sequence_number);
+
+    return expected_results;
+  }
+
+  void NewDB() {
+    DestroyDB(dbname_, Options());
+    EXPECT_OK(env_->CreateDirIfMissing(dbname_));
+    versions_.reset(new VersionSet(dbname_, &db_options_, env_options_,
+                                   table_cache_.get(), &write_buffer_manager_,
+                                   &write_controller_,
+                                   /*block_cache_tracer=*/nullptr));
+    compaction_job_stats_.Reset();
+    SetIdentityFile(env_, dbname_);
+
+    VersionEdit new_db;
+    if (db_options_.write_dbid_to_manifest) {
+      DBImpl* impl = new DBImpl(DBOptions(), dbname_);
+      std::string db_id;
+      impl->GetDbIdentityFromIdentityFile(&db_id);
+      new_db.SetDBId(db_id);
+    }
+    new_db.SetLogNumber(0);
+    new_db.SetNextFile(2);
+    new_db.SetLastSequence(0);
+
+    const std::string manifest = DescriptorFileName(dbname_, 1);
+    std::unique_ptr<WritableFile> file;
+    Status s = env_->NewWritableFile(
+        manifest, &file, env_->OptimizeForManifestWrite(env_options_));
+    ASSERT_OK(s);
+    std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
+        NewLegacyWritableFileWrapper(std::move(file)), manifest, env_options_));
+    {
+      log::Writer log(std::move(file_writer), 0, false);
+      std::string record;
+      new_db.EncodeTo(&record);
+      s = log.AddRecord(record);
+    }
+    ASSERT_OK(s);
+    // Make "CURRENT" file that points to the new manifest file.
+    s = SetCurrentFile(env_, dbname_, 1, nullptr);
+
+    std::vector<ColumnFamilyDescriptor> column_families;
+    cf_options_.table_factory = mock_table_factory_;
+    cf_options_.merge_operator = merge_op_;
+    cf_options_.compaction_filter = compaction_filter_.get();
+    column_families.emplace_back(kDefaultColumnFamilyName, cf_options_);
+
+    EXPECT_OK(versions_->Recover(column_families, false));
+    cfd_ = versions_->GetColumnFamilySet()->GetDefault();
+  }
+
+  void RunCompaction(
+      const std::vector<std::vector<FileMetaData*>>& input_files,
+      const stl_wrappers::KVMap& expected_results,
+      const std::vector<SequenceNumber>& snapshots = {},
+      SequenceNumber earliest_write_conflict_snapshot = kMaxSequenceNumber,
+      int output_level = 1, bool verify = true,
+      uint64_t expected_oldest_blob_file_number = kInvalidBlobFileNumber) {
+    auto cfd = versions_->GetColumnFamilySet()->GetDefault();
+
+    size_t num_input_files = 0;
+    std::vector<CompactionInputFiles> compaction_input_files;
+    for (size_t level = 0; level < input_files.size(); level++) {
+      auto level_files = input_files[level];
+      CompactionInputFiles compaction_level;
+      compaction_level.level = static_cast<int>(level);
+      compaction_level.files.insert(compaction_level.files.end(),
+          level_files.begin(), level_files.end());
+      compaction_input_files.push_back(compaction_level);
+      num_input_files += level_files.size();
+    }
+
+    Compaction compaction(cfd->current()->storage_info(), *cfd->ioptions(),
+                          *cfd->GetLatestMutableCFOptions(),
+                          compaction_input_files, output_level, 1024 * 1024,
+                          10 * 1024 * 1024, 0, kNoCompression,
+                          cfd->ioptions()->compression_opts, 0, {}, true);
+    compaction.SetInputVersion(cfd->current());
+
+    LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, db_options_.info_log.get());
+    mutex_.Lock();
+    EventLogger event_logger(db_options_.info_log.get());
+    // TODO(yiwu) add a mock snapshot checker and add test for it.
+    SnapshotChecker* snapshot_checker = nullptr;
+    CompactionJob compaction_job(
+        0, &compaction, db_options_, env_options_, versions_.get(),
+        &shutting_down_, preserve_deletes_seqnum_, &log_buffer, nullptr,
+        nullptr, nullptr, &mutex_, &error_handler_, snapshots,
+        earliest_write_conflict_snapshot, snapshot_checker, table_cache_,
+        &event_logger, false, false, dbname_, &compaction_job_stats_,
+        Env::Priority::USER);
+    VerifyInitializationOfCompactionJobStats(compaction_job_stats_);
+
+    compaction_job.Prepare();
+    mutex_.Unlock();
+    Status s;
+    s = compaction_job.Run();
+    ASSERT_OK(s);
+    mutex_.Lock();
+    ASSERT_OK(compaction_job.Install(*cfd->GetLatestMutableCFOptions()));
+    mutex_.Unlock();
+
+    if (verify) {
+      ASSERT_GE(compaction_job_stats_.elapsed_micros, 0U);
+      ASSERT_EQ(compaction_job_stats_.num_input_files, num_input_files);
+
+      if (expected_results.empty()) {
+        ASSERT_EQ(compaction_job_stats_.num_output_files, 0U);
+      } else {
+        ASSERT_EQ(compaction_job_stats_.num_output_files, 1U);
+        mock_table_factory_->AssertLatestFile(expected_results);
+
+        auto output_files =
+            cfd->current()->storage_info()->LevelFiles(output_level);
+        ASSERT_EQ(output_files.size(), 1);
+        ASSERT_EQ(output_files[0]->oldest_blob_file_number,
+                  expected_oldest_blob_file_number);
+      }
+    }
+  }
+
+  Env* env_;
+  std::shared_ptr<FileSystem> fs_;
+  std::string dbname_;
+  EnvOptions env_options_;
+  ImmutableDBOptions db_options_;
+  ColumnFamilyOptions cf_options_;
+  MutableCFOptions mutable_cf_options_;
+  std::shared_ptr<Cache> table_cache_;
+  WriteController write_controller_;
+  WriteBufferManager write_buffer_manager_;
+  std::unique_ptr<VersionSet> versions_;
+  InstrumentedMutex mutex_;
+  std::atomic<bool> shutting_down_;
+  SequenceNumber preserve_deletes_seqnum_;
+  std::shared_ptr<mock::MockTableFactory> mock_table_factory_;
+  CompactionJobStats compaction_job_stats_;
+  ColumnFamilyData* cfd_;
+  std::unique_ptr<CompactionFilter> compaction_filter_;
+  std::shared_ptr<MergeOperator> merge_op_;
+  ErrorHandler error_handler_;
+};
+
+TEST_F(CompactionJobTest, Simple) {
+  NewDB();
+
+  auto expected_results = CreateTwoFiles(false);
+  auto cfd = versions_->GetColumnFamilySet()->GetDefault();
+  auto files = cfd->current()->storage_info()->LevelFiles(0);
+  ASSERT_EQ(2U, files.size());
+  RunCompaction({ files }, expected_results);
+}
+
+TEST_F(CompactionJobTest, SimpleCorrupted) {
+  NewDB();
+
+  auto expected_results = CreateTwoFiles(true);
+  auto cfd = versions_->GetColumnFamilySet()->GetDefault();
+  auto files = cfd->current()->storage_info()->LevelFiles(0);
+  RunCompaction({files}, expected_results);
+  ASSERT_EQ(compaction_job_stats_.num_corrupt_keys, 400U);
+}
+
+TEST_F(CompactionJobTest, SimpleDeletion) {
+  NewDB();
+
+  auto file1 = mock::MakeMockFile({{KeyStr("c", 4U, kTypeDeletion), ""},
+                                   {KeyStr("c", 3U, kTypeValue), "val"}});
+  AddMockFile(file1);
+
+  auto file2 = mock::MakeMockFile({{KeyStr("b", 2U, kTypeValue), "val"},
+                                   {KeyStr("b", 1U, kTypeValue), "val"}});
+  AddMockFile(file2);
+
+  auto expected_results =
+      mock::MakeMockFile({{KeyStr("b", 0U, kTypeValue), "val"}});
+
+  SetLastSequence(4U);
+  auto files = cfd_->current()->storage_info()->LevelFiles(0);
+  RunCompaction({files}, expected_results);
+}
+
+TEST_F(CompactionJobTest, OutputNothing) {
+  NewDB();
+
+  auto file1 = mock::MakeMockFile({{KeyStr("a", 1U, kTypeValue), "val"}});
+
+  AddMockFile(file1);
+
+  auto file2 = mock::MakeMockFile({{KeyStr("a", 2U, kTypeDeletion), ""}});
+
+  AddMockFile(file2);
+
+  auto expected_results = mock::MakeMockFile();
+
+  SetLastSequence(4U);
+  auto files = cfd_->current()->storage_info()->LevelFiles(0);
+  RunCompaction({files}, expected_results);
+}
+
+TEST_F(CompactionJobTest, SimpleOverwrite) {
+  NewDB();
+
+  auto file1 = mock::MakeMockFile({
+      {KeyStr("a", 3U, kTypeValue), "val2"},
+      {KeyStr("b", 4U, kTypeValue), "val3"},
+  });
+  AddMockFile(file1);
+
+  auto file2 = mock::MakeMockFile({{KeyStr("a", 1U, kTypeValue), "val"},
+                                   {KeyStr("b", 2U, kTypeValue), "val"}});
+  AddMockFile(file2);
+
+  auto expected_results =
+      mock::MakeMockFile({{KeyStr("a", 0U, kTypeValue), "val2"},
+                          {KeyStr("b", 0U, kTypeValue), "val3"}});
+
+  SetLastSequence(4U);
+  auto files = cfd_->current()->storage_info()->LevelFiles(0);
+  RunCompaction({files}, expected_results);
+}
+
+TEST_F(CompactionJobTest, SimpleNonLastLevel) {
+  NewDB();
+
+  auto file1 = mock::MakeMockFile({
+      {KeyStr("a", 5U, kTypeValue), "val2"},
+      {KeyStr("b", 6U, kTypeValue), "val3"},
+  });
+  AddMockFile(file1);
+
+  auto file2 = mock::MakeMockFile({{KeyStr("a", 3U, kTypeValue), "val"},
+                                   {KeyStr("b", 4U, kTypeValue), "val"}});
+  AddMockFile(file2, 1);
+
+  auto file3 = mock::MakeMockFile({{KeyStr("a", 1U, kTypeValue), "val"},
+                                   {KeyStr("b", 2U, kTypeValue), "val"}});
+  AddMockFile(file3, 2);
+
+  // Because level 1 is not the last level, the sequence numbers of a and b
+  // cannot be set to 0
+  auto expected_results =
+      mock::MakeMockFile({{KeyStr("a", 5U, kTypeValue), "val2"},
+                          {KeyStr("b", 6U, kTypeValue), "val3"}});
+
+  SetLastSequence(6U);
+  auto lvl0_files = cfd_->current()->storage_info()->LevelFiles(0);
+  auto lvl1_files = cfd_->current()->storage_info()->LevelFiles(1);
+  RunCompaction({lvl0_files, lvl1_files}, expected_results);
+}
+
+TEST_F(CompactionJobTest, SimpleMerge) {
+  merge_op_ = MergeOperators::CreateStringAppendOperator();
+  NewDB();
+
+  auto file1 = mock::MakeMockFile({
+      {KeyStr("a", 5U, kTypeMerge), "5"},
+      {KeyStr("a", 4U, kTypeMerge), "4"},
+      {KeyStr("a", 3U, kTypeValue), "3"},
+  });
+  AddMockFile(file1);
+
+  auto file2 = mock::MakeMockFile(
+      {{KeyStr("b", 2U, kTypeMerge), "2"}, {KeyStr("b", 1U, kTypeValue), "1"}});
+  AddMockFile(file2);
+
+  auto expected_results =
+      mock::MakeMockFile({{KeyStr("a", 0U, kTypeValue), "3,4,5"},
+                          {KeyStr("b", 0U, kTypeValue), "1,2"}});
+
+  SetLastSequence(5U);
+  auto files = cfd_->current()->storage_info()->LevelFiles(0);
+  RunCompaction({files}, expected_results);
+}
+
+TEST_F(CompactionJobTest, NonAssocMerge) {
+  merge_op_ = MergeOperators::CreateStringAppendTESTOperator();
+  NewDB();
+
+  auto file1 = mock::MakeMockFile({
+      {KeyStr("a", 5U, kTypeMerge), "5"},
+      {KeyStr("a", 4U, kTypeMerge), "4"},
+      {KeyStr("a", 3U, kTypeMerge), "3"},
+  });
+  AddMockFile(file1);
+
+  auto file2 = mock::MakeMockFile(
+      {{KeyStr("b", 2U, kTypeMerge), "2"}, {KeyStr("b", 1U, kTypeMerge), "1"}});
+  AddMockFile(file2);
+
+  auto expected_results =
+      mock::MakeMockFile({{KeyStr("a", 0U, kTypeValue), "3,4,5"},
+                          {KeyStr("b", 0U, kTypeValue), "1,2"}});
+
+  SetLastSequence(5U);
+  auto files = cfd_->current()->storage_info()->LevelFiles(0);
+  RunCompaction({files}, expected_results);
+}
+
+// Filters merge operands with value 10.
+TEST_F(CompactionJobTest, MergeOperandFilter) {
+  merge_op_ = MergeOperators::CreateUInt64AddOperator();
+  compaction_filter_.reset(new test::FilterNumber(10U));
+  NewDB();
+
+  auto file1 = mock::MakeMockFile(
+      {{KeyStr("a", 5U, kTypeMerge), test::EncodeInt(5U)},
+       {KeyStr("a", 4U, kTypeMerge), test::EncodeInt(10U)},  // Filtered
+       {KeyStr("a", 3U, kTypeMerge), test::EncodeInt(3U)}});
+  AddMockFile(file1);
+
+  auto file2 = mock::MakeMockFile({
+      {KeyStr("b", 2U, kTypeMerge), test::EncodeInt(2U)},
+      {KeyStr("b", 1U, kTypeMerge), test::EncodeInt(10U)}  // Filtered
+  });
+  AddMockFile(file2);
+
+  auto expected_results =
+      mock::MakeMockFile({{KeyStr("a", 0U, kTypeValue), test::EncodeInt(8U)},
+                          {KeyStr("b", 0U, kTypeValue), test::EncodeInt(2U)}});
+
+  SetLastSequence(5U);
+  auto files = cfd_->current()->storage_info()->LevelFiles(0);
+  RunCompaction({files}, expected_results);
+}
+
+TEST_F(CompactionJobTest, FilterSomeMergeOperands) {
+  merge_op_ = MergeOperators::CreateUInt64AddOperator();
+  compaction_filter_.reset(new test::FilterNumber(10U));
+  NewDB();
+
+  auto file1 = mock::MakeMockFile(
+      {{KeyStr("a", 5U, kTypeMerge), test::EncodeInt(5U)},
+       {KeyStr("a", 4U, kTypeMerge), test::EncodeInt(10U)},  // Filtered
+       {KeyStr("a", 3U, kTypeValue), test::EncodeInt(5U)},
+       {KeyStr("d", 8U, kTypeMerge), test::EncodeInt(10U)}});
+  AddMockFile(file1);
+
+  auto file2 =
+      mock::MakeMockFile({{KeyStr("b", 2U, kTypeMerge), test::EncodeInt(10U)},
+                          {KeyStr("b", 1U, kTypeMerge), test::EncodeInt(10U)},
+                          {KeyStr("c", 2U, kTypeMerge), test::EncodeInt(3U)},
+                          {KeyStr("c", 1U, kTypeValue), test::EncodeInt(7U)},
+                          {KeyStr("d", 1U, kTypeValue), test::EncodeInt(6U)}});
+  AddMockFile(file2);
+
+  auto file3 =
+      mock::MakeMockFile({{KeyStr("a", 1U, kTypeMerge), test::EncodeInt(3U)}});
+  AddMockFile(file3, 2);
+
+  auto expected_results = mock::MakeMockFile({
+      {KeyStr("a", 5U, kTypeValue), test::EncodeInt(10U)},
+      {KeyStr("c", 2U, kTypeValue), test::EncodeInt(10U)},
+      {KeyStr("d", 1U, kTypeValue), test::EncodeInt(6U)}
+      // b does not appear because the operands are filtered
+  });
+
+  SetLastSequence(5U);
+  auto files = cfd_->current()->storage_info()->LevelFiles(0);
+  RunCompaction({files}, expected_results);
+}
+
+// Test where all operands/merge results are filtered out.
+TEST_F(CompactionJobTest, FilterAllMergeOperands) {
+  merge_op_ = MergeOperators::CreateUInt64AddOperator();
+  compaction_filter_.reset(new test::FilterNumber(10U));
+  NewDB();
+
+  auto file1 =
+      mock::MakeMockFile({{KeyStr("a", 11U, kTypeMerge), test::EncodeInt(10U)},
+                          {KeyStr("a", 10U, kTypeMerge), test::EncodeInt(10U)},
+                          {KeyStr("a", 9U, kTypeMerge), test::EncodeInt(10U)}});
+  AddMockFile(file1);
+
+  auto file2 =
+      mock::MakeMockFile({{KeyStr("b", 8U, kTypeMerge), test::EncodeInt(10U)},
+                          {KeyStr("b", 7U, kTypeMerge), test::EncodeInt(10U)},
+                          {KeyStr("b", 6U, kTypeMerge), test::EncodeInt(10U)},
+                          {KeyStr("b", 5U, kTypeMerge), test::EncodeInt(10U)},
+                          {KeyStr("b", 4U, kTypeMerge), test::EncodeInt(10U)},
+                          {KeyStr("b", 3U, kTypeMerge), test::EncodeInt(10U)},
+                          {KeyStr("b", 2U, kTypeMerge), test::EncodeInt(10U)},
+                          {KeyStr("c", 2U, kTypeMerge), test::EncodeInt(10U)},
+                          {KeyStr("c", 1U, kTypeMerge), test::EncodeInt(10U)}});
+  AddMockFile(file2);
+
+  auto file3 =
+      mock::MakeMockFile({{KeyStr("a", 2U, kTypeMerge), test::EncodeInt(10U)},
+                          {KeyStr("b", 1U, kTypeMerge), test::EncodeInt(10U)}});
+  AddMockFile(file3, 2);
+
+  SetLastSequence(11U);
+  auto files = cfd_->current()->storage_info()->LevelFiles(0);
+
+  stl_wrappers::KVMap empty_map;
+  RunCompaction({files}, empty_map);
+}
+
+TEST_F(CompactionJobTest, SimpleSingleDelete) {
+  NewDB();
+
+  auto file1 = mock::MakeMockFile({
+      {KeyStr("a", 5U, kTypeDeletion), ""},
+      {KeyStr("b", 6U, kTypeSingleDeletion), ""},
+  });
+  AddMockFile(file1);
+
+  auto file2 = mock::MakeMockFile({{KeyStr("a", 3U, kTypeValue), "val"},
+                                   {KeyStr("b", 4U, kTypeValue), "val"}});
+  AddMockFile(file2);
+
+  auto file3 = mock::MakeMockFile({
+      {KeyStr("a", 1U, kTypeValue), "val"},
+  });
+  AddMockFile(file3, 2);
+
+  auto expected_results =
+      mock::MakeMockFile({{KeyStr("a", 5U, kTypeDeletion), ""}});
+
+  SetLastSequence(6U);
+  auto files = cfd_->current()->storage_info()->LevelFiles(0);
+  RunCompaction({files}, expected_results);
+}
+
+TEST_F(CompactionJobTest, SingleDeleteSnapshots) {
+  NewDB();
+
+  auto file1 = mock::MakeMockFile({
+      {KeyStr("A", 12U, kTypeSingleDeletion), ""},
+      {KeyStr("a", 12U, kTypeSingleDeletion), ""},
+      {KeyStr("b", 21U, kTypeSingleDeletion), ""},
+      {KeyStr("c", 22U, kTypeSingleDeletion), ""},
+      {KeyStr("d", 9U, kTypeSingleDeletion), ""},
+      {KeyStr("f", 21U, kTypeSingleDeletion), ""},
+      {KeyStr("j", 11U, kTypeSingleDeletion), ""},
+      {KeyStr("j", 9U, kTypeSingleDeletion), ""},
+      {KeyStr("k", 12U, kTypeSingleDeletion), ""},
+      {KeyStr("k", 11U, kTypeSingleDeletion), ""},
+      {KeyStr("l", 3U, kTypeSingleDeletion), ""},
+      {KeyStr("l", 2U, kTypeSingleDeletion), ""},
+  });
+  AddMockFile(file1);
+
+  auto file2 = mock::MakeMockFile({
+      {KeyStr("0", 2U, kTypeSingleDeletion), ""},
+      {KeyStr("a", 11U, kTypeValue), "val1"},
+      {KeyStr("b", 11U, kTypeValue), "val2"},
+      {KeyStr("c", 21U, kTypeValue), "val3"},
+      {KeyStr("d", 8U, kTypeValue), "val4"},
+      {KeyStr("e", 2U, kTypeSingleDeletion), ""},
+      {KeyStr("f", 1U, kTypeValue), "val1"},
+      {KeyStr("g", 11U, kTypeSingleDeletion), ""},
+      {KeyStr("h", 2U, kTypeSingleDeletion), ""},
+      {KeyStr("m", 12U, kTypeValue), "val1"},
+      {KeyStr("m", 11U, kTypeSingleDeletion), ""},
+      {KeyStr("m", 8U, kTypeValue), "val2"},
+  });
+  AddMockFile(file2);
+
+  auto file3 = mock::MakeMockFile({
+      {KeyStr("A", 1U, kTypeValue), "val"},
+      {KeyStr("e", 1U, kTypeValue), "val"},
+  });
+  AddMockFile(file3, 2);
+
+  auto expected_results = mock::MakeMockFile({
+      {KeyStr("A", 12U, kTypeSingleDeletion), ""},
+      {KeyStr("a", 12U, kTypeSingleDeletion), ""},
+      {KeyStr("a", 11U, kTypeValue), ""},
+      {KeyStr("b", 21U, kTypeSingleDeletion), ""},
+      {KeyStr("b", 11U, kTypeValue), "val2"},
+      {KeyStr("c", 22U, kTypeSingleDeletion), ""},
+      {KeyStr("c", 21U, kTypeValue), ""},
+      {KeyStr("e", 2U, kTypeSingleDeletion), ""},
+      {KeyStr("f", 21U, kTypeSingleDeletion), ""},
+      {KeyStr("f", 1U, kTypeValue), "val1"},
+      {KeyStr("g", 11U, kTypeSingleDeletion), ""},
+      {KeyStr("j", 11U, kTypeSingleDeletion), ""},
+      {KeyStr("k", 11U, kTypeSingleDeletion), ""},
+      {KeyStr("m", 12U, kTypeValue), "val1"},
+      {KeyStr("m", 11U, kTypeSingleDeletion), ""},
+      {KeyStr("m", 8U, kTypeValue), "val2"},
+  });
+
+  SetLastSequence(22U);
+  auto files = cfd_->current()->storage_info()->LevelFiles(0);
+  RunCompaction({files}, expected_results, {10U, 20U}, 10U);
+}
+
+TEST_F(CompactionJobTest, EarliestWriteConflictSnapshot) {
+  NewDB();
+
+  // Test multiple snapshots where the earliest snapshot is not a
+  // write-conflic-snapshot.
+
+  auto file1 = mock::MakeMockFile({
+      {KeyStr("A", 24U, kTypeSingleDeletion), ""},
+      {KeyStr("A", 23U, kTypeValue), "val"},
+      {KeyStr("B", 24U, kTypeSingleDeletion), ""},
+      {KeyStr("B", 23U, kTypeValue), "val"},
+      {KeyStr("D", 24U, kTypeSingleDeletion), ""},
+      {KeyStr("G", 32U, kTypeSingleDeletion), ""},
+      {KeyStr("G", 31U, kTypeValue), "val"},
+      {KeyStr("G", 24U, kTypeSingleDeletion), ""},
+      {KeyStr("G", 23U, kTypeValue), "val2"},
+      {KeyStr("H", 31U, kTypeValue), "val"},
+      {KeyStr("H", 24U, kTypeSingleDeletion), ""},
+      {KeyStr("H", 23U, kTypeValue), "val"},
+      {KeyStr("I", 35U, kTypeSingleDeletion), ""},
+      {KeyStr("I", 34U, kTypeValue), "val2"},
+      {KeyStr("I", 33U, kTypeSingleDeletion), ""},
+      {KeyStr("I", 32U, kTypeValue), "val3"},
+      {KeyStr("I", 31U, kTypeSingleDeletion), ""},
+      {KeyStr("J", 34U, kTypeValue), "val"},
+      {KeyStr("J", 33U, kTypeSingleDeletion), ""},
+      {KeyStr("J", 25U, kTypeValue), "val2"},
+      {KeyStr("J", 24U, kTypeSingleDeletion), ""},
+  });
+  AddMockFile(file1);
+
+  auto file2 = mock::MakeMockFile({
+      {KeyStr("A", 14U, kTypeSingleDeletion), ""},
+      {KeyStr("A", 13U, kTypeValue), "val2"},
+      {KeyStr("C", 14U, kTypeSingleDeletion), ""},
+      {KeyStr("C", 13U, kTypeValue), "val"},
+      {KeyStr("E", 12U, kTypeSingleDeletion), ""},
+      {KeyStr("F", 4U, kTypeSingleDeletion), ""},
+      {KeyStr("F", 3U, kTypeValue), "val"},
+      {KeyStr("G", 14U, kTypeSingleDeletion), ""},
+      {KeyStr("G", 13U, kTypeValue), "val3"},
+      {KeyStr("H", 14U, kTypeSingleDeletion), ""},
+      {KeyStr("H", 13U, kTypeValue), "val2"},
+      {KeyStr("I", 13U, kTypeValue), "val4"},
+      {KeyStr("I", 12U, kTypeSingleDeletion), ""},
+      {KeyStr("I", 11U, kTypeValue), "val5"},
+      {KeyStr("J", 15U, kTypeValue), "val3"},
+      {KeyStr("J", 14U, kTypeSingleDeletion), ""},
+  });
+  AddMockFile(file2);
+
+  auto expected_results = mock::MakeMockFile({
+      {KeyStr("A", 24U, kTypeSingleDeletion), ""},
+      {KeyStr("A", 23U, kTypeValue), ""},
+      {KeyStr("B", 24U, kTypeSingleDeletion), ""},
+      {KeyStr("B", 23U, kTypeValue), ""},
+      {KeyStr("D", 24U, kTypeSingleDeletion), ""},
+      {KeyStr("E", 12U, kTypeSingleDeletion), ""},
+      {KeyStr("G", 32U, kTypeSingleDeletion), ""},
+      {KeyStr("G", 31U, kTypeValue), ""},
+      {KeyStr("H", 31U, kTypeValue), "val"},
+      {KeyStr("I", 35U, kTypeSingleDeletion), ""},
+      {KeyStr("I", 34U, kTypeValue), ""},
+      {KeyStr("I", 31U, kTypeSingleDeletion), ""},
+      {KeyStr("I", 13U, kTypeValue), "val4"},
+      {KeyStr("J", 34U, kTypeValue), "val"},
+      {KeyStr("J", 33U, kTypeSingleDeletion), ""},
+      {KeyStr("J", 25U, kTypeValue), "val2"},
+      {KeyStr("J", 24U, kTypeSingleDeletion), ""},
+      {KeyStr("J", 15U, kTypeValue), "val3"},
+      {KeyStr("J", 14U, kTypeSingleDeletion), ""},
+  });
+
+  SetLastSequence(24U);
+  auto files = cfd_->current()->storage_info()->LevelFiles(0);
+  RunCompaction({files}, expected_results, {10U, 20U, 30U}, 20U);
+}
+
+TEST_F(CompactionJobTest, SingleDeleteZeroSeq) {
+  NewDB();
+
+  auto file1 = mock::MakeMockFile({
+      {KeyStr("A", 10U, kTypeSingleDeletion), ""},
+      {KeyStr("dummy", 5U, kTypeValue), "val2"},
+  });
+  AddMockFile(file1);
+
+  auto file2 = mock::MakeMockFile({
+      {KeyStr("A", 0U, kTypeValue), "val"},
+  });
+  AddMockFile(file2);
+
+  auto expected_results = mock::MakeMockFile({
+      {KeyStr("dummy", 0U, kTypeValue), "val2"},
+  });
+
+  SetLastSequence(22U);
+  auto files = cfd_->current()->storage_info()->LevelFiles(0);
+  RunCompaction({files}, expected_results, {});
+}
+
+TEST_F(CompactionJobTest, MultiSingleDelete) {
+  // Tests three scenarios involving multiple single delete/put pairs:
+  //
+  // A: Put Snapshot SDel Put SDel -> Put Snapshot SDel
+  // B: Snapshot Put SDel Put SDel Snapshot -> Snapshot SDel Snapshot
+  // C: SDel Put SDel Snapshot Put -> Snapshot Put
+  // D: (Put) SDel Snapshot Put SDel -> (Put) SDel Snapshot SDel
+  // E: Put SDel Snapshot Put SDel -> Snapshot SDel
+  // F: Put SDel Put Sdel Snapshot -> removed
+  // G: Snapshot SDel Put SDel Put -> Snapshot Put SDel
+  // H: (Put) Put SDel Put Sdel Snapshot -> Removed
+  // I: (Put) Snapshot Put SDel Put SDel -> SDel
+  // J: Put Put SDel Put SDel SDel Snapshot Put Put SDel SDel Put
+  //      -> Snapshot Put
+  // K: SDel SDel Put SDel Put Put Snapshot SDel Put SDel SDel Put SDel
+  //      -> Snapshot Put Snapshot SDel
+  // L: SDel Put Del Put SDel Snapshot Del Put Del SDel Put SDel
+  //      -> Snapshot SDel
+  // M: (Put) SDel Put Del Put SDel Snapshot Put Del SDel Put SDel Del
+  //      -> SDel Snapshot Del
+  NewDB();
+
+  auto file1 = mock::MakeMockFile({
+      {KeyStr("A", 14U, kTypeSingleDeletion), ""},
+      {KeyStr("A", 13U, kTypeValue), "val5"},
+      {KeyStr("A", 12U, kTypeSingleDeletion), ""},
+      {KeyStr("B", 14U, kTypeSingleDeletion), ""},
+      {KeyStr("B", 13U, kTypeValue), "val2"},
+      {KeyStr("C", 14U, kTypeValue), "val3"},
+      {KeyStr("D", 12U, kTypeSingleDeletion), ""},
+      {KeyStr("D", 11U, kTypeValue), "val4"},
+      {KeyStr("G", 15U, kTypeValue), "val"},
+      {KeyStr("G", 14U, kTypeSingleDeletion), ""},
+      {KeyStr("G", 13U, kTypeValue), "val"},
+      {KeyStr("I", 14U, kTypeSingleDeletion), ""},
+      {KeyStr("I", 13U, kTypeValue), "val"},
+      {KeyStr("J", 15U, kTypeValue), "val"},
+      {KeyStr("J", 14U, kTypeSingleDeletion), ""},
+      {KeyStr("J", 13U, kTypeSingleDeletion), ""},
+      {KeyStr("J", 12U, kTypeValue), "val"},
+      {KeyStr("J", 11U, kTypeValue), "val"},
+      {KeyStr("K", 16U, kTypeSingleDeletion), ""},
+      {KeyStr("K", 15U, kTypeValue), "val1"},
+      {KeyStr("K", 14U, kTypeSingleDeletion), ""},
+      {KeyStr("K", 13U, kTypeSingleDeletion), ""},
+      {KeyStr("K", 12U, kTypeValue), "val2"},
+      {KeyStr("K", 11U, kTypeSingleDeletion), ""},
+      {KeyStr("L", 16U, kTypeSingleDeletion), ""},
+      {KeyStr("L", 15U, kTypeValue), "val"},
+      {KeyStr("L", 14U, kTypeSingleDeletion), ""},
+      {KeyStr("L", 13U, kTypeDeletion), ""},
+      {KeyStr("L", 12U, kTypeValue), "val"},
+      {KeyStr("L", 11U, kTypeDeletion), ""},
+      {KeyStr("M", 16U, kTypeDeletion), ""},
+      {KeyStr("M", 15U, kTypeSingleDeletion), ""},
+      {KeyStr("M", 14U, kTypeValue), "val"},
+      {KeyStr("M", 13U, kTypeSingleDeletion), ""},
+      {KeyStr("M", 12U, kTypeDeletion), ""},
+      {KeyStr("M", 11U, kTypeValue), "val"},
+  });
+  AddMockFile(file1);
+
+  auto file2 = mock::MakeMockFile({
+      {KeyStr("A", 10U, kTypeValue), "val"},
+      {KeyStr("B", 12U, kTypeSingleDeletion), ""},
+      {KeyStr("B", 11U, kTypeValue), "val2"},
+      {KeyStr("C", 10U, kTypeSingleDeletion), ""},
+      {KeyStr("C", 9U, kTypeValue), "val6"},
+      {KeyStr("C", 8U, kTypeSingleDeletion), ""},
+      {KeyStr("D", 10U, kTypeSingleDeletion), ""},
+      {KeyStr("E", 12U, kTypeSingleDeletion), ""},
+      {KeyStr("E", 11U, kTypeValue), "val"},
+      {KeyStr("E", 5U, kTypeSingleDeletion), ""},
+      {KeyStr("E", 4U, kTypeValue), "val"},
+      {KeyStr("F", 6U, kTypeSingleDeletion), ""},
+      {KeyStr("F", 5U, kTypeValue), "val"},
+      {KeyStr("F", 4U, kTypeSingleDeletion), ""},
+      {KeyStr("F", 3U, kTypeValue), "val"},
+      {KeyStr("G", 12U, kTypeSingleDeletion), ""},
+      {KeyStr("H", 6U, kTypeSingleDeletion), ""},
+      {KeyStr("H", 5U, kTypeValue), "val"},
+      {KeyStr("H", 4U, kTypeSingleDeletion), ""},
+      {KeyStr("H", 3U, kTypeValue), "val"},
+      {KeyStr("I", 12U, kTypeSingleDeletion), ""},
+      {KeyStr("I", 11U, kTypeValue), "val"},
+      {KeyStr("J", 6U, kTypeSingleDeletion), ""},
+      {KeyStr("J", 5U, kTypeSingleDeletion), ""},
+      {KeyStr("J", 4U, kTypeValue), "val"},
+      {KeyStr("J", 3U, kTypeSingleDeletion), ""},
+      {KeyStr("J", 2U, kTypeValue), "val"},
+      {KeyStr("K", 8U, kTypeValue), "val3"},
+      {KeyStr("K", 7U, kTypeValue), "val4"},
+      {KeyStr("K", 6U, kTypeSingleDeletion), ""},
+      {KeyStr("K", 5U, kTypeValue), "val5"},
+      {KeyStr("K", 2U, kTypeSingleDeletion), ""},
+      {KeyStr("K", 1U, kTypeSingleDeletion), ""},
+      {KeyStr("L", 5U, kTypeSingleDeletion), ""},
+      {KeyStr("L", 4U, kTypeValue), "val"},
+      {KeyStr("L", 3U, kTypeDeletion), ""},
+      {KeyStr("L", 2U, kTypeValue), "val"},
+      {KeyStr("L", 1U, kTypeSingleDeletion), ""},
+      {KeyStr("M", 10U, kTypeSingleDeletion), ""},
+      {KeyStr("M", 7U, kTypeValue), "val"},
+      {KeyStr("M", 5U, kTypeDeletion), ""},
+      {KeyStr("M", 4U, kTypeValue), "val"},
+      {KeyStr("M", 3U, kTypeSingleDeletion), ""},
+  });
+  AddMockFile(file2);
+
+  auto file3 = mock::MakeMockFile({
+      {KeyStr("D", 1U, kTypeValue), "val"},
+      {KeyStr("H", 1U, kTypeValue), "val"},
+      {KeyStr("I", 2U, kTypeValue), "val"},
+  });
+  AddMockFile(file3, 2);
+
+  auto file4 = mock::MakeMockFile({
+      {KeyStr("M", 1U, kTypeValue), "val"},
+  });
+  AddMockFile(file4, 2);
+
+  auto expected_results =
+      mock::MakeMockFile({{KeyStr("A", 14U, kTypeSingleDeletion), ""},
+                          {KeyStr("A", 13U, kTypeValue), ""},
+                          {KeyStr("A", 12U, kTypeSingleDeletion), ""},
+                          {KeyStr("A", 10U, kTypeValue), "val"},
+                          {KeyStr("B", 14U, kTypeSingleDeletion), ""},
+                          {KeyStr("B", 13U, kTypeValue), ""},
+                          {KeyStr("C", 14U, kTypeValue), "val3"},
+                          {KeyStr("D", 12U, kTypeSingleDeletion), ""},
+                          {KeyStr("D", 11U, kTypeValue), ""},
+                          {KeyStr("D", 10U, kTypeSingleDeletion), ""},
+                          {KeyStr("E", 12U, kTypeSingleDeletion), ""},
+                          {KeyStr("E", 11U, kTypeValue), ""},
+                          {KeyStr("G", 15U, kTypeValue), "val"},
+                          {KeyStr("G", 12U, kTypeSingleDeletion), ""},
+                          {KeyStr("I", 14U, kTypeSingleDeletion), ""},
+                          {KeyStr("I", 13U, kTypeValue), ""},
+                          {KeyStr("J", 15U, kTypeValue), "val"},
+                          {KeyStr("K", 16U, kTypeSingleDeletion), ""},
+                          {KeyStr("K", 15U, kTypeValue), ""},
+                          {KeyStr("K", 11U, kTypeSingleDeletion), ""},
+                          {KeyStr("K", 8U, kTypeValue), "val3"},
+                          {KeyStr("L", 16U, kTypeSingleDeletion), ""},
+                          {KeyStr("L", 15U, kTypeValue), ""},
+                          {KeyStr("M", 16U, kTypeDeletion), ""},
+                          {KeyStr("M", 3U, kTypeSingleDeletion), ""}});
+
+  SetLastSequence(22U);
+  auto files = cfd_->current()->storage_info()->LevelFiles(0);
+  RunCompaction({files}, expected_results, {10U}, 10U);
+}
+
+// This test documents the behavior where a corrupt key follows a deletion or a
+// single deletion and the (single) deletion gets removed while the corrupt key
+// gets written out. TODO(noetzli): We probably want a better way to treat
+// corrupt keys.
+TEST_F(CompactionJobTest, CorruptionAfterDeletion) {
+  NewDB();
+
+  auto file1 =
+      mock::MakeMockFile({{test::KeyStr("A", 6U, kTypeValue), "val3"},
+                          {test::KeyStr("a", 5U, kTypeDeletion), ""},
+                          {test::KeyStr("a", 4U, kTypeValue, true), "val"}});
+  AddMockFile(file1);
+
+  auto file2 =
+      mock::MakeMockFile({{test::KeyStr("b", 3U, kTypeSingleDeletion), ""},
+                          {test::KeyStr("b", 2U, kTypeValue, true), "val"},
+                          {test::KeyStr("c", 1U, kTypeValue), "val2"}});
+  AddMockFile(file2);
+
+  auto expected_results =
+      mock::MakeMockFile({{test::KeyStr("A", 0U, kTypeValue), "val3"},
+                          {test::KeyStr("a", 0U, kTypeValue, true), "val"},
+                          {test::KeyStr("b", 0U, kTypeValue, true), "val"},
+                          {test::KeyStr("c", 0U, kTypeValue), "val2"}});
+
+  SetLastSequence(6U);
+  auto files = cfd_->current()->storage_info()->LevelFiles(0);
+  RunCompaction({files}, expected_results);
+}
+
+TEST_F(CompactionJobTest, OldestBlobFileNumber) {
+  NewDB();
+
+  // Note: blob1 is inlined TTL, so it will not be considered for the purposes
+  // of identifying the oldest referenced blob file. Similarly, blob6 will be
+  // ignored because it has TTL and hence refers to a TTL blob file.
+  const stl_wrappers::KVMap::value_type blob1(
+      KeyStr("a", 1U, kTypeBlobIndex), BlobStrInlinedTTL("foo", 1234567890ULL));
+  const stl_wrappers::KVMap::value_type blob2(KeyStr("b", 2U, kTypeBlobIndex),
+                                              BlobStr(59, 123456, 999));
+  const stl_wrappers::KVMap::value_type blob3(KeyStr("c", 3U, kTypeBlobIndex),
+                                              BlobStr(138, 1000, 1 << 8));
+  auto file1 = mock::MakeMockFile({blob1, blob2, blob3});
+  AddMockFile(file1);
+
+  const stl_wrappers::KVMap::value_type blob4(KeyStr("d", 4U, kTypeBlobIndex),
+                                              BlobStr(199, 3 << 10, 1 << 20));
+  const stl_wrappers::KVMap::value_type blob5(KeyStr("e", 5U, kTypeBlobIndex),
+                                              BlobStr(19, 6789, 333));
+  const stl_wrappers::KVMap::value_type blob6(
+      KeyStr("f", 6U, kTypeBlobIndex),
+      BlobStrTTL(5, 2048, 1 << 7, 1234567890ULL));
+  auto file2 = mock::MakeMockFile({blob4, blob5, blob6});
+  AddMockFile(file2);
+
+  const stl_wrappers::KVMap::value_type expected_blob1(
+      KeyStr("a", 0U, kTypeBlobIndex), blob1.second);
+  const stl_wrappers::KVMap::value_type expected_blob2(
+      KeyStr("b", 0U, kTypeBlobIndex), blob2.second);
+  const stl_wrappers::KVMap::value_type expected_blob3(
+      KeyStr("c", 0U, kTypeBlobIndex), blob3.second);
+  const stl_wrappers::KVMap::value_type expected_blob4(
+      KeyStr("d", 0U, kTypeBlobIndex), blob4.second);
+  const stl_wrappers::KVMap::value_type expected_blob5(
+      KeyStr("e", 0U, kTypeBlobIndex), blob5.second);
+  const stl_wrappers::KVMap::value_type expected_blob6(
+      KeyStr("f", 0U, kTypeBlobIndex), blob6.second);
+  auto expected_results =
+      mock::MakeMockFile({expected_blob1, expected_blob2, expected_blob3,
+                          expected_blob4, expected_blob5, expected_blob6});
+
+  SetLastSequence(6U);
+  auto files = cfd_->current()->storage_info()->LevelFiles(0);
+  RunCompaction({files}, expected_results, std::vector<SequenceNumber>(),
+                kMaxSequenceNumber, /* output_level */ 1, /* verify */ true,
+                /* expected_oldest_blob_file_number */ 19);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+  fprintf(stderr,
+          "SKIPPED as CompactionJobStats is not supported in ROCKSDB_LITE\n");
+  return 0;
+}
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/db/compaction/compaction_picker.cc b/src/rocksdb/db/compaction/compaction_picker.cc
new file mode 100644
index 000000000..4355d4b91
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_picker.cc
@@ -0,0 +1,1131 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/compaction/compaction_picker.h"
+
+#include <cinttypes>
+#include <limits>
+#include <queue>
+#include <string>
+#include <utility>
+#include <vector>
+#include "db/column_family.h"
+#include "file/filename.h"
+#include "logging/log_buffer.h"
+#include "monitoring/statistics.h"
+#include "test_util/sync_point.h"
+#include "util/random.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+uint64_t TotalCompensatedFileSize(const std::vector<FileMetaData*>& files) {
+  uint64_t sum = 0;
+  for (size_t i = 0; i < files.size() && files[i]; i++) {
+    sum += files[i]->compensated_file_size;
+  }
+  return sum;
+}
+}  // anonymous namespace
+
+bool FindIntraL0Compaction(const std::vector<FileMetaData*>& level_files,
+                           size_t min_files_to_compact,
+                           uint64_t max_compact_bytes_per_del_file,
+                           uint64_t max_compaction_bytes,
+                           CompactionInputFiles* comp_inputs,
+                           SequenceNumber earliest_mem_seqno) {
+  // Do not pick ingested file when there is at least one memtable not flushed
+  // which of seqno is overlap with the sst.
+  TEST_SYNC_POINT("FindIntraL0Compaction");
+  size_t start = 0;
+  for (; start < level_files.size(); start++) {
+    if (level_files[start]->being_compacted) {
+      return false;
+    }
+    // If there is no data in memtable, the earliest sequence number would the
+    // largest sequence number in last memtable.
+    // Because all files are sorted in descending order by largest_seqno, so we
+    // only need to check the first one.
+    if (level_files[start]->fd.largest_seqno <= earliest_mem_seqno) {
+      break;
+    }
+  }
+  if (start >= level_files.size()) {
+    return false;
+  }
+  size_t compact_bytes = static_cast<size_t>(level_files[start]->fd.file_size);
+  uint64_t compensated_compact_bytes =
+      level_files[start]->compensated_file_size;
+  size_t compact_bytes_per_del_file = port::kMaxSizet;
+  // Compaction range will be [start, limit).
+  size_t limit;
+  // Pull in files until the amount of compaction work per deleted file begins
+  // increasing or maximum total compaction size is reached.
+  size_t new_compact_bytes_per_del_file = 0;
+  for (limit = start + 1; limit < level_files.size(); ++limit) {
+    compact_bytes += static_cast<size_t>(level_files[limit]->fd.file_size);
+    compensated_compact_bytes += level_files[limit]->compensated_file_size;
+    new_compact_bytes_per_del_file = compact_bytes / (limit - start);
+    if (level_files[limit]->being_compacted ||
+        new_compact_bytes_per_del_file > compact_bytes_per_del_file ||
+        compensated_compact_bytes > max_compaction_bytes) {
+      break;
+    }
+    compact_bytes_per_del_file = new_compact_bytes_per_del_file;
+  }
+
+  if ((limit - start) >= min_files_to_compact &&
+      compact_bytes_per_del_file < max_compact_bytes_per_del_file) {
+    assert(comp_inputs != nullptr);
+    comp_inputs->level = 0;
+    for (size_t i = start; i < limit; ++i) {
+      comp_inputs->files.push_back(level_files[i]);
+    }
+    return true;
+  }
+  return false;
+}
+
+// Determine compression type, based on user options, level of the output
+// file and whether compression is disabled.
+// If enable_compression is false, then compression is always disabled no
+// matter what the values of the other two parameters are.
+// Otherwise, the compression type is determined based on options and level.
+CompressionType GetCompressionType(const ImmutableCFOptions& ioptions,
+                                   const VersionStorageInfo* vstorage,
+                                   const MutableCFOptions& mutable_cf_options,
+                                   int level, int base_level,
+                                   const bool enable_compression) {
+  if (!enable_compression) {
+    // disable compression
+    return kNoCompression;
+  }
+
+  // If bottommost_compression is set and we are compacting to the
+  // bottommost level then we should use it.
+  if (ioptions.bottommost_compression != kDisableCompressionOption &&
+      level >= (vstorage->num_non_empty_levels() - 1)) {
+    return ioptions.bottommost_compression;
+  }
+  // If the user has specified a different compression level for each level,
+  // then pick the compression for that level.
+  if (!ioptions.compression_per_level.empty()) {
+    assert(level == 0 || level >= base_level);
+    int idx = (level == 0) ? 0 : level - base_level + 1;
+
+    const int n = static_cast<int>(ioptions.compression_per_level.size()) - 1;
+    // It is possible for level_ to be -1; in that case, we use level
+    // 0's compression.  This occurs mostly in backwards compatibility
+    // situations when the builder doesn't know what level the file
+    // belongs to.  Likewise, if level is beyond the end of the
+    // specified compression levels, use the last value.
+    return ioptions.compression_per_level[std::max(0, std::min(idx, n))];
+  } else {
+    return mutable_cf_options.compression;
+  }
+}
+
+CompressionOptions GetCompressionOptions(const ImmutableCFOptions& ioptions,
+                                         const VersionStorageInfo* vstorage,
+                                         int level,
+                                         const bool enable_compression) {
+  if (!enable_compression) {
+    return ioptions.compression_opts;
+  }
+  // If bottommost_compression is set and we are compacting to the
+  // bottommost level then we should use the specified compression options
+  // for the bottmomost_compression.
+  if (ioptions.bottommost_compression != kDisableCompressionOption &&
+      level >= (vstorage->num_non_empty_levels() - 1) &&
+      ioptions.bottommost_compression_opts.enabled) {
+    return ioptions.bottommost_compression_opts;
+  }
+  return ioptions.compression_opts;
+}
+
+CompactionPicker::CompactionPicker(const ImmutableCFOptions& ioptions,
+                                   const InternalKeyComparator* icmp)
+    : ioptions_(ioptions), icmp_(icmp) {}
+
+CompactionPicker::~CompactionPicker() {}
+
+// Delete this compaction from the list of running compactions.
+void CompactionPicker::ReleaseCompactionFiles(Compaction* c, Status status) {
+  UnregisterCompaction(c);
+  if (!status.ok()) {
+    c->ResetNextCompactionIndex();
+  }
+}
+
+void CompactionPicker::GetRange(const CompactionInputFiles& inputs,
+                                InternalKey* smallest,
+                                InternalKey* largest) const {
+  const int level = inputs.level;
+  assert(!inputs.empty());
+  smallest->Clear();
+  largest->Clear();
+
+  if (level == 0) {
+    for (size_t i = 0; i < inputs.size(); i++) {
+      FileMetaData* f = inputs[i];
+      if (i == 0) {
+        *smallest = f->smallest;
+        *largest = f->largest;
+      } else {
+        if (icmp_->Compare(f->smallest, *smallest) < 0) {
+          *smallest = f->smallest;
+        }
+        if (icmp_->Compare(f->largest, *largest) > 0) {
+          *largest = f->largest;
+        }
+      }
+    }
+  } else {
+    *smallest = inputs[0]->smallest;
+    *largest = inputs[inputs.size() - 1]->largest;
+  }
+}
+
+void CompactionPicker::GetRange(const CompactionInputFiles& inputs1,
+                                const CompactionInputFiles& inputs2,
+                                InternalKey* smallest,
+                                InternalKey* largest) const {
+  assert(!inputs1.empty() || !inputs2.empty());
+  if (inputs1.empty()) {
+    GetRange(inputs2, smallest, largest);
+  } else if (inputs2.empty()) {
+    GetRange(inputs1, smallest, largest);
+  } else {
+    InternalKey smallest1, smallest2, largest1, largest2;
+    GetRange(inputs1, &smallest1, &largest1);
+    GetRange(inputs2, &smallest2, &largest2);
+    *smallest =
+        icmp_->Compare(smallest1, smallest2) < 0 ? smallest1 : smallest2;
+    *largest = icmp_->Compare(largest1, largest2) < 0 ? largest2 : largest1;
+  }
+}
+
+void CompactionPicker::GetRange(const std::vector<CompactionInputFiles>& inputs,
+                                InternalKey* smallest,
+                                InternalKey* largest) const {
+  InternalKey current_smallest;
+  InternalKey current_largest;
+  bool initialized = false;
+  for (const auto& in : inputs) {
+    if (in.empty()) {
+      continue;
+    }
+    GetRange(in, &current_smallest, &current_largest);
+    if (!initialized) {
+      *smallest = current_smallest;
+      *largest = current_largest;
+      initialized = true;
+    } else {
+      if (icmp_->Compare(current_smallest, *smallest) < 0) {
+        *smallest = current_smallest;
+      }
+      if (icmp_->Compare(current_largest, *largest) > 0) {
+        *largest = current_largest;
+      }
+    }
+  }
+  assert(initialized);
+}
+
+bool CompactionPicker::ExpandInputsToCleanCut(const std::string& /*cf_name*/,
+                                              VersionStorageInfo* vstorage,
+                                              CompactionInputFiles* inputs,
+                                              InternalKey** next_smallest) {
+  // This isn't good compaction
+  assert(!inputs->empty());
+
+  const int level = inputs->level;
+  // GetOverlappingInputs will always do the right thing for level-0.
+  // So we don't need to do any expansion if level == 0.
+  if (level == 0) {
+    return true;
+  }
+
+  InternalKey smallest, largest;
+
+  // Keep expanding inputs until we are sure that there is a "clean cut"
+  // boundary between the files in input and the surrounding files.
+  // This will ensure that no parts of a key are lost during compaction.
+  int hint_index = -1;
+  size_t old_size;
+  do {
+    old_size = inputs->size();
+    GetRange(*inputs, &smallest, &largest);
+    inputs->clear();
+    vstorage->GetOverlappingInputs(level, &smallest, &largest, &inputs->files,
+                                   hint_index, &hint_index, true,
+                                   next_smallest);
+  } while (inputs->size() > old_size);
+
+  // we started off with inputs non-empty and the previous loop only grew
+  // inputs. thus, inputs should be non-empty here
+  assert(!inputs->empty());
+
+  // If, after the expansion, there are files that are already under
+  // compaction, then we must drop/cancel this compaction.
+  if (AreFilesInCompaction(inputs->files)) {
+    return false;
+  }
+  return true;
+}
+
+bool CompactionPicker::RangeOverlapWithCompaction(
+    const Slice& smallest_user_key, const Slice& largest_user_key,
+    int level) const {
+  const Comparator* ucmp = icmp_->user_comparator();
+  for (Compaction* c : compactions_in_progress_) {
+    if (c->output_level() == level &&
+        ucmp->Compare(smallest_user_key, c->GetLargestUserKey()) <= 0 &&
+        ucmp->Compare(largest_user_key, c->GetSmallestUserKey()) >= 0) {
+      // Overlap
+      return true;
+    }
+  }
+  // Did not overlap with any running compaction in level `level`
+  return false;
+}
+
+bool CompactionPicker::FilesRangeOverlapWithCompaction(
+    const std::vector<CompactionInputFiles>& inputs, int level) const {
+  bool is_empty = true;
+  for (auto& in : inputs) {
+    if (!in.empty()) {
+      is_empty = false;
+      break;
+    }
+  }
+  if (is_empty) {
+    // No files in inputs
+    return false;
+  }
+
+  InternalKey smallest, largest;
+  GetRange(inputs, &smallest, &largest);
+  return RangeOverlapWithCompaction(smallest.user_key(), largest.user_key(),
+                                    level);
+}
+
+// Returns true if any one of specified files are being compacted
+bool CompactionPicker::AreFilesInCompaction(
+    const std::vector<FileMetaData*>& files) {
+  for (size_t i = 0; i < files.size(); i++) {
+    if (files[i]->being_compacted) {
+      return true;
+    }
+  }
+  return false;
+}
+
+Compaction* CompactionPicker::CompactFiles(
+    const CompactionOptions& compact_options,
+    const std::vector<CompactionInputFiles>& input_files, int output_level,
+    VersionStorageInfo* vstorage, const MutableCFOptions& mutable_cf_options,
+    uint32_t output_path_id) {
+  assert(input_files.size());
+  // This compaction output should not overlap with a running compaction as
+  // `SanitizeCompactionInputFiles` should've checked earlier and db mutex
+  // shouldn't have been released since.
+  assert(!FilesRangeOverlapWithCompaction(input_files, output_level));
+
+  CompressionType compression_type;
+  if (compact_options.compression == kDisableCompressionOption) {
+    int base_level;
+    if (ioptions_.compaction_style == kCompactionStyleLevel) {
+      base_level = vstorage->base_level();
+    } else {
+      base_level = 1;
+    }
+    compression_type =
+        GetCompressionType(ioptions_, vstorage, mutable_cf_options,
+                           output_level, base_level);
+  } else {
+    // TODO(ajkr): `CompactionOptions` offers configurable `CompressionType`
+    // without configurable `CompressionOptions`, which is inconsistent.
+    compression_type = compact_options.compression;
+  }
+  auto c = new Compaction(
+      vstorage, ioptions_, mutable_cf_options, input_files, output_level,
+      compact_options.output_file_size_limit,
+      mutable_cf_options.max_compaction_bytes, output_path_id, compression_type,
+      GetCompressionOptions(ioptions_, vstorage, output_level),
+      compact_options.max_subcompactions,
+      /* grandparents */ {}, true);
+  RegisterCompaction(c);
+  return c;
+}
+
+Status CompactionPicker::GetCompactionInputsFromFileNumbers(
+    std::vector<CompactionInputFiles>* input_files,
+    std::unordered_set<uint64_t>* input_set, const VersionStorageInfo* vstorage,
+    const CompactionOptions& /*compact_options*/) const {
+  if (input_set->size() == 0U) {
+    return Status::InvalidArgument(
+        "Compaction must include at least one file.");
+  }
+  assert(input_files);
+
+  std::vector<CompactionInputFiles> matched_input_files;
+  matched_input_files.resize(vstorage->num_levels());
+  int first_non_empty_level = -1;
+  int last_non_empty_level = -1;
+  // TODO(yhchiang): use a lazy-initialized mapping from
+  //                 file_number to FileMetaData in Version.
+  for (int level = 0; level < vstorage->num_levels(); ++level) {
+    for (auto file : vstorage->LevelFiles(level)) {
+      auto iter = input_set->find(file->fd.GetNumber());
+      if (iter != input_set->end()) {
+        matched_input_files[level].files.push_back(file);
+        input_set->erase(iter);
+        last_non_empty_level = level;
+        if (first_non_empty_level == -1) {
+          first_non_empty_level = level;
+        }
+      }
+    }
+  }
+
+  if (!input_set->empty()) {
+    std::string message(
+        "Cannot find matched SST files for the following file numbers:");
+    for (auto fn : *input_set) {
+      message += " ";
+      message += ToString(fn);
+    }
+    return Status::InvalidArgument(message);
+  }
+
+  for (int level = first_non_empty_level; level <= last_non_empty_level;
+       ++level) {
+    matched_input_files[level].level = level;
+    input_files->emplace_back(std::move(matched_input_files[level]));
+  }
+
+  return Status::OK();
+}
+
+// Returns true if any one of the parent files are being compacted
+bool CompactionPicker::IsRangeInCompaction(VersionStorageInfo* vstorage,
+                                           const InternalKey* smallest,
+                                           const InternalKey* largest,
+                                           int level, int* level_index) {
+  std::vector<FileMetaData*> inputs;
+  assert(level < NumberLevels());
+
+  vstorage->GetOverlappingInputs(level, smallest, largest, &inputs,
+                                 level_index ? *level_index : 0, level_index);
+  return AreFilesInCompaction(inputs);
+}
+
+// Populates the set of inputs of all other levels that overlap with the
+// start level.
+// Now we assume all levels except start level and output level are empty.
+// Will also attempt to expand "start level" if that doesn't expand
+// "output level" or cause "level" to include a file for compaction that has an
+// overlapping user-key with another file.
+// REQUIRES: input_level and output_level are different
+// REQUIRES: inputs->empty() == false
+// Returns false if files on parent level are currently in compaction, which
+// means that we can't compact them
+bool CompactionPicker::SetupOtherInputs(
+    const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+    VersionStorageInfo* vstorage, CompactionInputFiles* inputs,
+    CompactionInputFiles* output_level_inputs, int* parent_index,
+    int base_index) {
+  assert(!inputs->empty());
+  assert(output_level_inputs->empty());
+  const int input_level = inputs->level;
+  const int output_level = output_level_inputs->level;
+  if (input_level == output_level) {
+    // no possibility of conflict
+    return true;
+  }
+
+  // For now, we only support merging two levels, start level and output level.
+  // We need to assert other levels are empty.
+  for (int l = input_level + 1; l < output_level; l++) {
+    assert(vstorage->NumLevelFiles(l) == 0);
+  }
+
+  InternalKey smallest, largest;
+
+  // Get the range one last time.
+  GetRange(*inputs, &smallest, &largest);
+
+  // Populate the set of next-level files (inputs_GetOutputLevelInputs()) to
+  // include in compaction
+  vstorage->GetOverlappingInputs(output_level, &smallest, &largest,
+                                 &output_level_inputs->files, *parent_index,
+                                 parent_index);
+  if (AreFilesInCompaction(output_level_inputs->files)) {
+    return false;
+  }
+  if (!output_level_inputs->empty()) {
+    if (!ExpandInputsToCleanCut(cf_name, vstorage, output_level_inputs)) {
+      return false;
+    }
+  }
+
+  // See if we can further grow the number of inputs in "level" without
+  // changing the number of "level+1" files we pick up. We also choose NOT
+  // to expand if this would cause "level" to include some entries for some
+  // user key, while excluding other entries for the same user key. This
+  // can happen when one user key spans multiple files.
+  if (!output_level_inputs->empty()) {
+    const uint64_t limit = mutable_cf_options.max_compaction_bytes;
+    const uint64_t output_level_inputs_size =
+        TotalCompensatedFileSize(output_level_inputs->files);
+    const uint64_t inputs_size = TotalCompensatedFileSize(inputs->files);
+    bool expand_inputs = false;
+
+    CompactionInputFiles expanded_inputs;
+    expanded_inputs.level = input_level;
+    // Get closed interval of output level
+    InternalKey all_start, all_limit;
+    GetRange(*inputs, *output_level_inputs, &all_start, &all_limit);
+    bool try_overlapping_inputs = true;
+    vstorage->GetOverlappingInputs(input_level, &all_start, &all_limit,
+                                   &expanded_inputs.files, base_index, nullptr);
+    uint64_t expanded_inputs_size =
+        TotalCompensatedFileSize(expanded_inputs.files);
+    if (!ExpandInputsToCleanCut(cf_name, vstorage, &expanded_inputs)) {
+      try_overlapping_inputs = false;
+    }
+    if (try_overlapping_inputs && expanded_inputs.size() > inputs->size() &&
+        output_level_inputs_size + expanded_inputs_size < limit &&
+        !AreFilesInCompaction(expanded_inputs.files)) {
+      InternalKey new_start, new_limit;
+      GetRange(expanded_inputs, &new_start, &new_limit);
+      CompactionInputFiles expanded_output_level_inputs;
+      expanded_output_level_inputs.level = output_level;
+      vstorage->GetOverlappingInputs(output_level, &new_start, &new_limit,
+                                     &expanded_output_level_inputs.files,
+                                     *parent_index, parent_index);
+      assert(!expanded_output_level_inputs.empty());
+      if (!AreFilesInCompaction(expanded_output_level_inputs.files) &&
+          ExpandInputsToCleanCut(cf_name, vstorage,
+                                 &expanded_output_level_inputs) &&
+          expanded_output_level_inputs.size() == output_level_inputs->size()) {
+        expand_inputs = true;
+      }
+    }
+    if (!expand_inputs) {
+      vstorage->GetCleanInputsWithinInterval(input_level, &all_start,
+                                             &all_limit, &expanded_inputs.files,
+                                             base_index, nullptr);
+      expanded_inputs_size = TotalCompensatedFileSize(expanded_inputs.files);
+      if (expanded_inputs.size() > inputs->size() &&
+          output_level_inputs_size + expanded_inputs_size < limit &&
+          !AreFilesInCompaction(expanded_inputs.files)) {
+        expand_inputs = true;
+      }
+    }
+    if (expand_inputs) {
+      ROCKS_LOG_INFO(ioptions_.info_log,
+                     "[%s] Expanding@%d %" ROCKSDB_PRIszt "+%" ROCKSDB_PRIszt
+                     "(%" PRIu64 "+%" PRIu64 " bytes) to %" ROCKSDB_PRIszt
+                     "+%" ROCKSDB_PRIszt " (%" PRIu64 "+%" PRIu64 " bytes)\n",
+                     cf_name.c_str(), input_level, inputs->size(),
+                     output_level_inputs->size(), inputs_size,
+                     output_level_inputs_size, expanded_inputs.size(),
+                     output_level_inputs->size(), expanded_inputs_size,
+                     output_level_inputs_size);
+      inputs->files = expanded_inputs.files;
+    }
+  }
+  return true;
+}
+
+void CompactionPicker::GetGrandparents(
+    VersionStorageInfo* vstorage, const CompactionInputFiles& inputs,
+    const CompactionInputFiles& output_level_inputs,
+    std::vector<FileMetaData*>* grandparents) {
+  InternalKey start, limit;
+  GetRange(inputs, output_level_inputs, &start, &limit);
+  // Compute the set of grandparent files that overlap this compaction
+  // (parent == level+1; grandparent == level+2)
+  if (output_level_inputs.level + 1 < NumberLevels()) {
+    vstorage->GetOverlappingInputs(output_level_inputs.level + 1, &start,
+                                   &limit, grandparents);
+  }
+}
+
+Compaction* CompactionPicker::CompactRange(
+    const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+    VersionStorageInfo* vstorage, int input_level, int output_level,
+    const CompactRangeOptions& compact_range_options, const InternalKey* begin,
+    const InternalKey* end, InternalKey** compaction_end, bool* manual_conflict,
+    uint64_t max_file_num_to_ignore) {
+  // CompactionPickerFIFO has its own implementation of compact range
+  assert(ioptions_.compaction_style != kCompactionStyleFIFO);
+
+  if (input_level == ColumnFamilyData::kCompactAllLevels) {
+    assert(ioptions_.compaction_style == kCompactionStyleUniversal);
+
+    // Universal compaction with more than one level always compacts all the
+    // files together to the last level.
+    assert(vstorage->num_levels() > 1);
+    // DBImpl::CompactRange() set output level to be the last level
+    if (ioptions_.allow_ingest_behind) {
+      assert(output_level == vstorage->num_levels() - 2);
+    } else {
+      assert(output_level == vstorage->num_levels() - 1);
+    }
+    // DBImpl::RunManualCompaction will make full range for universal compaction
+    assert(begin == nullptr);
+    assert(end == nullptr);
+    *compaction_end = nullptr;
+
+    int start_level = 0;
+    for (; start_level < vstorage->num_levels() &&
+           vstorage->NumLevelFiles(start_level) == 0;
+         start_level++) {
+    }
+    if (start_level == vstorage->num_levels()) {
+      return nullptr;
+    }
+
+    if ((start_level == 0) && (!level0_compactions_in_progress_.empty())) {
+      *manual_conflict = true;
+      // Only one level 0 compaction allowed
+      return nullptr;
+    }
+
+    std::vector<CompactionInputFiles> inputs(vstorage->num_levels() -
+                                             start_level);
+    for (int level = start_level; level < vstorage->num_levels(); level++) {
+      inputs[level - start_level].level = level;
+      auto& files = inputs[level - start_level].files;
+      for (FileMetaData* f : vstorage->LevelFiles(level)) {
+        files.push_back(f);
+      }
+      if (AreFilesInCompaction(files)) {
+        *manual_conflict = true;
+        return nullptr;
+      }
+    }
+
+    // 2 non-exclusive manual compactions could run at the same time producing
+    // overlaping outputs in the same level.
+    if (FilesRangeOverlapWithCompaction(inputs, output_level)) {
+      // This compaction output could potentially conflict with the output
+      // of a currently running compaction, we cannot run it.
+      *manual_conflict = true;
+      return nullptr;
+    }
+
+    Compaction* c = new Compaction(
+        vstorage, ioptions_, mutable_cf_options, std::move(inputs),
+        output_level,
+        MaxFileSizeForLevel(mutable_cf_options, output_level,
+                            ioptions_.compaction_style),
+        /* max_compaction_bytes */ LLONG_MAX,
+        compact_range_options.target_path_id,
+        GetCompressionType(ioptions_, vstorage, mutable_cf_options,
+                           output_level, 1),
+        GetCompressionOptions(ioptions_, vstorage, output_level),
+        compact_range_options.max_subcompactions, /* grandparents */ {},
+        /* is manual */ true);
+    RegisterCompaction(c);
+    return c;
+  }
+
+  CompactionInputFiles inputs;
+  inputs.level = input_level;
+  bool covering_the_whole_range = true;
+
+  // All files are 'overlapping' in universal style compaction.
+  // We have to compact the entire range in one shot.
+  if (ioptions_.compaction_style == kCompactionStyleUniversal) {
+    begin = nullptr;
+    end = nullptr;
+  }
+
+  vstorage->GetOverlappingInputs(input_level, begin, end, &inputs.files);
+  if (inputs.empty()) {
+    return nullptr;
+  }
+
+  if ((input_level == 0) && (!level0_compactions_in_progress_.empty())) {
+    // Only one level 0 compaction allowed
+    TEST_SYNC_POINT("CompactionPicker::CompactRange:Conflict");
+    *manual_conflict = true;
+    return nullptr;
+  }
+
+  // Avoid compacting too much in one shot in case the range is large.
+  // But we cannot do this for level-0 since level-0 files can overlap
+  // and we must not pick one file and drop another older file if the
+  // two files overlap.
+  if (input_level > 0) {
+    const uint64_t limit = mutable_cf_options.max_compaction_bytes;
+    uint64_t total = 0;
+    for (size_t i = 0; i + 1 < inputs.size(); ++i) {
+      uint64_t s = inputs[i]->compensated_file_size;
+      total += s;
+      if (total >= limit) {
+        covering_the_whole_range = false;
+        inputs.files.resize(i + 1);
+        break;
+      }
+    }
+  }
+  assert(compact_range_options.target_path_id <
+         static_cast<uint32_t>(ioptions_.cf_paths.size()));
+
+  // for BOTTOM LEVEL compaction only, use max_file_num_to_ignore to filter out
+  // files that are created during the current compaction.
+  if (compact_range_options.bottommost_level_compaction ==
+          BottommostLevelCompaction::kForceOptimized &&
+      max_file_num_to_ignore != port::kMaxUint64) {
+    assert(input_level == output_level);
+    // inputs_shrunk holds a continuous subset of input files which were all
+    // created before the current manual compaction
+    std::vector<FileMetaData*> inputs_shrunk;
+    size_t skip_input_index = inputs.size();
+    for (size_t i = 0; i < inputs.size(); ++i) {
+      if (inputs[i]->fd.GetNumber() < max_file_num_to_ignore) {
+        inputs_shrunk.push_back(inputs[i]);
+      } else if (!inputs_shrunk.empty()) {
+        // inputs[i] was created during the current manual compaction and
+        // need to be skipped
+        skip_input_index = i;
+        break;
+      }
+    }
+    if (inputs_shrunk.empty()) {
+      return nullptr;
+    }
+    if (inputs.size() != inputs_shrunk.size()) {
+      inputs.files.swap(inputs_shrunk);
+    }
+    // set covering_the_whole_range to false if there is any file that need to
+    // be compacted in the range of inputs[skip_input_index+1, inputs.size())
+    for (size_t i = skip_input_index + 1; i < inputs.size(); ++i) {
+      if (inputs[i]->fd.GetNumber() < max_file_num_to_ignore) {
+        covering_the_whole_range = false;
+      }
+    }
+  }
+
+  InternalKey key_storage;
+  InternalKey* next_smallest = &key_storage;
+  if (ExpandInputsToCleanCut(cf_name, vstorage, &inputs, &next_smallest) ==
+      false) {
+    // manual compaction is now multi-threaded, so it can
+    // happen that ExpandWhileOverlapping fails
+    // we handle it higher in RunManualCompaction
+    *manual_conflict = true;
+    return nullptr;
+  }
+
+  if (covering_the_whole_range || !next_smallest) {
+    *compaction_end = nullptr;
+  } else {
+    **compaction_end = *next_smallest;
+  }
+
+  CompactionInputFiles output_level_inputs;
+  if (output_level == ColumnFamilyData::kCompactToBaseLevel) {
+    assert(input_level == 0);
+    output_level = vstorage->base_level();
+    assert(output_level > 0);
+  }
+  output_level_inputs.level = output_level;
+  if (input_level != output_level) {
+    int parent_index = -1;
+    if (!SetupOtherInputs(cf_name, mutable_cf_options, vstorage, &inputs,
+                          &output_level_inputs, &parent_index, -1)) {
+      // manual compaction is now multi-threaded, so it can
+      // happen that SetupOtherInputs fails
+      // we handle it higher in RunManualCompaction
+      *manual_conflict = true;
+      return nullptr;
+    }
+  }
+
+  std::vector<CompactionInputFiles> compaction_inputs({inputs});
+  if (!output_level_inputs.empty()) {
+    compaction_inputs.push_back(output_level_inputs);
+  }
+  for (size_t i = 0; i < compaction_inputs.size(); i++) {
+    if (AreFilesInCompaction(compaction_inputs[i].files)) {
+      *manual_conflict = true;
+      return nullptr;
+    }
+  }
+
+  // 2 non-exclusive manual compactions could run at the same time producing
+  // overlaping outputs in the same level.
+  if (FilesRangeOverlapWithCompaction(compaction_inputs, output_level)) {
+    // This compaction output could potentially conflict with the output
+    // of a currently running compaction, we cannot run it.
+    *manual_conflict = true;
+    return nullptr;
+  }
+
+  std::vector<FileMetaData*> grandparents;
+  GetGrandparents(vstorage, inputs, output_level_inputs, &grandparents);
+  Compaction* compaction = new Compaction(
+      vstorage, ioptions_, mutable_cf_options, std::move(compaction_inputs),
+      output_level,
+      MaxFileSizeForLevel(mutable_cf_options, output_level,
+                          ioptions_.compaction_style, vstorage->base_level(),
+                          ioptions_.level_compaction_dynamic_level_bytes),
+      mutable_cf_options.max_compaction_bytes,
+      compact_range_options.target_path_id,
+      GetCompressionType(ioptions_, vstorage, mutable_cf_options, output_level,
+                         vstorage->base_level()),
+      GetCompressionOptions(ioptions_, vstorage, output_level),
+      compact_range_options.max_subcompactions, std::move(grandparents),
+      /* is manual compaction */ true);
+
+  TEST_SYNC_POINT_CALLBACK("CompactionPicker::CompactRange:Return", compaction);
+  RegisterCompaction(compaction);
+
+  // Creating a compaction influences the compaction score because the score
+  // takes running compactions into account (by skipping files that are already
+  // being compacted). Since we just changed compaction score, we recalculate it
+  // here
+  vstorage->ComputeCompactionScore(ioptions_, mutable_cf_options);
+
+  return compaction;
+}
+
+#ifndef ROCKSDB_LITE
+namespace {
+// Test whether two files have overlapping key-ranges.
+bool HaveOverlappingKeyRanges(const Comparator* c, const SstFileMetaData& a,
+                              const SstFileMetaData& b) {
+  if (c->Compare(a.smallestkey, b.smallestkey) >= 0) {
+    if (c->Compare(a.smallestkey, b.largestkey) <= 0) {
+      // b.smallestkey <= a.smallestkey <= b.largestkey
+      return true;
+    }
+  } else if (c->Compare(a.largestkey, b.smallestkey) >= 0) {
+    // a.smallestkey < b.smallestkey <= a.largestkey
+    return true;
+  }
+  if (c->Compare(a.largestkey, b.largestkey) <= 0) {
+    if (c->Compare(a.largestkey, b.smallestkey) >= 0) {
+      // b.smallestkey <= a.largestkey <= b.largestkey
+      return true;
+    }
+  } else if (c->Compare(a.smallestkey, b.largestkey) <= 0) {
+    // a.smallestkey <= b.largestkey < a.largestkey
+    return true;
+  }
+  return false;
+}
+}  // namespace
+
+Status CompactionPicker::SanitizeCompactionInputFilesForAllLevels(
+    std::unordered_set<uint64_t>* input_files,
+    const ColumnFamilyMetaData& cf_meta, const int output_level) const {
+  auto& levels = cf_meta.levels;
+  auto comparator = icmp_->user_comparator();
+
+  // TODO(yhchiang): add is_adjustable to CompactionOptions
+
+  // the smallest and largest key of the current compaction input
+  std::string smallestkey;
+  std::string largestkey;
+  // a flag for initializing smallest and largest key
+  bool is_first = false;
+  const int kNotFound = -1;
+
+  // For each level, it does the following things:
+  // 1. Find the first and the last compaction input files
+  //    in the current level.
+  // 2. Include all files between the first and the last
+  //    compaction input files.
+  // 3. Update the compaction key-range.
+  // 4. For all remaining levels, include files that have
+  //    overlapping key-range with the compaction key-range.
+  for (int l = 0; l <= output_level; ++l) {
+    auto& current_files = levels[l].files;
+    int first_included = static_cast<int>(current_files.size());
+    int last_included = kNotFound;
+
+    // identify the first and the last compaction input files
+    // in the current level.
+    for (size_t f = 0; f < current_files.size(); ++f) {
+      if (input_files->find(TableFileNameToNumber(current_files[f].name)) !=
+          input_files->end()) {
+        first_included = std::min(first_included, static_cast<int>(f));
+        last_included = std::max(last_included, static_cast<int>(f));
+        if (is_first == false) {
+          smallestkey = current_files[f].smallestkey;
+          largestkey = current_files[f].largestkey;
+          is_first = true;
+        }
+      }
+    }
+    if (last_included == kNotFound) {
+      continue;
+    }
+
+    if (l != 0) {
+      // expend the compaction input of the current level if it
+      // has overlapping key-range with other non-compaction input
+      // files in the same level.
+      while (first_included > 0) {
+        if (comparator->Compare(current_files[first_included - 1].largestkey,
+                                current_files[first_included].smallestkey) <
+            0) {
+          break;
+        }
+        first_included--;
+      }
+
+      while (last_included < static_cast<int>(current_files.size()) - 1) {
+        if (comparator->Compare(current_files[last_included + 1].smallestkey,
+                                current_files[last_included].largestkey) > 0) {
+          break;
+        }
+        last_included++;
+      }
+    } else if (output_level > 0) {
+      last_included = static_cast<int>(current_files.size() - 1);
+    }
+
+    // include all files between the first and the last compaction input files.
+    for (int f = first_included; f <= last_included; ++f) {
+      if (current_files[f].being_compacted) {
+        return Status::Aborted("Necessary compaction input file " +
+                               current_files[f].name +
+                               " is currently being compacted.");
+      }
+      input_files->insert(TableFileNameToNumber(current_files[f].name));
+    }
+
+    // update smallest and largest key
+    if (l == 0) {
+      for (int f = first_included; f <= last_included; ++f) {
+        if (comparator->Compare(smallestkey, current_files[f].smallestkey) >
+            0) {
+          smallestkey = current_files[f].smallestkey;
+        }
+        if (comparator->Compare(largestkey, current_files[f].largestkey) < 0) {
+          largestkey = current_files[f].largestkey;
+        }
+      }
+    } else {
+      if (comparator->Compare(smallestkey,
+                              current_files[first_included].smallestkey) > 0) {
+        smallestkey = current_files[first_included].smallestkey;
+      }
+      if (comparator->Compare(largestkey,
+                              current_files[last_included].largestkey) < 0) {
+        largestkey = current_files[last_included].largestkey;
+      }
+    }
+
+    SstFileMetaData aggregated_file_meta;
+    aggregated_file_meta.smallestkey = smallestkey;
+    aggregated_file_meta.largestkey = largestkey;
+
+    // For all lower levels, include all overlapping files.
+    // We need to add overlapping files from the current level too because even
+    // if there no input_files in level l, we would still need to add files
+    // which overlap with the range containing the input_files in levels 0 to l
+    // Level 0 doesn't need to be handled this way because files are sorted by
+    // time and not by key
+    for (int m = std::max(l, 1); m <= output_level; ++m) {
+      for (auto& next_lv_file : levels[m].files) {
+        if (HaveOverlappingKeyRanges(comparator, aggregated_file_meta,
+                                     next_lv_file)) {
+          if (next_lv_file.being_compacted) {
+            return Status::Aborted(
+                "File " + next_lv_file.name +
+                " that has overlapping key range with one of the compaction "
+                " input file is currently being compacted.");
+          }
+          input_files->insert(TableFileNameToNumber(next_lv_file.name));
+        }
+      }
+    }
+  }
+  if (RangeOverlapWithCompaction(smallestkey, largestkey, output_level)) {
+    return Status::Aborted(
+        "A running compaction is writing to the same output level in an "
+        "overlapping key range");
+  }
+  return Status::OK();
+}
+
+Status CompactionPicker::SanitizeCompactionInputFiles(
+    std::unordered_set<uint64_t>* input_files,
+    const ColumnFamilyMetaData& cf_meta, const int output_level) const {
+  assert(static_cast<int>(cf_meta.levels.size()) - 1 ==
+         cf_meta.levels[cf_meta.levels.size() - 1].level);
+  if (output_level >= static_cast<int>(cf_meta.levels.size())) {
+    return Status::InvalidArgument(
+        "Output level for column family " + cf_meta.name +
+        " must between [0, " +
+        ToString(cf_meta.levels[cf_meta.levels.size() - 1].level) + "].");
+  }
+
+  if (output_level > MaxOutputLevel()) {
+    return Status::InvalidArgument(
+        "Exceed the maximum output level defined by "
+        "the current compaction algorithm --- " +
+        ToString(MaxOutputLevel()));
+  }
+
+  if (output_level < 0) {
+    return Status::InvalidArgument("Output level cannot be negative.");
+  }
+
+  if (input_files->size() == 0) {
+    return Status::InvalidArgument(
+        "A compaction must contain at least one file.");
+  }
+
+  Status s = SanitizeCompactionInputFilesForAllLevels(input_files, cf_meta,
+                                                      output_level);
+
+  if (!s.ok()) {
+    return s;
+  }
+
+  // for all input files, check whether the file number matches
+  // any currently-existing files.
+  for (auto file_num : *input_files) {
+    bool found = false;
+    for (const auto& level_meta : cf_meta.levels) {
+      for (const auto& file_meta : level_meta.files) {
+        if (file_num == TableFileNameToNumber(file_meta.name)) {
+          if (file_meta.being_compacted) {
+            return Status::Aborted("Specified compaction input file " +
+                                   MakeTableFileName("", file_num) +
+                                   " is already being compacted.");
+          }
+          found = true;
+          break;
+        }
+      }
+      if (found) {
+        break;
+      }
+    }
+    if (!found) {
+      return Status::InvalidArgument(
+          "Specified compaction input file " + MakeTableFileName("", file_num) +
+          " does not exist in column family " + cf_meta.name + ".");
+    }
+  }
+
+  return Status::OK();
+}
+#endif  // !ROCKSDB_LITE
+
+void CompactionPicker::RegisterCompaction(Compaction* c) {
+  if (c == nullptr) {
+    return;
+  }
+  assert(ioptions_.compaction_style != kCompactionStyleLevel ||
+         c->output_level() == 0 ||
+         !FilesRangeOverlapWithCompaction(*c->inputs(), c->output_level()));
+  if (c->start_level() == 0 ||
+      ioptions_.compaction_style == kCompactionStyleUniversal) {
+    level0_compactions_in_progress_.insert(c);
+  }
+  compactions_in_progress_.insert(c);
+}
+
+void CompactionPicker::UnregisterCompaction(Compaction* c) {
+  if (c == nullptr) {
+    return;
+  }
+  if (c->start_level() == 0 ||
+      ioptions_.compaction_style == kCompactionStyleUniversal) {
+    level0_compactions_in_progress_.erase(c);
+  }
+  compactions_in_progress_.erase(c);
+}
+
+void CompactionPicker::PickFilesMarkedForCompaction(
+    const std::string& cf_name, VersionStorageInfo* vstorage, int* start_level,
+    int* output_level, CompactionInputFiles* start_level_inputs) {
+  if (vstorage->FilesMarkedForCompaction().empty()) {
+    return;
+  }
+
+  auto continuation = [&, cf_name](std::pair<int, FileMetaData*> level_file) {
+    // If it's being compacted it has nothing to do here.
+    // If this assert() fails that means that some function marked some
+    // files as being_compacted, but didn't call ComputeCompactionScore()
+    assert(!level_file.second->being_compacted);
+    *start_level = level_file.first;
+    *output_level =
+        (*start_level == 0) ? vstorage->base_level() : *start_level + 1;
+
+    if (*start_level == 0 && !level0_compactions_in_progress()->empty()) {
+      return false;
+    }
+
+    start_level_inputs->files = {level_file.second};
+    start_level_inputs->level = *start_level;
+    return ExpandInputsToCleanCut(cf_name, vstorage, start_level_inputs);
+  };
+
+  // take a chance on a random file first
+  Random64 rnd(/* seed */ reinterpret_cast<uint64_t>(vstorage));
+  size_t random_file_index = static_cast<size_t>(rnd.Uniform(
+      static_cast<uint64_t>(vstorage->FilesMarkedForCompaction().size())));
+
+  if (continuation(vstorage->FilesMarkedForCompaction()[random_file_index])) {
+    // found the compaction!
+    return;
+  }
+
+  for (auto& level_file : vstorage->FilesMarkedForCompaction()) {
+    if (continuation(level_file)) {
+      // found the compaction!
+      return;
+    }
+  }
+  start_level_inputs->files.clear();
+}
+
+bool CompactionPicker::GetOverlappingL0Files(
+    VersionStorageInfo* vstorage, CompactionInputFiles* start_level_inputs,
+    int output_level, int* parent_index) {
+  // Two level 0 compaction won't run at the same time, so don't need to worry
+  // about files on level 0 being compacted.
+  assert(level0_compactions_in_progress()->empty());
+  InternalKey smallest, largest;
+  GetRange(*start_level_inputs, &smallest, &largest);
+  // Note that the next call will discard the file we placed in
+  // c->inputs_[0] earlier and replace it with an overlapping set
+  // which will include the picked file.
+  start_level_inputs->files.clear();
+  vstorage->GetOverlappingInputs(0, &smallest, &largest,
+                                 &(start_level_inputs->files));
+
+  // If we include more L0 files in the same compaction run it can
+  // cause the 'smallest' and 'largest' key to get extended to a
+  // larger range. So, re-invoke GetRange to get the new key range
+  GetRange(*start_level_inputs, &smallest, &largest);
+  if (IsRangeInCompaction(vstorage, &smallest, &largest, output_level,
+                          parent_index)) {
+    return false;
+  }
+  assert(!start_level_inputs->files.empty());
+
+  return true;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/compaction/compaction_picker.h b/src/rocksdb/db/compaction/compaction_picker.h
new file mode 100644
index 000000000..36d570e68
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_picker.h
@@ -0,0 +1,313 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <memory>
+#include <set>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "db/compaction/compaction.h"
+#include "db/version_set.h"
+#include "options/cf_options.h"
+#include "rocksdb/env.h"
+#include "rocksdb/options.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// The file contains an abstract class CompactionPicker, and its two
+// sub-classes LevelCompactionPicker and NullCompactionPicker, as
+// well as some helper functions used by them.
+
+class LogBuffer;
+class Compaction;
+class VersionStorageInfo;
+struct CompactionInputFiles;
+
+// An abstract class to pick compactions from an existing LSM-tree.
+//
+// Each compaction style inherits the class and implement the
+// interface to form automatic compactions. If NeedCompaction() is true,
+// then call PickCompaction() to find what files need to be compacted
+// and where to put the output files.
+//
+// Non-virtual functions CompactRange() and CompactFiles() are used to
+// pick files to compact based on users' DB::CompactRange() and
+// DB::CompactFiles() requests, respectively. There is little
+// compaction style specific logic for them.
+class CompactionPicker {
+ public:
+  CompactionPicker(const ImmutableCFOptions& ioptions,
+                   const InternalKeyComparator* icmp);
+  virtual ~CompactionPicker();
+
+  // Pick level and inputs for a new compaction.
+  // Returns nullptr if there is no compaction to be done.
+  // Otherwise returns a pointer to a heap-allocated object that
+  // describes the compaction.  Caller should delete the result.
+  virtual Compaction* PickCompaction(
+      const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+      VersionStorageInfo* vstorage, LogBuffer* log_buffer,
+      SequenceNumber earliest_memtable_seqno = kMaxSequenceNumber) = 0;
+
+  // Return a compaction object for compacting the range [begin,end] in
+  // the specified level.  Returns nullptr if there is nothing in that
+  // level that overlaps the specified range.  Caller should delete
+  // the result.
+  //
+  // The returned Compaction might not include the whole requested range.
+  // In that case, compaction_end will be set to the next key that needs
+  // compacting. In case the compaction will compact the whole range,
+  // compaction_end will be set to nullptr.
+  // Client is responsible for compaction_end storage -- when called,
+  // *compaction_end should point to valid InternalKey!
+  virtual Compaction* CompactRange(
+      const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+      VersionStorageInfo* vstorage, int input_level, int output_level,
+      const CompactRangeOptions& compact_range_options,
+      const InternalKey* begin, const InternalKey* end,
+      InternalKey** compaction_end, bool* manual_conflict,
+      uint64_t max_file_num_to_ignore);
+
+  // The maximum allowed output level.  Default value is NumberLevels() - 1.
+  virtual int MaxOutputLevel() const { return NumberLevels() - 1; }
+
+  virtual bool NeedsCompaction(const VersionStorageInfo* vstorage) const = 0;
+
+// Sanitize the input set of compaction input files.
+// When the input parameters do not describe a valid compaction, the
+// function will try to fix the input_files by adding necessary
+// files.  If it's not possible to conver an invalid input_files
+// into a valid one by adding more files, the function will return a
+// non-ok status with specific reason.
+#ifndef ROCKSDB_LITE
+  Status SanitizeCompactionInputFiles(std::unordered_set<uint64_t>* input_files,
+                                      const ColumnFamilyMetaData& cf_meta,
+                                      const int output_level) const;
+#endif  // ROCKSDB_LITE
+
+  // Free up the files that participated in a compaction
+  //
+  // Requirement: DB mutex held
+  void ReleaseCompactionFiles(Compaction* c, Status status);
+
+  // Returns true if any one of the specified files are being compacted
+  bool AreFilesInCompaction(const std::vector<FileMetaData*>& files);
+
+  // Takes a list of CompactionInputFiles and returns a (manual) Compaction
+  // object.
+  //
+  // Caller must provide a set of input files that has been passed through
+  // `SanitizeCompactionInputFiles` earlier. The lock should not be released
+  // between that call and this one.
+  Compaction* CompactFiles(const CompactionOptions& compact_options,
+                           const std::vector<CompactionInputFiles>& input_files,
+                           int output_level, VersionStorageInfo* vstorage,
+                           const MutableCFOptions& mutable_cf_options,
+                           uint32_t output_path_id);
+
+  // Converts a set of compaction input file numbers into
+  // a list of CompactionInputFiles.
+  Status GetCompactionInputsFromFileNumbers(
+      std::vector<CompactionInputFiles>* input_files,
+      std::unordered_set<uint64_t>* input_set,
+      const VersionStorageInfo* vstorage,
+      const CompactionOptions& compact_options) const;
+
+  // Is there currently a compaction involving level 0 taking place
+  bool IsLevel0CompactionInProgress() const {
+    return !level0_compactions_in_progress_.empty();
+  }
+
+  // Return true if the passed key range overlap with a compaction output
+  // that is currently running.
+  bool RangeOverlapWithCompaction(const Slice& smallest_user_key,
+                                  const Slice& largest_user_key,
+                                  int level) const;
+
+  // Stores the minimal range that covers all entries in inputs in
+  // *smallest, *largest.
+  // REQUIRES: inputs is not empty
+  void GetRange(const CompactionInputFiles& inputs, InternalKey* smallest,
+                InternalKey* largest) const;
+
+  // Stores the minimal range that covers all entries in inputs1 and inputs2
+  // in *smallest, *largest.
+  // REQUIRES: inputs is not empty
+  void GetRange(const CompactionInputFiles& inputs1,
+                const CompactionInputFiles& inputs2, InternalKey* smallest,
+                InternalKey* largest) const;
+
+  // Stores the minimal range that covers all entries in inputs
+  // in *smallest, *largest.
+  // REQUIRES: inputs is not empty (at least on entry have one file)
+  void GetRange(const std::vector<CompactionInputFiles>& inputs,
+                InternalKey* smallest, InternalKey* largest) const;
+
+  int NumberLevels() const { return ioptions_.num_levels; }
+
+  // Add more files to the inputs on "level" to make sure that
+  // no newer version of a key is compacted to "level+1" while leaving an older
+  // version in a "level". Otherwise, any Get() will search "level" first,
+  // and will likely return an old/stale value for the key, since it always
+  // searches in increasing order of level to find the value. This could
+  // also scramble the order of merge operands. This function should be
+  // called any time a new Compaction is created, and its inputs_[0] are
+  // populated.
+  //
+  // Will return false if it is impossible to apply this compaction.
+  bool ExpandInputsToCleanCut(const std::string& cf_name,
+                              VersionStorageInfo* vstorage,
+                              CompactionInputFiles* inputs,
+                              InternalKey** next_smallest = nullptr);
+
+  // Returns true if any one of the parent files are being compacted
+  bool IsRangeInCompaction(VersionStorageInfo* vstorage,
+                           const InternalKey* smallest,
+                           const InternalKey* largest, int level, int* index);
+
+  // Returns true if the key range that `inputs` files cover overlap with the
+  // key range of a currently running compaction.
+  bool FilesRangeOverlapWithCompaction(
+      const std::vector<CompactionInputFiles>& inputs, int level) const;
+
+  bool SetupOtherInputs(const std::string& cf_name,
+                        const MutableCFOptions& mutable_cf_options,
+                        VersionStorageInfo* vstorage,
+                        CompactionInputFiles* inputs,
+                        CompactionInputFiles* output_level_inputs,
+                        int* parent_index, int base_index);
+
+  void GetGrandparents(VersionStorageInfo* vstorage,
+                       const CompactionInputFiles& inputs,
+                       const CompactionInputFiles& output_level_inputs,
+                       std::vector<FileMetaData*>* grandparents);
+
+  void PickFilesMarkedForCompaction(const std::string& cf_name,
+                                    VersionStorageInfo* vstorage,
+                                    int* start_level, int* output_level,
+                                    CompactionInputFiles* start_level_inputs);
+
+  bool GetOverlappingL0Files(VersionStorageInfo* vstorage,
+                             CompactionInputFiles* start_level_inputs,
+                             int output_level, int* parent_index);
+
+  // Register this compaction in the set of running compactions
+  void RegisterCompaction(Compaction* c);
+
+  // Remove this compaction from the set of running compactions
+  void UnregisterCompaction(Compaction* c);
+
+  std::set<Compaction*>* level0_compactions_in_progress() {
+    return &level0_compactions_in_progress_;
+  }
+  std::unordered_set<Compaction*>* compactions_in_progress() {
+    return &compactions_in_progress_;
+  }
+
+ protected:
+  const ImmutableCFOptions& ioptions_;
+
+// A helper function to SanitizeCompactionInputFiles() that
+// sanitizes "input_files" by adding necessary files.
+#ifndef ROCKSDB_LITE
+  virtual Status SanitizeCompactionInputFilesForAllLevels(
+      std::unordered_set<uint64_t>* input_files,
+      const ColumnFamilyMetaData& cf_meta, const int output_level) const;
+#endif  // ROCKSDB_LITE
+
+  // Keeps track of all compactions that are running on Level0.
+  // Protected by DB mutex
+  std::set<Compaction*> level0_compactions_in_progress_;
+
+  // Keeps track of all compactions that are running.
+  // Protected by DB mutex
+  std::unordered_set<Compaction*> compactions_in_progress_;
+
+  const InternalKeyComparator* const icmp_;
+};
+
+#ifndef ROCKSDB_LITE
+// A dummy compaction that never triggers any automatic
+// compaction.
+class NullCompactionPicker : public CompactionPicker {
+ public:
+  NullCompactionPicker(const ImmutableCFOptions& ioptions,
+                       const InternalKeyComparator* icmp)
+      : CompactionPicker(ioptions, icmp) {}
+  virtual ~NullCompactionPicker() {}
+
+  // Always return "nullptr"
+  Compaction* PickCompaction(
+      const std::string& /*cf_name*/,
+      const MutableCFOptions& /*mutable_cf_options*/,
+      VersionStorageInfo* /*vstorage*/, LogBuffer* /* log_buffer */,
+      SequenceNumber /* earliest_memtable_seqno */) override {
+    return nullptr;
+  }
+
+  // Always return "nullptr"
+  Compaction* CompactRange(const std::string& /*cf_name*/,
+                           const MutableCFOptions& /*mutable_cf_options*/,
+                           VersionStorageInfo* /*vstorage*/,
+                           int /*input_level*/, int /*output_level*/,
+                           const CompactRangeOptions& /*compact_range_options*/,
+                           const InternalKey* /*begin*/,
+                           const InternalKey* /*end*/,
+                           InternalKey** /*compaction_end*/,
+                           bool* /*manual_conflict*/,
+                           uint64_t /*max_file_num_to_ignore*/) override {
+    return nullptr;
+  }
+
+  // Always returns false.
+  virtual bool NeedsCompaction(
+      const VersionStorageInfo* /*vstorage*/) const override {
+    return false;
+  }
+};
+#endif  // !ROCKSDB_LITE
+
+// Attempts to find an intra L0 compaction conforming to the given parameters.
+//
+// @param level_files                     Metadata for L0 files.
+// @param min_files_to_compact            Minimum number of files required to
+//                                        do the compaction.
+// @param max_compact_bytes_per_del_file  Maximum average size in bytes per
+//                                        file that is going to get deleted by
+//                                        the compaction.
+// @param max_compaction_bytes            Maximum total size in bytes (in terms
+//                                        of compensated file size) for files
+//                                        to be compacted.
+// @param [out] comp_inputs               If a compaction was found, will be
+//                                        initialized with corresponding input
+//                                        files. Cannot be nullptr.
+//
+// @return                                true iff compaction was found.
+bool FindIntraL0Compaction(
+    const std::vector<FileMetaData*>& level_files, size_t min_files_to_compact,
+    uint64_t max_compact_bytes_per_del_file, uint64_t max_compaction_bytes,
+    CompactionInputFiles* comp_inputs,
+    SequenceNumber earliest_mem_seqno = kMaxSequenceNumber);
+
+CompressionType GetCompressionType(const ImmutableCFOptions& ioptions,
+                                   const VersionStorageInfo* vstorage,
+                                   const MutableCFOptions& mutable_cf_options,
+                                   int level, int base_level,
+                                   const bool enable_compression = true);
+
+CompressionOptions GetCompressionOptions(const ImmutableCFOptions& ioptions,
+                                         const VersionStorageInfo* vstorage,
+                                         int level,
+                                         const bool enable_compression = true);
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/compaction/compaction_picker_fifo.cc b/src/rocksdb/db/compaction/compaction_picker_fifo.cc
new file mode 100644
index 000000000..b148aadc2
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_picker_fifo.cc
@@ -0,0 +1,244 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/compaction/compaction_picker_fifo.h"
+#ifndef ROCKSDB_LITE
+
+#include <cinttypes>
+#include <string>
+#include <vector>
+#include "db/column_family.h"
+#include "logging/log_buffer.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace {
+uint64_t GetTotalFilesSize(const std::vector<FileMetaData*>& files) {
+  uint64_t total_size = 0;
+  for (const auto& f : files) {
+    total_size += f->fd.file_size;
+  }
+  return total_size;
+}
+}  // anonymous namespace
+
+bool FIFOCompactionPicker::NeedsCompaction(
+    const VersionStorageInfo* vstorage) const {
+  const int kLevel0 = 0;
+  return vstorage->CompactionScore(kLevel0) >= 1;
+}
+
+Compaction* FIFOCompactionPicker::PickTTLCompaction(
+    const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+    VersionStorageInfo* vstorage, LogBuffer* log_buffer) {
+  assert(mutable_cf_options.ttl > 0);
+
+  const int kLevel0 = 0;
+  const std::vector<FileMetaData*>& level_files = vstorage->LevelFiles(kLevel0);
+  uint64_t total_size = GetTotalFilesSize(level_files);
+
+  int64_t _current_time;
+  auto status = ioptions_.env->GetCurrentTime(&_current_time);
+  if (!status.ok()) {
+    ROCKS_LOG_BUFFER(log_buffer,
+                     "[%s] FIFO compaction: Couldn't get current time: %s. "
+                     "Not doing compactions based on TTL. ",
+                     cf_name.c_str(), status.ToString().c_str());
+    return nullptr;
+  }
+  const uint64_t current_time = static_cast<uint64_t>(_current_time);
+
+  if (!level0_compactions_in_progress_.empty()) {
+    ROCKS_LOG_BUFFER(
+        log_buffer,
+        "[%s] FIFO compaction: Already executing compaction. No need "
+        "to run parallel compactions since compactions are very fast",
+        cf_name.c_str());
+    return nullptr;
+  }
+
+  std::vector<CompactionInputFiles> inputs;
+  inputs.emplace_back();
+  inputs[0].level = 0;
+
+  // avoid underflow
+  if (current_time > mutable_cf_options.ttl) {
+    for (auto ritr = level_files.rbegin(); ritr != level_files.rend(); ++ritr) {
+      FileMetaData* f = *ritr;
+      if (f->fd.table_reader && f->fd.table_reader->GetTableProperties()) {
+        uint64_t creation_time =
+            f->fd.table_reader->GetTableProperties()->creation_time;
+        if (creation_time == 0 ||
+            creation_time >= (current_time - mutable_cf_options.ttl)) {
+          break;
+        }
+        total_size -= f->compensated_file_size;
+        inputs[0].files.push_back(f);
+      }
+    }
+  }
+
+  // Return a nullptr and proceed to size-based FIFO compaction if:
+  // 1. there are no files older than ttl OR
+  // 2. there are a few files older than ttl, but deleting them will not bring
+  //    the total size to be less than max_table_files_size threshold.
+  if (inputs[0].files.empty() ||
+      total_size >
+          mutable_cf_options.compaction_options_fifo.max_table_files_size) {
+    return nullptr;
+  }
+
+  for (const auto& f : inputs[0].files) {
+    uint64_t creation_time = 0;
+    if (f && f->fd.table_reader && f->fd.table_reader->GetTableProperties()) {
+      creation_time = f->fd.table_reader->GetTableProperties()->creation_time;
+    }
+    ROCKS_LOG_BUFFER(log_buffer,
+                     "[%s] FIFO compaction: picking file %" PRIu64
+                     " with creation time %" PRIu64 " for deletion",
+                     cf_name.c_str(), f->fd.GetNumber(), creation_time);
+  }
+
+  Compaction* c = new Compaction(
+      vstorage, ioptions_, mutable_cf_options, std::move(inputs), 0, 0, 0, 0,
+      kNoCompression, ioptions_.compression_opts, /* max_subcompactions */ 0,
+      {}, /* is manual */ false, vstorage->CompactionScore(0),
+      /* is deletion compaction */ true, CompactionReason::kFIFOTtl);
+  return c;
+}
+
+Compaction* FIFOCompactionPicker::PickSizeCompaction(
+    const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+    VersionStorageInfo* vstorage, LogBuffer* log_buffer) {
+  const int kLevel0 = 0;
+  const std::vector<FileMetaData*>& level_files = vstorage->LevelFiles(kLevel0);
+  uint64_t total_size = GetTotalFilesSize(level_files);
+
+  if (total_size <=
+          mutable_cf_options.compaction_options_fifo.max_table_files_size ||
+      level_files.size() == 0) {
+    // total size not exceeded
+    if (mutable_cf_options.compaction_options_fifo.allow_compaction &&
+        level_files.size() > 0) {
+      CompactionInputFiles comp_inputs;
+      // try to prevent same files from being compacted multiple times, which
+      // could produce large files that may never TTL-expire. Achieve this by
+      // disallowing compactions with files larger than memtable (inflate its
+      // size by 10% to account for uncompressed L0 files that may have size
+      // slightly greater than memtable size limit).
+      size_t max_compact_bytes_per_del_file =
+          static_cast<size_t>(MultiplyCheckOverflow(
+              static_cast<uint64_t>(mutable_cf_options.write_buffer_size),
+              1.1));
+      if (FindIntraL0Compaction(
+              level_files,
+              mutable_cf_options
+                  .level0_file_num_compaction_trigger /* min_files_to_compact */
+              ,
+              max_compact_bytes_per_del_file,
+              mutable_cf_options.max_compaction_bytes, &comp_inputs)) {
+        Compaction* c = new Compaction(
+            vstorage, ioptions_, mutable_cf_options, {comp_inputs}, 0,
+            16 * 1024 * 1024 /* output file size limit */,
+            0 /* max compaction bytes, not applicable */,
+            0 /* output path ID */, mutable_cf_options.compression,
+            ioptions_.compression_opts, 0 /* max_subcompactions */, {},
+            /* is manual */ false, vstorage->CompactionScore(0),
+            /* is deletion compaction */ false,
+            CompactionReason::kFIFOReduceNumFiles);
+        return c;
+      }
+    }
+
+    ROCKS_LOG_BUFFER(
+        log_buffer,
+        "[%s] FIFO compaction: nothing to do. Total size %" PRIu64
+        ", max size %" PRIu64 "\n",
+        cf_name.c_str(), total_size,
+        mutable_cf_options.compaction_options_fifo.max_table_files_size);
+    return nullptr;
+  }
+
+  if (!level0_compactions_in_progress_.empty()) {
+    ROCKS_LOG_BUFFER(
+        log_buffer,
+        "[%s] FIFO compaction: Already executing compaction. No need "
+        "to run parallel compactions since compactions are very fast",
+        cf_name.c_str());
+    return nullptr;
+  }
+
+  std::vector<CompactionInputFiles> inputs;
+  inputs.emplace_back();
+  inputs[0].level = 0;
+
+  for (auto ritr = level_files.rbegin(); ritr != level_files.rend(); ++ritr) {
+    auto f = *ritr;
+    total_size -= f->compensated_file_size;
+    inputs[0].files.push_back(f);
+    char tmp_fsize[16];
+    AppendHumanBytes(f->fd.GetFileSize(), tmp_fsize, sizeof(tmp_fsize));
+    ROCKS_LOG_BUFFER(log_buffer,
+                     "[%s] FIFO compaction: picking file %" PRIu64
+                     " with size %s for deletion",
+                     cf_name.c_str(), f->fd.GetNumber(), tmp_fsize);
+    if (total_size <=
+        mutable_cf_options.compaction_options_fifo.max_table_files_size) {
+      break;
+    }
+  }
+
+  Compaction* c = new Compaction(
+      vstorage, ioptions_, mutable_cf_options, std::move(inputs), 0, 0, 0, 0,
+      kNoCompression, ioptions_.compression_opts, /* max_subcompactions */ 0,
+      {}, /* is manual */ false, vstorage->CompactionScore(0),
+      /* is deletion compaction */ true, CompactionReason::kFIFOMaxSize);
+  return c;
+}
+
+Compaction* FIFOCompactionPicker::PickCompaction(
+    const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+    VersionStorageInfo* vstorage, LogBuffer* log_buffer,
+    SequenceNumber /*earliest_memtable_seqno*/) {
+  assert(vstorage->num_levels() == 1);
+
+  Compaction* c = nullptr;
+  if (mutable_cf_options.ttl > 0) {
+    c = PickTTLCompaction(cf_name, mutable_cf_options, vstorage, log_buffer);
+  }
+  if (c == nullptr) {
+    c = PickSizeCompaction(cf_name, mutable_cf_options, vstorage, log_buffer);
+  }
+  RegisterCompaction(c);
+  return c;
+}
+
+Compaction* FIFOCompactionPicker::CompactRange(
+    const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+    VersionStorageInfo* vstorage, int input_level, int output_level,
+    const CompactRangeOptions& /*compact_range_options*/,
+    const InternalKey* /*begin*/, const InternalKey* /*end*/,
+    InternalKey** compaction_end, bool* /*manual_conflict*/,
+    uint64_t /*max_file_num_to_ignore*/) {
+#ifdef NDEBUG
+  (void)input_level;
+  (void)output_level;
+#endif
+  assert(input_level == 0);
+  assert(output_level == 0);
+  *compaction_end = nullptr;
+  LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, ioptions_.info_log);
+  Compaction* c =
+      PickCompaction(cf_name, mutable_cf_options, vstorage, &log_buffer);
+  log_buffer.FlushBufferToLog();
+  return c;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/compaction/compaction_picker_fifo.h b/src/rocksdb/db/compaction/compaction_picker_fifo.h
new file mode 100644
index 000000000..eb786e5ac
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_picker_fifo.h
@@ -0,0 +1,53 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include "db/compaction/compaction_picker.h"
+
+namespace ROCKSDB_NAMESPACE {
+class FIFOCompactionPicker : public CompactionPicker {
+ public:
+  FIFOCompactionPicker(const ImmutableCFOptions& ioptions,
+                       const InternalKeyComparator* icmp)
+      : CompactionPicker(ioptions, icmp) {}
+
+  virtual Compaction* PickCompaction(
+      const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+      VersionStorageInfo* version, LogBuffer* log_buffer,
+      SequenceNumber earliest_memtable_seqno = kMaxSequenceNumber) override;
+
+  virtual Compaction* CompactRange(
+      const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+      VersionStorageInfo* vstorage, int input_level, int output_level,
+      const CompactRangeOptions& compact_range_options,
+      const InternalKey* begin, const InternalKey* end,
+      InternalKey** compaction_end, bool* manual_conflict,
+      uint64_t max_file_num_to_ignore) override;
+
+  // The maximum allowed output level.  Always returns 0.
+  virtual int MaxOutputLevel() const override { return 0; }
+
+  virtual bool NeedsCompaction(
+      const VersionStorageInfo* vstorage) const override;
+
+ private:
+  Compaction* PickTTLCompaction(const std::string& cf_name,
+                                const MutableCFOptions& mutable_cf_options,
+                                VersionStorageInfo* version,
+                                LogBuffer* log_buffer);
+
+  Compaction* PickSizeCompaction(const std::string& cf_name,
+                                 const MutableCFOptions& mutable_cf_options,
+                                 VersionStorageInfo* version,
+                                 LogBuffer* log_buffer);
+};
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/compaction/compaction_picker_level.cc b/src/rocksdb/db/compaction/compaction_picker_level.cc
new file mode 100644
index 000000000..012edd080
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_picker_level.cc
@@ -0,0 +1,558 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "db/compaction/compaction_picker_level.h"
+#include "logging/log_buffer.h"
+#include "test_util/sync_point.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+bool LevelCompactionPicker::NeedsCompaction(
+    const VersionStorageInfo* vstorage) const {
+  if (!vstorage->ExpiredTtlFiles().empty()) {
+    return true;
+  }
+  if (!vstorage->FilesMarkedForPeriodicCompaction().empty()) {
+    return true;
+  }
+  if (!vstorage->BottommostFilesMarkedForCompaction().empty()) {
+    return true;
+  }
+  if (!vstorage->FilesMarkedForCompaction().empty()) {
+    return true;
+  }
+  for (int i = 0; i <= vstorage->MaxInputLevel(); i++) {
+    if (vstorage->CompactionScore(i) >= 1) {
+      return true;
+    }
+  }
+  return false;
+}
+
+namespace {
+// A class to build a leveled compaction step-by-step.
+class LevelCompactionBuilder {
+ public:
+  LevelCompactionBuilder(const std::string& cf_name,
+                         VersionStorageInfo* vstorage,
+                         SequenceNumber earliest_mem_seqno,
+                         CompactionPicker* compaction_picker,
+                         LogBuffer* log_buffer,
+                         const MutableCFOptions& mutable_cf_options,
+                         const ImmutableCFOptions& ioptions)
+      : cf_name_(cf_name),
+        vstorage_(vstorage),
+        earliest_mem_seqno_(earliest_mem_seqno),
+        compaction_picker_(compaction_picker),
+        log_buffer_(log_buffer),
+        mutable_cf_options_(mutable_cf_options),
+        ioptions_(ioptions) {}
+
+  // Pick and return a compaction.
+  Compaction* PickCompaction();
+
+  // Pick the initial files to compact to the next level. (or together
+  // in Intra-L0 compactions)
+  void SetupInitialFiles();
+
+  // If the initial files are from L0 level, pick other L0
+  // files if needed.
+  bool SetupOtherL0FilesIfNeeded();
+
+  // Based on initial files, setup other files need to be compacted
+  // in this compaction, accordingly.
+  bool SetupOtherInputsIfNeeded();
+
+  Compaction* GetCompaction();
+
+  // For the specfied level, pick a file that we want to compact.
+  // Returns false if there is no file to compact.
+  // If it returns true, inputs->files.size() will be exactly one.
+  // If level is 0 and there is already a compaction on that level, this
+  // function will return false.
+  bool PickFileToCompact();
+
+  // For L0->L0, picks the longest span of files that aren't currently
+  // undergoing compaction for which work-per-deleted-file decreases. The span
+  // always starts from the newest L0 file.
+  //
+  // Intra-L0 compaction is independent of all other files, so it can be
+  // performed even when L0->base_level compactions are blocked.
+  //
+  // Returns true if `inputs` is populated with a span of files to be compacted;
+  // otherwise, returns false.
+  bool PickIntraL0Compaction();
+
+  void PickExpiredTtlFiles();
+
+  void PickFilesMarkedForPeriodicCompaction();
+
+  const std::string& cf_name_;
+  VersionStorageInfo* vstorage_;
+  SequenceNumber earliest_mem_seqno_;
+  CompactionPicker* compaction_picker_;
+  LogBuffer* log_buffer_;
+  int start_level_ = -1;
+  int output_level_ = -1;
+  int parent_index_ = -1;
+  int base_index_ = -1;
+  double start_level_score_ = 0;
+  bool is_manual_ = false;
+  CompactionInputFiles start_level_inputs_;
+  std::vector<CompactionInputFiles> compaction_inputs_;
+  CompactionInputFiles output_level_inputs_;
+  std::vector<FileMetaData*> grandparents_;
+  CompactionReason compaction_reason_ = CompactionReason::kUnknown;
+
+  const MutableCFOptions& mutable_cf_options_;
+  const ImmutableCFOptions& ioptions_;
+  // Pick a path ID to place a newly generated file, with its level
+  static uint32_t GetPathId(const ImmutableCFOptions& ioptions,
+                            const MutableCFOptions& mutable_cf_options,
+                            int level);
+
+  static const int kMinFilesForIntraL0Compaction = 4;
+};
+
+void LevelCompactionBuilder::PickExpiredTtlFiles() {
+  if (vstorage_->ExpiredTtlFiles().empty()) {
+    return;
+  }
+
+  auto continuation = [&](std::pair<int, FileMetaData*> level_file) {
+    // If it's being compacted it has nothing to do here.
+    // If this assert() fails that means that some function marked some
+    // files as being_compacted, but didn't call ComputeCompactionScore()
+    assert(!level_file.second->being_compacted);
+    start_level_ = level_file.first;
+    output_level_ =
+        (start_level_ == 0) ? vstorage_->base_level() : start_level_ + 1;
+
+    if ((start_level_ == vstorage_->num_non_empty_levels() - 1) ||
+        (start_level_ == 0 &&
+         !compaction_picker_->level0_compactions_in_progress()->empty())) {
+      return false;
+    }
+
+    start_level_inputs_.files = {level_file.second};
+    start_level_inputs_.level = start_level_;
+    return compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_,
+                                                      &start_level_inputs_);
+  };
+
+  for (auto& level_file : vstorage_->ExpiredTtlFiles()) {
+    if (continuation(level_file)) {
+      // found the compaction!
+      return;
+    }
+  }
+
+  start_level_inputs_.files.clear();
+}
+
+void LevelCompactionBuilder::PickFilesMarkedForPeriodicCompaction() {
+  if (vstorage_->FilesMarkedForPeriodicCompaction().empty()) {
+    return;
+  }
+
+  auto continuation = [&](std::pair<int, FileMetaData*> level_file) {
+    // If it's being compacted it has nothing to do here.
+    // If this assert() fails that means that some function marked some
+    // files as being_compacted, but didn't call ComputeCompactionScore()
+    assert(!level_file.second->being_compacted);
+    output_level_ = start_level_ = level_file.first;
+
+    if (start_level_ == 0 &&
+        !compaction_picker_->level0_compactions_in_progress()->empty()) {
+      return false;
+    }
+
+    start_level_inputs_.files = {level_file.second};
+    start_level_inputs_.level = start_level_;
+    return compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_,
+                                                      &start_level_inputs_);
+  };
+
+  for (auto& level_file : vstorage_->FilesMarkedForPeriodicCompaction()) {
+    if (continuation(level_file)) {
+      // found the compaction!
+      return;
+    }
+  }
+
+  start_level_inputs_.files.clear();
+}
+
+void LevelCompactionBuilder::SetupInitialFiles() {
+  // Find the compactions by size on all levels.
+  bool skipped_l0_to_base = false;
+  for (int i = 0; i < compaction_picker_->NumberLevels() - 1; i++) {
+    start_level_score_ = vstorage_->CompactionScore(i);
+    start_level_ = vstorage_->CompactionScoreLevel(i);
+    assert(i == 0 || start_level_score_ <= vstorage_->CompactionScore(i - 1));
+    if (start_level_score_ >= 1) {
+      if (skipped_l0_to_base && start_level_ == vstorage_->base_level()) {
+        // If L0->base_level compaction is pending, don't schedule further
+        // compaction from base level. Otherwise L0->base_level compaction
+        // may starve.
+        continue;
+      }
+      output_level_ =
+          (start_level_ == 0) ? vstorage_->base_level() : start_level_ + 1;
+      if (PickFileToCompact()) {
+        // found the compaction!
+        if (start_level_ == 0) {
+          // L0 score = `num L0 files` / `level0_file_num_compaction_trigger`
+          compaction_reason_ = CompactionReason::kLevelL0FilesNum;
+        } else {
+          // L1+ score = `Level files size` / `MaxBytesForLevel`
+          compaction_reason_ = CompactionReason::kLevelMaxLevelSize;
+        }
+        break;
+      } else {
+        // didn't find the compaction, clear the inputs
+        start_level_inputs_.clear();
+        if (start_level_ == 0) {
+          skipped_l0_to_base = true;
+          // L0->base_level may be blocked due to ongoing L0->base_level
+          // compactions. It may also be blocked by an ongoing compaction from
+          // base_level downwards.
+          //
+          // In these cases, to reduce L0 file count and thus reduce likelihood
+          // of write stalls, we can attempt compacting a span of files within
+          // L0.
+          if (PickIntraL0Compaction()) {
+            output_level_ = 0;
+            compaction_reason_ = CompactionReason::kLevelL0FilesNum;
+            break;
+          }
+        }
+      }
+    }
+  }
+
+  // if we didn't find a compaction, check if there are any files marked for
+  // compaction
+  if (start_level_inputs_.empty()) {
+    parent_index_ = base_index_ = -1;
+
+    compaction_picker_->PickFilesMarkedForCompaction(
+        cf_name_, vstorage_, &start_level_, &output_level_,
+        &start_level_inputs_);
+    if (!start_level_inputs_.empty()) {
+      is_manual_ = true;
+      compaction_reason_ = CompactionReason::kFilesMarkedForCompaction;
+      return;
+    }
+  }
+
+  // Bottommost Files Compaction on deleting tombstones
+  if (start_level_inputs_.empty()) {
+    size_t i;
+    for (i = 0; i < vstorage_->BottommostFilesMarkedForCompaction().size();
+         ++i) {
+      auto& level_and_file = vstorage_->BottommostFilesMarkedForCompaction()[i];
+      assert(!level_and_file.second->being_compacted);
+      start_level_inputs_.level = output_level_ = start_level_ =
+          level_and_file.first;
+      start_level_inputs_.files = {level_and_file.second};
+      if (compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_,
+                                                     &start_level_inputs_)) {
+        break;
+      }
+    }
+    if (i == vstorage_->BottommostFilesMarkedForCompaction().size()) {
+      start_level_inputs_.clear();
+    } else {
+      assert(!start_level_inputs_.empty());
+      compaction_reason_ = CompactionReason::kBottommostFiles;
+      return;
+    }
+  }
+
+  // TTL Compaction
+  if (start_level_inputs_.empty()) {
+    PickExpiredTtlFiles();
+    if (!start_level_inputs_.empty()) {
+      compaction_reason_ = CompactionReason::kTtl;
+      return;
+    }
+  }
+
+  // Periodic Compaction
+  if (start_level_inputs_.empty()) {
+    PickFilesMarkedForPeriodicCompaction();
+    if (!start_level_inputs_.empty()) {
+      compaction_reason_ = CompactionReason::kPeriodicCompaction;
+      return;
+    }
+  }
+}
+
+bool LevelCompactionBuilder::SetupOtherL0FilesIfNeeded() {
+  if (start_level_ == 0 && output_level_ != 0) {
+    return compaction_picker_->GetOverlappingL0Files(
+        vstorage_, &start_level_inputs_, output_level_, &parent_index_);
+  }
+  return true;
+}
+
+bool LevelCompactionBuilder::SetupOtherInputsIfNeeded() {
+  // Setup input files from output level. For output to L0, we only compact
+  // spans of files that do not interact with any pending compactions, so don't
+  // need to consider other levels.
+  if (output_level_ != 0) {
+    output_level_inputs_.level = output_level_;
+    if (!compaction_picker_->SetupOtherInputs(
+            cf_name_, mutable_cf_options_, vstorage_, &start_level_inputs_,
+            &output_level_inputs_, &parent_index_, base_index_)) {
+      return false;
+    }
+
+    compaction_inputs_.push_back(start_level_inputs_);
+    if (!output_level_inputs_.empty()) {
+      compaction_inputs_.push_back(output_level_inputs_);
+    }
+
+    // In some edge cases we could pick a compaction that will be compacting
+    // a key range that overlap with another running compaction, and both
+    // of them have the same output level. This could happen if
+    // (1) we are running a non-exclusive manual compaction
+    // (2) AddFile ingest a new file into the LSM tree
+    // We need to disallow this from happening.
+    if (compaction_picker_->FilesRangeOverlapWithCompaction(compaction_inputs_,
+                                                            output_level_)) {
+      // This compaction output could potentially conflict with the output
+      // of a currently running compaction, we cannot run it.
+      return false;
+    }
+    compaction_picker_->GetGrandparents(vstorage_, start_level_inputs_,
+                                        output_level_inputs_, &grandparents_);
+  } else {
+    compaction_inputs_.push_back(start_level_inputs_);
+  }
+  return true;
+}
+
+Compaction* LevelCompactionBuilder::PickCompaction() {
+  // Pick up the first file to start compaction. It may have been extended
+  // to a clean cut.
+  SetupInitialFiles();
+  if (start_level_inputs_.empty()) {
+    return nullptr;
+  }
+  assert(start_level_ >= 0 && output_level_ >= 0);
+
+  // If it is a L0 -> base level compaction, we need to set up other L0
+  // files if needed.
+  if (!SetupOtherL0FilesIfNeeded()) {
+    return nullptr;
+  }
+
+  // Pick files in the output level and expand more files in the start level
+  // if needed.
+  if (!SetupOtherInputsIfNeeded()) {
+    return nullptr;
+  }
+
+  // Form a compaction object containing the files we picked.
+  Compaction* c = GetCompaction();
+
+  TEST_SYNC_POINT_CALLBACK("LevelCompactionPicker::PickCompaction:Return", c);
+
+  return c;
+}
+
+Compaction* LevelCompactionBuilder::GetCompaction() {
+  auto c = new Compaction(
+      vstorage_, ioptions_, mutable_cf_options_, std::move(compaction_inputs_),
+      output_level_,
+      MaxFileSizeForLevel(mutable_cf_options_, output_level_,
+                          ioptions_.compaction_style, vstorage_->base_level(),
+                          ioptions_.level_compaction_dynamic_level_bytes),
+      mutable_cf_options_.max_compaction_bytes,
+      GetPathId(ioptions_, mutable_cf_options_, output_level_),
+      GetCompressionType(ioptions_, vstorage_, mutable_cf_options_,
+                         output_level_, vstorage_->base_level()),
+      GetCompressionOptions(ioptions_, vstorage_, output_level_),
+      /* max_subcompactions */ 0, std::move(grandparents_), is_manual_,
+      start_level_score_, false /* deletion_compaction */, compaction_reason_);
+
+  // If it's level 0 compaction, make sure we don't execute any other level 0
+  // compactions in parallel
+  compaction_picker_->RegisterCompaction(c);
+
+  // Creating a compaction influences the compaction score because the score
+  // takes running compactions into account (by skipping files that are already
+  // being compacted). Since we just changed compaction score, we recalculate it
+  // here
+  vstorage_->ComputeCompactionScore(ioptions_, mutable_cf_options_);
+  return c;
+}
+
+/*
+ * Find the optimal path to place a file
+ * Given a level, finds the path where levels up to it will fit in levels
+ * up to and including this path
+ */
+uint32_t LevelCompactionBuilder::GetPathId(
+    const ImmutableCFOptions& ioptions,
+    const MutableCFOptions& mutable_cf_options, int level) {
+  uint32_t p = 0;
+  assert(!ioptions.cf_paths.empty());
+
+  // size remaining in the most recent path
+  uint64_t current_path_size = ioptions.cf_paths[0].target_size;
+
+  uint64_t level_size;
+  int cur_level = 0;
+
+  // max_bytes_for_level_base denotes L1 size.
+  // We estimate L0 size to be the same as L1.
+  level_size = mutable_cf_options.max_bytes_for_level_base;
+
+  // Last path is the fallback
+  while (p < ioptions.cf_paths.size() - 1) {
+    if (level_size <= current_path_size) {
+      if (cur_level == level) {
+        // Does desired level fit in this path?
+        return p;
+      } else {
+        current_path_size -= level_size;
+        if (cur_level > 0) {
+          if (ioptions.level_compaction_dynamic_level_bytes) {
+            // Currently, level_compaction_dynamic_level_bytes is ignored when
+            // multiple db paths are specified. https://github.com/facebook/
+            // rocksdb/blob/master/db/column_family.cc.
+            // Still, adding this check to avoid accidentally using
+            // max_bytes_for_level_multiplier_additional
+            level_size = static_cast<uint64_t>(
+                level_size * mutable_cf_options.max_bytes_for_level_multiplier);
+          } else {
+            level_size = static_cast<uint64_t>(
+                level_size * mutable_cf_options.max_bytes_for_level_multiplier *
+                mutable_cf_options.MaxBytesMultiplerAdditional(cur_level));
+          }
+        }
+        cur_level++;
+        continue;
+      }
+    }
+    p++;
+    current_path_size = ioptions.cf_paths[p].target_size;
+  }
+  return p;
+}
+
+bool LevelCompactionBuilder::PickFileToCompact() {
+  // level 0 files are overlapping. So we cannot pick more
+  // than one concurrent compactions at this level. This
+  // could be made better by looking at key-ranges that are
+  // being compacted at level 0.
+  if (start_level_ == 0 &&
+      !compaction_picker_->level0_compactions_in_progress()->empty()) {
+    TEST_SYNC_POINT("LevelCompactionPicker::PickCompactionBySize:0");
+    return false;
+  }
+
+  start_level_inputs_.clear();
+
+  assert(start_level_ >= 0);
+
+  // Pick the largest file in this level that is not already
+  // being compacted
+  const std::vector<int>& file_size =
+      vstorage_->FilesByCompactionPri(start_level_);
+  const std::vector<FileMetaData*>& level_files =
+      vstorage_->LevelFiles(start_level_);
+
+  unsigned int cmp_idx;
+  for (cmp_idx = vstorage_->NextCompactionIndex(start_level_);
+       cmp_idx < file_size.size(); cmp_idx++) {
+    int index = file_size[cmp_idx];
+    auto* f = level_files[index];
+
+    // do not pick a file to compact if it is being compacted
+    // from n-1 level.
+    if (f->being_compacted) {
+      continue;
+    }
+
+    start_level_inputs_.files.push_back(f);
+    start_level_inputs_.level = start_level_;
+    if (!compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_,
+                                                    &start_level_inputs_) ||
+        compaction_picker_->FilesRangeOverlapWithCompaction(
+            {start_level_inputs_}, output_level_)) {
+      // A locked (pending compaction) input-level file was pulled in due to
+      // user-key overlap.
+      start_level_inputs_.clear();
+      continue;
+    }
+
+    // Now that input level is fully expanded, we check whether any output files
+    // are locked due to pending compaction.
+    //
+    // Note we rely on ExpandInputsToCleanCut() to tell us whether any output-
+    // level files are locked, not just the extra ones pulled in for user-key
+    // overlap.
+    InternalKey smallest, largest;
+    compaction_picker_->GetRange(start_level_inputs_, &smallest, &largest);
+    CompactionInputFiles output_level_inputs;
+    output_level_inputs.level = output_level_;
+    vstorage_->GetOverlappingInputs(output_level_, &smallest, &largest,
+                                    &output_level_inputs.files);
+    if (!output_level_inputs.empty() &&
+        !compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_,
+                                                    &output_level_inputs)) {
+      start_level_inputs_.clear();
+      continue;
+    }
+    base_index_ = index;
+    break;
+  }
+
+  // store where to start the iteration in the next call to PickCompaction
+  vstorage_->SetNextCompactionIndex(start_level_, cmp_idx);
+
+  return start_level_inputs_.size() > 0;
+}
+
+bool LevelCompactionBuilder::PickIntraL0Compaction() {
+  start_level_inputs_.clear();
+  const std::vector<FileMetaData*>& level_files =
+      vstorage_->LevelFiles(0 /* level */);
+  if (level_files.size() <
+          static_cast<size_t>(
+              mutable_cf_options_.level0_file_num_compaction_trigger + 2) ||
+      level_files[0]->being_compacted) {
+    // If L0 isn't accumulating much files beyond the regular trigger, don't
+    // resort to L0->L0 compaction yet.
+    return false;
+  }
+  return FindIntraL0Compaction(level_files, kMinFilesForIntraL0Compaction,
+                               port::kMaxUint64,
+                               mutable_cf_options_.max_compaction_bytes,
+                               &start_level_inputs_, earliest_mem_seqno_);
+}
+}  // namespace
+
+Compaction* LevelCompactionPicker::PickCompaction(
+    const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+    VersionStorageInfo* vstorage, LogBuffer* log_buffer,
+    SequenceNumber earliest_mem_seqno) {
+  LevelCompactionBuilder builder(cf_name, vstorage, earliest_mem_seqno, this,
+                                 log_buffer, mutable_cf_options, ioptions_);
+  return builder.PickCompaction();
+}
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/compaction/compaction_picker_level.h b/src/rocksdb/db/compaction/compaction_picker_level.h
new file mode 100644
index 000000000..b82070e14
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_picker_level.h
@@ -0,0 +1,32 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include "db/compaction/compaction_picker.h"
+
+namespace ROCKSDB_NAMESPACE {
+// Picking compactions for leveled compaction. See wiki page
+// https://github.com/facebook/rocksdb/wiki/Leveled-Compaction
+// for description of Leveled compaction.
+class LevelCompactionPicker : public CompactionPicker {
+ public:
+  LevelCompactionPicker(const ImmutableCFOptions& ioptions,
+                        const InternalKeyComparator* icmp)
+      : CompactionPicker(ioptions, icmp) {}
+  virtual Compaction* PickCompaction(
+      const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+      VersionStorageInfo* vstorage, LogBuffer* log_buffer,
+      SequenceNumber earliest_memtable_seqno = kMaxSequenceNumber) override;
+
+  virtual bool NeedsCompaction(
+      const VersionStorageInfo* vstorage) const override;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/compaction/compaction_picker_test.cc b/src/rocksdb/db/compaction/compaction_picker_test.cc
new file mode 100644
index 000000000..278bdb06a
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_picker_test.cc
@@ -0,0 +1,1741 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+
+#include <limits>
+#include <string>
+#include <utility>
+#include "db/compaction/compaction.h"
+#include "db/compaction/compaction_picker_fifo.h"
+#include "db/compaction/compaction_picker_level.h"
+#include "db/compaction/compaction_picker_universal.h"
+
+#include "logging/logging.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class CountingLogger : public Logger {
+ public:
+  using Logger::Logv;
+  void Logv(const char* /*format*/, va_list /*ap*/) override { log_count++; }
+  size_t log_count;
+};
+
+class CompactionPickerTest : public testing::Test {
+ public:
+  const Comparator* ucmp_;
+  InternalKeyComparator icmp_;
+  Options options_;
+  ImmutableCFOptions ioptions_;
+  MutableCFOptions mutable_cf_options_;
+  LevelCompactionPicker level_compaction_picker;
+  std::string cf_name_;
+  CountingLogger logger_;
+  LogBuffer log_buffer_;
+  uint32_t file_num_;
+  CompactionOptionsFIFO fifo_options_;
+  std::unique_ptr<VersionStorageInfo> vstorage_;
+  std::vector<std::unique_ptr<FileMetaData>> files_;
+  // does not own FileMetaData
+  std::unordered_map<uint32_t, std::pair<FileMetaData*, int>> file_map_;
+  // input files to compaction process.
+  std::vector<CompactionInputFiles> input_files_;
+  int compaction_level_start_;
+
+  CompactionPickerTest()
+      : ucmp_(BytewiseComparator()),
+        icmp_(ucmp_),
+        ioptions_(options_),
+        mutable_cf_options_(options_),
+        level_compaction_picker(ioptions_, &icmp_),
+        cf_name_("dummy"),
+        log_buffer_(InfoLogLevel::INFO_LEVEL, &logger_),
+        file_num_(1),
+        vstorage_(nullptr) {
+    mutable_cf_options_.ttl = 0;
+    mutable_cf_options_.periodic_compaction_seconds = 0;
+    // ioptions_.compaction_pri = kMinOverlappingRatio has its own set of
+    // tests to cover.
+    ioptions_.compaction_pri = kByCompensatedSize;
+    fifo_options_.max_table_files_size = 1;
+    mutable_cf_options_.RefreshDerivedOptions(ioptions_);
+    ioptions_.cf_paths.emplace_back("dummy",
+                                    std::numeric_limits<uint64_t>::max());
+  }
+
+  ~CompactionPickerTest() override {}
+
+  void NewVersionStorage(int num_levels, CompactionStyle style) {
+    DeleteVersionStorage();
+    options_.num_levels = num_levels;
+    vstorage_.reset(new VersionStorageInfo(&icmp_, ucmp_, options_.num_levels,
+                                           style, nullptr, false));
+    vstorage_->CalculateBaseBytes(ioptions_, mutable_cf_options_);
+  }
+
+  void DeleteVersionStorage() {
+    vstorage_.reset();
+    files_.clear();
+    file_map_.clear();
+    input_files_.clear();
+  }
+
+  void Add(int level, uint32_t file_number, const char* smallest,
+           const char* largest, uint64_t file_size = 1, uint32_t path_id = 0,
+           SequenceNumber smallest_seq = 100, SequenceNumber largest_seq = 100,
+           size_t compensated_file_size = 0) {
+    assert(level < vstorage_->num_levels());
+    FileMetaData* f = new FileMetaData(
+        file_number, path_id, file_size,
+        InternalKey(smallest, smallest_seq, kTypeValue),
+        InternalKey(largest, largest_seq, kTypeValue), smallest_seq,
+        largest_seq, /* marked_for_compact */ false, kInvalidBlobFileNumber,
+        kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+        kUnknownFileChecksum, kUnknownFileChecksumFuncName);
+    f->compensated_file_size =
+        (compensated_file_size != 0) ? compensated_file_size : file_size;
+    vstorage_->AddFile(level, f);
+    files_.emplace_back(f);
+    file_map_.insert({file_number, {f, level}});
+  }
+
+  void SetCompactionInputFilesLevels(int level_count, int start_level) {
+    input_files_.resize(level_count);
+    for (int i = 0; i < level_count; ++i) {
+      input_files_[i].level = start_level + i;
+    }
+    compaction_level_start_ = start_level;
+  }
+
+  void AddToCompactionFiles(uint32_t file_number) {
+    auto iter = file_map_.find(file_number);
+    assert(iter != file_map_.end());
+    int level = iter->second.second;
+    assert(level < vstorage_->num_levels());
+    input_files_[level - compaction_level_start_].files.emplace_back(
+        iter->second.first);
+  }
+
+  void UpdateVersionStorageInfo() {
+    vstorage_->CalculateBaseBytes(ioptions_, mutable_cf_options_);
+    vstorage_->UpdateFilesByCompactionPri(ioptions_.compaction_pri);
+    vstorage_->UpdateNumNonEmptyLevels();
+    vstorage_->GenerateFileIndexer();
+    vstorage_->GenerateLevelFilesBrief();
+    vstorage_->ComputeCompactionScore(ioptions_, mutable_cf_options_);
+    vstorage_->GenerateLevel0NonOverlapping();
+    vstorage_->ComputeFilesMarkedForCompaction();
+    vstorage_->SetFinalized();
+  }
+};
+
+TEST_F(CompactionPickerTest, Empty) {
+  NewVersionStorage(6, kCompactionStyleLevel);
+  UpdateVersionStorageInfo();
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+  ASSERT_TRUE(compaction.get() == nullptr);
+}
+
+TEST_F(CompactionPickerTest, Single) {
+  NewVersionStorage(6, kCompactionStyleLevel);
+  mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+  Add(0, 1U, "p", "q");
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+  ASSERT_TRUE(compaction.get() == nullptr);
+}
+
+TEST_F(CompactionPickerTest, Level0Trigger) {
+  NewVersionStorage(6, kCompactionStyleLevel);
+  mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+  Add(0, 1U, "150", "200");
+  Add(0, 2U, "200", "250");
+
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(2U, compaction->num_input_files(0));
+  ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, Level1Trigger) {
+  NewVersionStorage(6, kCompactionStyleLevel);
+  Add(1, 66U, "150", "200", 1000000000U);
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(1U, compaction->num_input_files(0));
+  ASSERT_EQ(66U, compaction->input(0, 0)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, Level1Trigger2) {
+  mutable_cf_options_.target_file_size_base = 10000000000;
+  mutable_cf_options_.RefreshDerivedOptions(ioptions_);
+  NewVersionStorage(6, kCompactionStyleLevel);
+  Add(1, 66U, "150", "200", 1000000001U);
+  Add(1, 88U, "201", "300", 1000000000U);
+  Add(2, 6U, "150", "179", 1000000000U);
+  Add(2, 7U, "180", "220", 1000000000U);
+  Add(2, 8U, "221", "300", 1000000000U);
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(1U, compaction->num_input_files(0));
+  ASSERT_EQ(2U, compaction->num_input_files(1));
+  ASSERT_EQ(66U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(6U, compaction->input(1, 0)->fd.GetNumber());
+  ASSERT_EQ(7U, compaction->input(1, 1)->fd.GetNumber());
+  ASSERT_EQ(uint64_t{1073741824}, compaction->OutputFilePreallocationSize());
+}
+
+TEST_F(CompactionPickerTest, LevelMaxScore) {
+  NewVersionStorage(6, kCompactionStyleLevel);
+  mutable_cf_options_.target_file_size_base = 10000000;
+  mutable_cf_options_.max_bytes_for_level_base = 10 * 1024 * 1024;
+  mutable_cf_options_.RefreshDerivedOptions(ioptions_);
+  Add(0, 1U, "150", "200", 1000000U);
+  // Level 1 score 1.2
+  Add(1, 66U, "150", "200", 6000000U);
+  Add(1, 88U, "201", "300", 6000000U);
+  // Level 2 score 1.8. File 7 is the largest. Should be picked
+  Add(2, 6U, "150", "179", 60000000U);
+  Add(2, 7U, "180", "220", 60000001U);
+  Add(2, 8U, "221", "300", 60000000U);
+  // Level 3 score slightly larger than 1
+  Add(3, 26U, "150", "170", 260000000U);
+  Add(3, 27U, "171", "179", 260000000U);
+  Add(3, 28U, "191", "220", 260000000U);
+  Add(3, 29U, "221", "300", 260000000U);
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(1U, compaction->num_input_files(0));
+  ASSERT_EQ(7U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(mutable_cf_options_.target_file_size_base +
+                mutable_cf_options_.target_file_size_base / 10,
+            compaction->OutputFilePreallocationSize());
+}
+
+TEST_F(CompactionPickerTest, NeedsCompactionLevel) {
+  const int kLevels = 6;
+  const int kFileCount = 20;
+
+  for (int level = 0; level < kLevels - 1; ++level) {
+    NewVersionStorage(kLevels, kCompactionStyleLevel);
+    uint64_t file_size = vstorage_->MaxBytesForLevel(level) * 2 / kFileCount;
+    for (int file_count = 1; file_count <= kFileCount; ++file_count) {
+      // start a brand new version in each test.
+      NewVersionStorage(kLevels, kCompactionStyleLevel);
+      for (int i = 0; i < file_count; ++i) {
+        Add(level, i, ToString((i + 100) * 1000).c_str(),
+            ToString((i + 100) * 1000 + 999).c_str(),
+            file_size, 0, i * 100, i * 100 + 99);
+      }
+      UpdateVersionStorageInfo();
+      ASSERT_EQ(vstorage_->CompactionScoreLevel(0), level);
+      ASSERT_EQ(level_compaction_picker.NeedsCompaction(vstorage_.get()),
+                vstorage_->CompactionScore(0) >= 1);
+      // release the version storage
+      DeleteVersionStorage();
+    }
+  }
+}
+
+TEST_F(CompactionPickerTest, Level0TriggerDynamic) {
+  int num_levels = ioptions_.num_levels;
+  ioptions_.level_compaction_dynamic_level_bytes = true;
+  mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+  mutable_cf_options_.max_bytes_for_level_base = 200;
+  mutable_cf_options_.max_bytes_for_level_multiplier = 10;
+  NewVersionStorage(num_levels, kCompactionStyleLevel);
+  Add(0, 1U, "150", "200");
+  Add(0, 2U, "200", "250");
+
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(2U, compaction->num_input_files(0));
+  ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber());
+  ASSERT_EQ(1, static_cast<int>(compaction->num_input_levels()));
+  ASSERT_EQ(num_levels - 1, compaction->output_level());
+}
+
+TEST_F(CompactionPickerTest, Level0TriggerDynamic2) {
+  int num_levels = ioptions_.num_levels;
+  ioptions_.level_compaction_dynamic_level_bytes = true;
+  mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+  mutable_cf_options_.max_bytes_for_level_base = 200;
+  mutable_cf_options_.max_bytes_for_level_multiplier = 10;
+  NewVersionStorage(num_levels, kCompactionStyleLevel);
+  Add(0, 1U, "150", "200");
+  Add(0, 2U, "200", "250");
+  Add(num_levels - 1, 3U, "200", "250", 300U);
+
+  UpdateVersionStorageInfo();
+  ASSERT_EQ(vstorage_->base_level(), num_levels - 2);
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(2U, compaction->num_input_files(0));
+  ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber());
+  ASSERT_EQ(1, static_cast<int>(compaction->num_input_levels()));
+  ASSERT_EQ(num_levels - 2, compaction->output_level());
+}
+
+TEST_F(CompactionPickerTest, Level0TriggerDynamic3) {
+  int num_levels = ioptions_.num_levels;
+  ioptions_.level_compaction_dynamic_level_bytes = true;
+  mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+  mutable_cf_options_.max_bytes_for_level_base = 200;
+  mutable_cf_options_.max_bytes_for_level_multiplier = 10;
+  NewVersionStorage(num_levels, kCompactionStyleLevel);
+  Add(0, 1U, "150", "200");
+  Add(0, 2U, "200", "250");
+  Add(num_levels - 1, 3U, "200", "250", 300U);
+  Add(num_levels - 1, 4U, "300", "350", 3000U);
+
+  UpdateVersionStorageInfo();
+  ASSERT_EQ(vstorage_->base_level(), num_levels - 3);
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(2U, compaction->num_input_files(0));
+  ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber());
+  ASSERT_EQ(1, static_cast<int>(compaction->num_input_levels()));
+  ASSERT_EQ(num_levels - 3, compaction->output_level());
+}
+
+TEST_F(CompactionPickerTest, Level0TriggerDynamic4) {
+  int num_levels = ioptions_.num_levels;
+  ioptions_.level_compaction_dynamic_level_bytes = true;
+  mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+  mutable_cf_options_.max_bytes_for_level_base = 200;
+  mutable_cf_options_.max_bytes_for_level_multiplier = 10;
+
+  NewVersionStorage(num_levels, kCompactionStyleLevel);
+  Add(0, 1U, "150", "200");
+  Add(0, 2U, "200", "250");
+  Add(num_levels - 1, 3U, "200", "250", 300U);
+  Add(num_levels - 1, 4U, "300", "350", 3000U);
+  Add(num_levels - 3, 5U, "150", "180", 3U);
+  Add(num_levels - 3, 6U, "181", "300", 3U);
+  Add(num_levels - 3, 7U, "400", "450", 3U);
+
+  UpdateVersionStorageInfo();
+  ASSERT_EQ(vstorage_->base_level(), num_levels - 3);
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(2U, compaction->num_input_files(0));
+  ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber());
+  ASSERT_EQ(2U, compaction->num_input_files(1));
+  ASSERT_EQ(num_levels - 3, compaction->level(1));
+  ASSERT_EQ(5U, compaction->input(1, 0)->fd.GetNumber());
+  ASSERT_EQ(6U, compaction->input(1, 1)->fd.GetNumber());
+  ASSERT_EQ(2, static_cast<int>(compaction->num_input_levels()));
+  ASSERT_EQ(num_levels - 3, compaction->output_level());
+}
+
+TEST_F(CompactionPickerTest, LevelTriggerDynamic4) {
+  int num_levels = ioptions_.num_levels;
+  ioptions_.level_compaction_dynamic_level_bytes = true;
+  ioptions_.compaction_pri = kMinOverlappingRatio;
+  mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+  mutable_cf_options_.max_bytes_for_level_base = 200;
+  mutable_cf_options_.max_bytes_for_level_multiplier = 10;
+  NewVersionStorage(num_levels, kCompactionStyleLevel);
+  Add(0, 1U, "150", "200");
+  Add(num_levels - 1, 3U, "200", "250", 300U);
+  Add(num_levels - 1, 4U, "300", "350", 3000U);
+  Add(num_levels - 1, 4U, "400", "450", 3U);
+  Add(num_levels - 2, 5U, "150", "180", 300U);
+  Add(num_levels - 2, 6U, "181", "350", 500U);
+  Add(num_levels - 2, 7U, "400", "450", 200U);
+
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(1U, compaction->num_input_files(0));
+  ASSERT_EQ(5U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(0, compaction->num_input_files(1));
+  ASSERT_EQ(1U, compaction->num_input_levels());
+  ASSERT_EQ(num_levels - 1, compaction->output_level());
+}
+
+// Universal and FIFO Compactions are not supported in ROCKSDB_LITE
+#ifndef ROCKSDB_LITE
+TEST_F(CompactionPickerTest, NeedsCompactionUniversal) {
+  NewVersionStorage(1, kCompactionStyleUniversal);
+  UniversalCompactionPicker universal_compaction_picker(
+      ioptions_, &icmp_);
+  UpdateVersionStorageInfo();
+  // must return false when there's no files.
+  ASSERT_EQ(universal_compaction_picker.NeedsCompaction(vstorage_.get()),
+            false);
+
+  // verify the trigger given different number of L0 files.
+  for (int i = 1;
+       i <= mutable_cf_options_.level0_file_num_compaction_trigger * 2; ++i) {
+    NewVersionStorage(1, kCompactionStyleUniversal);
+    Add(0, i, ToString((i + 100) * 1000).c_str(),
+        ToString((i + 100) * 1000 + 999).c_str(), 1000000, 0, i * 100,
+        i * 100 + 99);
+    UpdateVersionStorageInfo();
+    ASSERT_EQ(level_compaction_picker.NeedsCompaction(vstorage_.get()),
+              vstorage_->CompactionScore(0) >= 1);
+  }
+}
+
+TEST_F(CompactionPickerTest, CompactionUniversalIngestBehindReservedLevel) {
+  const uint64_t kFileSize = 100000;
+  NewVersionStorage(1, kCompactionStyleUniversal);
+  ioptions_.allow_ingest_behind = true;
+  ioptions_.num_levels = 3;
+  UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+  UpdateVersionStorageInfo();
+  // must return false when there's no files.
+  ASSERT_EQ(universal_compaction_picker.NeedsCompaction(vstorage_.get()),
+            false);
+
+  NewVersionStorage(3, kCompactionStyleUniversal);
+
+  Add(0, 1U, "150", "200", kFileSize, 0, 500, 550);
+  Add(0, 2U, "201", "250", kFileSize, 0, 401, 450);
+  Add(0, 4U, "260", "300", kFileSize, 0, 260, 300);
+  Add(1, 5U, "100", "151", kFileSize, 0, 200, 251);
+  Add(1, 3U, "301", "350", kFileSize, 0, 101, 150);
+  Add(2, 6U, "120", "200", kFileSize, 0, 20, 100);
+
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(
+      universal_compaction_picker.PickCompaction(
+          cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+
+  // output level should be the one above the bottom-most
+  ASSERT_EQ(1, compaction->output_level());
+}
+// Tests if the files can be trivially moved in multi level
+// universal compaction when allow_trivial_move option is set
+// In this test as the input files overlaps, they cannot
+// be trivially moved.
+
+TEST_F(CompactionPickerTest, CannotTrivialMoveUniversal) {
+  const uint64_t kFileSize = 100000;
+
+  mutable_cf_options_.compaction_options_universal.allow_trivial_move = true;
+  NewVersionStorage(1, kCompactionStyleUniversal);
+  UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+  UpdateVersionStorageInfo();
+  // must return false when there's no files.
+  ASSERT_EQ(universal_compaction_picker.NeedsCompaction(vstorage_.get()),
+            false);
+
+  NewVersionStorage(3, kCompactionStyleUniversal);
+
+  Add(0, 1U, "150", "200", kFileSize, 0, 500, 550);
+  Add(0, 2U, "201", "250", kFileSize, 0, 401, 450);
+  Add(0, 4U, "260", "300", kFileSize, 0, 260, 300);
+  Add(1, 5U, "100", "151", kFileSize, 0, 200, 251);
+  Add(1, 3U, "301", "350", kFileSize, 0, 101, 150);
+  Add(2, 6U, "120", "200", kFileSize, 0, 20, 100);
+
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(
+      universal_compaction_picker.PickCompaction(
+          cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+
+  ASSERT_TRUE(!compaction->is_trivial_move());
+}
+// Tests if the files can be trivially moved in multi level
+// universal compaction when allow_trivial_move option is set
+// In this test as the input files doesn't overlaps, they should
+// be trivially moved.
+TEST_F(CompactionPickerTest, AllowsTrivialMoveUniversal) {
+  const uint64_t kFileSize = 100000;
+
+  mutable_cf_options_.compaction_options_universal.allow_trivial_move = true;
+  UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+  NewVersionStorage(3, kCompactionStyleUniversal);
+
+  Add(0, 1U, "150", "200", kFileSize, 0, 500, 550);
+  Add(0, 2U, "201", "250", kFileSize, 0, 401, 450);
+  Add(0, 4U, "260", "300", kFileSize, 0, 260, 300);
+  Add(1, 5U, "010", "080", kFileSize, 0, 200, 251);
+  Add(2, 3U, "301", "350", kFileSize, 0, 101, 150);
+
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(
+      universal_compaction_picker.PickCompaction(
+          cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+
+  ASSERT_TRUE(compaction->is_trivial_move());
+}
+
+TEST_F(CompactionPickerTest, UniversalPeriodicCompaction1) {
+  // The case where universal periodic compaction can be picked
+  // with some newer files being compacted.
+  const uint64_t kFileSize = 100000;
+
+  mutable_cf_options_.periodic_compaction_seconds = 1000;
+  UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+  NewVersionStorage(5, kCompactionStyleUniversal);
+
+  Add(0, 1U, "150", "200", kFileSize, 0, 500, 550);
+  Add(0, 2U, "201", "250", kFileSize, 0, 401, 450);
+  Add(0, 4U, "260", "300", kFileSize, 0, 260, 300);
+  Add(3, 5U, "010", "080", kFileSize, 0, 200, 251);
+  Add(4, 3U, "301", "350", kFileSize, 0, 101, 150);
+  Add(4, 6U, "501", "750", kFileSize, 0, 101, 150);
+
+  file_map_[2].first->being_compacted = true;
+  UpdateVersionStorageInfo();
+  vstorage_->TEST_AddFileMarkedForPeriodicCompaction(4, file_map_[3].first);
+
+  std::unique_ptr<Compaction> compaction(
+      universal_compaction_picker.PickCompaction(
+          cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+
+  ASSERT_TRUE(compaction);
+  ASSERT_EQ(4, compaction->output_level());
+  ASSERT_EQ(0, compaction->start_level());
+  ASSERT_EQ(1U, compaction->num_input_files(0));
+}
+
+TEST_F(CompactionPickerTest, UniversalPeriodicCompaction2) {
+  // The case where universal periodic compaction does not
+  // pick up only level to compact if it doesn't cover
+  // any file marked as periodic compaction.
+  const uint64_t kFileSize = 100000;
+
+  mutable_cf_options_.periodic_compaction_seconds = 1000;
+  UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+  NewVersionStorage(5, kCompactionStyleUniversal);
+
+  Add(0, 1U, "150", "200", kFileSize, 0, 500, 550);
+  Add(3, 5U, "010", "080", kFileSize, 0, 200, 251);
+  Add(4, 3U, "301", "350", kFileSize, 0, 101, 150);
+  Add(4, 6U, "501", "750", kFileSize, 0, 101, 150);
+
+  file_map_[5].first->being_compacted = true;
+  UpdateVersionStorageInfo();
+  vstorage_->TEST_AddFileMarkedForPeriodicCompaction(0, file_map_[1].first);
+
+  std::unique_ptr<Compaction> compaction(
+      universal_compaction_picker.PickCompaction(
+          cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+
+  ASSERT_FALSE(compaction);
+}
+
+TEST_F(CompactionPickerTest, UniversalPeriodicCompaction3) {
+  // The case where universal periodic compaction does not
+  // pick up only the last sorted run which is an L0 file if it isn't
+  // marked as periodic compaction.
+  const uint64_t kFileSize = 100000;
+
+  mutable_cf_options_.periodic_compaction_seconds = 1000;
+  UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+  NewVersionStorage(5, kCompactionStyleUniversal);
+
+  Add(0, 1U, "150", "200", kFileSize, 0, 500, 550);
+  Add(0, 5U, "010", "080", kFileSize, 0, 200, 251);
+  Add(0, 6U, "501", "750", kFileSize, 0, 101, 150);
+
+  file_map_[5].first->being_compacted = true;
+  UpdateVersionStorageInfo();
+  vstorage_->TEST_AddFileMarkedForPeriodicCompaction(0, file_map_[1].first);
+
+  std::unique_ptr<Compaction> compaction(
+      universal_compaction_picker.PickCompaction(
+          cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+
+  ASSERT_FALSE(compaction);
+}
+
+TEST_F(CompactionPickerTest, UniversalPeriodicCompaction4) {
+  // The case where universal periodic compaction couldn't form
+  // a compaction that inlcudes any file marked for periodic compaction.
+  // Right now we form the compaction anyway if it is more than one
+  // sorted run. Just put the case here to validate that it doesn't
+  // crash.
+  const uint64_t kFileSize = 100000;
+
+  mutable_cf_options_.periodic_compaction_seconds = 1000;
+  UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+  NewVersionStorage(5, kCompactionStyleUniversal);
+
+  Add(0, 1U, "150", "200", kFileSize, 0, 500, 550);
+  Add(2, 2U, "010", "080", kFileSize, 0, 200, 251);
+  Add(3, 5U, "010", "080", kFileSize, 0, 200, 251);
+  Add(4, 3U, "301", "350", kFileSize, 0, 101, 150);
+  Add(4, 6U, "501", "750", kFileSize, 0, 101, 150);
+
+  file_map_[2].first->being_compacted = true;
+  UpdateVersionStorageInfo();
+  vstorage_->TEST_AddFileMarkedForPeriodicCompaction(0, file_map_[2].first);
+
+  std::unique_ptr<Compaction> compaction(
+      universal_compaction_picker.PickCompaction(
+          cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+  ASSERT_TRUE(!compaction ||
+              compaction->start_level() != compaction->output_level());
+}
+
+TEST_F(CompactionPickerTest, UniversalPeriodicCompaction5) {
+  // Test single L0 file periodic compaction triggering.
+  const uint64_t kFileSize = 100000;
+
+  mutable_cf_options_.periodic_compaction_seconds = 1000;
+  UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+  NewVersionStorage(5, kCompactionStyleUniversal);
+
+  Add(0, 6U, "150", "200", kFileSize, 0, 500, 550);
+  UpdateVersionStorageInfo();
+  vstorage_->TEST_AddFileMarkedForPeriodicCompaction(0, file_map_[6].first);
+
+  std::unique_ptr<Compaction> compaction(
+      universal_compaction_picker.PickCompaction(
+          cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+  ASSERT_TRUE(compaction);
+  ASSERT_EQ(0, compaction->start_level());
+  ASSERT_EQ(1U, compaction->num_input_files(0));
+  ASSERT_EQ(6U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(4, compaction->output_level());
+}
+
+TEST_F(CompactionPickerTest, UniversalPeriodicCompaction6) {
+  // Test single sorted run non-L0 periodic compaction
+  const uint64_t kFileSize = 100000;
+
+  mutable_cf_options_.periodic_compaction_seconds = 1000;
+  UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+  NewVersionStorage(5, kCompactionStyleUniversal);
+
+  Add(4, 5U, "150", "200", kFileSize, 0, 500, 550);
+  Add(4, 6U, "350", "400", kFileSize, 0, 500, 550);
+  UpdateVersionStorageInfo();
+  vstorage_->TEST_AddFileMarkedForPeriodicCompaction(4, file_map_[6].first);
+
+  std::unique_ptr<Compaction> compaction(
+      universal_compaction_picker.PickCompaction(
+          cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+  ASSERT_TRUE(compaction);
+  ASSERT_EQ(4, compaction->start_level());
+  ASSERT_EQ(2U, compaction->num_input_files(0));
+  ASSERT_EQ(5U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(6U, compaction->input(0, 1)->fd.GetNumber());
+  ASSERT_EQ(4, compaction->output_level());
+}
+
+TEST_F(CompactionPickerTest, NeedsCompactionFIFO) {
+  NewVersionStorage(1, kCompactionStyleFIFO);
+  const int kFileCount =
+      mutable_cf_options_.level0_file_num_compaction_trigger * 3;
+  const uint64_t kFileSize = 100000;
+  const uint64_t kMaxSize = kFileSize * kFileCount / 2;
+
+  fifo_options_.max_table_files_size = kMaxSize;
+  mutable_cf_options_.compaction_options_fifo = fifo_options_;
+  FIFOCompactionPicker fifo_compaction_picker(ioptions_, &icmp_);
+  UpdateVersionStorageInfo();
+  // must return false when there's no files.
+  ASSERT_EQ(fifo_compaction_picker.NeedsCompaction(vstorage_.get()), false);
+
+  // verify whether compaction is needed based on the current
+  // size of L0 files.
+  uint64_t current_size = 0;
+  for (int i = 1; i <= kFileCount; ++i) {
+    NewVersionStorage(1, kCompactionStyleFIFO);
+    Add(0, i, ToString((i + 100) * 1000).c_str(),
+        ToString((i + 100) * 1000 + 999).c_str(),
+        kFileSize, 0, i * 100, i * 100 + 99);
+    current_size += kFileSize;
+    UpdateVersionStorageInfo();
+    ASSERT_EQ(fifo_compaction_picker.NeedsCompaction(vstorage_.get()),
+              vstorage_->CompactionScore(0) >= 1);
+  }
+}
+#endif  // ROCKSDB_LITE
+
+TEST_F(CompactionPickerTest, CompactionPriMinOverlapping1) {
+  NewVersionStorage(6, kCompactionStyleLevel);
+  ioptions_.compaction_pri = kMinOverlappingRatio;
+  mutable_cf_options_.target_file_size_base = 100000000000;
+  mutable_cf_options_.target_file_size_multiplier = 10;
+  mutable_cf_options_.max_bytes_for_level_base = 10 * 1024 * 1024;
+  mutable_cf_options_.RefreshDerivedOptions(ioptions_);
+
+  Add(2, 6U, "150", "179", 50000000U);
+  Add(2, 7U, "180", "220", 50000000U);
+  Add(2, 8U, "321", "400", 50000000U);  // File not overlapping
+  Add(2, 9U, "721", "800", 50000000U);
+
+  Add(3, 26U, "150", "170", 260000000U);
+  Add(3, 27U, "171", "179", 260000000U);
+  Add(3, 28U, "191", "220", 260000000U);
+  Add(3, 29U, "221", "300", 260000000U);
+  Add(3, 30U, "750", "900", 260000000U);
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(1U, compaction->num_input_files(0));
+  // Pick file 8 because it overlaps with 0 files on level 3.
+  ASSERT_EQ(8U, compaction->input(0, 0)->fd.GetNumber());
+  // Compaction input size * 1.1
+  ASSERT_GE(uint64_t{55000000}, compaction->OutputFilePreallocationSize());
+}
+
+TEST_F(CompactionPickerTest, CompactionPriMinOverlapping2) {
+  NewVersionStorage(6, kCompactionStyleLevel);
+  ioptions_.compaction_pri = kMinOverlappingRatio;
+  mutable_cf_options_.target_file_size_base = 10000000;
+  mutable_cf_options_.target_file_size_multiplier = 10;
+  mutable_cf_options_.max_bytes_for_level_base = 10 * 1024 * 1024;
+
+  Add(2, 6U, "150", "175",
+      60000000U);  // Overlaps with file 26, 27, total size 521M
+  Add(2, 7U, "176", "200", 60000000U);  // Overlaps with file 27, 28, total size
+                                        // 520M, the smalelst overlapping
+  Add(2, 8U, "201", "300",
+      60000000U);  // Overlaps with file 28, 29, total size 521M
+
+  Add(3, 26U, "100", "110", 261000000U);
+  Add(3, 26U, "150", "170", 261000000U);
+  Add(3, 27U, "171", "179", 260000000U);
+  Add(3, 28U, "191", "220", 260000000U);
+  Add(3, 29U, "221", "300", 261000000U);
+  Add(3, 30U, "321", "400", 261000000U);
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(1U, compaction->num_input_files(0));
+  // Picking file 7 because overlapping ratio is the biggest.
+  ASSERT_EQ(7U, compaction->input(0, 0)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, CompactionPriMinOverlapping3) {
+  NewVersionStorage(6, kCompactionStyleLevel);
+  ioptions_.compaction_pri = kMinOverlappingRatio;
+  mutable_cf_options_.max_bytes_for_level_base = 10000000;
+  mutable_cf_options_.max_bytes_for_level_multiplier = 10;
+
+  // file 7 and 8 over lap with the same file, but file 8 is smaller so
+  // it will be picked.
+  Add(2, 6U, "150", "167", 60000000U);  // Overlaps with file 26, 27
+  Add(2, 7U, "168", "169", 60000000U);  // Overlaps with file 27
+  Add(2, 8U, "201", "300", 61000000U);  // Overlaps with file 28, but the file
+                                        // itself is larger. Should be picked.
+
+  Add(3, 26U, "160", "165", 260000000U);
+  Add(3, 27U, "166", "170", 260000000U);
+  Add(3, 28U, "180", "400", 260000000U);
+  Add(3, 29U, "401", "500", 260000000U);
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(1U, compaction->num_input_files(0));
+  // Picking file 8 because overlapping ratio is the biggest.
+  ASSERT_EQ(8U, compaction->input(0, 0)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, CompactionPriMinOverlapping4) {
+  NewVersionStorage(6, kCompactionStyleLevel);
+  ioptions_.compaction_pri = kMinOverlappingRatio;
+  mutable_cf_options_.max_bytes_for_level_base = 10000000;
+  mutable_cf_options_.max_bytes_for_level_multiplier = 10;
+
+  // file 7 and 8 over lap with the same file, but file 8 is smaller so
+  // it will be picked.
+  // Overlaps with file 26, 27. And the file is compensated so will be
+  // picked up.
+  Add(2, 6U, "150", "167", 60000000U, 0, 100, 100, 180000000U);
+  Add(2, 7U, "168", "169", 60000000U);  // Overlaps with file 27
+  Add(2, 8U, "201", "300", 61000000U);  // Overlaps with file 28
+
+  Add(3, 26U, "160", "165", 60000000U);
+  // Boosted file size in output level is not considered.
+  Add(3, 27U, "166", "170", 60000000U, 0, 100, 100, 260000000U);
+  Add(3, 28U, "180", "400", 60000000U);
+  Add(3, 29U, "401", "500", 60000000U);
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(1U, compaction->num_input_files(0));
+  // Picking file 8 because overlapping ratio is the biggest.
+  ASSERT_EQ(6U, compaction->input(0, 0)->fd.GetNumber());
+}
+
+// This test exhibits the bug where we don't properly reset parent_index in
+// PickCompaction()
+TEST_F(CompactionPickerTest, ParentIndexResetBug) {
+  int num_levels = ioptions_.num_levels;
+  mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+  mutable_cf_options_.max_bytes_for_level_base = 200;
+  NewVersionStorage(num_levels, kCompactionStyleLevel);
+  Add(0, 1U, "150", "200");       // <- marked for compaction
+  Add(1, 3U, "400", "500", 600);  // <- this one needs compacting
+  Add(2, 4U, "150", "200");
+  Add(2, 5U, "201", "210");
+  Add(2, 6U, "300", "310");
+  Add(2, 7U, "400", "500");  // <- being compacted
+
+  vstorage_->LevelFiles(2)[3]->being_compacted = true;
+  vstorage_->LevelFiles(0)[0]->marked_for_compaction = true;
+
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+}
+
+// This test checks ExpandWhileOverlapping() by having overlapping user keys
+// ranges (with different sequence numbers) in the input files.
+TEST_F(CompactionPickerTest, OverlappingUserKeys) {
+  NewVersionStorage(6, kCompactionStyleLevel);
+  ioptions_.compaction_pri = kByCompensatedSize;
+
+  Add(1, 1U, "100", "150", 1U);
+  // Overlapping user keys
+  Add(1, 2U, "200", "400", 1U);
+  Add(1, 3U, "400", "500", 1000000000U, 0, 0);
+  Add(2, 4U, "600", "700", 1U);
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+              cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(1U, compaction->num_input_levels());
+  ASSERT_EQ(2U, compaction->num_input_files(0));
+  ASSERT_EQ(2U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(3U, compaction->input(0, 1)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, OverlappingUserKeys2) {
+  NewVersionStorage(6, kCompactionStyleLevel);
+  // Overlapping user keys on same level and output level
+  Add(1, 1U, "200", "400", 1000000000U);
+  Add(1, 2U, "400", "500", 1U, 0, 0);
+  Add(2, 3U, "000", "100", 1U);
+  Add(2, 4U, "100", "600", 1U, 0, 0);
+  Add(2, 5U, "600", "700", 1U, 0, 0);
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+              cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(2U, compaction->num_input_levels());
+  ASSERT_EQ(2U, compaction->num_input_files(0));
+  ASSERT_EQ(3U, compaction->num_input_files(1));
+  ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber());
+  ASSERT_EQ(3U, compaction->input(1, 0)->fd.GetNumber());
+  ASSERT_EQ(4U, compaction->input(1, 1)->fd.GetNumber());
+  ASSERT_EQ(5U, compaction->input(1, 2)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, OverlappingUserKeys3) {
+  NewVersionStorage(6, kCompactionStyleLevel);
+  // Chain of overlapping user key ranges (forces ExpandWhileOverlapping() to
+  // expand multiple times)
+  Add(1, 1U, "100", "150", 1U);
+  Add(1, 2U, "150", "200", 1U, 0, 0);
+  Add(1, 3U, "200", "250", 1000000000U, 0, 0);
+  Add(1, 4U, "250", "300", 1U, 0, 0);
+  Add(1, 5U, "300", "350", 1U, 0, 0);
+  // Output level overlaps with the beginning and the end of the chain
+  Add(2, 6U, "050", "100", 1U);
+  Add(2, 7U, "350", "400", 1U);
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+              cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(2U, compaction->num_input_levels());
+  ASSERT_EQ(5U, compaction->num_input_files(0));
+  ASSERT_EQ(2U, compaction->num_input_files(1));
+  ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber());
+  ASSERT_EQ(3U, compaction->input(0, 2)->fd.GetNumber());
+  ASSERT_EQ(4U, compaction->input(0, 3)->fd.GetNumber());
+  ASSERT_EQ(5U, compaction->input(0, 4)->fd.GetNumber());
+  ASSERT_EQ(6U, compaction->input(1, 0)->fd.GetNumber());
+  ASSERT_EQ(7U, compaction->input(1, 1)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, OverlappingUserKeys4) {
+  NewVersionStorage(6, kCompactionStyleLevel);
+  mutable_cf_options_.max_bytes_for_level_base = 1000000;
+
+  Add(1, 1U, "100", "150", 1U);
+  Add(1, 2U, "150", "199", 1U, 0, 0);
+  Add(1, 3U, "200", "250", 1100000U, 0, 0);
+  Add(1, 4U, "251", "300", 1U, 0, 0);
+  Add(1, 5U, "300", "350", 1U, 0, 0);
+
+  Add(2, 6U, "100", "115", 1U);
+  Add(2, 7U, "125", "325", 1U);
+  Add(2, 8U, "350", "400", 1U);
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(2U, compaction->num_input_levels());
+  ASSERT_EQ(1U, compaction->num_input_files(0));
+  ASSERT_EQ(1U, compaction->num_input_files(1));
+  ASSERT_EQ(3U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(7U, compaction->input(1, 0)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, OverlappingUserKeys5) {
+  NewVersionStorage(6, kCompactionStyleLevel);
+  // Overlapping user keys on same level and output level
+  Add(1, 1U, "200", "400", 1000000000U);
+  Add(1, 2U, "400", "500", 1U, 0, 0);
+  Add(2, 3U, "000", "100", 1U);
+  Add(2, 4U, "100", "600", 1U, 0, 0);
+  Add(2, 5U, "600", "700", 1U, 0, 0);
+
+  vstorage_->LevelFiles(2)[2]->being_compacted = true;
+
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+  ASSERT_TRUE(compaction.get() == nullptr);
+}
+
+TEST_F(CompactionPickerTest, OverlappingUserKeys6) {
+  NewVersionStorage(6, kCompactionStyleLevel);
+  // Overlapping user keys on same level and output level
+  Add(1, 1U, "200", "400", 1U, 0, 0);
+  Add(1, 2U, "401", "500", 1U, 0, 0);
+  Add(2, 3U, "000", "100", 1U);
+  Add(2, 4U, "100", "300", 1U, 0, 0);
+  Add(2, 5U, "305", "450", 1U, 0, 0);
+  Add(2, 6U, "460", "600", 1U, 0, 0);
+  Add(2, 7U, "600", "700", 1U, 0, 0);
+
+  vstorage_->LevelFiles(1)[0]->marked_for_compaction = true;
+  vstorage_->LevelFiles(1)[1]->marked_for_compaction = true;
+
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(2U, compaction->num_input_levels());
+  ASSERT_EQ(1U, compaction->num_input_files(0));
+  ASSERT_EQ(3U, compaction->num_input_files(1));
+}
+
+TEST_F(CompactionPickerTest, OverlappingUserKeys7) {
+  NewVersionStorage(6, kCompactionStyleLevel);
+  mutable_cf_options_.max_compaction_bytes = 100000000000u;
+  // Overlapping user keys on same level and output level
+  Add(1, 1U, "200", "400", 1U, 0, 0);
+  Add(1, 2U, "401", "500", 1000000000U, 0, 0);
+  Add(2, 3U, "100", "250", 1U);
+  Add(2, 4U, "300", "600", 1U, 0, 0);
+  Add(2, 5U, "600", "800", 1U, 0, 0);
+
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(2U, compaction->num_input_levels());
+  ASSERT_GE(1U, compaction->num_input_files(0));
+  ASSERT_GE(2U, compaction->num_input_files(1));
+  // File 5 has to be included in the compaction
+  ASSERT_EQ(5U, compaction->inputs(1)->back()->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, OverlappingUserKeys8) {
+  NewVersionStorage(6, kCompactionStyleLevel);
+  mutable_cf_options_.max_compaction_bytes = 100000000000u;
+  // grow the number of inputs in "level" without
+  // changing the number of "level+1" files we pick up
+  // Expand input level as much as possible
+  // no overlapping case
+  Add(1, 1U, "101", "150", 1U);
+  Add(1, 2U, "151", "200", 1U);
+  Add(1, 3U, "201", "300", 1000000000U);
+  Add(1, 4U, "301", "400", 1U);
+  Add(1, 5U, "401", "500", 1U);
+  Add(2, 6U, "150", "200", 1U);
+  Add(2, 7U, "200", "450", 1U, 0, 0);
+  Add(2, 8U, "500", "600", 1U);
+
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(2U, compaction->num_input_levels());
+  ASSERT_EQ(3U, compaction->num_input_files(0));
+  ASSERT_EQ(2U, compaction->num_input_files(1));
+  ASSERT_EQ(2U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(3U, compaction->input(0, 1)->fd.GetNumber());
+  ASSERT_EQ(4U, compaction->input(0, 2)->fd.GetNumber());
+  ASSERT_EQ(6U, compaction->input(1, 0)->fd.GetNumber());
+  ASSERT_EQ(7U, compaction->input(1, 1)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, OverlappingUserKeys9) {
+  NewVersionStorage(6, kCompactionStyleLevel);
+  mutable_cf_options_.max_compaction_bytes = 100000000000u;
+  // grow the number of inputs in "level" without
+  // changing the number of "level+1" files we pick up
+  // Expand input level as much as possible
+  // overlapping case
+  Add(1, 1U, "121", "150", 1U);
+  Add(1, 2U, "151", "200", 1U);
+  Add(1, 3U, "201", "300", 1000000000U);
+  Add(1, 4U, "301", "400", 1U);
+  Add(1, 5U, "401", "500", 1U);
+  Add(2, 6U, "100", "120", 1U);
+  Add(2, 7U, "150", "200", 1U);
+  Add(2, 8U, "200", "450", 1U, 0, 0);
+  Add(2, 9U, "501", "600", 1U);
+
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(2U, compaction->num_input_levels());
+  ASSERT_EQ(5U, compaction->num_input_files(0));
+  ASSERT_EQ(2U, compaction->num_input_files(1));
+  ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber());
+  ASSERT_EQ(3U, compaction->input(0, 2)->fd.GetNumber());
+  ASSERT_EQ(4U, compaction->input(0, 3)->fd.GetNumber());
+  ASSERT_EQ(7U, compaction->input(1, 0)->fd.GetNumber());
+  ASSERT_EQ(8U, compaction->input(1, 1)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, OverlappingUserKeys10) {
+  // Locked file encountered when pulling in extra input-level files with same
+  // user keys. Verify we pick the next-best file from the same input level.
+  NewVersionStorage(6, kCompactionStyleLevel);
+  mutable_cf_options_.max_compaction_bytes = 100000000000u;
+
+  // file_number 2U is largest and thus first choice. But it overlaps with
+  // file_number 1U which is being compacted. So instead we pick the next-
+  // biggest file, 3U, which is eligible for compaction.
+  Add(1 /* level */, 1U /* file_number */, "100" /* smallest */,
+      "150" /* largest */, 1U /* file_size */);
+  file_map_[1U].first->being_compacted = true;
+  Add(1 /* level */, 2U /* file_number */, "150" /* smallest */,
+      "200" /* largest */, 1000000000U /* file_size */, 0 /* smallest_seq */,
+      0 /* largest_seq */);
+  Add(1 /* level */, 3U /* file_number */, "201" /* smallest */,
+      "250" /* largest */, 900000000U /* file_size */);
+  Add(2 /* level */, 4U /* file_number */, "100" /* smallest */,
+      "150" /* largest */, 1U /* file_size */);
+  Add(2 /* level */, 5U /* file_number */, "151" /* smallest */,
+      "200" /* largest */, 1U /* file_size */);
+  Add(2 /* level */, 6U /* file_number */, "201" /* smallest */,
+      "250" /* largest */, 1U /* file_size */);
+
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(2U, compaction->num_input_levels());
+  ASSERT_EQ(1U, compaction->num_input_files(0));
+  ASSERT_EQ(1U, compaction->num_input_files(1));
+  ASSERT_EQ(3U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(6U, compaction->input(1, 0)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, OverlappingUserKeys11) {
+  // Locked file encountered when pulling in extra output-level files with same
+  // user keys. Expected to skip that compaction and pick the next-best choice.
+  NewVersionStorage(6, kCompactionStyleLevel);
+  mutable_cf_options_.max_compaction_bytes = 100000000000u;
+
+  // score(L1) = 3.7
+  // score(L2) = 1.85
+  // There is no eligible file in L1 to compact since both candidates pull in
+  // file_number 5U, which overlaps with a file pending compaction (6U). The
+  // first eligible compaction is from L2->L3.
+  Add(1 /* level */, 2U /* file_number */, "151" /* smallest */,
+      "200" /* largest */, 1000000000U /* file_size */);
+  Add(1 /* level */, 3U /* file_number */, "201" /* smallest */,
+      "250" /* largest */, 1U /* file_size */);
+  Add(2 /* level */, 4U /* file_number */, "100" /* smallest */,
+      "149" /* largest */, 5000000000U /* file_size */);
+  Add(2 /* level */, 5U /* file_number */, "150" /* smallest */,
+      "201" /* largest */, 1U /* file_size */);
+  Add(2 /* level */, 6U /* file_number */, "201" /* smallest */,
+      "249" /* largest */, 1U /* file_size */, 0 /* smallest_seq */,
+      0 /* largest_seq */);
+  file_map_[6U].first->being_compacted = true;
+  Add(3 /* level */, 7U /* file_number */, "100" /* smallest */,
+      "149" /* largest */, 1U /* file_size */);
+
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(2U, compaction->num_input_levels());
+  ASSERT_EQ(1U, compaction->num_input_files(0));
+  ASSERT_EQ(1U, compaction->num_input_files(1));
+  ASSERT_EQ(4U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(7U, compaction->input(1, 0)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, NotScheduleL1IfL0WithHigherPri1) {
+  NewVersionStorage(6, kCompactionStyleLevel);
+  mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+  mutable_cf_options_.max_bytes_for_level_base = 900000000U;
+
+  // 6 L0 files, score 3.
+  Add(0, 1U, "000", "400", 1U);
+  Add(0, 2U, "001", "400", 1U, 0, 0);
+  Add(0, 3U, "001", "400", 1000000000U, 0, 0);
+  Add(0, 31U, "001", "400", 1000000000U, 0, 0);
+  Add(0, 32U, "001", "400", 1000000000U, 0, 0);
+  Add(0, 33U, "001", "400", 1000000000U, 0, 0);
+
+  // L1 total size 2GB, score 2.2. If one file being comapcted, score 1.1.
+  Add(1, 4U, "050", "300", 1000000000U, 0, 0);
+  file_map_[4u].first->being_compacted = true;
+  Add(1, 5U, "301", "350", 1000000000U, 0, 0);
+
+  // Output level overlaps with the beginning and the end of the chain
+  Add(2, 6U, "050", "100", 1U);
+  Add(2, 7U, "300", "400", 1U);
+
+  // No compaction should be scheduled, if L0 has higher priority than L1
+  // but L0->L1 compaction is blocked by a file in L1 being compacted.
+  UpdateVersionStorageInfo();
+  ASSERT_EQ(0, vstorage_->CompactionScoreLevel(0));
+  ASSERT_EQ(1, vstorage_->CompactionScoreLevel(1));
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+  ASSERT_TRUE(compaction.get() == nullptr);
+}
+
+TEST_F(CompactionPickerTest, NotScheduleL1IfL0WithHigherPri2) {
+  NewVersionStorage(6, kCompactionStyleLevel);
+  mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+  mutable_cf_options_.max_bytes_for_level_base = 900000000U;
+
+  // 6 L0 files, score 3.
+  Add(0, 1U, "000", "400", 1U);
+  Add(0, 2U, "001", "400", 1U, 0, 0);
+  Add(0, 3U, "001", "400", 1000000000U, 0, 0);
+  Add(0, 31U, "001", "400", 1000000000U, 0, 0);
+  Add(0, 32U, "001", "400", 1000000000U, 0, 0);
+  Add(0, 33U, "001", "400", 1000000000U, 0, 0);
+
+  // L1 total size 2GB, score 2.2. If one file being comapcted, score 1.1.
+  Add(1, 4U, "050", "300", 1000000000U, 0, 0);
+  Add(1, 5U, "301", "350", 1000000000U, 0, 0);
+
+  // Output level overlaps with the beginning and the end of the chain
+  Add(2, 6U, "050", "100", 1U);
+  Add(2, 7U, "300", "400", 1U);
+
+  // If no file in L1 being compacted, L0->L1 compaction will be scheduled.
+  UpdateVersionStorageInfo();  // being_compacted flag is cleared here.
+  ASSERT_EQ(0, vstorage_->CompactionScoreLevel(0));
+  ASSERT_EQ(1, vstorage_->CompactionScoreLevel(1));
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+}
+
+TEST_F(CompactionPickerTest, NotScheduleL1IfL0WithHigherPri3) {
+  NewVersionStorage(6, kCompactionStyleLevel);
+  mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+  mutable_cf_options_.max_bytes_for_level_base = 900000000U;
+
+  // 6 L0 files, score 3.
+  Add(0, 1U, "000", "400", 1U);
+  Add(0, 2U, "001", "400", 1U, 0, 0);
+  Add(0, 3U, "001", "400", 1000000000U, 0, 0);
+  Add(0, 31U, "001", "400", 1000000000U, 0, 0);
+  Add(0, 32U, "001", "400", 1000000000U, 0, 0);
+  Add(0, 33U, "001", "400", 1000000000U, 0, 0);
+
+  // L1 score more than 6.
+  Add(1, 4U, "050", "300", 1000000000U, 0, 0);
+  file_map_[4u].first->being_compacted = true;
+  Add(1, 5U, "301", "350", 1000000000U, 0, 0);
+  Add(1, 51U, "351", "400", 6000000000U, 0, 0);
+
+  // Output level overlaps with the beginning and the end of the chain
+  Add(2, 6U, "050", "100", 1U);
+  Add(2, 7U, "300", "400", 1U);
+
+  // If score in L1 is larger than L0, L1 compaction goes through despite
+  // there is pending L0 compaction.
+  UpdateVersionStorageInfo();
+  ASSERT_EQ(1, vstorage_->CompactionScoreLevel(0));
+  ASSERT_EQ(0, vstorage_->CompactionScoreLevel(1));
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+}
+
+TEST_F(CompactionPickerTest, EstimateCompactionBytesNeeded1) {
+  int num_levels = ioptions_.num_levels;
+  ioptions_.level_compaction_dynamic_level_bytes = false;
+  mutable_cf_options_.level0_file_num_compaction_trigger = 4;
+  mutable_cf_options_.max_bytes_for_level_base = 1000;
+  mutable_cf_options_.max_bytes_for_level_multiplier = 10;
+  NewVersionStorage(num_levels, kCompactionStyleLevel);
+  Add(0, 1U, "150", "200", 200);
+  Add(0, 2U, "150", "200", 200);
+  Add(0, 3U, "150", "200", 200);
+  // Level 1 is over target by 200
+  Add(1, 4U, "400", "500", 600);
+  Add(1, 5U, "600", "700", 600);
+  // Level 2 is less than target 10000 even added size of level 1
+  // Size ratio of L2/L1 is 9600 / 1200 = 8
+  Add(2, 6U, "150", "200", 2500);
+  Add(2, 7U, "201", "210", 2000);
+  Add(2, 8U, "300", "310", 2600);
+  Add(2, 9U, "400", "500", 2500);
+  // Level 3 exceeds target 100,000 of 1000
+  Add(3, 10U, "400", "500", 101000);
+  // Level 4 exceeds target 1,000,000 by 900 after adding size from level 3
+  // Size ratio L4/L3 is 9.9
+  // After merge from L3, L4 size is 1000900
+  Add(4, 11U, "400", "500", 999900);
+  Add(5, 11U, "400", "500", 8007200);
+
+  UpdateVersionStorageInfo();
+
+  ASSERT_EQ(200u * 9u + 10900u + 900u * 9,
+            vstorage_->estimated_compaction_needed_bytes());
+}
+
+TEST_F(CompactionPickerTest, EstimateCompactionBytesNeeded2) {
+  int num_levels = ioptions_.num_levels;
+  ioptions_.level_compaction_dynamic_level_bytes = false;
+  mutable_cf_options_.level0_file_num_compaction_trigger = 3;
+  mutable_cf_options_.max_bytes_for_level_base = 1000;
+  mutable_cf_options_.max_bytes_for_level_multiplier = 10;
+  NewVersionStorage(num_levels, kCompactionStyleLevel);
+  Add(0, 1U, "150", "200", 200);
+  Add(0, 2U, "150", "200", 200);
+  Add(0, 4U, "150", "200", 200);
+  Add(0, 5U, "150", "200", 200);
+  Add(0, 6U, "150", "200", 200);
+  // Level 1 size will be 1400 after merging with L0
+  Add(1, 7U, "400", "500", 200);
+  Add(1, 8U, "600", "700", 200);
+  // Level 2 is less than target 10000 even added size of level 1
+  Add(2, 9U, "150", "200", 9100);
+  // Level 3 over the target, but since level 4 is empty, we assume it will be
+  // a trivial move.
+  Add(3, 10U, "400", "500", 101000);
+
+  UpdateVersionStorageInfo();
+
+  // estimated L1->L2 merge: 400 * (9100.0 / 1400.0 + 1.0)
+  ASSERT_EQ(1400u + 3000u, vstorage_->estimated_compaction_needed_bytes());
+}
+
+TEST_F(CompactionPickerTest, EstimateCompactionBytesNeeded3) {
+  int num_levels = ioptions_.num_levels;
+  ioptions_.level_compaction_dynamic_level_bytes = false;
+  mutable_cf_options_.level0_file_num_compaction_trigger = 3;
+  mutable_cf_options_.max_bytes_for_level_base = 1000;
+  mutable_cf_options_.max_bytes_for_level_multiplier = 10;
+  NewVersionStorage(num_levels, kCompactionStyleLevel);
+  Add(0, 1U, "150", "200", 2000);
+  Add(0, 2U, "150", "200", 2000);
+  Add(0, 4U, "150", "200", 2000);
+  Add(0, 5U, "150", "200", 2000);
+  Add(0, 6U, "150", "200", 1000);
+  // Level 1 size will be 10000 after merging with L0
+  Add(1, 7U, "400", "500", 500);
+  Add(1, 8U, "600", "700", 500);
+
+  Add(2, 9U, "150", "200", 10000);
+
+  UpdateVersionStorageInfo();
+
+  ASSERT_EQ(10000u + 18000u, vstorage_->estimated_compaction_needed_bytes());
+}
+
+TEST_F(CompactionPickerTest, EstimateCompactionBytesNeededDynamicLevel) {
+  int num_levels = ioptions_.num_levels;
+  ioptions_.level_compaction_dynamic_level_bytes = true;
+  mutable_cf_options_.level0_file_num_compaction_trigger = 3;
+  mutable_cf_options_.max_bytes_for_level_base = 1000;
+  mutable_cf_options_.max_bytes_for_level_multiplier = 10;
+  NewVersionStorage(num_levels, kCompactionStyleLevel);
+
+  // Set Last level size 50000
+  // num_levels - 1 target 5000
+  // num_levels - 2 is base level with target 1000 (rounded up to
+  // max_bytes_for_level_base).
+  Add(num_levels - 1, 10U, "400", "500", 50000);
+
+  Add(0, 1U, "150", "200", 200);
+  Add(0, 2U, "150", "200", 200);
+  Add(0, 4U, "150", "200", 200);
+  Add(0, 5U, "150", "200", 200);
+  Add(0, 6U, "150", "200", 200);
+  // num_levels - 3 is over target by 100 + 1000
+  Add(num_levels - 3, 7U, "400", "500", 550);
+  Add(num_levels - 3, 8U, "600", "700", 550);
+  // num_levels - 2 is over target by 1100 + 200
+  Add(num_levels - 2, 9U, "150", "200", 5200);
+
+  UpdateVersionStorageInfo();
+
+  // Merging to the second last level: (5200 / 2100 + 1) * 1100
+  // Merging to the last level: (50000 / 6300 + 1) * 1300
+  ASSERT_EQ(2100u + 3823u + 11617u,
+            vstorage_->estimated_compaction_needed_bytes());
+}
+
+TEST_F(CompactionPickerTest, IsBottommostLevelTest) {
+  // case 1: Higher levels are empty
+  NewVersionStorage(6, kCompactionStyleLevel);
+  Add(0, 1U, "a", "m");
+  Add(0, 2U, "c", "z");
+  Add(1, 3U, "d", "e");
+  Add(1, 4U, "l", "p");
+  Add(2, 5U, "g", "i");
+  Add(2, 6U, "x", "z");
+  UpdateVersionStorageInfo();
+  SetCompactionInputFilesLevels(2, 1);
+  AddToCompactionFiles(3U);
+  AddToCompactionFiles(5U);
+  bool result =
+      Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_);
+  ASSERT_TRUE(result);
+
+  // case 2: Higher levels have no overlap
+  NewVersionStorage(6, kCompactionStyleLevel);
+  Add(0, 1U, "a", "m");
+  Add(0, 2U, "c", "z");
+  Add(1, 3U, "d", "e");
+  Add(1, 4U, "l", "p");
+  Add(2, 5U, "g", "i");
+  Add(2, 6U, "x", "z");
+  Add(3, 7U, "k", "p");
+  Add(3, 8U, "t", "w");
+  Add(4, 9U, "a", "b");
+  Add(5, 10U, "c", "cc");
+  UpdateVersionStorageInfo();
+  SetCompactionInputFilesLevels(2, 1);
+  AddToCompactionFiles(3U);
+  AddToCompactionFiles(5U);
+  result = Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_);
+  ASSERT_TRUE(result);
+
+  // case 3.1: Higher levels (level 3) have overlap
+  NewVersionStorage(6, kCompactionStyleLevel);
+  Add(0, 1U, "a", "m");
+  Add(0, 2U, "c", "z");
+  Add(1, 3U, "d", "e");
+  Add(1, 4U, "l", "p");
+  Add(2, 5U, "g", "i");
+  Add(2, 6U, "x", "z");
+  Add(3, 7U, "e", "g");
+  Add(3, 8U, "h", "k");
+  Add(4, 9U, "a", "b");
+  Add(5, 10U, "c", "cc");
+  UpdateVersionStorageInfo();
+  SetCompactionInputFilesLevels(2, 1);
+  AddToCompactionFiles(3U);
+  AddToCompactionFiles(5U);
+  result = Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_);
+  ASSERT_FALSE(result);
+
+  // case 3.2: Higher levels (level 5) have overlap
+  DeleteVersionStorage();
+  NewVersionStorage(6, kCompactionStyleLevel);
+  Add(0, 1U, "a", "m");
+  Add(0, 2U, "c", "z");
+  Add(1, 3U, "d", "e");
+  Add(1, 4U, "l", "p");
+  Add(2, 5U, "g", "i");
+  Add(2, 6U, "x", "z");
+  Add(3, 7U, "j", "k");
+  Add(3, 8U, "l", "m");
+  Add(4, 9U, "a", "b");
+  Add(5, 10U, "c", "cc");
+  Add(5, 11U, "h", "k");
+  Add(5, 12U, "y", "yy");
+  Add(5, 13U, "z", "zz");
+  UpdateVersionStorageInfo();
+  SetCompactionInputFilesLevels(2, 1);
+  AddToCompactionFiles(3U);
+  AddToCompactionFiles(5U);
+  result = Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_);
+  ASSERT_FALSE(result);
+
+  // case 3.3: Higher levels (level 5) have overlap, but it's only overlapping
+  // one key ("d")
+  NewVersionStorage(6, kCompactionStyleLevel);
+  Add(0, 1U, "a", "m");
+  Add(0, 2U, "c", "z");
+  Add(1, 3U, "d", "e");
+  Add(1, 4U, "l", "p");
+  Add(2, 5U, "g", "i");
+  Add(2, 6U, "x", "z");
+  Add(3, 7U, "j", "k");
+  Add(3, 8U, "l", "m");
+  Add(4, 9U, "a", "b");
+  Add(5, 10U, "c", "cc");
+  Add(5, 11U, "ccc", "d");
+  Add(5, 12U, "y", "yy");
+  Add(5, 13U, "z", "zz");
+  UpdateVersionStorageInfo();
+  SetCompactionInputFilesLevels(2, 1);
+  AddToCompactionFiles(3U);
+  AddToCompactionFiles(5U);
+  result = Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_);
+  ASSERT_FALSE(result);
+
+  // Level 0 files overlap
+  NewVersionStorage(6, kCompactionStyleLevel);
+  Add(0, 1U, "s", "t");
+  Add(0, 2U, "a", "m");
+  Add(0, 3U, "b", "z");
+  Add(0, 4U, "e", "f");
+  Add(5, 10U, "y", "z");
+  UpdateVersionStorageInfo();
+  SetCompactionInputFilesLevels(1, 0);
+  AddToCompactionFiles(1U);
+  AddToCompactionFiles(2U);
+  AddToCompactionFiles(3U);
+  AddToCompactionFiles(4U);
+  result = Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_);
+  ASSERT_FALSE(result);
+
+  // Level 0 files don't overlap
+  NewVersionStorage(6, kCompactionStyleLevel);
+  Add(0, 1U, "s", "t");
+  Add(0, 2U, "a", "m");
+  Add(0, 3U, "b", "k");
+  Add(0, 4U, "e", "f");
+  Add(5, 10U, "y", "z");
+  UpdateVersionStorageInfo();
+  SetCompactionInputFilesLevels(1, 0);
+  AddToCompactionFiles(1U);
+  AddToCompactionFiles(2U);
+  AddToCompactionFiles(3U);
+  AddToCompactionFiles(4U);
+  result = Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_);
+  ASSERT_TRUE(result);
+
+  // Level 1 files overlap
+  NewVersionStorage(6, kCompactionStyleLevel);
+  Add(0, 1U, "s", "t");
+  Add(0, 2U, "a", "m");
+  Add(0, 3U, "b", "k");
+  Add(0, 4U, "e", "f");
+  Add(1, 5U, "a", "m");
+  Add(1, 6U, "n", "o");
+  Add(1, 7U, "w", "y");
+  Add(5, 10U, "y", "z");
+  UpdateVersionStorageInfo();
+  SetCompactionInputFilesLevels(2, 0);
+  AddToCompactionFiles(1U);
+  AddToCompactionFiles(2U);
+  AddToCompactionFiles(3U);
+  AddToCompactionFiles(4U);
+  AddToCompactionFiles(5U);
+  AddToCompactionFiles(6U);
+  AddToCompactionFiles(7U);
+  result = Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_);
+  ASSERT_FALSE(result);
+
+  DeleteVersionStorage();
+}
+
+TEST_F(CompactionPickerTest, MaxCompactionBytesHit) {
+  mutable_cf_options_.max_bytes_for_level_base = 1000000u;
+  mutable_cf_options_.max_compaction_bytes = 800000u;
+  ioptions_.level_compaction_dynamic_level_bytes = false;
+  NewVersionStorage(6, kCompactionStyleLevel);
+  // A compaction should be triggered and pick file 2 and 5.
+  // It can expand because adding file 1 and 3, the compaction size will
+  // exceed mutable_cf_options_.max_bytes_for_level_base.
+  Add(1, 1U, "100", "150", 300000U);
+  Add(1, 2U, "151", "200", 300001U, 0, 0);
+  Add(1, 3U, "201", "250", 300000U, 0, 0);
+  Add(1, 4U, "251", "300", 300000U, 0, 0);
+  Add(2, 5U, "100", "256", 1U);
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(2U, compaction->num_input_levels());
+  ASSERT_EQ(1U, compaction->num_input_files(0));
+  ASSERT_EQ(1U, compaction->num_input_files(1));
+  ASSERT_EQ(2U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(5U, compaction->input(1, 0)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, MaxCompactionBytesNotHit) {
+  mutable_cf_options_.max_bytes_for_level_base = 800000u;
+  mutable_cf_options_.max_compaction_bytes = 1000000u;
+  ioptions_.level_compaction_dynamic_level_bytes = false;
+  NewVersionStorage(6, kCompactionStyleLevel);
+  // A compaction should be triggered and pick file 2 and 5.
+  // and it expands to file 1 and 3 too.
+  Add(1, 1U, "100", "150", 300000U);
+  Add(1, 2U, "151", "200", 300001U, 0, 0);
+  Add(1, 3U, "201", "250", 300000U, 0, 0);
+  Add(1, 4U, "251", "300", 300000U, 0, 0);
+  Add(2, 5U, "000", "251", 1U);
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(2U, compaction->num_input_levels());
+  ASSERT_EQ(3U, compaction->num_input_files(0));
+  ASSERT_EQ(1U, compaction->num_input_files(1));
+  ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber());
+  ASSERT_EQ(3U, compaction->input(0, 2)->fd.GetNumber());
+  ASSERT_EQ(5U, compaction->input(1, 0)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, IsTrivialMoveOn) {
+  mutable_cf_options_.max_bytes_for_level_base = 10000u;
+  mutable_cf_options_.max_compaction_bytes = 10001u;
+  ioptions_.level_compaction_dynamic_level_bytes = false;
+  NewVersionStorage(6, kCompactionStyleLevel);
+  // A compaction should be triggered and pick file 2
+  Add(1, 1U, "100", "150", 3000U);
+  Add(1, 2U, "151", "200", 3001U);
+  Add(1, 3U, "201", "250", 3000U);
+  Add(1, 4U, "251", "300", 3000U);
+
+  Add(3, 5U, "120", "130", 7000U);
+  Add(3, 6U, "170", "180", 7000U);
+  Add(3, 5U, "220", "230", 7000U);
+  Add(3, 5U, "270", "280", 7000U);
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+    cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_TRUE(compaction->IsTrivialMove());
+}
+
+TEST_F(CompactionPickerTest, IsTrivialMoveOff) {
+  mutable_cf_options_.max_bytes_for_level_base = 1000000u;
+  mutable_cf_options_.max_compaction_bytes = 10000u;
+  ioptions_.level_compaction_dynamic_level_bytes = false;
+  NewVersionStorage(6, kCompactionStyleLevel);
+  // A compaction should be triggered and pick all files from level 1
+  Add(1, 1U, "100", "150", 300000U, 0, 0);
+  Add(1, 2U, "150", "200", 300000U, 0, 0);
+  Add(1, 3U, "200", "250", 300000U, 0, 0);
+  Add(1, 4U, "250", "300", 300000U, 0, 0);
+
+  Add(3, 5U, "120", "130", 6000U);
+  Add(3, 6U, "140", "150", 6000U);
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+    cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_FALSE(compaction->IsTrivialMove());
+}
+
+TEST_F(CompactionPickerTest, CacheNextCompactionIndex) {
+  NewVersionStorage(6, kCompactionStyleLevel);
+  mutable_cf_options_.max_compaction_bytes = 100000000000u;
+
+  Add(1 /* level */, 1U /* file_number */, "100" /* smallest */,
+      "149" /* largest */, 1000000000U /* file_size */);
+  file_map_[1U].first->being_compacted = true;
+  Add(1 /* level */, 2U /* file_number */, "150" /* smallest */,
+      "199" /* largest */, 900000000U /* file_size */);
+  Add(1 /* level */, 3U /* file_number */, "200" /* smallest */,
+      "249" /* largest */, 800000000U /* file_size */);
+  Add(1 /* level */, 4U /* file_number */, "250" /* smallest */,
+      "299" /* largest */, 700000000U /* file_size */);
+  Add(2 /* level */, 5U /* file_number */, "150" /* smallest */,
+      "199" /* largest */, 1U /* file_size */);
+  file_map_[5U].first->being_compacted = true;
+
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(1U, compaction->num_input_levels());
+  ASSERT_EQ(1U, compaction->num_input_files(0));
+  ASSERT_EQ(0U, compaction->num_input_files(1));
+  ASSERT_EQ(3U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(2, vstorage_->NextCompactionIndex(1 /* level */));
+
+  compaction.reset(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(1U, compaction->num_input_levels());
+  ASSERT_EQ(1U, compaction->num_input_files(0));
+  ASSERT_EQ(0U, compaction->num_input_files(1));
+  ASSERT_EQ(4U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(3, vstorage_->NextCompactionIndex(1 /* level */));
+
+  compaction.reset(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+  ASSERT_TRUE(compaction.get() == nullptr);
+  ASSERT_EQ(4, vstorage_->NextCompactionIndex(1 /* level */));
+}
+
+TEST_F(CompactionPickerTest, IntraL0MaxCompactionBytesNotHit) {
+  // Intra L0 compaction triggers only if there are at least
+  // level0_file_num_compaction_trigger + 2 L0 files.
+  mutable_cf_options_.level0_file_num_compaction_trigger = 3;
+  mutable_cf_options_.max_compaction_bytes = 1000000u;
+  NewVersionStorage(6, kCompactionStyleLevel);
+
+  // All 5 L0 files will be picked for intra L0 compaction. The one L1 file
+  // spans entire L0 key range and is marked as being compacted to avoid
+  // L0->L1 compaction.
+  Add(0, 1U, "100", "150", 200000U, 0, 100, 101);
+  Add(0, 2U, "151", "200", 200000U, 0, 102, 103);
+  Add(0, 3U, "201", "250", 200000U, 0, 104, 105);
+  Add(0, 4U, "251", "300", 200000U, 0, 106, 107);
+  Add(0, 5U, "301", "350", 200000U, 0, 108, 109);
+  Add(1, 6U, "100", "350", 200000U, 0, 110, 111);
+  vstorage_->LevelFiles(1)[0]->being_compacted = true;
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(1U, compaction->num_input_levels());
+  ASSERT_EQ(5U, compaction->num_input_files(0));
+  ASSERT_EQ(CompactionReason::kLevelL0FilesNum,
+            compaction->compaction_reason());
+  ASSERT_EQ(0, compaction->output_level());
+}
+
+TEST_F(CompactionPickerTest, IntraL0MaxCompactionBytesHit) {
+  // Intra L0 compaction triggers only if there are at least
+  // level0_file_num_compaction_trigger + 2 L0 files.
+  mutable_cf_options_.level0_file_num_compaction_trigger = 3;
+  mutable_cf_options_.max_compaction_bytes = 999999u;
+  NewVersionStorage(6, kCompactionStyleLevel);
+
+  // 4 out of 5 L0 files will be picked for intra L0 compaction due to
+  // max_compaction_bytes limit (the minimum number of files for triggering
+  // intra L0 compaction is 4). The one L1 file spans entire L0 key range and
+  // is marked as being compacted to avoid L0->L1 compaction.
+  Add(0, 1U, "100", "150", 200000U, 0, 100, 101);
+  Add(0, 2U, "151", "200", 200000U, 0, 102, 103);
+  Add(0, 3U, "201", "250", 200000U, 0, 104, 105);
+  Add(0, 4U, "251", "300", 200000U, 0, 106, 107);
+  Add(0, 5U, "301", "350", 200000U, 0, 108, 109);
+  Add(1, 6U, "100", "350", 200000U, 0, 109, 110);
+  vstorage_->LevelFiles(1)[0]->being_compacted = true;
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(1U, compaction->num_input_levels());
+  ASSERT_EQ(4U, compaction->num_input_files(0));
+  ASSERT_EQ(CompactionReason::kLevelL0FilesNum,
+            compaction->compaction_reason());
+  ASSERT_EQ(0, compaction->output_level());
+}
+
+TEST_F(CompactionPickerTest, IntraL0ForEarliestSeqno) {
+  // Intra L0 compaction triggers only if there are at least
+  // level0_file_num_compaction_trigger + 2 L0 files.
+  mutable_cf_options_.level0_file_num_compaction_trigger = 3;
+  mutable_cf_options_.max_compaction_bytes = 999999u;
+  NewVersionStorage(6, kCompactionStyleLevel);
+
+  // 4 out of 6 L0 files will be picked for intra L0 compaction due to
+  // being_compact limit. And the latest one L0 will be skipped due to earliest
+  // seqno. The one L1 file spans entire L0 key range and is marked as being
+  // compacted to avoid L0->L1 compaction.
+  Add(1, 1U, "100", "350", 200000U, 0, 110, 111);
+  Add(0, 2U, "301", "350", 1U, 0, 108, 109);
+  Add(0, 3U, "251", "300", 1U, 0, 106, 107);
+  Add(0, 4U, "201", "250", 1U, 0, 104, 105);
+  Add(0, 5U, "151", "200", 1U, 0, 102, 103);
+  Add(0, 6U, "100", "150", 1U, 0, 100, 101);
+  Add(0, 7U, "100", "100", 1U, 0, 99, 100);
+  vstorage_->LevelFiles(0)[5]->being_compacted = true;
+  vstorage_->LevelFiles(1)[0]->being_compacted = true;
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_, 107));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(1U, compaction->num_input_levels());
+  ASSERT_EQ(4U, compaction->num_input_files(0));
+  ASSERT_EQ(CompactionReason::kLevelL0FilesNum,
+            compaction->compaction_reason());
+  ASSERT_EQ(0, compaction->output_level());
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/compaction/compaction_picker_universal.cc b/src/rocksdb/db/compaction/compaction_picker_universal.cc
new file mode 100644
index 000000000..d8b63956e
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_picker_universal.cc
@@ -0,0 +1,1105 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/compaction/compaction_picker_universal.h"
+#ifndef ROCKSDB_LITE
+
+#include <cinttypes>
+#include <limits>
+#include <queue>
+#include <string>
+#include <utility>
+#include "db/column_family.h"
+#include "file/filename.h"
+#include "logging/log_buffer.h"
+#include "monitoring/statistics.h"
+#include "test_util/sync_point.h"
+#include "util/random.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace {
+// A helper class that form universal compactions. The class is used by
+// UniversalCompactionPicker::PickCompaction().
+// The usage is to create the class, and get the compaction object by calling
+// PickCompaction().
+class UniversalCompactionBuilder {
+ public:
+  UniversalCompactionBuilder(const ImmutableCFOptions& ioptions,
+                             const InternalKeyComparator* icmp,
+                             const std::string& cf_name,
+                             const MutableCFOptions& mutable_cf_options,
+                             VersionStorageInfo* vstorage,
+                             UniversalCompactionPicker* picker,
+                             LogBuffer* log_buffer)
+      : ioptions_(ioptions),
+        icmp_(icmp),
+        cf_name_(cf_name),
+        mutable_cf_options_(mutable_cf_options),
+        vstorage_(vstorage),
+        picker_(picker),
+        log_buffer_(log_buffer) {}
+
+  // Form and return the compaction object. The caller owns return object.
+  Compaction* PickCompaction();
+
+ private:
+  struct SortedRun {
+    SortedRun(int _level, FileMetaData* _file, uint64_t _size,
+              uint64_t _compensated_file_size, bool _being_compacted)
+        : level(_level),
+          file(_file),
+          size(_size),
+          compensated_file_size(_compensated_file_size),
+          being_compacted(_being_compacted) {
+      assert(compensated_file_size > 0);
+      assert(level != 0 || file != nullptr);
+    }
+
+    void Dump(char* out_buf, size_t out_buf_size,
+              bool print_path = false) const;
+
+    // sorted_run_count is added into the string to print
+    void DumpSizeInfo(char* out_buf, size_t out_buf_size,
+                      size_t sorted_run_count) const;
+
+    int level;
+    // `file` Will be null for level > 0. For level = 0, the sorted run is
+    // for this file.
+    FileMetaData* file;
+    // For level > 0, `size` and `compensated_file_size` are sum of sizes all
+    // files in the level. `being_compacted` should be the same for all files
+    // in a non-zero level. Use the value here.
+    uint64_t size;
+    uint64_t compensated_file_size;
+    bool being_compacted;
+  };
+
+  // Pick Universal compaction to limit read amplification
+  Compaction* PickCompactionToReduceSortedRuns(
+      unsigned int ratio, unsigned int max_number_of_files_to_compact);
+
+  // Pick Universal compaction to limit space amplification.
+  Compaction* PickCompactionToReduceSizeAmp();
+
+  Compaction* PickDeleteTriggeredCompaction();
+
+  // Form a compaction from the sorted run indicated by start_index to the
+  // oldest sorted run.
+  // The caller is responsible for making sure that those files are not in
+  // compaction.
+  Compaction* PickCompactionToOldest(size_t start_index,
+                                     CompactionReason compaction_reason);
+
+  // Try to pick periodic compaction. The caller should only call it
+  // if there is at least one file marked for periodic compaction.
+  // null will be returned if no such a compaction can be formed
+  // because some files are being compacted.
+  Compaction* PickPeriodicCompaction();
+
+  // Used in universal compaction when the enabled_trivial_move
+  // option is set. Checks whether there are any overlapping files
+  // in the input. Returns true if the input files are non
+  // overlapping.
+  bool IsInputFilesNonOverlapping(Compaction* c);
+
+  const ImmutableCFOptions& ioptions_;
+  const InternalKeyComparator* icmp_;
+  double score_;
+  std::vector<SortedRun> sorted_runs_;
+  const std::string& cf_name_;
+  const MutableCFOptions& mutable_cf_options_;
+  VersionStorageInfo* vstorage_;
+  UniversalCompactionPicker* picker_;
+  LogBuffer* log_buffer_;
+
+  static std::vector<SortedRun> CalculateSortedRuns(
+      const VersionStorageInfo& vstorage, const ImmutableCFOptions& ioptions,
+      const MutableCFOptions& mutable_cf_options);
+
+  // Pick a path ID to place a newly generated file, with its estimated file
+  // size.
+  static uint32_t GetPathId(const ImmutableCFOptions& ioptions,
+                            const MutableCFOptions& mutable_cf_options,
+                            uint64_t file_size);
+};
+
+// Used in universal compaction when trivial move is enabled.
+// This structure is used for the construction of min heap
+// that contains the file meta data, the level of the file
+// and the index of the file in that level
+
+struct InputFileInfo {
+  InputFileInfo() : f(nullptr), level(0), index(0) {}
+
+  FileMetaData* f;
+  size_t level;
+  size_t index;
+};
+
+// Used in universal compaction when trivial move is enabled.
+// This comparator is used for the construction of min heap
+// based on the smallest key of the file.
+struct SmallestKeyHeapComparator {
+  explicit SmallestKeyHeapComparator(const Comparator* ucmp) { ucmp_ = ucmp; }
+
+  bool operator()(InputFileInfo i1, InputFileInfo i2) const {
+    return (ucmp_->Compare(i1.f->smallest.user_key(),
+                           i2.f->smallest.user_key()) > 0);
+  }
+
+ private:
+  const Comparator* ucmp_;
+};
+
+typedef std::priority_queue<InputFileInfo, std::vector<InputFileInfo>,
+                            SmallestKeyHeapComparator>
+    SmallestKeyHeap;
+
+// This function creates the heap that is used to find if the files are
+// overlapping during universal compaction when the allow_trivial_move
+// is set.
+SmallestKeyHeap create_level_heap(Compaction* c, const Comparator* ucmp) {
+  SmallestKeyHeap smallest_key_priority_q =
+      SmallestKeyHeap(SmallestKeyHeapComparator(ucmp));
+
+  InputFileInfo input_file;
+
+  for (size_t l = 0; l < c->num_input_levels(); l++) {
+    if (c->num_input_files(l) != 0) {
+      if (l == 0 && c->start_level() == 0) {
+        for (size_t i = 0; i < c->num_input_files(0); i++) {
+          input_file.f = c->input(0, i);
+          input_file.level = 0;
+          input_file.index = i;
+          smallest_key_priority_q.push(std::move(input_file));
+        }
+      } else {
+        input_file.f = c->input(l, 0);
+        input_file.level = l;
+        input_file.index = 0;
+        smallest_key_priority_q.push(std::move(input_file));
+      }
+    }
+  }
+  return smallest_key_priority_q;
+}
+
+#ifndef NDEBUG
+// smallest_seqno and largest_seqno are set iff. `files` is not empty.
+void GetSmallestLargestSeqno(const std::vector<FileMetaData*>& files,
+                             SequenceNumber* smallest_seqno,
+                             SequenceNumber* largest_seqno) {
+  bool is_first = true;
+  for (FileMetaData* f : files) {
+    assert(f->fd.smallest_seqno <= f->fd.largest_seqno);
+    if (is_first) {
+      is_first = false;
+      *smallest_seqno = f->fd.smallest_seqno;
+      *largest_seqno = f->fd.largest_seqno;
+    } else {
+      if (f->fd.smallest_seqno < *smallest_seqno) {
+        *smallest_seqno = f->fd.smallest_seqno;
+      }
+      if (f->fd.largest_seqno > *largest_seqno) {
+        *largest_seqno = f->fd.largest_seqno;
+      }
+    }
+  }
+}
+#endif
+}  // namespace
+
+// Algorithm that checks to see if there are any overlapping
+// files in the input
+bool UniversalCompactionBuilder::IsInputFilesNonOverlapping(Compaction* c) {
+  auto comparator = icmp_->user_comparator();
+  int first_iter = 1;
+
+  InputFileInfo prev, curr, next;
+
+  SmallestKeyHeap smallest_key_priority_q =
+      create_level_heap(c, icmp_->user_comparator());
+
+  while (!smallest_key_priority_q.empty()) {
+    curr = smallest_key_priority_q.top();
+    smallest_key_priority_q.pop();
+
+    if (first_iter) {
+      prev = curr;
+      first_iter = 0;
+    } else {
+      if (comparator->Compare(prev.f->largest.user_key(),
+                              curr.f->smallest.user_key()) >= 0) {
+        // found overlapping files, return false
+        return false;
+      }
+      assert(comparator->Compare(curr.f->largest.user_key(),
+                                 prev.f->largest.user_key()) > 0);
+      prev = curr;
+    }
+
+    next.f = nullptr;
+
+    if (c->level(curr.level) != 0 &&
+        curr.index < c->num_input_files(curr.level) - 1) {
+      next.f = c->input(curr.level, curr.index + 1);
+      next.level = curr.level;
+      next.index = curr.index + 1;
+    }
+
+    if (next.f) {
+      smallest_key_priority_q.push(std::move(next));
+    }
+  }
+  return true;
+}
+
+bool UniversalCompactionPicker::NeedsCompaction(
+    const VersionStorageInfo* vstorage) const {
+  const int kLevel0 = 0;
+  if (vstorage->CompactionScore(kLevel0) >= 1) {
+    return true;
+  }
+  if (!vstorage->FilesMarkedForPeriodicCompaction().empty()) {
+    return true;
+  }
+  if (!vstorage->FilesMarkedForCompaction().empty()) {
+    return true;
+  }
+  return false;
+}
+
+Compaction* UniversalCompactionPicker::PickCompaction(
+    const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+    VersionStorageInfo* vstorage, LogBuffer* log_buffer,
+    SequenceNumber /* earliest_memtable_seqno */) {
+  UniversalCompactionBuilder builder(ioptions_, icmp_, cf_name,
+                                     mutable_cf_options, vstorage, this,
+                                     log_buffer);
+  return builder.PickCompaction();
+}
+
+void UniversalCompactionBuilder::SortedRun::Dump(char* out_buf,
+                                                 size_t out_buf_size,
+                                                 bool print_path) const {
+  if (level == 0) {
+    assert(file != nullptr);
+    if (file->fd.GetPathId() == 0 || !print_path) {
+      snprintf(out_buf, out_buf_size, "file %" PRIu64, file->fd.GetNumber());
+    } else {
+      snprintf(out_buf, out_buf_size, "file %" PRIu64
+                                      "(path "
+                                      "%" PRIu32 ")",
+               file->fd.GetNumber(), file->fd.GetPathId());
+    }
+  } else {
+    snprintf(out_buf, out_buf_size, "level %d", level);
+  }
+}
+
+void UniversalCompactionBuilder::SortedRun::DumpSizeInfo(
+    char* out_buf, size_t out_buf_size, size_t sorted_run_count) const {
+  if (level == 0) {
+    assert(file != nullptr);
+    snprintf(out_buf, out_buf_size,
+             "file %" PRIu64 "[%" ROCKSDB_PRIszt
+             "] "
+             "with size %" PRIu64 " (compensated size %" PRIu64 ")",
+             file->fd.GetNumber(), sorted_run_count, file->fd.GetFileSize(),
+             file->compensated_file_size);
+  } else {
+    snprintf(out_buf, out_buf_size,
+             "level %d[%" ROCKSDB_PRIszt
+             "] "
+             "with size %" PRIu64 " (compensated size %" PRIu64 ")",
+             level, sorted_run_count, size, compensated_file_size);
+  }
+}
+
+std::vector<UniversalCompactionBuilder::SortedRun>
+UniversalCompactionBuilder::CalculateSortedRuns(
+    const VersionStorageInfo& vstorage, const ImmutableCFOptions& /*ioptions*/,
+    const MutableCFOptions& mutable_cf_options) {
+  std::vector<UniversalCompactionBuilder::SortedRun> ret;
+  for (FileMetaData* f : vstorage.LevelFiles(0)) {
+    ret.emplace_back(0, f, f->fd.GetFileSize(), f->compensated_file_size,
+                     f->being_compacted);
+  }
+  for (int level = 1; level < vstorage.num_levels(); level++) {
+    uint64_t total_compensated_size = 0U;
+    uint64_t total_size = 0U;
+    bool being_compacted = false;
+    bool is_first = true;
+    for (FileMetaData* f : vstorage.LevelFiles(level)) {
+      total_compensated_size += f->compensated_file_size;
+      total_size += f->fd.GetFileSize();
+      if (mutable_cf_options.compaction_options_universal.allow_trivial_move ==
+          true) {
+        if (f->being_compacted) {
+          being_compacted = f->being_compacted;
+        }
+      } else {
+        // Compaction always includes all files for a non-zero level, so for a
+        // non-zero level, all the files should share the same being_compacted
+        // value.
+        // This assumption is only valid when
+        // mutable_cf_options.compaction_options_universal.allow_trivial_move
+        // is false
+        assert(is_first || f->being_compacted == being_compacted);
+      }
+      if (is_first) {
+        being_compacted = f->being_compacted;
+        is_first = false;
+      }
+    }
+    if (total_compensated_size > 0) {
+      ret.emplace_back(level, nullptr, total_size, total_compensated_size,
+                       being_compacted);
+    }
+  }
+  return ret;
+}
+
+// Universal style of compaction. Pick files that are contiguous in
+// time-range to compact.
+Compaction* UniversalCompactionBuilder::PickCompaction() {
+  const int kLevel0 = 0;
+  score_ = vstorage_->CompactionScore(kLevel0);
+  sorted_runs_ =
+      CalculateSortedRuns(*vstorage_, ioptions_, mutable_cf_options_);
+
+  if (sorted_runs_.size() == 0 ||
+      (vstorage_->FilesMarkedForPeriodicCompaction().empty() &&
+       vstorage_->FilesMarkedForCompaction().empty() &&
+       sorted_runs_.size() < (unsigned int)mutable_cf_options_
+                                 .level0_file_num_compaction_trigger)) {
+    ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: nothing to do\n",
+                     cf_name_.c_str());
+    TEST_SYNC_POINT_CALLBACK(
+        "UniversalCompactionBuilder::PickCompaction:Return", nullptr);
+    return nullptr;
+  }
+  VersionStorageInfo::LevelSummaryStorage tmp;
+  ROCKS_LOG_BUFFER_MAX_SZ(
+      log_buffer_, 3072,
+      "[%s] Universal: sorted runs files(%" ROCKSDB_PRIszt "): %s\n",
+      cf_name_.c_str(), sorted_runs_.size(), vstorage_->LevelSummary(&tmp));
+
+  Compaction* c = nullptr;
+  // Periodic compaction has higher priority than other type of compaction
+  // because it's a hard requirement.
+  if (!vstorage_->FilesMarkedForPeriodicCompaction().empty()) {
+    // Always need to do a full compaction for periodic compaction.
+    c = PickPeriodicCompaction();
+  }
+
+  // Check for size amplification.
+  if (c == nullptr &&
+      sorted_runs_.size() >=
+          static_cast<size_t>(
+              mutable_cf_options_.level0_file_num_compaction_trigger)) {
+    if ((c = PickCompactionToReduceSizeAmp()) != nullptr) {
+      ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: compacting for size amp\n",
+                       cf_name_.c_str());
+    } else {
+      // Size amplification is within limits. Try reducing read
+      // amplification while maintaining file size ratios.
+      unsigned int ratio =
+          mutable_cf_options_.compaction_options_universal.size_ratio;
+
+      if ((c = PickCompactionToReduceSortedRuns(ratio, UINT_MAX)) != nullptr) {
+        ROCKS_LOG_BUFFER(log_buffer_,
+                         "[%s] Universal: compacting for size ratio\n",
+                         cf_name_.c_str());
+      } else {
+        // Size amplification and file size ratios are within configured limits.
+        // If max read amplification is exceeding configured limits, then force
+        // compaction without looking at filesize ratios and try to reduce
+        // the number of files to fewer than level0_file_num_compaction_trigger.
+        // This is guaranteed by NeedsCompaction()
+        assert(sorted_runs_.size() >=
+               static_cast<size_t>(
+                   mutable_cf_options_.level0_file_num_compaction_trigger));
+        // Get the total number of sorted runs that are not being compacted
+        int num_sr_not_compacted = 0;
+        for (size_t i = 0; i < sorted_runs_.size(); i++) {
+          if (sorted_runs_[i].being_compacted == false) {
+            num_sr_not_compacted++;
+          }
+        }
+
+        // The number of sorted runs that are not being compacted is greater
+        // than the maximum allowed number of sorted runs
+        if (num_sr_not_compacted >
+            mutable_cf_options_.level0_file_num_compaction_trigger) {
+          unsigned int num_files =
+              num_sr_not_compacted -
+              mutable_cf_options_.level0_file_num_compaction_trigger + 1;
+          if ((c = PickCompactionToReduceSortedRuns(UINT_MAX, num_files)) !=
+              nullptr) {
+            ROCKS_LOG_BUFFER(log_buffer_,
+                             "[%s] Universal: compacting for file num -- %u\n",
+                             cf_name_.c_str(), num_files);
+          }
+        }
+      }
+    }
+  }
+
+  if (c == nullptr) {
+    if ((c = PickDeleteTriggeredCompaction()) != nullptr) {
+      ROCKS_LOG_BUFFER(log_buffer_,
+                       "[%s] Universal: delete triggered compaction\n",
+                       cf_name_.c_str());
+    }
+  }
+
+  if (c == nullptr) {
+    TEST_SYNC_POINT_CALLBACK(
+        "UniversalCompactionBuilder::PickCompaction:Return", nullptr);
+    return nullptr;
+  }
+
+  if (mutable_cf_options_.compaction_options_universal.allow_trivial_move ==
+          true &&
+      c->compaction_reason() != CompactionReason::kPeriodicCompaction) {
+    c->set_is_trivial_move(IsInputFilesNonOverlapping(c));
+  }
+
+// validate that all the chosen files of L0 are non overlapping in time
+#ifndef NDEBUG
+  SequenceNumber prev_smallest_seqno = 0U;
+  bool is_first = true;
+
+  size_t level_index = 0U;
+  if (c->start_level() == 0) {
+    for (auto f : *c->inputs(0)) {
+      assert(f->fd.smallest_seqno <= f->fd.largest_seqno);
+      if (is_first) {
+        is_first = false;
+      }
+      prev_smallest_seqno = f->fd.smallest_seqno;
+    }
+    level_index = 1U;
+  }
+  for (; level_index < c->num_input_levels(); level_index++) {
+    if (c->num_input_files(level_index) != 0) {
+      SequenceNumber smallest_seqno = 0U;
+      SequenceNumber largest_seqno = 0U;
+      GetSmallestLargestSeqno(*(c->inputs(level_index)), &smallest_seqno,
+                              &largest_seqno);
+      if (is_first) {
+        is_first = false;
+      } else if (prev_smallest_seqno > 0) {
+        // A level is considered as the bottommost level if there are
+        // no files in higher levels or if files in higher levels do
+        // not overlap with the files being compacted. Sequence numbers
+        // of files in bottommost level can be set to 0 to help
+        // compression. As a result, the following assert may not hold
+        // if the prev_smallest_seqno is 0.
+        assert(prev_smallest_seqno > largest_seqno);
+      }
+      prev_smallest_seqno = smallest_seqno;
+    }
+  }
+#endif
+  // update statistics
+  RecordInHistogram(ioptions_.statistics, NUM_FILES_IN_SINGLE_COMPACTION,
+                    c->inputs(0)->size());
+
+  picker_->RegisterCompaction(c);
+  vstorage_->ComputeCompactionScore(ioptions_, mutable_cf_options_);
+
+  TEST_SYNC_POINT_CALLBACK("UniversalCompactionBuilder::PickCompaction:Return",
+                           c);
+  return c;
+}
+
+uint32_t UniversalCompactionBuilder::GetPathId(
+    const ImmutableCFOptions& ioptions,
+    const MutableCFOptions& mutable_cf_options, uint64_t file_size) {
+  // Two conditions need to be satisfied:
+  // (1) the target path needs to be able to hold the file's size
+  // (2) Total size left in this and previous paths need to be not
+  //     smaller than expected future file size before this new file is
+  //     compacted, which is estimated based on size_ratio.
+  // For example, if now we are compacting files of size (1, 1, 2, 4, 8),
+  // we will make sure the target file, probably with size of 16, will be
+  // placed in a path so that eventually when new files are generated and
+  // compacted to (1, 1, 2, 4, 8, 16), all those files can be stored in or
+  // before the path we chose.
+  //
+  // TODO(sdong): now the case of multiple column families is not
+  // considered in this algorithm. So the target size can be violated in
+  // that case. We need to improve it.
+  uint64_t accumulated_size = 0;
+  uint64_t future_size =
+      file_size *
+      (100 - mutable_cf_options.compaction_options_universal.size_ratio) / 100;
+  uint32_t p = 0;
+  assert(!ioptions.cf_paths.empty());
+  for (; p < ioptions.cf_paths.size() - 1; p++) {
+    uint64_t target_size = ioptions.cf_paths[p].target_size;
+    if (target_size > file_size &&
+        accumulated_size + (target_size - file_size) > future_size) {
+      return p;
+    }
+    accumulated_size += target_size;
+  }
+  return p;
+}
+
+//
+// Consider compaction files based on their size differences with
+// the next file in time order.
+//
+Compaction* UniversalCompactionBuilder::PickCompactionToReduceSortedRuns(
+    unsigned int ratio, unsigned int max_number_of_files_to_compact) {
+  unsigned int min_merge_width =
+      mutable_cf_options_.compaction_options_universal.min_merge_width;
+  unsigned int max_merge_width =
+      mutable_cf_options_.compaction_options_universal.max_merge_width;
+
+  const SortedRun* sr = nullptr;
+  bool done = false;
+  size_t start_index = 0;
+  unsigned int candidate_count = 0;
+
+  unsigned int max_files_to_compact =
+      std::min(max_merge_width, max_number_of_files_to_compact);
+  min_merge_width = std::max(min_merge_width, 2U);
+
+  // Caller checks the size before executing this function. This invariant is
+  // important because otherwise we may have a possible integer underflow when
+  // dealing with unsigned types.
+  assert(sorted_runs_.size() > 0);
+
+  // Considers a candidate file only if it is smaller than the
+  // total size accumulated so far.
+  for (size_t loop = 0; loop < sorted_runs_.size(); loop++) {
+    candidate_count = 0;
+
+    // Skip files that are already being compacted
+    for (sr = nullptr; loop < sorted_runs_.size(); loop++) {
+      sr = &sorted_runs_[loop];
+
+      if (!sr->being_compacted) {
+        candidate_count = 1;
+        break;
+      }
+      char file_num_buf[kFormatFileNumberBufSize];
+      sr->Dump(file_num_buf, sizeof(file_num_buf));
+      ROCKS_LOG_BUFFER(log_buffer_,
+                       "[%s] Universal: %s"
+                       "[%d] being compacted, skipping",
+                       cf_name_.c_str(), file_num_buf, loop);
+
+      sr = nullptr;
+    }
+
+    // This file is not being compacted. Consider it as the
+    // first candidate to be compacted.
+    uint64_t candidate_size = sr != nullptr ? sr->compensated_file_size : 0;
+    if (sr != nullptr) {
+      char file_num_buf[kFormatFileNumberBufSize];
+      sr->Dump(file_num_buf, sizeof(file_num_buf), true);
+      ROCKS_LOG_BUFFER(log_buffer_,
+                       "[%s] Universal: Possible candidate %s[%d].",
+                       cf_name_.c_str(), file_num_buf, loop);
+    }
+
+    // Check if the succeeding files need compaction.
+    for (size_t i = loop + 1;
+         candidate_count < max_files_to_compact && i < sorted_runs_.size();
+         i++) {
+      const SortedRun* succeeding_sr = &sorted_runs_[i];
+      if (succeeding_sr->being_compacted) {
+        break;
+      }
+      // Pick files if the total/last candidate file size (increased by the
+      // specified ratio) is still larger than the next candidate file.
+      // candidate_size is the total size of files picked so far with the
+      // default kCompactionStopStyleTotalSize; with
+      // kCompactionStopStyleSimilarSize, it's simply the size of the last
+      // picked file.
+      double sz = candidate_size * (100.0 + ratio) / 100.0;
+      if (sz < static_cast<double>(succeeding_sr->size)) {
+        break;
+      }
+      if (mutable_cf_options_.compaction_options_universal.stop_style ==
+          kCompactionStopStyleSimilarSize) {
+        // Similar-size stopping rule: also check the last picked file isn't
+        // far larger than the next candidate file.
+        sz = (succeeding_sr->size * (100.0 + ratio)) / 100.0;
+        if (sz < static_cast<double>(candidate_size)) {
+          // If the small file we've encountered begins a run of similar-size
+          // files, we'll pick them up on a future iteration of the outer
+          // loop. If it's some lonely straggler, it'll eventually get picked
+          // by the last-resort read amp strategy which disregards size ratios.
+          break;
+        }
+        candidate_size = succeeding_sr->compensated_file_size;
+      } else {  // default kCompactionStopStyleTotalSize
+        candidate_size += succeeding_sr->compensated_file_size;
+      }
+      candidate_count++;
+    }
+
+    // Found a series of consecutive files that need compaction.
+    if (candidate_count >= (unsigned int)min_merge_width) {
+      start_index = loop;
+      done = true;
+      break;
+    } else {
+      for (size_t i = loop;
+           i < loop + candidate_count && i < sorted_runs_.size(); i++) {
+        const SortedRun* skipping_sr = &sorted_runs_[i];
+        char file_num_buf[256];
+        skipping_sr->DumpSizeInfo(file_num_buf, sizeof(file_num_buf), loop);
+        ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: Skipping %s",
+                         cf_name_.c_str(), file_num_buf);
+      }
+    }
+  }
+  if (!done || candidate_count <= 1) {
+    return nullptr;
+  }
+  size_t first_index_after = start_index + candidate_count;
+  // Compression is enabled if files compacted earlier already reached
+  // size ratio of compression.
+  bool enable_compression = true;
+  int ratio_to_compress =
+      mutable_cf_options_.compaction_options_universal.compression_size_percent;
+  if (ratio_to_compress >= 0) {
+    uint64_t total_size = 0;
+    for (auto& sorted_run : sorted_runs_) {
+      total_size += sorted_run.compensated_file_size;
+    }
+
+    uint64_t older_file_size = 0;
+    for (size_t i = sorted_runs_.size() - 1; i >= first_index_after; i--) {
+      older_file_size += sorted_runs_[i].size;
+      if (older_file_size * 100L >= total_size * (long)ratio_to_compress) {
+        enable_compression = false;
+        break;
+      }
+    }
+  }
+
+  uint64_t estimated_total_size = 0;
+  for (unsigned int i = 0; i < first_index_after; i++) {
+    estimated_total_size += sorted_runs_[i].size;
+  }
+  uint32_t path_id =
+      GetPathId(ioptions_, mutable_cf_options_, estimated_total_size);
+  int start_level = sorted_runs_[start_index].level;
+  int output_level;
+  if (first_index_after == sorted_runs_.size()) {
+    output_level = vstorage_->num_levels() - 1;
+  } else if (sorted_runs_[first_index_after].level == 0) {
+    output_level = 0;
+  } else {
+    output_level = sorted_runs_[first_index_after].level - 1;
+  }
+
+  // last level is reserved for the files ingested behind
+  if (ioptions_.allow_ingest_behind &&
+      (output_level == vstorage_->num_levels() - 1)) {
+    assert(output_level > 1);
+    output_level--;
+  }
+
+  std::vector<CompactionInputFiles> inputs(vstorage_->num_levels());
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    inputs[i].level = start_level + static_cast<int>(i);
+  }
+  for (size_t i = start_index; i < first_index_after; i++) {
+    auto& picking_sr = sorted_runs_[i];
+    if (picking_sr.level == 0) {
+      FileMetaData* picking_file = picking_sr.file;
+      inputs[0].files.push_back(picking_file);
+    } else {
+      auto& files = inputs[picking_sr.level - start_level].files;
+      for (auto* f : vstorage_->LevelFiles(picking_sr.level)) {
+        files.push_back(f);
+      }
+    }
+    char file_num_buf[256];
+    picking_sr.DumpSizeInfo(file_num_buf, sizeof(file_num_buf), i);
+    ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: Picking %s",
+                     cf_name_.c_str(), file_num_buf);
+  }
+
+  CompactionReason compaction_reason;
+  if (max_number_of_files_to_compact == UINT_MAX) {
+    compaction_reason = CompactionReason::kUniversalSizeRatio;
+  } else {
+    compaction_reason = CompactionReason::kUniversalSortedRunNum;
+  }
+  return new Compaction(
+      vstorage_, ioptions_, mutable_cf_options_, std::move(inputs),
+      output_level,
+      MaxFileSizeForLevel(mutable_cf_options_, output_level,
+                          kCompactionStyleUniversal),
+      LLONG_MAX, path_id,
+      GetCompressionType(ioptions_, vstorage_, mutable_cf_options_, start_level,
+                         1, enable_compression),
+      GetCompressionOptions(ioptions_, vstorage_, start_level,
+                            enable_compression),
+      /* max_subcompactions */ 0, /* grandparents */ {}, /* is manual */ false,
+      score_, false /* deletion_compaction */, compaction_reason);
+}
+
+// Look at overall size amplification. If size amplification
+// exceeeds the configured value, then do a compaction
+// of the candidate files all the way upto the earliest
+// base file (overrides configured values of file-size ratios,
+// min_merge_width and max_merge_width).
+//
+Compaction* UniversalCompactionBuilder::PickCompactionToReduceSizeAmp() {
+  // percentage flexibility while reducing size amplification
+  uint64_t ratio = mutable_cf_options_.compaction_options_universal
+                       .max_size_amplification_percent;
+
+  unsigned int candidate_count = 0;
+  uint64_t candidate_size = 0;
+  size_t start_index = 0;
+  const SortedRun* sr = nullptr;
+
+  assert(!sorted_runs_.empty());
+  if (sorted_runs_.back().being_compacted) {
+    return nullptr;
+  }
+
+  // Skip files that are already being compacted
+  for (size_t loop = 0; loop < sorted_runs_.size() - 1; loop++) {
+    sr = &sorted_runs_[loop];
+    if (!sr->being_compacted) {
+      start_index = loop;  // Consider this as the first candidate.
+      break;
+    }
+    char file_num_buf[kFormatFileNumberBufSize];
+    sr->Dump(file_num_buf, sizeof(file_num_buf), true);
+    ROCKS_LOG_BUFFER(log_buffer_,
+                     "[%s] Universal: skipping %s[%d] compacted %s",
+                     cf_name_.c_str(), file_num_buf, loop,
+                     " cannot be a candidate to reduce size amp.\n");
+    sr = nullptr;
+  }
+
+  if (sr == nullptr) {
+    return nullptr;  // no candidate files
+  }
+  {
+    char file_num_buf[kFormatFileNumberBufSize];
+    sr->Dump(file_num_buf, sizeof(file_num_buf), true);
+    ROCKS_LOG_BUFFER(
+        log_buffer_,
+        "[%s] Universal: First candidate %s[%" ROCKSDB_PRIszt "] %s",
+        cf_name_.c_str(), file_num_buf, start_index, " to reduce size amp.\n");
+  }
+
+  // keep adding up all the remaining files
+  for (size_t loop = start_index; loop < sorted_runs_.size() - 1; loop++) {
+    sr = &sorted_runs_[loop];
+    if (sr->being_compacted) {
+      char file_num_buf[kFormatFileNumberBufSize];
+      sr->Dump(file_num_buf, sizeof(file_num_buf), true);
+      ROCKS_LOG_BUFFER(
+          log_buffer_, "[%s] Universal: Possible candidate %s[%d] %s",
+          cf_name_.c_str(), file_num_buf, start_index,
+          " is already being compacted. No size amp reduction possible.\n");
+      return nullptr;
+    }
+    candidate_size += sr->compensated_file_size;
+    candidate_count++;
+  }
+  if (candidate_count == 0) {
+    return nullptr;
+  }
+
+  // size of earliest file
+  uint64_t earliest_file_size = sorted_runs_.back().size;
+
+  // size amplification = percentage of additional size
+  if (candidate_size * 100 < ratio * earliest_file_size) {
+    ROCKS_LOG_BUFFER(
+        log_buffer_,
+        "[%s] Universal: size amp not needed. newer-files-total-size %" PRIu64
+        " earliest-file-size %" PRIu64,
+        cf_name_.c_str(), candidate_size, earliest_file_size);
+    return nullptr;
+  } else {
+    ROCKS_LOG_BUFFER(
+        log_buffer_,
+        "[%s] Universal: size amp needed. newer-files-total-size %" PRIu64
+        " earliest-file-size %" PRIu64,
+        cf_name_.c_str(), candidate_size, earliest_file_size);
+  }
+  return PickCompactionToOldest(start_index,
+                                CompactionReason::kUniversalSizeAmplification);
+}
+
+// Pick files marked for compaction. Typically, files are marked by
+// CompactOnDeleteCollector due to the presence of tombstones.
+Compaction* UniversalCompactionBuilder::PickDeleteTriggeredCompaction() {
+  CompactionInputFiles start_level_inputs;
+  int output_level;
+  std::vector<CompactionInputFiles> inputs;
+
+  if (vstorage_->num_levels() == 1) {
+    // This is single level universal. Since we're basically trying to reclaim
+    // space by processing files marked for compaction due to high tombstone
+    // density, let's do the same thing as compaction to reduce size amp which
+    // has the same goals.
+    bool compact = false;
+
+    start_level_inputs.level = 0;
+    start_level_inputs.files.clear();
+    output_level = 0;
+    for (FileMetaData* f : vstorage_->LevelFiles(0)) {
+      if (f->marked_for_compaction) {
+        compact = true;
+      }
+      if (compact) {
+        start_level_inputs.files.push_back(f);
+      }
+    }
+    if (start_level_inputs.size() <= 1) {
+      // If only the last file in L0 is marked for compaction, ignore it
+      return nullptr;
+    }
+    inputs.push_back(start_level_inputs);
+  } else {
+    int start_level;
+
+    // For multi-level universal, the strategy is to make this look more like
+    // leveled. We pick one of the files marked for compaction and compact with
+    // overlapping files in the adjacent level.
+    picker_->PickFilesMarkedForCompaction(cf_name_, vstorage_, &start_level,
+                                          &output_level, &start_level_inputs);
+    if (start_level_inputs.empty()) {
+      return nullptr;
+    }
+
+    // Pick the first non-empty level after the start_level
+    for (output_level = start_level + 1; output_level < vstorage_->num_levels();
+         output_level++) {
+      if (vstorage_->NumLevelFiles(output_level) != 0) {
+        break;
+      }
+    }
+
+    // If all higher levels are empty, pick the highest level as output level
+    if (output_level == vstorage_->num_levels()) {
+      if (start_level == 0) {
+        output_level = vstorage_->num_levels() - 1;
+      } else {
+        // If start level is non-zero and all higher levels are empty, this
+        // compaction will translate into a trivial move. Since the idea is
+        // to reclaim space and trivial move doesn't help with that, we
+        // skip compaction in this case and return nullptr
+        return nullptr;
+      }
+    }
+    if (ioptions_.allow_ingest_behind &&
+        output_level == vstorage_->num_levels() - 1) {
+      assert(output_level > 1);
+      output_level--;
+    }
+
+    if (output_level != 0) {
+      if (start_level == 0) {
+        if (!picker_->GetOverlappingL0Files(vstorage_, &start_level_inputs,
+                                            output_level, nullptr)) {
+          return nullptr;
+        }
+      }
+
+      CompactionInputFiles output_level_inputs;
+      int parent_index = -1;
+
+      output_level_inputs.level = output_level;
+      if (!picker_->SetupOtherInputs(cf_name_, mutable_cf_options_, vstorage_,
+                                     &start_level_inputs, &output_level_inputs,
+                                     &parent_index, -1)) {
+        return nullptr;
+      }
+      inputs.push_back(start_level_inputs);
+      if (!output_level_inputs.empty()) {
+        inputs.push_back(output_level_inputs);
+      }
+      if (picker_->FilesRangeOverlapWithCompaction(inputs, output_level)) {
+        return nullptr;
+      }
+    } else {
+      inputs.push_back(start_level_inputs);
+    }
+  }
+
+  uint64_t estimated_total_size = 0;
+  // Use size of the output level as estimated file size
+  for (FileMetaData* f : vstorage_->LevelFiles(output_level)) {
+    estimated_total_size += f->fd.GetFileSize();
+  }
+  uint32_t path_id =
+      GetPathId(ioptions_, mutable_cf_options_, estimated_total_size);
+  return new Compaction(
+      vstorage_, ioptions_, mutable_cf_options_, std::move(inputs),
+      output_level,
+      MaxFileSizeForLevel(mutable_cf_options_, output_level,
+                          kCompactionStyleUniversal),
+      /* max_grandparent_overlap_bytes */ LLONG_MAX, path_id,
+      GetCompressionType(ioptions_, vstorage_, mutable_cf_options_,
+                         output_level, 1),
+      GetCompressionOptions(ioptions_, vstorage_, output_level),
+      /* max_subcompactions */ 0, /* grandparents */ {}, /* is manual */ true,
+      score_, false /* deletion_compaction */,
+      CompactionReason::kFilesMarkedForCompaction);
+}
+
+Compaction* UniversalCompactionBuilder::PickCompactionToOldest(
+    size_t start_index, CompactionReason compaction_reason) {
+  assert(start_index < sorted_runs_.size());
+
+  // Estimate total file size
+  uint64_t estimated_total_size = 0;
+  for (size_t loop = start_index; loop < sorted_runs_.size(); loop++) {
+    estimated_total_size += sorted_runs_[loop].size;
+  }
+  uint32_t path_id =
+      GetPathId(ioptions_, mutable_cf_options_, estimated_total_size);
+  int start_level = sorted_runs_[start_index].level;
+
+  std::vector<CompactionInputFiles> inputs(vstorage_->num_levels());
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    inputs[i].level = start_level + static_cast<int>(i);
+  }
+  for (size_t loop = start_index; loop < sorted_runs_.size(); loop++) {
+    auto& picking_sr = sorted_runs_[loop];
+    if (picking_sr.level == 0) {
+      FileMetaData* f = picking_sr.file;
+      inputs[0].files.push_back(f);
+    } else {
+      auto& files = inputs[picking_sr.level - start_level].files;
+      for (auto* f : vstorage_->LevelFiles(picking_sr.level)) {
+        files.push_back(f);
+      }
+    }
+    std::string comp_reason_print_string;
+    if (compaction_reason == CompactionReason::kPeriodicCompaction) {
+      comp_reason_print_string = "periodic compaction";
+    } else if (compaction_reason ==
+               CompactionReason::kUniversalSizeAmplification) {
+      comp_reason_print_string = "size amp";
+    } else {
+      assert(false);
+    }
+
+    char file_num_buf[256];
+    picking_sr.DumpSizeInfo(file_num_buf, sizeof(file_num_buf), loop);
+    ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: %s picking %s",
+                     cf_name_.c_str(), comp_reason_print_string.c_str(),
+                     file_num_buf);
+  }
+
+  // output files at the bottom most level, unless it's reserved
+  int output_level = vstorage_->num_levels() - 1;
+  // last level is reserved for the files ingested behind
+  if (ioptions_.allow_ingest_behind) {
+    assert(output_level > 1);
+    output_level--;
+  }
+
+  // We never check size for
+  // compaction_options_universal.compression_size_percent,
+  // because we always compact all the files, so always compress.
+  return new Compaction(
+      vstorage_, ioptions_, mutable_cf_options_, std::move(inputs),
+      output_level,
+      MaxFileSizeForLevel(mutable_cf_options_, output_level,
+                          kCompactionStyleUniversal),
+      LLONG_MAX, path_id,
+      GetCompressionType(ioptions_, vstorage_, mutable_cf_options_, start_level,
+                         1, true /* enable_compression */),
+      GetCompressionOptions(ioptions_, vstorage_, start_level,
+                            true /* enable_compression */),
+      /* max_subcompactions */ 0, /* grandparents */ {}, /* is manual */ false,
+      score_, false /* deletion_compaction */, compaction_reason);
+}
+
+Compaction* UniversalCompactionBuilder::PickPeriodicCompaction() {
+  ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: Periodic Compaction",
+                   cf_name_.c_str());
+
+  // In universal compaction, sorted runs contain older data are almost always
+  // generated earlier too. To simplify the problem, we just try to trigger
+  // a full compaction. We start from the oldest sorted run and include
+  // all sorted runs, until we hit a sorted already being compacted.
+  // Since usually the largest (which is usually the oldest) sorted run is
+  // included anyway, doing a full compaction won't increase write
+  // amplification much.
+
+  // Get some information from marked files to check whether a file is
+  // included in the compaction.
+
+  size_t start_index = sorted_runs_.size();
+  while (start_index > 0 && !sorted_runs_[start_index - 1].being_compacted) {
+    start_index--;
+  }
+  if (start_index == sorted_runs_.size()) {
+    return nullptr;
+  }
+
+  // There is a rare corner case where we can't pick up all the files
+  // because some files are being compacted and we end up with picking files
+  // but none of them need periodic compaction. Unless we simply recompact
+  // the last sorted run (either the last level or last L0 file), we would just
+  // execute the compaction, in order to simplify  the logic.
+  if (start_index == sorted_runs_.size() - 1) {
+    bool included_file_marked = false;
+    int start_level = sorted_runs_[start_index].level;
+    FileMetaData* start_file = sorted_runs_[start_index].file;
+    for (const std::pair<int, FileMetaData*>& level_file_pair :
+         vstorage_->FilesMarkedForPeriodicCompaction()) {
+      if (start_level != 0) {
+        // Last sorted run is a level
+        if (start_level == level_file_pair.first) {
+          included_file_marked = true;
+          break;
+        }
+      } else {
+        // Last sorted run is a L0 file.
+        if (start_file == level_file_pair.second) {
+          included_file_marked = true;
+          break;
+        }
+      }
+    }
+    if (!included_file_marked) {
+      ROCKS_LOG_BUFFER(log_buffer_,
+                       "[%s] Universal: Cannot form a compaction covering file "
+                       "marked for periodic compaction",
+                       cf_name_.c_str());
+      return nullptr;
+    }
+  }
+
+  Compaction* c = PickCompactionToOldest(start_index,
+                                         CompactionReason::kPeriodicCompaction);
+
+  TEST_SYNC_POINT_CALLBACK(
+      "UniversalCompactionPicker::PickPeriodicCompaction:Return", c);
+
+  return c;
+}
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/compaction/compaction_picker_universal.h b/src/rocksdb/db/compaction/compaction_picker_universal.h
new file mode 100644
index 000000000..c3f55f5d3
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_picker_universal.h
@@ -0,0 +1,31 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include "db/compaction/compaction_picker.h"
+
+namespace ROCKSDB_NAMESPACE {
+class UniversalCompactionPicker : public CompactionPicker {
+ public:
+  UniversalCompactionPicker(const ImmutableCFOptions& ioptions,
+                            const InternalKeyComparator* icmp)
+      : CompactionPicker(ioptions, icmp) {}
+  virtual Compaction* PickCompaction(
+      const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+      VersionStorageInfo* vstorage, LogBuffer* log_buffer,
+      SequenceNumber earliest_memtable_seqno = kMaxSequenceNumber) override;
+  virtual int MaxOutputLevel() const override { return NumberLevels() - 1; }
+
+  virtual bool NeedsCompaction(
+      const VersionStorageInfo* vstorage) const override;
+};
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/comparator_db_test.cc b/src/rocksdb/db/comparator_db_test.cc
new file mode 100644
index 000000000..49f287a97
--- /dev/null
+++ b/src/rocksdb/db/comparator_db_test.cc
@@ -0,0 +1,660 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#include <array>
+#include <map>
+#include <string>
+
+#include "memtable/stl_wrappers.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/hash.h"
+#include "util/kv_map.h"
+#include "util/string_util.h"
+#include "utilities/merge_operators.h"
+
+using std::unique_ptr;
+
+namespace ROCKSDB_NAMESPACE {
+namespace {
+
+static const Comparator* kTestComparator = nullptr;
+
+class KVIter : public Iterator {
+ public:
+  explicit KVIter(const stl_wrappers::KVMap* map)
+      : map_(map), iter_(map_->end()) {}
+  bool Valid() const override { return iter_ != map_->end(); }
+  void SeekToFirst() override { iter_ = map_->begin(); }
+  void SeekToLast() override {
+    if (map_->empty()) {
+      iter_ = map_->end();
+    } else {
+      iter_ = map_->find(map_->rbegin()->first);
+    }
+  }
+  void Seek(const Slice& k) override {
+    iter_ = map_->lower_bound(k.ToString());
+  }
+  void SeekForPrev(const Slice& k) override {
+    iter_ = map_->upper_bound(k.ToString());
+    Prev();
+  }
+  void Next() override { ++iter_; }
+  void Prev() override {
+    if (iter_ == map_->begin()) {
+      iter_ = map_->end();
+      return;
+    }
+    --iter_;
+  }
+
+  Slice key() const override { return iter_->first; }
+  Slice value() const override { return iter_->second; }
+  Status status() const override { return Status::OK(); }
+
+ private:
+  const stl_wrappers::KVMap* const map_;
+  stl_wrappers::KVMap::const_iterator iter_;
+};
+
+void AssertItersEqual(Iterator* iter1, Iterator* iter2) {
+  ASSERT_EQ(iter1->Valid(), iter2->Valid());
+  if (iter1->Valid()) {
+    ASSERT_EQ(iter1->key().ToString(), iter2->key().ToString());
+    ASSERT_EQ(iter1->value().ToString(), iter2->value().ToString());
+  }
+}
+
+// Measuring operations on DB (expect to be empty).
+// source_strings are candidate keys
+void DoRandomIteraratorTest(DB* db, std::vector<std::string> source_strings,
+                            Random* rnd, int num_writes, int num_iter_ops,
+                            int num_trigger_flush) {
+  stl_wrappers::KVMap map((stl_wrappers::LessOfComparator(kTestComparator)));
+
+  for (int i = 0; i < num_writes; i++) {
+    if (num_trigger_flush > 0 && i != 0 && i % num_trigger_flush == 0) {
+      db->Flush(FlushOptions());
+    }
+
+    int type = rnd->Uniform(2);
+    int index = rnd->Uniform(static_cast<int>(source_strings.size()));
+    auto& key = source_strings[index];
+    switch (type) {
+      case 0:
+        // put
+        map[key] = key;
+        ASSERT_OK(db->Put(WriteOptions(), key, key));
+        break;
+      case 1:
+        // delete
+        if (map.find(key) != map.end()) {
+          map.erase(key);
+        }
+        ASSERT_OK(db->Delete(WriteOptions(), key));
+        break;
+      default:
+        assert(false);
+    }
+  }
+
+  std::unique_ptr<Iterator> iter(db->NewIterator(ReadOptions()));
+  std::unique_ptr<Iterator> result_iter(new KVIter(&map));
+
+  bool is_valid = false;
+  for (int i = 0; i < num_iter_ops; i++) {
+    // Random walk and make sure iter and result_iter returns the
+    // same key and value
+    int type = rnd->Uniform(6);
+    ASSERT_OK(iter->status());
+    switch (type) {
+      case 0:
+        // Seek to First
+        iter->SeekToFirst();
+        result_iter->SeekToFirst();
+        break;
+      case 1:
+        // Seek to last
+        iter->SeekToLast();
+        result_iter->SeekToLast();
+        break;
+      case 2: {
+        // Seek to random key
+        auto key_idx = rnd->Uniform(static_cast<int>(source_strings.size()));
+        auto key = source_strings[key_idx];
+        iter->Seek(key);
+        result_iter->Seek(key);
+        break;
+      }
+      case 3:
+        // Next
+        if (is_valid) {
+          iter->Next();
+          result_iter->Next();
+        } else {
+          continue;
+        }
+        break;
+      case 4:
+        // Prev
+        if (is_valid) {
+          iter->Prev();
+          result_iter->Prev();
+        } else {
+          continue;
+        }
+        break;
+      default: {
+        assert(type == 5);
+        auto key_idx = rnd->Uniform(static_cast<int>(source_strings.size()));
+        auto key = source_strings[key_idx];
+        std::string result;
+        auto status = db->Get(ReadOptions(), key, &result);
+        if (map.find(key) == map.end()) {
+          ASSERT_TRUE(status.IsNotFound());
+        } else {
+          ASSERT_EQ(map[key], result);
+        }
+        break;
+      }
+    }
+    AssertItersEqual(iter.get(), result_iter.get());
+    is_valid = iter->Valid();
+  }
+}
+
+class DoubleComparator : public Comparator {
+ public:
+  DoubleComparator() {}
+
+  const char* Name() const override { return "DoubleComparator"; }
+
+  int Compare(const Slice& a, const Slice& b) const override {
+#ifndef CYGWIN
+    double da = std::stod(a.ToString());
+    double db = std::stod(b.ToString());
+#else
+    double da = std::strtod(a.ToString().c_str(), 0 /* endptr */);
+    double db = std::strtod(a.ToString().c_str(), 0 /* endptr */);
+#endif
+    if (da == db) {
+      return a.compare(b);
+    } else if (da > db) {
+      return 1;
+    } else {
+      return -1;
+    }
+  }
+  void FindShortestSeparator(std::string* /*start*/,
+                             const Slice& /*limit*/) const override {}
+
+  void FindShortSuccessor(std::string* /*key*/) const override {}
+};
+
+class HashComparator : public Comparator {
+ public:
+  HashComparator() {}
+
+  const char* Name() const override { return "HashComparator"; }
+
+  int Compare(const Slice& a, const Slice& b) const override {
+    uint32_t ha = Hash(a.data(), a.size(), 66);
+    uint32_t hb = Hash(b.data(), b.size(), 66);
+    if (ha == hb) {
+      return a.compare(b);
+    } else if (ha > hb) {
+      return 1;
+    } else {
+      return -1;
+    }
+  }
+  void FindShortestSeparator(std::string* /*start*/,
+                             const Slice& /*limit*/) const override {}
+
+  void FindShortSuccessor(std::string* /*key*/) const override {}
+};
+
+class TwoStrComparator : public Comparator {
+ public:
+  TwoStrComparator() {}
+
+  const char* Name() const override { return "TwoStrComparator"; }
+
+  int Compare(const Slice& a, const Slice& b) const override {
+    assert(a.size() >= 2);
+    assert(b.size() >= 2);
+    size_t size_a1 = static_cast<size_t>(a[0]);
+    size_t size_b1 = static_cast<size_t>(b[0]);
+    size_t size_a2 = static_cast<size_t>(a[1]);
+    size_t size_b2 = static_cast<size_t>(b[1]);
+    assert(size_a1 + size_a2 + 2 == a.size());
+    assert(size_b1 + size_b2 + 2 == b.size());
+
+    Slice a1 = Slice(a.data() + 2, size_a1);
+    Slice b1 = Slice(b.data() + 2, size_b1);
+    Slice a2 = Slice(a.data() + 2 + size_a1, size_a2);
+    Slice b2 = Slice(b.data() + 2 + size_b1, size_b2);
+
+    if (a1 != b1) {
+      return a1.compare(b1);
+    }
+    return a2.compare(b2);
+  }
+  void FindShortestSeparator(std::string* /*start*/,
+                             const Slice& /*limit*/) const override {}
+
+  void FindShortSuccessor(std::string* /*key*/) const override {}
+};
+}  // namespace
+
+class ComparatorDBTest
+    : public testing::Test,
+      virtual public ::testing::WithParamInterface<uint32_t> {
+ private:
+  std::string dbname_;
+  Env* env_;
+  DB* db_;
+  Options last_options_;
+  std::unique_ptr<const Comparator> comparator_guard;
+
+ public:
+  ComparatorDBTest() : env_(Env::Default()), db_(nullptr) {
+    kTestComparator = BytewiseComparator();
+    dbname_ = test::PerThreadDBPath("comparator_db_test");
+    BlockBasedTableOptions toptions;
+    toptions.format_version = GetParam();
+    last_options_.table_factory.reset(
+        ROCKSDB_NAMESPACE::NewBlockBasedTableFactory(toptions));
+    EXPECT_OK(DestroyDB(dbname_, last_options_));
+  }
+
+  ~ComparatorDBTest() override {
+    delete db_;
+    EXPECT_OK(DestroyDB(dbname_, last_options_));
+    kTestComparator = BytewiseComparator();
+  }
+
+  DB* GetDB() { return db_; }
+
+  void SetOwnedComparator(const Comparator* cmp, bool owner = true) {
+    if (owner) {
+      comparator_guard.reset(cmp);
+    } else {
+      comparator_guard.reset();
+    }
+    kTestComparator = cmp;
+    last_options_.comparator = cmp;
+  }
+
+  // Return the current option configuration.
+  Options* GetOptions() { return &last_options_; }
+
+  void DestroyAndReopen() {
+    // Destroy using last options
+    Destroy();
+    ASSERT_OK(TryReopen());
+  }
+
+  void Destroy() {
+    delete db_;
+    db_ = nullptr;
+    ASSERT_OK(DestroyDB(dbname_, last_options_));
+  }
+
+  Status TryReopen() {
+    delete db_;
+    db_ = nullptr;
+    last_options_.create_if_missing = true;
+
+    return DB::Open(last_options_, dbname_, &db_);
+  }
+};
+
+INSTANTIATE_TEST_CASE_P(FormatDef, ComparatorDBTest,
+                        testing::Values(test::kDefaultFormatVersion));
+INSTANTIATE_TEST_CASE_P(FormatLatest, ComparatorDBTest,
+                        testing::Values(test::kLatestFormatVersion));
+
+TEST_P(ComparatorDBTest, Bytewise) {
+  for (int rand_seed = 301; rand_seed < 306; rand_seed++) {
+    DestroyAndReopen();
+    Random rnd(rand_seed);
+    DoRandomIteraratorTest(GetDB(),
+                           {"a", "b", "c", "d", "e", "f", "g", "h", "i"}, &rnd,
+                           8, 100, 3);
+  }
+}
+
+TEST_P(ComparatorDBTest, SimpleSuffixReverseComparator) {
+  SetOwnedComparator(new test::SimpleSuffixReverseComparator());
+
+  for (int rnd_seed = 301; rnd_seed < 316; rnd_seed++) {
+    Options* opt = GetOptions();
+    opt->comparator = kTestComparator;
+    DestroyAndReopen();
+    Random rnd(rnd_seed);
+
+    std::vector<std::string> source_strings;
+    std::vector<std::string> source_prefixes;
+    // Randomly generate 5 prefixes
+    for (int i = 0; i < 5; i++) {
+      source_prefixes.push_back(test::RandomHumanReadableString(&rnd, 8));
+    }
+    for (int j = 0; j < 20; j++) {
+      int prefix_index = rnd.Uniform(static_cast<int>(source_prefixes.size()));
+      std::string key = source_prefixes[prefix_index] +
+                        test::RandomHumanReadableString(&rnd, rnd.Uniform(8));
+      source_strings.push_back(key);
+    }
+
+    DoRandomIteraratorTest(GetDB(), source_strings, &rnd, 30, 600, 66);
+  }
+}
+
+TEST_P(ComparatorDBTest, Uint64Comparator) {
+  SetOwnedComparator(test::Uint64Comparator(), false /* owner */);
+
+  for (int rnd_seed = 301; rnd_seed < 316; rnd_seed++) {
+    Options* opt = GetOptions();
+    opt->comparator = kTestComparator;
+    DestroyAndReopen();
+    Random rnd(rnd_seed);
+    Random64 rnd64(rnd_seed);
+
+    std::vector<std::string> source_strings;
+    // Randomly generate source keys
+    for (int i = 0; i < 100; i++) {
+      uint64_t r = rnd64.Next();
+      std::string str;
+      str.resize(8);
+      memcpy(&str[0], static_cast<void*>(&r), 8);
+      source_strings.push_back(str);
+    }
+
+    DoRandomIteraratorTest(GetDB(), source_strings, &rnd, 200, 1000, 66);
+  }
+}
+
+TEST_P(ComparatorDBTest, DoubleComparator) {
+  SetOwnedComparator(new DoubleComparator());
+
+  for (int rnd_seed = 301; rnd_seed < 316; rnd_seed++) {
+    Options* opt = GetOptions();
+    opt->comparator = kTestComparator;
+    DestroyAndReopen();
+    Random rnd(rnd_seed);
+
+    std::vector<std::string> source_strings;
+    // Randomly generate source keys
+    for (int i = 0; i < 100; i++) {
+      uint32_t r = rnd.Next();
+      uint32_t divide_order = rnd.Uniform(8);
+      double to_divide = 1.0;
+      for (uint32_t j = 0; j < divide_order; j++) {
+        to_divide *= 10.0;
+      }
+      source_strings.push_back(ToString(r / to_divide));
+    }
+
+    DoRandomIteraratorTest(GetDB(), source_strings, &rnd, 200, 1000, 66);
+  }
+}
+
+TEST_P(ComparatorDBTest, HashComparator) {
+  SetOwnedComparator(new HashComparator());
+
+  for (int rnd_seed = 301; rnd_seed < 316; rnd_seed++) {
+    Options* opt = GetOptions();
+    opt->comparator = kTestComparator;
+    DestroyAndReopen();
+    Random rnd(rnd_seed);
+
+    std::vector<std::string> source_strings;
+    // Randomly generate source keys
+    for (int i = 0; i < 100; i++) {
+      source_strings.push_back(test::RandomKey(&rnd, 8));
+    }
+
+    DoRandomIteraratorTest(GetDB(), source_strings, &rnd, 200, 1000, 66);
+  }
+}
+
+TEST_P(ComparatorDBTest, TwoStrComparator) {
+  SetOwnedComparator(new TwoStrComparator());
+
+  for (int rnd_seed = 301; rnd_seed < 316; rnd_seed++) {
+    Options* opt = GetOptions();
+    opt->comparator = kTestComparator;
+    DestroyAndReopen();
+    Random rnd(rnd_seed);
+
+    std::vector<std::string> source_strings;
+    // Randomly generate source keys
+    for (int i = 0; i < 100; i++) {
+      std::string str;
+      uint32_t size1 = rnd.Uniform(8);
+      uint32_t size2 = rnd.Uniform(8);
+      str.append(1, static_cast<char>(size1));
+      str.append(1, static_cast<char>(size2));
+      str.append(test::RandomKey(&rnd, size1));
+      str.append(test::RandomKey(&rnd, size2));
+      source_strings.push_back(str);
+    }
+
+    DoRandomIteraratorTest(GetDB(), source_strings, &rnd, 200, 1000, 66);
+  }
+}
+
+TEST_P(ComparatorDBTest, IsSameLengthImmediateSuccessor) {
+  {
+    // different length
+    Slice s("abcxy");
+    Slice t("abcxyz");
+    ASSERT_FALSE(BytewiseComparator()->IsSameLengthImmediateSuccessor(s, t));
+  }
+  {
+    Slice s("abcxyz");
+    Slice t("abcxy");
+    ASSERT_FALSE(BytewiseComparator()->IsSameLengthImmediateSuccessor(s, t));
+  }
+  {
+    // not last byte different
+    Slice s("abc1xyz");
+    Slice t("abc2xyz");
+    ASSERT_FALSE(BytewiseComparator()->IsSameLengthImmediateSuccessor(s, t));
+  }
+  {
+    // same string
+    Slice s("abcxyz");
+    Slice t("abcxyz");
+    ASSERT_FALSE(BytewiseComparator()->IsSameLengthImmediateSuccessor(s, t));
+  }
+  {
+    Slice s("abcxy");
+    Slice t("abcxz");
+    ASSERT_TRUE(BytewiseComparator()->IsSameLengthImmediateSuccessor(s, t));
+  }
+  {
+    Slice s("abcxz");
+    Slice t("abcxy");
+    ASSERT_FALSE(BytewiseComparator()->IsSameLengthImmediateSuccessor(s, t));
+  }
+  {
+    const char s_array[] = "\x50\x8a\xac";
+    const char t_array[] = "\x50\x8a\xad";
+    Slice s(s_array);
+    Slice t(t_array);
+    ASSERT_TRUE(BytewiseComparator()->IsSameLengthImmediateSuccessor(s, t));
+  }
+  {
+    const char s_array[] = "\x50\x8a\xff";
+    const char t_array[] = "\x50\x8b\x00";
+    Slice s(s_array, 3);
+    Slice t(t_array, 3);
+    ASSERT_TRUE(BytewiseComparator()->IsSameLengthImmediateSuccessor(s, t));
+  }
+  {
+    const char s_array[] = "\x50\x8a\xff\xff";
+    const char t_array[] = "\x50\x8b\x00\x00";
+    Slice s(s_array, 4);
+    Slice t(t_array, 4);
+    ASSERT_TRUE(BytewiseComparator()->IsSameLengthImmediateSuccessor(s, t));
+  }
+  {
+    const char s_array[] = "\x50\x8a\xff\xff";
+    const char t_array[] = "\x50\x8b\x00\x01";
+    Slice s(s_array, 4);
+    Slice t(t_array, 4);
+    ASSERT_FALSE(BytewiseComparator()->IsSameLengthImmediateSuccessor(s, t));
+  }
+}
+
+TEST_P(ComparatorDBTest, FindShortestSeparator) {
+  std::string s1 = "abc1xyz";
+  std::string s2 = "abc3xy";
+
+  BytewiseComparator()->FindShortestSeparator(&s1, s2);
+  ASSERT_EQ("abc2", s1);
+
+  s1 = "abc5xyztt";
+
+  ReverseBytewiseComparator()->FindShortestSeparator(&s1, s2);
+  ASSERT_EQ("abc5", s1);
+
+  s1 = "abc3";
+  s2 = "abc2xy";
+  ReverseBytewiseComparator()->FindShortestSeparator(&s1, s2);
+  ASSERT_EQ("abc3", s1);
+
+  s1 = "abc3xyz";
+  s2 = "abc2xy";
+  ReverseBytewiseComparator()->FindShortestSeparator(&s1, s2);
+  ASSERT_EQ("abc3", s1);
+
+  s1 = "abc3xyz";
+  s2 = "abc2";
+  ReverseBytewiseComparator()->FindShortestSeparator(&s1, s2);
+  ASSERT_EQ("abc3", s1);
+
+  std::string old_s1 = s1 = "abc2xy";
+  s2 = "abc2";
+  ReverseBytewiseComparator()->FindShortestSeparator(&s1, s2);
+  ASSERT_TRUE(old_s1 >= s1);
+  ASSERT_TRUE(s1 > s2);
+}
+
+TEST_P(ComparatorDBTest, SeparatorSuccessorRandomizeTest) {
+  // Char list for boundary cases.
+  std::array<unsigned char, 6> char_list{{0, 1, 2, 253, 254, 255}};
+  Random rnd(301);
+
+  for (int attempts = 0; attempts < 1000; attempts++) {
+    uint32_t size1 = rnd.Skewed(4);
+    uint32_t size2;
+
+    if (rnd.OneIn(2)) {
+      // size2 to be random size
+      size2 = rnd.Skewed(4);
+    } else {
+      // size1 is within [-2, +2] of size1
+      int diff = static_cast<int>(rnd.Uniform(5)) - 2;
+      int tmp_size2 = static_cast<int>(size1) + diff;
+      if (tmp_size2 < 0) {
+        tmp_size2 = 0;
+      }
+      size2 = static_cast<uint32_t>(tmp_size2);
+    }
+
+    std::string s1;
+    std::string s2;
+    for (uint32_t i = 0; i < size1; i++) {
+      if (rnd.OneIn(2)) {
+        // Use random byte
+        s1 += static_cast<char>(rnd.Uniform(256));
+      } else {
+        // Use one byte in char_list
+        char c = static_cast<char>(char_list[rnd.Uniform(sizeof(char_list))]);
+        s1 += c;
+      }
+    }
+
+    // First set s2 to be the same as s1, and then modify s2.
+    s2 = s1;
+    s2.resize(size2);
+    // We start from the back of the string
+    if (size2 > 0) {
+      uint32_t pos = size2 - 1;
+      do {
+        if (pos >= size1 || rnd.OneIn(4)) {
+          // For 1/4 chance, use random byte
+          s2[pos] = static_cast<char>(rnd.Uniform(256));
+        } else if (rnd.OneIn(4)) {
+          // In 1/4 chance, stop here.
+          break;
+        } else {
+          // Create a char within [-2, +2] of the matching char of s1.
+          int diff = static_cast<int>(rnd.Uniform(5)) - 2;
+          // char may be signed or unsigned based on platform.
+          int s1_char = static_cast<int>(static_cast<unsigned char>(s1[pos]));
+          int s2_char = s1_char + diff;
+          if (s2_char < 0) {
+            s2_char = 0;
+          }
+          if (s2_char > 255) {
+            s2_char = 255;
+          }
+          s2[pos] = static_cast<char>(s2_char);
+        }
+      } while (pos-- != 0);
+    }
+
+    // Test separators
+    for (int rev = 0; rev < 2; rev++) {
+      if (rev == 1) {
+        // switch s1 and s2
+        std::string t = s1;
+        s1 = s2;
+        s2 = t;
+      }
+      std::string separator = s1;
+      BytewiseComparator()->FindShortestSeparator(&separator, s2);
+      std::string rev_separator = s1;
+      ReverseBytewiseComparator()->FindShortestSeparator(&rev_separator, s2);
+
+      if (s1 == s2) {
+        ASSERT_EQ(s1, separator);
+        ASSERT_EQ(s2, rev_separator);
+      } else if (s1 < s2) {
+        ASSERT_TRUE(s1 <= separator);
+        ASSERT_TRUE(s2 > separator);
+        ASSERT_LE(separator.size(), std::max(s1.size(), s2.size()));
+        ASSERT_EQ(s1, rev_separator);
+      } else {
+        ASSERT_TRUE(s1 >= rev_separator);
+        ASSERT_TRUE(s2 < rev_separator);
+        ASSERT_LE(rev_separator.size(), std::max(s1.size(), s2.size()));
+        ASSERT_EQ(s1, separator);
+      }
+    }
+
+    // Test successors
+    std::string succ = s1;
+    BytewiseComparator()->FindShortSuccessor(&succ);
+    ASSERT_TRUE(succ >= s1);
+
+    succ = s1;
+    ReverseBytewiseComparator()->FindShortSuccessor(&succ);
+    ASSERT_TRUE(succ <= s1);
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/convenience.cc b/src/rocksdb/db/convenience.cc
new file mode 100644
index 000000000..206f1f875
--- /dev/null
+++ b/src/rocksdb/db/convenience.cc
@@ -0,0 +1,77 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+
+#ifndef ROCKSDB_LITE
+
+#include "rocksdb/convenience.h"
+
+#include "db/db_impl/db_impl.h"
+#include "util/cast_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+void CancelAllBackgroundWork(DB* db, bool wait) {
+  (static_cast_with_check<DBImpl, DB>(db->GetRootDB()))
+      ->CancelAllBackgroundWork(wait);
+}
+
+Status DeleteFilesInRange(DB* db, ColumnFamilyHandle* column_family,
+                          const Slice* begin, const Slice* end,
+                          bool include_end) {
+  RangePtr range(begin, end);
+  return DeleteFilesInRanges(db, column_family, &range, 1, include_end);
+}
+
+Status DeleteFilesInRanges(DB* db, ColumnFamilyHandle* column_family,
+                           const RangePtr* ranges, size_t n,
+                           bool include_end) {
+  return (static_cast_with_check<DBImpl, DB>(db->GetRootDB()))
+      ->DeleteFilesInRanges(column_family, ranges, n, include_end);
+}
+
+Status VerifySstFileChecksum(const Options& options,
+                             const EnvOptions& env_options,
+                             const std::string& file_path) {
+  return VerifySstFileChecksum(options, env_options, ReadOptions(), file_path);
+}
+Status VerifySstFileChecksum(const Options& options,
+                             const EnvOptions& env_options,
+                             const ReadOptions& read_options,
+                             const std::string& file_path) {
+  std::unique_ptr<FSRandomAccessFile> file;
+  uint64_t file_size;
+  InternalKeyComparator internal_comparator(options.comparator);
+  ImmutableCFOptions ioptions(options);
+
+  Status s = ioptions.fs->NewRandomAccessFile(file_path,
+                                              FileOptions(env_options),
+                                              &file, nullptr);
+  if (s.ok()) {
+    s = ioptions.fs->GetFileSize(file_path, IOOptions(), &file_size, nullptr);
+  } else {
+    return s;
+  }
+  std::unique_ptr<TableReader> table_reader;
+  std::unique_ptr<RandomAccessFileReader> file_reader(
+      new RandomAccessFileReader(std::move(file), file_path));
+  const bool kImmortal = true;
+  s = ioptions.table_factory->NewTableReader(
+      TableReaderOptions(ioptions, options.prefix_extractor.get(), env_options,
+                         internal_comparator, false /* skip_filters */,
+                         !kImmortal, -1 /* level */),
+      std::move(file_reader), file_size, &table_reader,
+      false /* prefetch_index_and_filter_in_cache */);
+  if (!s.ok()) {
+    return s;
+  }
+  s = table_reader->VerifyChecksum(read_options,
+                                   TableReaderCaller::kUserVerifyChecksum);
+  return s;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/db/corruption_test.cc b/src/rocksdb/db/corruption_test.cc
new file mode 100644
index 000000000..203c34fa4
--- /dev/null
+++ b/src/rocksdb/db/corruption_test.cc
@@ -0,0 +1,613 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef ROCKSDB_LITE
+
+#include "rocksdb/db.h"
+
+#include <errno.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <cinttypes>
+#include "db/db_impl/db_impl.h"
+#include "db/db_test_util.h"
+#include "db/log_format.h"
+#include "db/version_set.h"
+#include "env/composite_env_wrapper.h"
+#include "file/filename.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/env.h"
+#include "rocksdb/table.h"
+#include "rocksdb/write_batch.h"
+#include "table/block_based/block_based_table_builder.h"
+#include "table/meta_blocks.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+static const int kValueSize = 1000;
+
+class CorruptionTest : public testing::Test {
+ public:
+  test::ErrorEnv env_;
+  std::string dbname_;
+  std::shared_ptr<Cache> tiny_cache_;
+  Options options_;
+  DB* db_;
+
+  CorruptionTest() {
+    // If LRU cache shard bit is smaller than 2 (or -1 which will automatically
+    // set it to 0), test SequenceNumberRecovery will fail, likely because of a
+    // bug in recovery code. Keep it 4 for now to make the test passes.
+    tiny_cache_ = NewLRUCache(100, 4);
+    options_.wal_recovery_mode = WALRecoveryMode::kTolerateCorruptedTailRecords;
+    options_.env = &env_;
+    dbname_ = test::PerThreadDBPath("corruption_test");
+    DestroyDB(dbname_, options_);
+
+    db_ = nullptr;
+    options_.create_if_missing = true;
+    BlockBasedTableOptions table_options;
+    table_options.block_size_deviation = 0;  // make unit test pass for now
+    options_.table_factory.reset(NewBlockBasedTableFactory(table_options));
+    Reopen();
+    options_.create_if_missing = false;
+  }
+
+  ~CorruptionTest() override {
+    delete db_;
+    DestroyDB(dbname_, Options());
+  }
+
+  void CloseDb() {
+    delete db_;
+    db_ = nullptr;
+  }
+
+  Status TryReopen(Options* options = nullptr) {
+    delete db_;
+    db_ = nullptr;
+    Options opt = (options ? *options : options_);
+    if (opt.env == Options().env) {
+      // If env is not overridden, replace it with ErrorEnv.
+      // Otherwise, the test already uses a non-default Env.
+      opt.env = &env_;
+    }
+    opt.arena_block_size = 4096;
+    BlockBasedTableOptions table_options;
+    table_options.block_cache = tiny_cache_;
+    table_options.block_size_deviation = 0;
+    opt.table_factory.reset(NewBlockBasedTableFactory(table_options));
+    return DB::Open(opt, dbname_, &db_);
+  }
+
+  void Reopen(Options* options = nullptr) {
+    ASSERT_OK(TryReopen(options));
+  }
+
+  void RepairDB() {
+    delete db_;
+    db_ = nullptr;
+    ASSERT_OK(::ROCKSDB_NAMESPACE::RepairDB(dbname_, options_));
+  }
+
+  void Build(int n, int flush_every = 0) {
+    std::string key_space, value_space;
+    WriteBatch batch;
+    for (int i = 0; i < n; i++) {
+      if (flush_every != 0 && i != 0 && i % flush_every == 0) {
+        DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
+        dbi->TEST_FlushMemTable();
+      }
+      //if ((i % 100) == 0) fprintf(stderr, "@ %d of %d\n", i, n);
+      Slice key = Key(i, &key_space);
+      batch.Clear();
+      batch.Put(key, Value(i, &value_space));
+      ASSERT_OK(db_->Write(WriteOptions(), &batch));
+    }
+  }
+
+  void Check(int min_expected, int max_expected) {
+    uint64_t next_expected = 0;
+    uint64_t missed = 0;
+    int bad_keys = 0;
+    int bad_values = 0;
+    int correct = 0;
+    std::string value_space;
+    // Do not verify checksums. If we verify checksums then the
+    // db itself will raise errors because data is corrupted.
+    // Instead, we want the reads to be successful and this test
+    // will detect whether the appropriate corruptions have
+    // occurred.
+    Iterator* iter = db_->NewIterator(ReadOptions(false, true));
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+      uint64_t key;
+      Slice in(iter->key());
+      if (!ConsumeDecimalNumber(&in, &key) ||
+          !in.empty() ||
+          key < next_expected) {
+        bad_keys++;
+        continue;
+      }
+      missed += (key - next_expected);
+      next_expected = key + 1;
+      if (iter->value() != Value(static_cast<int>(key), &value_space)) {
+        bad_values++;
+      } else {
+        correct++;
+      }
+    }
+    delete iter;
+
+    fprintf(stderr,
+      "expected=%d..%d; got=%d; bad_keys=%d; bad_values=%d; missed=%llu\n",
+            min_expected, max_expected, correct, bad_keys, bad_values,
+            static_cast<unsigned long long>(missed));
+    ASSERT_LE(min_expected, correct);
+    ASSERT_GE(max_expected, correct);
+  }
+
+  void CorruptFile(const std::string& fname, int offset, int bytes_to_corrupt) {
+    struct stat sbuf;
+    if (stat(fname.c_str(), &sbuf) != 0) {
+      const char* msg = strerror(errno);
+      FAIL() << fname << ": " << msg;
+    }
+
+    if (offset < 0) {
+      // Relative to end of file; make it absolute
+      if (-offset > sbuf.st_size) {
+        offset = 0;
+      } else {
+        offset = static_cast<int>(sbuf.st_size + offset);
+      }
+    }
+    if (offset > sbuf.st_size) {
+      offset = static_cast<int>(sbuf.st_size);
+    }
+    if (offset + bytes_to_corrupt > sbuf.st_size) {
+      bytes_to_corrupt = static_cast<int>(sbuf.st_size - offset);
+    }
+
+    // Do it
+    std::string contents;
+    Status s = ReadFileToString(Env::Default(), fname, &contents);
+    ASSERT_TRUE(s.ok()) << s.ToString();
+    for (int i = 0; i < bytes_to_corrupt; i++) {
+      contents[i + offset] ^= 0x80;
+    }
+    s = WriteStringToFile(Env::Default(), contents, fname);
+    ASSERT_TRUE(s.ok()) << s.ToString();
+    Options options;
+    EnvOptions env_options;
+    options.file_system.reset(new LegacyFileSystemWrapper(options.env));
+    ASSERT_NOK(VerifySstFileChecksum(options, env_options, fname));
+  }
+
+  void Corrupt(FileType filetype, int offset, int bytes_to_corrupt) {
+    // Pick file to corrupt
+    std::vector<std::string> filenames;
+    ASSERT_OK(env_.GetChildren(dbname_, &filenames));
+    uint64_t number;
+    FileType type;
+    std::string fname;
+    int picked_number = -1;
+    for (size_t i = 0; i < filenames.size(); i++) {
+      if (ParseFileName(filenames[i], &number, &type) &&
+          type == filetype &&
+          static_cast<int>(number) > picked_number) {  // Pick latest file
+        fname = dbname_ + "/" + filenames[i];
+        picked_number = static_cast<int>(number);
+      }
+    }
+    ASSERT_TRUE(!fname.empty()) << filetype;
+
+    CorruptFile(fname, offset, bytes_to_corrupt);
+  }
+
+  // corrupts exactly one file at level `level`. if no file found at level,
+  // asserts
+  void CorruptTableFileAtLevel(int level, int offset, int bytes_to_corrupt) {
+    std::vector<LiveFileMetaData> metadata;
+    db_->GetLiveFilesMetaData(&metadata);
+    for (const auto& m : metadata) {
+      if (m.level == level) {
+        CorruptFile(dbname_ + "/" + m.name, offset, bytes_to_corrupt);
+        return;
+      }
+    }
+    FAIL() << "no file found at level";
+  }
+
+
+  int Property(const std::string& name) {
+    std::string property;
+    int result;
+    if (db_->GetProperty(name, &property) &&
+        sscanf(property.c_str(), "%d", &result) == 1) {
+      return result;
+    } else {
+      return -1;
+    }
+  }
+
+  // Return the ith key
+  Slice Key(int i, std::string* storage) {
+    char buf[100];
+    snprintf(buf, sizeof(buf), "%016d", i);
+    storage->assign(buf, strlen(buf));
+    return Slice(*storage);
+  }
+
+  // Return the value to associate with the specified key
+  Slice Value(int k, std::string* storage) {
+    if (k == 0) {
+      // Ugh.  Random seed of 0 used to produce no entropy.  This code
+      // preserves the implementation that was in place when all of the
+      // magic values in this file were picked.
+      *storage = std::string(kValueSize, ' ');
+      return Slice(*storage);
+    } else {
+      Random r(k);
+      return test::RandomString(&r, kValueSize, storage);
+    }
+  }
+};
+
+TEST_F(CorruptionTest, Recovery) {
+  Build(100);
+  Check(100, 100);
+#ifdef OS_WIN
+  // On Wndows OS Disk cache does not behave properly
+  // We do not call FlushBuffers on every Flush. If we do not close
+  // the log file prior to the corruption we end up with the first
+  // block not corrupted but only the second. However, under the debugger
+  // things work just fine but never pass when running normally
+  // For that reason people may want to run with unbuffered I/O. That option
+  // is not available for WAL though.
+  CloseDb();
+#endif
+  Corrupt(kLogFile, 19, 1);      // WriteBatch tag for first record
+  Corrupt(kLogFile, log::kBlockSize + 1000, 1);  // Somewhere in second block
+  ASSERT_TRUE(!TryReopen().ok());
+  options_.paranoid_checks = false;
+  Reopen(&options_);
+
+  // The 64 records in the first two log blocks are completely lost.
+  Check(36, 36);
+}
+
+TEST_F(CorruptionTest, RecoverWriteError) {
+  env_.writable_file_error_ = true;
+  Status s = TryReopen();
+  ASSERT_TRUE(!s.ok());
+}
+
+TEST_F(CorruptionTest, NewFileErrorDuringWrite) {
+  // Do enough writing to force minor compaction
+  env_.writable_file_error_ = true;
+  const int num =
+      static_cast<int>(3 + (Options().write_buffer_size / kValueSize));
+  std::string value_storage;
+  Status s;
+  bool failed = false;
+  for (int i = 0; i < num; i++) {
+    WriteBatch batch;
+    batch.Put("a", Value(100, &value_storage));
+    s = db_->Write(WriteOptions(), &batch);
+    if (!s.ok()) {
+      failed = true;
+    }
+    ASSERT_TRUE(!failed || !s.ok());
+  }
+  ASSERT_TRUE(!s.ok());
+  ASSERT_GE(env_.num_writable_file_errors_, 1);
+  env_.writable_file_error_ = false;
+  Reopen();
+}
+
+TEST_F(CorruptionTest, TableFile) {
+  Build(100);
+  DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
+  dbi->TEST_FlushMemTable();
+  dbi->TEST_CompactRange(0, nullptr, nullptr);
+  dbi->TEST_CompactRange(1, nullptr, nullptr);
+
+  Corrupt(kTableFile, 100, 1);
+  Check(99, 99);
+  ASSERT_NOK(dbi->VerifyChecksum());
+}
+
+TEST_F(CorruptionTest, VerifyChecksumReadahead) {
+  Options options;
+  SpecialEnv senv(Env::Default());
+  options.env = &senv;
+  // Disable block cache as we are going to check checksum for
+  // the same file twice and measure number of reads.
+  BlockBasedTableOptions table_options_no_bc;
+  table_options_no_bc.no_block_cache = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options_no_bc));
+
+  Reopen(&options);
+
+  Build(10000);
+  DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
+  dbi->TEST_FlushMemTable();
+  dbi->TEST_CompactRange(0, nullptr, nullptr);
+  dbi->TEST_CompactRange(1, nullptr, nullptr);
+
+  senv.count_random_reads_ = true;
+  senv.random_read_counter_.Reset();
+  ASSERT_OK(dbi->VerifyChecksum());
+
+  // Make sure the counter is enabled.
+  ASSERT_GT(senv.random_read_counter_.Read(), 0);
+
+  // The SST file is about 10MB. Default readahead size is 256KB.
+  // Give a conservative 20 reads for metadata blocks, The number
+  // of random reads should be within 10 MB / 256KB + 20 = 60.
+  ASSERT_LT(senv.random_read_counter_.Read(), 60);
+
+  senv.random_read_bytes_counter_ = 0;
+  ReadOptions ro;
+  ro.readahead_size = size_t{32 * 1024};
+  ASSERT_OK(dbi->VerifyChecksum(ro));
+  // The SST file is about 10MB. We set readahead size to 32KB.
+  // Give 0 to 20 reads for metadata blocks, and allow real read
+  // to range from 24KB to 48KB. The lower bound would be:
+  //   10MB / 48KB + 0 = 213
+  // The higher bound is
+  //   10MB / 24KB + 20 = 447.
+  ASSERT_GE(senv.random_read_counter_.Read(), 213);
+  ASSERT_LE(senv.random_read_counter_.Read(), 447);
+
+  // Test readahead shouldn't break mmap mode (where it should be
+  // disabled).
+  options.allow_mmap_reads = true;
+  Reopen(&options);
+  dbi = static_cast<DBImpl*>(db_);
+  ASSERT_OK(dbi->VerifyChecksum(ro));
+
+  CloseDb();
+}
+
+TEST_F(CorruptionTest, TableFileIndexData) {
+  Options options;
+  // very big, we'll trigger flushes manually
+  options.write_buffer_size = 100 * 1024 * 1024;
+  Reopen(&options);
+  // build 2 tables, flush at 5000
+  Build(10000, 5000);
+  DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
+  dbi->TEST_FlushMemTable();
+
+  // corrupt an index block of an entire file
+  Corrupt(kTableFile, -2000, 500);
+  options.paranoid_checks = false;
+  Reopen(&options);
+  dbi = reinterpret_cast<DBImpl*>(db_);
+  // one full file may be readable, since only one was corrupted
+  // the other file should be fully non-readable, since index was corrupted
+  Check(0, 5000);
+  ASSERT_NOK(dbi->VerifyChecksum());
+
+  // In paranoid mode, the db cannot be opened due to the corrupted file.
+  ASSERT_TRUE(TryReopen().IsCorruption());
+}
+
+TEST_F(CorruptionTest, MissingDescriptor) {
+  Build(1000);
+  RepairDB();
+  Reopen();
+  Check(1000, 1000);
+}
+
+TEST_F(CorruptionTest, SequenceNumberRecovery) {
+  ASSERT_OK(db_->Put(WriteOptions(), "foo", "v1"));
+  ASSERT_OK(db_->Put(WriteOptions(), "foo", "v2"));
+  ASSERT_OK(db_->Put(WriteOptions(), "foo", "v3"));
+  ASSERT_OK(db_->Put(WriteOptions(), "foo", "v4"));
+  ASSERT_OK(db_->Put(WriteOptions(), "foo", "v5"));
+  RepairDB();
+  Reopen();
+  std::string v;
+  ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
+  ASSERT_EQ("v5", v);
+  // Write something.  If sequence number was not recovered properly,
+  // it will be hidden by an earlier write.
+  ASSERT_OK(db_->Put(WriteOptions(), "foo", "v6"));
+  ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
+  ASSERT_EQ("v6", v);
+  Reopen();
+  ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
+  ASSERT_EQ("v6", v);
+}
+
+TEST_F(CorruptionTest, CorruptedDescriptor) {
+  ASSERT_OK(db_->Put(WriteOptions(), "foo", "hello"));
+  DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
+  dbi->TEST_FlushMemTable();
+  dbi->TEST_CompactRange(0, nullptr, nullptr);
+
+  Corrupt(kDescriptorFile, 0, 1000);
+  Status s = TryReopen();
+  ASSERT_TRUE(!s.ok());
+
+  RepairDB();
+  Reopen();
+  std::string v;
+  ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
+  ASSERT_EQ("hello", v);
+}
+
+TEST_F(CorruptionTest, CompactionInputError) {
+  Options options;
+  Reopen(&options);
+  Build(10);
+  DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
+  dbi->TEST_FlushMemTable();
+  dbi->TEST_CompactRange(0, nullptr, nullptr);
+  dbi->TEST_CompactRange(1, nullptr, nullptr);
+  ASSERT_EQ(1, Property("rocksdb.num-files-at-level2"));
+
+  Corrupt(kTableFile, 100, 1);
+  Check(9, 9);
+  ASSERT_NOK(dbi->VerifyChecksum());
+
+  // Force compactions by writing lots of values
+  Build(10000);
+  Check(10000, 10000);
+  ASSERT_NOK(dbi->VerifyChecksum());
+}
+
+TEST_F(CorruptionTest, CompactionInputErrorParanoid) {
+  Options options;
+  options.paranoid_checks = true;
+  options.write_buffer_size = 131072;
+  options.max_write_buffer_number = 2;
+  Reopen(&options);
+  DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
+
+  // Fill levels >= 1
+  for (int level = 1; level < dbi->NumberLevels(); level++) {
+    dbi->Put(WriteOptions(), "", "begin");
+    dbi->Put(WriteOptions(), "~", "end");
+    dbi->TEST_FlushMemTable();
+    for (int comp_level = 0; comp_level < dbi->NumberLevels() - level;
+         ++comp_level) {
+      dbi->TEST_CompactRange(comp_level, nullptr, nullptr);
+    }
+  }
+
+  Reopen(&options);
+
+  dbi = reinterpret_cast<DBImpl*>(db_);
+  Build(10);
+  dbi->TEST_FlushMemTable();
+  dbi->TEST_WaitForCompact();
+  ASSERT_EQ(1, Property("rocksdb.num-files-at-level0"));
+
+  CorruptTableFileAtLevel(0, 100, 1);
+  Check(9, 9);
+  ASSERT_NOK(dbi->VerifyChecksum());
+
+  // Write must eventually fail because of corrupted table
+  Status s;
+  std::string tmp1, tmp2;
+  bool failed = false;
+  for (int i = 0; i < 10000; i++) {
+    s = db_->Put(WriteOptions(), Key(i, &tmp1), Value(i, &tmp2));
+    if (!s.ok()) {
+      failed = true;
+    }
+    // if one write failed, every subsequent write must fail, too
+    ASSERT_TRUE(!failed || !s.ok()) << "write did not fail in a corrupted db";
+  }
+  ASSERT_TRUE(!s.ok()) << "write did not fail in corrupted paranoid db";
+}
+
+TEST_F(CorruptionTest, UnrelatedKeys) {
+  Build(10);
+  DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
+  dbi->TEST_FlushMemTable();
+  Corrupt(kTableFile, 100, 1);
+  ASSERT_NOK(dbi->VerifyChecksum());
+
+  std::string tmp1, tmp2;
+  ASSERT_OK(db_->Put(WriteOptions(), Key(1000, &tmp1), Value(1000, &tmp2)));
+  std::string v;
+  ASSERT_OK(db_->Get(ReadOptions(), Key(1000, &tmp1), &v));
+  ASSERT_EQ(Value(1000, &tmp2).ToString(), v);
+  dbi->TEST_FlushMemTable();
+  ASSERT_OK(db_->Get(ReadOptions(), Key(1000, &tmp1), &v));
+  ASSERT_EQ(Value(1000, &tmp2).ToString(), v);
+}
+
+TEST_F(CorruptionTest, RangeDeletionCorrupted) {
+  ASSERT_OK(
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "b"));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  std::vector<LiveFileMetaData> metadata;
+  db_->GetLiveFilesMetaData(&metadata);
+  ASSERT_EQ(static_cast<size_t>(1), metadata.size());
+  std::string filename = dbname_ + metadata[0].name;
+
+  std::unique_ptr<RandomAccessFile> file;
+  ASSERT_OK(options_.env->NewRandomAccessFile(filename, &file, EnvOptions()));
+  std::unique_ptr<RandomAccessFileReader> file_reader(
+      new RandomAccessFileReader(NewLegacyRandomAccessFileWrapper(file),
+                                 filename));
+
+  uint64_t file_size;
+  ASSERT_OK(options_.env->GetFileSize(filename, &file_size));
+
+  BlockHandle range_del_handle;
+  ASSERT_OK(FindMetaBlock(
+      file_reader.get(), file_size, kBlockBasedTableMagicNumber,
+      ImmutableCFOptions(options_), kRangeDelBlock, &range_del_handle));
+
+  ASSERT_OK(TryReopen());
+  CorruptFile(filename, static_cast<int>(range_del_handle.offset()), 1);
+  ASSERT_TRUE(TryReopen().IsCorruption());
+}
+
+TEST_F(CorruptionTest, FileSystemStateCorrupted) {
+  for (int iter = 0; iter < 2; ++iter) {
+    Options options;
+    options.paranoid_checks = true;
+    options.create_if_missing = true;
+    Reopen(&options);
+    Build(10);
+    ASSERT_OK(db_->Flush(FlushOptions()));
+    DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
+    std::vector<LiveFileMetaData> metadata;
+    dbi->GetLiveFilesMetaData(&metadata);
+    ASSERT_GT(metadata.size(), size_t(0));
+    std::string filename = dbname_ + metadata[0].name;
+
+    delete db_;
+    db_ = nullptr;
+
+    if (iter == 0) {  // corrupt file size
+      std::unique_ptr<WritableFile> file;
+      env_.NewWritableFile(filename, &file, EnvOptions());
+      file->Append(Slice("corrupted sst"));
+      file.reset();
+      Status x = TryReopen(&options);
+      ASSERT_TRUE(x.IsCorruption());
+    } else {  // delete the file
+      env_.DeleteFile(filename);
+      Status x = TryReopen(&options);
+      ASSERT_TRUE(x.IsPathNotFound());
+    }
+
+    DestroyDB(dbname_, options_);
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+  fprintf(stderr, "SKIPPED as RepairDB() is not supported in ROCKSDB_LITE\n");
+  return 0;
+}
+
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/cuckoo_table_db_test.cc b/src/rocksdb/db/cuckoo_table_db_test.cc
new file mode 100644
index 000000000..9467840ff
--- /dev/null
+++ b/src/rocksdb/db/cuckoo_table_db_test.cc
@@ -0,0 +1,351 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "db/db_impl/db_impl.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "table/cuckoo/cuckoo_table_factory.h"
+#include "table/cuckoo/cuckoo_table_reader.h"
+#include "table/meta_blocks.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class CuckooTableDBTest : public testing::Test {
+ private:
+  std::string dbname_;
+  Env* env_;
+  DB* db_;
+
+ public:
+  CuckooTableDBTest() : env_(Env::Default()) {
+    dbname_ = test::PerThreadDBPath("cuckoo_table_db_test");
+    EXPECT_OK(DestroyDB(dbname_, Options()));
+    db_ = nullptr;
+    Reopen();
+  }
+
+  ~CuckooTableDBTest() override {
+    delete db_;
+    EXPECT_OK(DestroyDB(dbname_, Options()));
+  }
+
+  Options CurrentOptions() {
+    Options options;
+    options.table_factory.reset(NewCuckooTableFactory());
+    options.memtable_factory.reset(NewHashLinkListRepFactory(4, 0, 3, true));
+    options.allow_mmap_reads = true;
+    options.create_if_missing = true;
+    options.allow_concurrent_memtable_write = false;
+    return options;
+  }
+
+  DBImpl* dbfull() {
+    return reinterpret_cast<DBImpl*>(db_);
+  }
+
+  // The following util methods are copied from plain_table_db_test.
+  void Reopen(Options* options = nullptr) {
+    delete db_;
+    db_ = nullptr;
+    Options opts;
+    if (options != nullptr) {
+      opts = *options;
+    } else {
+      opts = CurrentOptions();
+      opts.create_if_missing = true;
+    }
+    ASSERT_OK(DB::Open(opts, dbname_, &db_));
+  }
+
+  Status Put(const Slice& k, const Slice& v) {
+    return db_->Put(WriteOptions(), k, v);
+  }
+
+  Status Delete(const std::string& k) {
+    return db_->Delete(WriteOptions(), k);
+  }
+
+  std::string Get(const std::string& k) {
+    ReadOptions options;
+    std::string result;
+    Status s = db_->Get(options, k, &result);
+    if (s.IsNotFound()) {
+      result = "NOT_FOUND";
+    } else if (!s.ok()) {
+      result = s.ToString();
+    }
+    return result;
+  }
+
+  int NumTableFilesAtLevel(int level) {
+    std::string property;
+    EXPECT_TRUE(db_->GetProperty(
+        "rocksdb.num-files-at-level" + NumberToString(level), &property));
+    return atoi(property.c_str());
+  }
+
+  // Return spread of files per level
+  std::string FilesPerLevel() {
+    std::string result;
+    size_t last_non_zero_offset = 0;
+    for (int level = 0; level < db_->NumberLevels(); level++) {
+      int f = NumTableFilesAtLevel(level);
+      char buf[100];
+      snprintf(buf, sizeof(buf), "%s%d", (level ? "," : ""), f);
+      result += buf;
+      if (f > 0) {
+        last_non_zero_offset = result.size();
+      }
+    }
+    result.resize(last_non_zero_offset);
+    return result;
+  }
+};
+
+TEST_F(CuckooTableDBTest, Flush) {
+  // Try with empty DB first.
+  ASSERT_TRUE(dbfull() != nullptr);
+  ASSERT_EQ("NOT_FOUND", Get("key2"));
+
+  // Add some values to db.
+  Options options = CurrentOptions();
+  Reopen(&options);
+
+  ASSERT_OK(Put("key1", "v1"));
+  ASSERT_OK(Put("key2", "v2"));
+  ASSERT_OK(Put("key3", "v3"));
+  dbfull()->TEST_FlushMemTable();
+
+  TablePropertiesCollection ptc;
+  reinterpret_cast<DB*>(dbfull())->GetPropertiesOfAllTables(&ptc);
+  ASSERT_EQ(1U, ptc.size());
+  ASSERT_EQ(3U, ptc.begin()->second->num_entries);
+  ASSERT_EQ("1", FilesPerLevel());
+
+  ASSERT_EQ("v1", Get("key1"));
+  ASSERT_EQ("v2", Get("key2"));
+  ASSERT_EQ("v3", Get("key3"));
+  ASSERT_EQ("NOT_FOUND", Get("key4"));
+
+  // Now add more keys and flush.
+  ASSERT_OK(Put("key4", "v4"));
+  ASSERT_OK(Put("key5", "v5"));
+  ASSERT_OK(Put("key6", "v6"));
+  dbfull()->TEST_FlushMemTable();
+
+  reinterpret_cast<DB*>(dbfull())->GetPropertiesOfAllTables(&ptc);
+  ASSERT_EQ(2U, ptc.size());
+  auto row = ptc.begin();
+  ASSERT_EQ(3U, row->second->num_entries);
+  ASSERT_EQ(3U, (++row)->second->num_entries);
+  ASSERT_EQ("2", FilesPerLevel());
+  ASSERT_EQ("v1", Get("key1"));
+  ASSERT_EQ("v2", Get("key2"));
+  ASSERT_EQ("v3", Get("key3"));
+  ASSERT_EQ("v4", Get("key4"));
+  ASSERT_EQ("v5", Get("key5"));
+  ASSERT_EQ("v6", Get("key6"));
+
+  ASSERT_OK(Delete("key6"));
+  ASSERT_OK(Delete("key5"));
+  ASSERT_OK(Delete("key4"));
+  dbfull()->TEST_FlushMemTable();
+  reinterpret_cast<DB*>(dbfull())->GetPropertiesOfAllTables(&ptc);
+  ASSERT_EQ(3U, ptc.size());
+  row = ptc.begin();
+  ASSERT_EQ(3U, row->second->num_entries);
+  ASSERT_EQ(3U, (++row)->second->num_entries);
+  ASSERT_EQ(3U, (++row)->second->num_entries);
+  ASSERT_EQ("3", FilesPerLevel());
+  ASSERT_EQ("v1", Get("key1"));
+  ASSERT_EQ("v2", Get("key2"));
+  ASSERT_EQ("v3", Get("key3"));
+  ASSERT_EQ("NOT_FOUND", Get("key4"));
+  ASSERT_EQ("NOT_FOUND", Get("key5"));
+  ASSERT_EQ("NOT_FOUND", Get("key6"));
+}
+
+TEST_F(CuckooTableDBTest, FlushWithDuplicateKeys) {
+  Options options = CurrentOptions();
+  Reopen(&options);
+  ASSERT_OK(Put("key1", "v1"));
+  ASSERT_OK(Put("key2", "v2"));
+  ASSERT_OK(Put("key1", "v3"));  // Duplicate
+  dbfull()->TEST_FlushMemTable();
+
+  TablePropertiesCollection ptc;
+  reinterpret_cast<DB*>(dbfull())->GetPropertiesOfAllTables(&ptc);
+  ASSERT_EQ(1U, ptc.size());
+  ASSERT_EQ(2U, ptc.begin()->second->num_entries);
+  ASSERT_EQ("1", FilesPerLevel());
+  ASSERT_EQ("v3", Get("key1"));
+  ASSERT_EQ("v2", Get("key2"));
+}
+
+namespace {
+static std::string Key(int i) {
+  char buf[100];
+  snprintf(buf, sizeof(buf), "key_______%06d", i);
+  return std::string(buf);
+}
+static std::string Uint64Key(uint64_t i) {
+  std::string str;
+  str.resize(8);
+  memcpy(&str[0], static_cast<void*>(&i), 8);
+  return str;
+}
+}  // namespace.
+
+TEST_F(CuckooTableDBTest, Uint64Comparator) {
+  Options options = CurrentOptions();
+  options.comparator = test::Uint64Comparator();
+  Reopen(&options);
+
+  ASSERT_OK(Put(Uint64Key(1), "v1"));
+  ASSERT_OK(Put(Uint64Key(2), "v2"));
+  ASSERT_OK(Put(Uint64Key(3), "v3"));
+  dbfull()->TEST_FlushMemTable();
+
+  ASSERT_EQ("v1", Get(Uint64Key(1)));
+  ASSERT_EQ("v2", Get(Uint64Key(2)));
+  ASSERT_EQ("v3", Get(Uint64Key(3)));
+  ASSERT_EQ("NOT_FOUND", Get(Uint64Key(4)));
+
+  // Add more keys.
+  ASSERT_OK(Delete(Uint64Key(2)));  // Delete.
+  dbfull()->TEST_FlushMemTable();
+  ASSERT_OK(Put(Uint64Key(3), "v0"));  // Update.
+  ASSERT_OK(Put(Uint64Key(4), "v4"));
+  dbfull()->TEST_FlushMemTable();
+  ASSERT_EQ("v1", Get(Uint64Key(1)));
+  ASSERT_EQ("NOT_FOUND", Get(Uint64Key(2)));
+  ASSERT_EQ("v0", Get(Uint64Key(3)));
+  ASSERT_EQ("v4", Get(Uint64Key(4)));
+}
+
+TEST_F(CuckooTableDBTest, CompactionIntoMultipleFiles) {
+  // Create a big L0 file and check it compacts into multiple files in L1.
+  Options options = CurrentOptions();
+  options.write_buffer_size = 270 << 10;
+  // Two SST files should be created, each containing 14 keys.
+  // Number of buckets will be 16. Total size ~156 KB.
+  options.target_file_size_base = 160 << 10;
+  Reopen(&options);
+
+  // Write 28 values, each 10016 B ~ 10KB
+  for (int idx = 0; idx < 28; ++idx) {
+    ASSERT_OK(Put(Key(idx), std::string(10000, 'a' + char(idx))));
+  }
+  dbfull()->TEST_WaitForFlushMemTable();
+  ASSERT_EQ("1", FilesPerLevel());
+
+  dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr,
+                              true /* disallow trivial move */);
+  ASSERT_EQ("0,2", FilesPerLevel());
+  for (int idx = 0; idx < 28; ++idx) {
+    ASSERT_EQ(std::string(10000, 'a' + char(idx)), Get(Key(idx)));
+  }
+}
+
+TEST_F(CuckooTableDBTest, SameKeyInsertedInTwoDifferentFilesAndCompacted) {
+  // Insert same key twice so that they go to different SST files. Then wait for
+  // compaction and check if the latest value is stored and old value removed.
+  Options options = CurrentOptions();
+  options.write_buffer_size = 100 << 10;  // 100KB
+  options.level0_file_num_compaction_trigger = 2;
+  Reopen(&options);
+
+  // Write 11 values, each 10016 B
+  for (int idx = 0; idx < 11; ++idx) {
+    ASSERT_OK(Put(Key(idx), std::string(10000, 'a')));
+  }
+  dbfull()->TEST_WaitForFlushMemTable();
+  ASSERT_EQ("1", FilesPerLevel());
+
+  // Generate one more file in level-0, and should trigger level-0 compaction
+  for (int idx = 0; idx < 11; ++idx) {
+    ASSERT_OK(Put(Key(idx), std::string(10000, 'a' + char(idx))));
+  }
+  dbfull()->TEST_WaitForFlushMemTable();
+  dbfull()->TEST_CompactRange(0, nullptr, nullptr);
+
+  ASSERT_EQ("0,1", FilesPerLevel());
+  for (int idx = 0; idx < 11; ++idx) {
+    ASSERT_EQ(std::string(10000, 'a' + char(idx)), Get(Key(idx)));
+  }
+}
+
+TEST_F(CuckooTableDBTest, AdaptiveTable) {
+  Options options = CurrentOptions();
+
+  // Ensure options compatible with PlainTable
+  options.prefix_extractor.reset(NewCappedPrefixTransform(8));
+
+  // Write some keys using cuckoo table.
+  options.table_factory.reset(NewCuckooTableFactory());
+  Reopen(&options);
+
+  ASSERT_OK(Put("key1", "v1"));
+  ASSERT_OK(Put("key2", "v2"));
+  ASSERT_OK(Put("key3", "v3"));
+  dbfull()->TEST_FlushMemTable();
+
+  // Write some keys using plain table.
+  std::shared_ptr<TableFactory> block_based_factory(
+      NewBlockBasedTableFactory());
+  std::shared_ptr<TableFactory> plain_table_factory(
+      NewPlainTableFactory());
+  std::shared_ptr<TableFactory> cuckoo_table_factory(
+      NewCuckooTableFactory());
+  options.create_if_missing = false;
+  options.table_factory.reset(NewAdaptiveTableFactory(
+    plain_table_factory, block_based_factory, plain_table_factory,
+    cuckoo_table_factory));
+  Reopen(&options);
+  ASSERT_OK(Put("key4", "v4"));
+  ASSERT_OK(Put("key1", "v5"));
+  dbfull()->TEST_FlushMemTable();
+
+  // Write some keys using block based table.
+  options.table_factory.reset(NewAdaptiveTableFactory(
+    block_based_factory, block_based_factory, plain_table_factory,
+    cuckoo_table_factory));
+  Reopen(&options);
+  ASSERT_OK(Put("key5", "v6"));
+  ASSERT_OK(Put("key2", "v7"));
+  dbfull()->TEST_FlushMemTable();
+
+  ASSERT_EQ("v5", Get("key1"));
+  ASSERT_EQ("v7", Get("key2"));
+  ASSERT_EQ("v3", Get("key3"));
+  ASSERT_EQ("v4", Get("key4"));
+  ASSERT_EQ("v6", Get("key5"));
+}
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  if (ROCKSDB_NAMESPACE::port::kLittleEndian) {
+    ::testing::InitGoogleTest(&argc, argv);
+    return RUN_ALL_TESTS();
+  } else {
+    fprintf(stderr, "SKIPPED as Cuckoo table doesn't support Big Endian\n");
+    return 0;
+  }
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+  fprintf(stderr, "SKIPPED as Cuckoo table is not supported in ROCKSDB_LITE\n");
+  return 0;
+}
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/db/db_basic_test.cc b/src/rocksdb/db/db_basic_test.cc
new file mode 100644
index 000000000..7573a01b4
--- /dev/null
+++ b/src/rocksdb/db/db_basic_test.cc
@@ -0,0 +1,2545 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include "db/db_test_util.h"
+#include "port/stack_trace.h"
+#include "rocksdb/perf_context.h"
+#include "rocksdb/utilities/debug.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "table/block_based/block_builder.h"
+#include "test_util/fault_injection_test_env.h"
+#if !defined(ROCKSDB_LITE)
+#include "test_util/sync_point.h"
+#endif
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBBasicTest : public DBTestBase {
+ public:
+  DBBasicTest() : DBTestBase("/db_basic_test") {}
+};
+
+TEST_F(DBBasicTest, OpenWhenOpen) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  ROCKSDB_NAMESPACE::DB* db2 = nullptr;
+  ROCKSDB_NAMESPACE::Status s = DB::Open(options, dbname_, &db2);
+
+  ASSERT_EQ(Status::Code::kIOError, s.code());
+  ASSERT_EQ(Status::SubCode::kNone, s.subcode());
+  ASSERT_TRUE(strstr(s.getState(), "lock ") != nullptr);
+
+  delete db2;
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBBasicTest, ReadOnlyDB) {
+  ASSERT_OK(Put("foo", "v1"));
+  ASSERT_OK(Put("bar", "v2"));
+  ASSERT_OK(Put("foo", "v3"));
+  Close();
+
+  auto options = CurrentOptions();
+  assert(options.env == env_);
+  ASSERT_OK(ReadOnlyReopen(options));
+  ASSERT_EQ("v3", Get("foo"));
+  ASSERT_EQ("v2", Get("bar"));
+  Iterator* iter = db_->NewIterator(ReadOptions());
+  int count = 0;
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    ASSERT_OK(iter->status());
+    ++count;
+  }
+  ASSERT_EQ(count, 2);
+  delete iter;
+  Close();
+
+  // Reopen and flush memtable.
+  Reopen(options);
+  Flush();
+  Close();
+  // Now check keys in read only mode.
+  ASSERT_OK(ReadOnlyReopen(options));
+  ASSERT_EQ("v3", Get("foo"));
+  ASSERT_EQ("v2", Get("bar"));
+  ASSERT_TRUE(db_->SyncWAL().IsNotSupported());
+}
+
+TEST_F(DBBasicTest, ReadOnlyDBWithWriteDBIdToManifestSet) {
+  ASSERT_OK(Put("foo", "v1"));
+  ASSERT_OK(Put("bar", "v2"));
+  ASSERT_OK(Put("foo", "v3"));
+  Close();
+
+  auto options = CurrentOptions();
+  options.write_dbid_to_manifest = true;
+  assert(options.env == env_);
+  ASSERT_OK(ReadOnlyReopen(options));
+  std::string db_id1;
+  db_->GetDbIdentity(db_id1);
+  ASSERT_EQ("v3", Get("foo"));
+  ASSERT_EQ("v2", Get("bar"));
+  Iterator* iter = db_->NewIterator(ReadOptions());
+  int count = 0;
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    ASSERT_OK(iter->status());
+    ++count;
+  }
+  ASSERT_EQ(count, 2);
+  delete iter;
+  Close();
+
+  // Reopen and flush memtable.
+  Reopen(options);
+  Flush();
+  Close();
+  // Now check keys in read only mode.
+  ASSERT_OK(ReadOnlyReopen(options));
+  ASSERT_EQ("v3", Get("foo"));
+  ASSERT_EQ("v2", Get("bar"));
+  ASSERT_TRUE(db_->SyncWAL().IsNotSupported());
+  std::string db_id2;
+  db_->GetDbIdentity(db_id2);
+  ASSERT_EQ(db_id1, db_id2);
+}
+
+TEST_F(DBBasicTest, CompactedDB) {
+  const uint64_t kFileSize = 1 << 20;
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.write_buffer_size = kFileSize;
+  options.target_file_size_base = kFileSize;
+  options.max_bytes_for_level_base = 1 << 30;
+  options.compression = kNoCompression;
+  Reopen(options);
+  // 1 L0 file, use CompactedDB if max_open_files = -1
+  ASSERT_OK(Put("aaa", DummyString(kFileSize / 2, '1')));
+  Flush();
+  Close();
+  ASSERT_OK(ReadOnlyReopen(options));
+  Status s = Put("new", "value");
+  ASSERT_EQ(s.ToString(),
+            "Not implemented: Not supported operation in read only mode.");
+  ASSERT_EQ(DummyString(kFileSize / 2, '1'), Get("aaa"));
+  Close();
+  options.max_open_files = -1;
+  ASSERT_OK(ReadOnlyReopen(options));
+  s = Put("new", "value");
+  ASSERT_EQ(s.ToString(),
+            "Not implemented: Not supported in compacted db mode.");
+  ASSERT_EQ(DummyString(kFileSize / 2, '1'), Get("aaa"));
+  Close();
+  Reopen(options);
+  // Add more L0 files
+  ASSERT_OK(Put("bbb", DummyString(kFileSize / 2, '2')));
+  Flush();
+  ASSERT_OK(Put("aaa", DummyString(kFileSize / 2, 'a')));
+  Flush();
+  ASSERT_OK(Put("bbb", DummyString(kFileSize / 2, 'b')));
+  ASSERT_OK(Put("eee", DummyString(kFileSize / 2, 'e')));
+  Flush();
+  Close();
+
+  ASSERT_OK(ReadOnlyReopen(options));
+  // Fallback to read-only DB
+  s = Put("new", "value");
+  ASSERT_EQ(s.ToString(),
+            "Not implemented: Not supported operation in read only mode.");
+  Close();
+
+  // Full compaction
+  Reopen(options);
+  // Add more keys
+  ASSERT_OK(Put("fff", DummyString(kFileSize / 2, 'f')));
+  ASSERT_OK(Put("hhh", DummyString(kFileSize / 2, 'h')));
+  ASSERT_OK(Put("iii", DummyString(kFileSize / 2, 'i')));
+  ASSERT_OK(Put("jjj", DummyString(kFileSize / 2, 'j')));
+  db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_EQ(3, NumTableFilesAtLevel(1));
+  Close();
+
+  // CompactedDB
+  ASSERT_OK(ReadOnlyReopen(options));
+  s = Put("new", "value");
+  ASSERT_EQ(s.ToString(),
+            "Not implemented: Not supported in compacted db mode.");
+  ASSERT_EQ("NOT_FOUND", Get("abc"));
+  ASSERT_EQ(DummyString(kFileSize / 2, 'a'), Get("aaa"));
+  ASSERT_EQ(DummyString(kFileSize / 2, 'b'), Get("bbb"));
+  ASSERT_EQ("NOT_FOUND", Get("ccc"));
+  ASSERT_EQ(DummyString(kFileSize / 2, 'e'), Get("eee"));
+  ASSERT_EQ(DummyString(kFileSize / 2, 'f'), Get("fff"));
+  ASSERT_EQ("NOT_FOUND", Get("ggg"));
+  ASSERT_EQ(DummyString(kFileSize / 2, 'h'), Get("hhh"));
+  ASSERT_EQ(DummyString(kFileSize / 2, 'i'), Get("iii"));
+  ASSERT_EQ(DummyString(kFileSize / 2, 'j'), Get("jjj"));
+  ASSERT_EQ("NOT_FOUND", Get("kkk"));
+
+  // MultiGet
+  std::vector<std::string> values;
+  std::vector<Status> status_list = dbfull()->MultiGet(
+      ReadOptions(),
+      std::vector<Slice>({Slice("aaa"), Slice("ccc"), Slice("eee"),
+                          Slice("ggg"), Slice("iii"), Slice("kkk")}),
+      &values);
+  ASSERT_EQ(status_list.size(), static_cast<uint64_t>(6));
+  ASSERT_EQ(values.size(), static_cast<uint64_t>(6));
+  ASSERT_OK(status_list[0]);
+  ASSERT_EQ(DummyString(kFileSize / 2, 'a'), values[0]);
+  ASSERT_TRUE(status_list[1].IsNotFound());
+  ASSERT_OK(status_list[2]);
+  ASSERT_EQ(DummyString(kFileSize / 2, 'e'), values[2]);
+  ASSERT_TRUE(status_list[3].IsNotFound());
+  ASSERT_OK(status_list[4]);
+  ASSERT_EQ(DummyString(kFileSize / 2, 'i'), values[4]);
+  ASSERT_TRUE(status_list[5].IsNotFound());
+
+  Reopen(options);
+  // Add a key
+  ASSERT_OK(Put("fff", DummyString(kFileSize / 2, 'f')));
+  Close();
+  ASSERT_OK(ReadOnlyReopen(options));
+  s = Put("new", "value");
+  ASSERT_EQ(s.ToString(),
+            "Not implemented: Not supported operation in read only mode.");
+}
+
+TEST_F(DBBasicTest, LevelLimitReopen) {
+  Options options = CurrentOptions();
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  const std::string value(1024 * 1024, ' ');
+  int i = 0;
+  while (NumTableFilesAtLevel(2, 1) == 0) {
+    ASSERT_OK(Put(1, Key(i++), value));
+    dbfull()->TEST_WaitForFlushMemTable();
+    dbfull()->TEST_WaitForCompact();
+  }
+
+  options.num_levels = 1;
+  options.max_bytes_for_level_multiplier_additional.resize(1, 1);
+  Status s = TryReopenWithColumnFamilies({"default", "pikachu"}, options);
+  ASSERT_EQ(s.IsInvalidArgument(), true);
+  ASSERT_EQ(s.ToString(),
+            "Invalid argument: db has more levels than options.num_levels");
+
+  options.num_levels = 10;
+  options.max_bytes_for_level_multiplier_additional.resize(10, 1);
+  ASSERT_OK(TryReopenWithColumnFamilies({"default", "pikachu"}, options));
+}
+#endif  // ROCKSDB_LITE
+
+TEST_F(DBBasicTest, PutDeleteGet) {
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    ASSERT_OK(Put(1, "foo", "v1"));
+    ASSERT_EQ("v1", Get(1, "foo"));
+    ASSERT_OK(Put(1, "foo", "v2"));
+    ASSERT_EQ("v2", Get(1, "foo"));
+    ASSERT_OK(Delete(1, "foo"));
+    ASSERT_EQ("NOT_FOUND", Get(1, "foo"));
+  } while (ChangeOptions());
+}
+
+TEST_F(DBBasicTest, PutSingleDeleteGet) {
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    ASSERT_OK(Put(1, "foo", "v1"));
+    ASSERT_EQ("v1", Get(1, "foo"));
+    ASSERT_OK(Put(1, "foo2", "v2"));
+    ASSERT_EQ("v2", Get(1, "foo2"));
+    ASSERT_OK(SingleDelete(1, "foo"));
+    ASSERT_EQ("NOT_FOUND", Get(1, "foo"));
+    // Ski FIFO and universal compaction because they do not apply to the test
+    // case. Skip MergePut because single delete does not get removed when it
+    // encounters a merge.
+  } while (ChangeOptions(kSkipFIFOCompaction | kSkipUniversalCompaction |
+                         kSkipMergePut));
+}
+
+TEST_F(DBBasicTest, EmptyFlush) {
+  // It is possible to produce empty flushes when using single deletes. Tests
+  // whether empty flushes cause issues.
+  do {
+    Random rnd(301);
+
+    Options options = CurrentOptions();
+    options.disable_auto_compactions = true;
+    CreateAndReopenWithCF({"pikachu"}, options);
+
+    Put(1, "a", Slice());
+    SingleDelete(1, "a");
+    ASSERT_OK(Flush(1));
+
+    ASSERT_EQ("[ ]", AllEntriesFor("a", 1));
+    // Skip FIFO and  universal compaction as they do not apply to the test
+    // case. Skip MergePut because merges cannot be combined with single
+    // deletions.
+  } while (ChangeOptions(kSkipFIFOCompaction | kSkipUniversalCompaction |
+                         kSkipMergePut));
+}
+
+TEST_F(DBBasicTest, GetFromVersions) {
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    ASSERT_OK(Put(1, "foo", "v1"));
+    ASSERT_OK(Flush(1));
+    ASSERT_EQ("v1", Get(1, "foo"));
+    ASSERT_EQ("NOT_FOUND", Get(0, "foo"));
+  } while (ChangeOptions());
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBBasicTest, GetSnapshot) {
+  anon::OptionsOverride options_override;
+  options_override.skip_policy = kSkipNoSnapshot;
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions(options_override));
+    // Try with both a short key and a long key
+    for (int i = 0; i < 2; i++) {
+      std::string key = (i == 0) ? std::string("foo") : std::string(200, 'x');
+      ASSERT_OK(Put(1, key, "v1"));
+      const Snapshot* s1 = db_->GetSnapshot();
+      ASSERT_OK(Put(1, key, "v2"));
+      ASSERT_EQ("v2", Get(1, key));
+      ASSERT_EQ("v1", Get(1, key, s1));
+      ASSERT_OK(Flush(1));
+      ASSERT_EQ("v2", Get(1, key));
+      ASSERT_EQ("v1", Get(1, key, s1));
+      db_->ReleaseSnapshot(s1);
+    }
+  } while (ChangeOptions());
+}
+#endif  // ROCKSDB_LITE
+
+TEST_F(DBBasicTest, CheckLock) {
+  do {
+    DB* localdb;
+    Options options = CurrentOptions();
+    ASSERT_OK(TryReopen(options));
+
+    // second open should fail
+    ASSERT_TRUE(!(DB::Open(options, dbname_, &localdb)).ok());
+  } while (ChangeCompactOptions());
+}
+
+TEST_F(DBBasicTest, FlushMultipleMemtable) {
+  do {
+    Options options = CurrentOptions();
+    WriteOptions writeOpt = WriteOptions();
+    writeOpt.disableWAL = true;
+    options.max_write_buffer_number = 4;
+    options.min_write_buffer_number_to_merge = 3;
+    options.max_write_buffer_size_to_maintain = -1;
+    CreateAndReopenWithCF({"pikachu"}, options);
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v1"));
+    ASSERT_OK(Flush(1));
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v1"));
+
+    ASSERT_EQ("v1", Get(1, "foo"));
+    ASSERT_EQ("v1", Get(1, "bar"));
+    ASSERT_OK(Flush(1));
+  } while (ChangeCompactOptions());
+}
+
+TEST_F(DBBasicTest, FlushEmptyColumnFamily) {
+  // Block flush thread and disable compaction thread
+  env_->SetBackgroundThreads(1, Env::HIGH);
+  env_->SetBackgroundThreads(1, Env::LOW);
+  test::SleepingBackgroundTask sleeping_task_low;
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+                 Env::Priority::LOW);
+  test::SleepingBackgroundTask sleeping_task_high;
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask,
+                 &sleeping_task_high, Env::Priority::HIGH);
+
+  Options options = CurrentOptions();
+  // disable compaction
+  options.disable_auto_compactions = true;
+  WriteOptions writeOpt = WriteOptions();
+  writeOpt.disableWAL = true;
+  options.max_write_buffer_number = 2;
+  options.min_write_buffer_number_to_merge = 1;
+  options.max_write_buffer_size_to_maintain =
+      static_cast<int64_t>(options.write_buffer_size);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  // Compaction can still go through even if no thread can flush the
+  // mem table.
+  ASSERT_OK(Flush(0));
+  ASSERT_OK(Flush(1));
+
+  // Insert can go through
+  ASSERT_OK(dbfull()->Put(writeOpt, handles_[0], "foo", "v1"));
+  ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v1"));
+
+  ASSERT_EQ("v1", Get(0, "foo"));
+  ASSERT_EQ("v1", Get(1, "bar"));
+
+  sleeping_task_high.WakeUp();
+  sleeping_task_high.WaitUntilDone();
+
+  // Flush can still go through.
+  ASSERT_OK(Flush(0));
+  ASSERT_OK(Flush(1));
+
+  sleeping_task_low.WakeUp();
+  sleeping_task_low.WaitUntilDone();
+}
+
+TEST_F(DBBasicTest, FLUSH) {
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    WriteOptions writeOpt = WriteOptions();
+    writeOpt.disableWAL = true;
+    SetPerfLevel(kEnableTime);
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v1"));
+    // this will now also flush the last 2 writes
+    ASSERT_OK(Flush(1));
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v1"));
+
+    get_perf_context()->Reset();
+    Get(1, "foo");
+    ASSERT_TRUE((int)get_perf_context()->get_from_output_files_time > 0);
+    ASSERT_EQ(2, (int)get_perf_context()->get_read_bytes);
+
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    ASSERT_EQ("v1", Get(1, "foo"));
+    ASSERT_EQ("v1", Get(1, "bar"));
+
+    writeOpt.disableWAL = true;
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v2"));
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v2"));
+    ASSERT_OK(Flush(1));
+
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    ASSERT_EQ("v2", Get(1, "bar"));
+    get_perf_context()->Reset();
+    ASSERT_EQ("v2", Get(1, "foo"));
+    ASSERT_TRUE((int)get_perf_context()->get_from_output_files_time > 0);
+
+    writeOpt.disableWAL = false;
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v3"));
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v3"));
+    ASSERT_OK(Flush(1));
+
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    // 'foo' should be there because its put
+    // has WAL enabled.
+    ASSERT_EQ("v3", Get(1, "foo"));
+    ASSERT_EQ("v3", Get(1, "bar"));
+
+    SetPerfLevel(kDisable);
+  } while (ChangeCompactOptions());
+}
+
+TEST_F(DBBasicTest, ManifestRollOver) {
+  do {
+    Options options;
+    options.max_manifest_file_size = 10;  // 10 bytes
+    options = CurrentOptions(options);
+    CreateAndReopenWithCF({"pikachu"}, options);
+    {
+      ASSERT_OK(Put(1, "manifest_key1", std::string(1000, '1')));
+      ASSERT_OK(Put(1, "manifest_key2", std::string(1000, '2')));
+      ASSERT_OK(Put(1, "manifest_key3", std::string(1000, '3')));
+      uint64_t manifest_before_flush = dbfull()->TEST_Current_Manifest_FileNo();
+      ASSERT_OK(Flush(1));  // This should trigger LogAndApply.
+      uint64_t manifest_after_flush = dbfull()->TEST_Current_Manifest_FileNo();
+      ASSERT_GT(manifest_after_flush, manifest_before_flush);
+      ReopenWithColumnFamilies({"default", "pikachu"}, options);
+      ASSERT_GT(dbfull()->TEST_Current_Manifest_FileNo(), manifest_after_flush);
+      // check if a new manifest file got inserted or not.
+      ASSERT_EQ(std::string(1000, '1'), Get(1, "manifest_key1"));
+      ASSERT_EQ(std::string(1000, '2'), Get(1, "manifest_key2"));
+      ASSERT_EQ(std::string(1000, '3'), Get(1, "manifest_key3"));
+    }
+  } while (ChangeCompactOptions());
+}
+
+TEST_F(DBBasicTest, IdentityAcrossRestarts1) {
+  do {
+    std::string id1;
+    ASSERT_OK(db_->GetDbIdentity(id1));
+
+    Options options = CurrentOptions();
+    Reopen(options);
+    std::string id2;
+    ASSERT_OK(db_->GetDbIdentity(id2));
+    // id1 should match id2 because identity was not regenerated
+    ASSERT_EQ(id1.compare(id2), 0);
+
+    std::string idfilename = IdentityFileName(dbname_);
+    ASSERT_OK(env_->DeleteFile(idfilename));
+    Reopen(options);
+    std::string id3;
+    ASSERT_OK(db_->GetDbIdentity(id3));
+    if (options.write_dbid_to_manifest) {
+      ASSERT_EQ(id1.compare(id3), 0);
+    } else {
+      // id1 should NOT match id3 because identity was regenerated
+      ASSERT_NE(id1.compare(id3), 0);
+    }
+  } while (ChangeCompactOptions());
+}
+
+TEST_F(DBBasicTest, IdentityAcrossRestarts2) {
+  do {
+    std::string id1;
+    ASSERT_OK(db_->GetDbIdentity(id1));
+
+    Options options = CurrentOptions();
+    options.write_dbid_to_manifest = true;
+    Reopen(options);
+    std::string id2;
+    ASSERT_OK(db_->GetDbIdentity(id2));
+    // id1 should match id2 because identity was not regenerated
+    ASSERT_EQ(id1.compare(id2), 0);
+
+    std::string idfilename = IdentityFileName(dbname_);
+    ASSERT_OK(env_->DeleteFile(idfilename));
+    Reopen(options);
+    std::string id3;
+    ASSERT_OK(db_->GetDbIdentity(id3));
+    // id1 should NOT match id3 because identity was regenerated
+    ASSERT_EQ(id1, id3);
+  } while (ChangeCompactOptions());
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBBasicTest, Snapshot) {
+  anon::OptionsOverride options_override;
+  options_override.skip_policy = kSkipNoSnapshot;
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions(options_override));
+    Put(0, "foo", "0v1");
+    Put(1, "foo", "1v1");
+
+    const Snapshot* s1 = db_->GetSnapshot();
+    ASSERT_EQ(1U, GetNumSnapshots());
+    uint64_t time_snap1 = GetTimeOldestSnapshots();
+    ASSERT_GT(time_snap1, 0U);
+    ASSERT_EQ(GetSequenceOldestSnapshots(), s1->GetSequenceNumber());
+    Put(0, "foo", "0v2");
+    Put(1, "foo", "1v2");
+
+    env_->addon_time_.fetch_add(1);
+
+    const Snapshot* s2 = db_->GetSnapshot();
+    ASSERT_EQ(2U, GetNumSnapshots());
+    ASSERT_EQ(time_snap1, GetTimeOldestSnapshots());
+    ASSERT_EQ(GetSequenceOldestSnapshots(), s1->GetSequenceNumber());
+    Put(0, "foo", "0v3");
+    Put(1, "foo", "1v3");
+
+    {
+      ManagedSnapshot s3(db_);
+      ASSERT_EQ(3U, GetNumSnapshots());
+      ASSERT_EQ(time_snap1, GetTimeOldestSnapshots());
+      ASSERT_EQ(GetSequenceOldestSnapshots(), s1->GetSequenceNumber());
+
+      Put(0, "foo", "0v4");
+      Put(1, "foo", "1v4");
+      ASSERT_EQ("0v1", Get(0, "foo", s1));
+      ASSERT_EQ("1v1", Get(1, "foo", s1));
+      ASSERT_EQ("0v2", Get(0, "foo", s2));
+      ASSERT_EQ("1v2", Get(1, "foo", s2));
+      ASSERT_EQ("0v3", Get(0, "foo", s3.snapshot()));
+      ASSERT_EQ("1v3", Get(1, "foo", s3.snapshot()));
+      ASSERT_EQ("0v4", Get(0, "foo"));
+      ASSERT_EQ("1v4", Get(1, "foo"));
+    }
+
+    ASSERT_EQ(2U, GetNumSnapshots());
+    ASSERT_EQ(time_snap1, GetTimeOldestSnapshots());
+    ASSERT_EQ(GetSequenceOldestSnapshots(), s1->GetSequenceNumber());
+    ASSERT_EQ("0v1", Get(0, "foo", s1));
+    ASSERT_EQ("1v1", Get(1, "foo", s1));
+    ASSERT_EQ("0v2", Get(0, "foo", s2));
+    ASSERT_EQ("1v2", Get(1, "foo", s2));
+    ASSERT_EQ("0v4", Get(0, "foo"));
+    ASSERT_EQ("1v4", Get(1, "foo"));
+
+    db_->ReleaseSnapshot(s1);
+    ASSERT_EQ("0v2", Get(0, "foo", s2));
+    ASSERT_EQ("1v2", Get(1, "foo", s2));
+    ASSERT_EQ("0v4", Get(0, "foo"));
+    ASSERT_EQ("1v4", Get(1, "foo"));
+    ASSERT_EQ(1U, GetNumSnapshots());
+    ASSERT_LT(time_snap1, GetTimeOldestSnapshots());
+    ASSERT_EQ(GetSequenceOldestSnapshots(), s2->GetSequenceNumber());
+
+    db_->ReleaseSnapshot(s2);
+    ASSERT_EQ(0U, GetNumSnapshots());
+    ASSERT_EQ(GetSequenceOldestSnapshots(), 0);
+    ASSERT_EQ("0v4", Get(0, "foo"));
+    ASSERT_EQ("1v4", Get(1, "foo"));
+  } while (ChangeOptions());
+}
+
+#endif  // ROCKSDB_LITE
+
+TEST_F(DBBasicTest, CompactBetweenSnapshots) {
+  anon::OptionsOverride options_override;
+  options_override.skip_policy = kSkipNoSnapshot;
+  do {
+    Options options = CurrentOptions(options_override);
+    options.disable_auto_compactions = true;
+    CreateAndReopenWithCF({"pikachu"}, options);
+    Random rnd(301);
+    FillLevels("a", "z", 1);
+
+    Put(1, "foo", "first");
+    const Snapshot* snapshot1 = db_->GetSnapshot();
+    Put(1, "foo", "second");
+    Put(1, "foo", "third");
+    Put(1, "foo", "fourth");
+    const Snapshot* snapshot2 = db_->GetSnapshot();
+    Put(1, "foo", "fifth");
+    Put(1, "foo", "sixth");
+
+    // All entries (including duplicates) exist
+    // before any compaction or flush is triggered.
+    ASSERT_EQ(AllEntriesFor("foo", 1),
+              "[ sixth, fifth, fourth, third, second, first ]");
+    ASSERT_EQ("sixth", Get(1, "foo"));
+    ASSERT_EQ("fourth", Get(1, "foo", snapshot2));
+    ASSERT_EQ("first", Get(1, "foo", snapshot1));
+
+    // After a flush, "second", "third" and "fifth" should
+    // be removed
+    ASSERT_OK(Flush(1));
+    ASSERT_EQ(AllEntriesFor("foo", 1), "[ sixth, fourth, first ]");
+
+    // after we release the snapshot1, only two values left
+    db_->ReleaseSnapshot(snapshot1);
+    FillLevels("a", "z", 1);
+    dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr,
+                           nullptr);
+
+    // We have only one valid snapshot snapshot2. Since snapshot1 is
+    // not valid anymore, "first" should be removed by a compaction.
+    ASSERT_EQ("sixth", Get(1, "foo"));
+    ASSERT_EQ("fourth", Get(1, "foo", snapshot2));
+    ASSERT_EQ(AllEntriesFor("foo", 1), "[ sixth, fourth ]");
+
+    // after we release the snapshot2, only one value should be left
+    db_->ReleaseSnapshot(snapshot2);
+    FillLevels("a", "z", 1);
+    dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr,
+                           nullptr);
+    ASSERT_EQ("sixth", Get(1, "foo"));
+    ASSERT_EQ(AllEntriesFor("foo", 1), "[ sixth ]");
+  } while (ChangeOptions(kSkipFIFOCompaction));
+}
+
+TEST_F(DBBasicTest, DBOpen_Options) {
+  Options options = CurrentOptions();
+  Close();
+  Destroy(options);
+
+  // Does not exist, and create_if_missing == false: error
+  DB* db = nullptr;
+  options.create_if_missing = false;
+  Status s = DB::Open(options, dbname_, &db);
+  ASSERT_TRUE(strstr(s.ToString().c_str(), "does not exist") != nullptr);
+  ASSERT_TRUE(db == nullptr);
+
+  // Does not exist, and create_if_missing == true: OK
+  options.create_if_missing = true;
+  s = DB::Open(options, dbname_, &db);
+  ASSERT_OK(s);
+  ASSERT_TRUE(db != nullptr);
+
+  delete db;
+  db = nullptr;
+
+  // Does exist, and error_if_exists == true: error
+  options.create_if_missing = false;
+  options.error_if_exists = true;
+  s = DB::Open(options, dbname_, &db);
+  ASSERT_TRUE(strstr(s.ToString().c_str(), "exists") != nullptr);
+  ASSERT_TRUE(db == nullptr);
+
+  // Does exist, and error_if_exists == false: OK
+  options.create_if_missing = true;
+  options.error_if_exists = false;
+  s = DB::Open(options, dbname_, &db);
+  ASSERT_OK(s);
+  ASSERT_TRUE(db != nullptr);
+
+  delete db;
+  db = nullptr;
+}
+
+TEST_F(DBBasicTest, CompactOnFlush) {
+  anon::OptionsOverride options_override;
+  options_override.skip_policy = kSkipNoSnapshot;
+  do {
+    Options options = CurrentOptions(options_override);
+    options.disable_auto_compactions = true;
+    CreateAndReopenWithCF({"pikachu"}, options);
+
+    Put(1, "foo", "v1");
+    ASSERT_OK(Flush(1));
+    ASSERT_EQ(AllEntriesFor("foo", 1), "[ v1 ]");
+
+    // Write two new keys
+    Put(1, "a", "begin");
+    Put(1, "z", "end");
+    Flush(1);
+
+    // Case1: Delete followed by a put
+    Delete(1, "foo");
+    Put(1, "foo", "v2");
+    ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2, DEL, v1 ]");
+
+    // After the current memtable is flushed, the DEL should
+    // have been removed
+    ASSERT_OK(Flush(1));
+    ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2, v1 ]");
+
+    dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr,
+                           nullptr);
+    ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2 ]");
+
+    // Case 2: Delete followed by another delete
+    Delete(1, "foo");
+    Delete(1, "foo");
+    ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, DEL, v2 ]");
+    ASSERT_OK(Flush(1));
+    ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, v2 ]");
+    dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr,
+                           nullptr);
+    ASSERT_EQ(AllEntriesFor("foo", 1), "[ ]");
+
+    // Case 3: Put followed by a delete
+    Put(1, "foo", "v3");
+    Delete(1, "foo");
+    ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, v3 ]");
+    ASSERT_OK(Flush(1));
+    ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL ]");
+    dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr,
+                           nullptr);
+    ASSERT_EQ(AllEntriesFor("foo", 1), "[ ]");
+
+    // Case 4: Put followed by another Put
+    Put(1, "foo", "v4");
+    Put(1, "foo", "v5");
+    ASSERT_EQ(AllEntriesFor("foo", 1), "[ v5, v4 ]");
+    ASSERT_OK(Flush(1));
+    ASSERT_EQ(AllEntriesFor("foo", 1), "[ v5 ]");
+    dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr,
+                           nullptr);
+    ASSERT_EQ(AllEntriesFor("foo", 1), "[ v5 ]");
+
+    // clear database
+    Delete(1, "foo");
+    dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr,
+                           nullptr);
+    ASSERT_EQ(AllEntriesFor("foo", 1), "[ ]");
+
+    // Case 5: Put followed by snapshot followed by another Put
+    // Both puts should remain.
+    Put(1, "foo", "v6");
+    const Snapshot* snapshot = db_->GetSnapshot();
+    Put(1, "foo", "v7");
+    ASSERT_OK(Flush(1));
+    ASSERT_EQ(AllEntriesFor("foo", 1), "[ v7, v6 ]");
+    db_->ReleaseSnapshot(snapshot);
+
+    // clear database
+    Delete(1, "foo");
+    dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr,
+                           nullptr);
+    ASSERT_EQ(AllEntriesFor("foo", 1), "[ ]");
+
+    // Case 5: snapshot followed by a put followed by another Put
+    // Only the last put should remain.
+    const Snapshot* snapshot1 = db_->GetSnapshot();
+    Put(1, "foo", "v8");
+    Put(1, "foo", "v9");
+    ASSERT_OK(Flush(1));
+    ASSERT_EQ(AllEntriesFor("foo", 1), "[ v9 ]");
+    db_->ReleaseSnapshot(snapshot1);
+  } while (ChangeCompactOptions());
+}
+
+TEST_F(DBBasicTest, FlushOneColumnFamily) {
+  Options options = CurrentOptions();
+  CreateAndReopenWithCF({"pikachu", "ilya", "muromec", "dobrynia", "nikitich",
+                         "alyosha", "popovich"},
+                        options);
+
+  ASSERT_OK(Put(0, "Default", "Default"));
+  ASSERT_OK(Put(1, "pikachu", "pikachu"));
+  ASSERT_OK(Put(2, "ilya", "ilya"));
+  ASSERT_OK(Put(3, "muromec", "muromec"));
+  ASSERT_OK(Put(4, "dobrynia", "dobrynia"));
+  ASSERT_OK(Put(5, "nikitich", "nikitich"));
+  ASSERT_OK(Put(6, "alyosha", "alyosha"));
+  ASSERT_OK(Put(7, "popovich", "popovich"));
+
+  for (int i = 0; i < 8; ++i) {
+    Flush(i);
+    auto tables = ListTableFiles(env_, dbname_);
+    ASSERT_EQ(tables.size(), i + 1U);
+  }
+}
+
+TEST_F(DBBasicTest, MultiGetSimple) {
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    SetPerfLevel(kEnableCount);
+    ASSERT_OK(Put(1, "k1", "v1"));
+    ASSERT_OK(Put(1, "k2", "v2"));
+    ASSERT_OK(Put(1, "k3", "v3"));
+    ASSERT_OK(Put(1, "k4", "v4"));
+    ASSERT_OK(Delete(1, "k4"));
+    ASSERT_OK(Put(1, "k5", "v5"));
+    ASSERT_OK(Delete(1, "no_key"));
+
+    std::vector<Slice> keys({"k1", "k2", "k3", "k4", "k5", "no_key"});
+
+    std::vector<std::string> values(20, "Temporary data to be overwritten");
+    std::vector<ColumnFamilyHandle*> cfs(keys.size(), handles_[1]);
+
+    get_perf_context()->Reset();
+    std::vector<Status> s = db_->MultiGet(ReadOptions(), cfs, keys, &values);
+    ASSERT_EQ(values.size(), keys.size());
+    ASSERT_EQ(values[0], "v1");
+    ASSERT_EQ(values[1], "v2");
+    ASSERT_EQ(values[2], "v3");
+    ASSERT_EQ(values[4], "v5");
+    // four kv pairs * two bytes per value
+    ASSERT_EQ(8, (int)get_perf_context()->multiget_read_bytes);
+
+    ASSERT_OK(s[0]);
+    ASSERT_OK(s[1]);
+    ASSERT_OK(s[2]);
+    ASSERT_TRUE(s[3].IsNotFound());
+    ASSERT_OK(s[4]);
+    ASSERT_TRUE(s[5].IsNotFound());
+    SetPerfLevel(kDisable);
+  } while (ChangeCompactOptions());
+}
+
+TEST_F(DBBasicTest, MultiGetEmpty) {
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    // Empty Key Set
+    std::vector<Slice> keys;
+    std::vector<std::string> values;
+    std::vector<ColumnFamilyHandle*> cfs;
+    std::vector<Status> s = db_->MultiGet(ReadOptions(), cfs, keys, &values);
+    ASSERT_EQ(s.size(), 0U);
+
+    // Empty Database, Empty Key Set
+    Options options = CurrentOptions();
+    options.create_if_missing = true;
+    DestroyAndReopen(options);
+    CreateAndReopenWithCF({"pikachu"}, options);
+    s = db_->MultiGet(ReadOptions(), cfs, keys, &values);
+    ASSERT_EQ(s.size(), 0U);
+
+    // Empty Database, Search for Keys
+    keys.resize(2);
+    keys[0] = "a";
+    keys[1] = "b";
+    cfs.push_back(handles_[0]);
+    cfs.push_back(handles_[1]);
+    s = db_->MultiGet(ReadOptions(), cfs, keys, &values);
+    ASSERT_EQ(static_cast<int>(s.size()), 2);
+    ASSERT_TRUE(s[0].IsNotFound() && s[1].IsNotFound());
+  } while (ChangeCompactOptions());
+}
+
+TEST_F(DBBasicTest, ChecksumTest) {
+  BlockBasedTableOptions table_options;
+  Options options = CurrentOptions();
+  // change when new checksum type added
+  int max_checksum = static_cast<int>(kxxHash64);
+  const int kNumPerFile = 2;
+
+  // generate one table with each type of checksum
+  for (int i = 0; i <= max_checksum; ++i) {
+    table_options.checksum = static_cast<ChecksumType>(i);
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+    Reopen(options);
+    for (int j = 0; j < kNumPerFile; ++j) {
+      ASSERT_OK(Put(Key(i * kNumPerFile + j), Key(i * kNumPerFile + j)));
+    }
+    ASSERT_OK(Flush());
+  }
+
+  // with each valid checksum type setting...
+  for (int i = 0; i <= max_checksum; ++i) {
+    table_options.checksum = static_cast<ChecksumType>(i);
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+    Reopen(options);
+    // verify every type of checksum (should be regardless of that setting)
+    for (int j = 0; j < (max_checksum + 1) * kNumPerFile; ++j) {
+      ASSERT_EQ(Key(j), Get(Key(j)));
+    }
+  }
+}
+
+// On Windows you can have either memory mapped file or a file
+// with unbuffered access. So this asserts and does not make
+// sense to run
+#ifndef OS_WIN
+TEST_F(DBBasicTest, MmapAndBufferOptions) {
+  if (!IsMemoryMappedAccessSupported()) {
+    return;
+  }
+  Options options = CurrentOptions();
+
+  options.use_direct_reads = true;
+  options.allow_mmap_reads = true;
+  ASSERT_NOK(TryReopen(options));
+
+  // All other combinations are acceptable
+  options.use_direct_reads = false;
+  ASSERT_OK(TryReopen(options));
+
+  if (IsDirectIOSupported()) {
+    options.use_direct_reads = true;
+    options.allow_mmap_reads = false;
+    ASSERT_OK(TryReopen(options));
+  }
+
+  options.use_direct_reads = false;
+  ASSERT_OK(TryReopen(options));
+}
+#endif
+
+class TestEnv : public EnvWrapper {
+  public:
+   explicit TestEnv(Env* base_env) : EnvWrapper(base_env), close_count(0) {}
+
+   class TestLogger : public Logger {
+    public:
+     using Logger::Logv;
+     explicit TestLogger(TestEnv* env_ptr) : Logger() { env = env_ptr; }
+     ~TestLogger() override {
+       if (!closed_) {
+         CloseHelper();
+       }
+     }
+     void Logv(const char* /*format*/, va_list /*ap*/) override {}
+
+    protected:
+     Status CloseImpl() override { return CloseHelper(); }
+
+    private:
+     Status CloseHelper() {
+       env->CloseCountInc();
+       ;
+       return Status::IOError();
+     }
+     TestEnv* env;
+   };
+
+    void CloseCountInc() { close_count++; }
+
+    int GetCloseCount() { return close_count; }
+
+    Status NewLogger(const std::string& /*fname*/,
+                     std::shared_ptr<Logger>* result) override {
+      result->reset(new TestLogger(this));
+      return Status::OK();
+    }
+
+   private:
+    int close_count;
+};
+
+TEST_F(DBBasicTest, DBClose) {
+  Options options = GetDefaultOptions();
+  std::string dbname = test::PerThreadDBPath("db_close_test");
+  ASSERT_OK(DestroyDB(dbname, options));
+
+  DB* db = nullptr;
+  TestEnv* env = new TestEnv(env_);
+  std::unique_ptr<TestEnv> local_env_guard(env);
+  options.create_if_missing = true;
+  options.env = env;
+  Status s = DB::Open(options, dbname, &db);
+  ASSERT_OK(s);
+  ASSERT_TRUE(db != nullptr);
+
+  s = db->Close();
+  ASSERT_EQ(env->GetCloseCount(), 1);
+  ASSERT_EQ(s, Status::IOError());
+
+  delete db;
+  ASSERT_EQ(env->GetCloseCount(), 1);
+
+  // Do not call DB::Close() and ensure our logger Close() still gets called
+  s = DB::Open(options, dbname, &db);
+  ASSERT_OK(s);
+  ASSERT_TRUE(db != nullptr);
+  delete db;
+  ASSERT_EQ(env->GetCloseCount(), 2);
+
+  // Provide our own logger and ensure DB::Close() does not close it
+  options.info_log.reset(new TestEnv::TestLogger(env));
+  options.create_if_missing = false;
+  s = DB::Open(options, dbname, &db);
+  ASSERT_OK(s);
+  ASSERT_TRUE(db != nullptr);
+
+  s = db->Close();
+  ASSERT_EQ(s, Status::OK());
+  delete db;
+  ASSERT_EQ(env->GetCloseCount(), 2);
+  options.info_log.reset();
+  ASSERT_EQ(env->GetCloseCount(), 3);
+}
+
+TEST_F(DBBasicTest, DBCloseFlushError) {
+  std::unique_ptr<FaultInjectionTestEnv> fault_injection_env(
+      new FaultInjectionTestEnv(env_));
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  options.manual_wal_flush = true;
+  options.write_buffer_size=100;
+  options.env = fault_injection_env.get();
+
+  Reopen(options);
+  ASSERT_OK(Put("key1", "value1"));
+  ASSERT_OK(Put("key2", "value2"));
+  ASSERT_OK(dbfull()->TEST_SwitchMemtable());
+  ASSERT_OK(Put("key3", "value3"));
+  fault_injection_env->SetFilesystemActive(false);
+  Status s = dbfull()->Close();
+  fault_injection_env->SetFilesystemActive(true);
+  ASSERT_NE(s, Status::OK());
+
+  Destroy(options);
+}
+
+class DBMultiGetTestWithParam : public DBBasicTest,
+                                public testing::WithParamInterface<bool> {};
+
+TEST_P(DBMultiGetTestWithParam, MultiGetMultiCF) {
+  Options options = CurrentOptions();
+  CreateAndReopenWithCF({"pikachu", "ilya", "muromec", "dobrynia", "nikitich",
+                         "alyosha", "popovich"},
+                        options);
+  // <CF, key, value> tuples
+  std::vector<std::tuple<int, std::string, std::string>> cf_kv_vec;
+  static const int num_keys = 24;
+  cf_kv_vec.reserve(num_keys);
+
+  for (int i = 0; i < num_keys; ++i) {
+    int cf = i / 3;
+    int cf_key = 1 % 3;
+    cf_kv_vec.emplace_back(std::make_tuple(
+        cf, "cf" + std::to_string(cf) + "_key_" + std::to_string(cf_key),
+        "cf" + std::to_string(cf) + "_val_" + std::to_string(cf_key)));
+    ASSERT_OK(Put(std::get<0>(cf_kv_vec[i]), std::get<1>(cf_kv_vec[i]),
+                  std::get<2>(cf_kv_vec[i])));
+  }
+
+  int get_sv_count = 0;
+  ROCKSDB_NAMESPACE::DBImpl* db = reinterpret_cast<DBImpl*>(db_);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::MultiGet::AfterRefSV", [&](void* /*arg*/) {
+        if (++get_sv_count == 2) {
+          // After MultiGet refs a couple of CFs, flush all CFs so MultiGet
+          // is forced to repeat the process
+          for (int i = 0; i < num_keys; ++i) {
+            int cf = i / 3;
+            int cf_key = i % 8;
+            if (cf_key == 0) {
+              ASSERT_OK(Flush(cf));
+            }
+            ASSERT_OK(Put(std::get<0>(cf_kv_vec[i]), std::get<1>(cf_kv_vec[i]),
+                          std::get<2>(cf_kv_vec[i]) + "_2"));
+          }
+        }
+        if (get_sv_count == 11) {
+          for (int i = 0; i < 8; ++i) {
+            auto* cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(
+                            db->GetColumnFamilyHandle(i))
+                            ->cfd();
+            ASSERT_EQ(cfd->TEST_GetLocalSV()->Get(), SuperVersion::kSVInUse);
+          }
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  std::vector<int> cfs;
+  std::vector<std::string> keys;
+  std::vector<std::string> values;
+
+  for (int i = 0; i < num_keys; ++i) {
+    cfs.push_back(std::get<0>(cf_kv_vec[i]));
+    keys.push_back(std::get<1>(cf_kv_vec[i]));
+  }
+
+  values = MultiGet(cfs, keys, nullptr, GetParam());
+  ASSERT_EQ(values.size(), num_keys);
+  for (unsigned int j = 0; j < values.size(); ++j) {
+    ASSERT_EQ(values[j], std::get<2>(cf_kv_vec[j]) + "_2");
+  }
+
+  keys.clear();
+  cfs.clear();
+  cfs.push_back(std::get<0>(cf_kv_vec[0]));
+  keys.push_back(std::get<1>(cf_kv_vec[0]));
+  cfs.push_back(std::get<0>(cf_kv_vec[3]));
+  keys.push_back(std::get<1>(cf_kv_vec[3]));
+  cfs.push_back(std::get<0>(cf_kv_vec[4]));
+  keys.push_back(std::get<1>(cf_kv_vec[4]));
+  values = MultiGet(cfs, keys, nullptr, GetParam());
+  ASSERT_EQ(values[0], std::get<2>(cf_kv_vec[0]) + "_2");
+  ASSERT_EQ(values[1], std::get<2>(cf_kv_vec[3]) + "_2");
+  ASSERT_EQ(values[2], std::get<2>(cf_kv_vec[4]) + "_2");
+
+  keys.clear();
+  cfs.clear();
+  cfs.push_back(std::get<0>(cf_kv_vec[7]));
+  keys.push_back(std::get<1>(cf_kv_vec[7]));
+  cfs.push_back(std::get<0>(cf_kv_vec[6]));
+  keys.push_back(std::get<1>(cf_kv_vec[6]));
+  cfs.push_back(std::get<0>(cf_kv_vec[1]));
+  keys.push_back(std::get<1>(cf_kv_vec[1]));
+  values = MultiGet(cfs, keys, nullptr, GetParam());
+  ASSERT_EQ(values[0], std::get<2>(cf_kv_vec[7]) + "_2");
+  ASSERT_EQ(values[1], std::get<2>(cf_kv_vec[6]) + "_2");
+  ASSERT_EQ(values[2], std::get<2>(cf_kv_vec[1]) + "_2");
+
+  for (int cf = 0; cf < 8; ++cf) {
+    auto* cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(
+                    reinterpret_cast<DBImpl*>(db_)->GetColumnFamilyHandle(cf))
+                    ->cfd();
+    ASSERT_NE(cfd->TEST_GetLocalSV()->Get(), SuperVersion::kSVInUse);
+    ASSERT_NE(cfd->TEST_GetLocalSV()->Get(), SuperVersion::kSVObsolete);
+  }
+}
+
+TEST_P(DBMultiGetTestWithParam, MultiGetMultiCFMutex) {
+  Options options = CurrentOptions();
+  CreateAndReopenWithCF({"pikachu", "ilya", "muromec", "dobrynia", "nikitich",
+                         "alyosha", "popovich"},
+                        options);
+
+  for (int i = 0; i < 8; ++i) {
+    ASSERT_OK(Put(i, "cf" + std::to_string(i) + "_key",
+                  "cf" + std::to_string(i) + "_val"));
+  }
+
+  int get_sv_count = 0;
+  int retries = 0;
+  bool last_try = false;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::MultiGet::LastTry", [&](void* /*arg*/) {
+        last_try = true;
+        ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::MultiGet::AfterRefSV", [&](void* /*arg*/) {
+        if (last_try) {
+          return;
+        }
+        if (++get_sv_count == 2) {
+          ++retries;
+          get_sv_count = 0;
+          for (int i = 0; i < 8; ++i) {
+            ASSERT_OK(Flush(i));
+            ASSERT_OK(Put(
+                i, "cf" + std::to_string(i) + "_key",
+                "cf" + std::to_string(i) + "_val" + std::to_string(retries)));
+          }
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  std::vector<int> cfs;
+  std::vector<std::string> keys;
+  std::vector<std::string> values;
+
+  for (int i = 0; i < 8; ++i) {
+    cfs.push_back(i);
+    keys.push_back("cf" + std::to_string(i) + "_key");
+  }
+
+  values = MultiGet(cfs, keys, nullptr, GetParam());
+  ASSERT_TRUE(last_try);
+  ASSERT_EQ(values.size(), 8);
+  for (unsigned int j = 0; j < values.size(); ++j) {
+    ASSERT_EQ(values[j],
+              "cf" + std::to_string(j) + "_val" + std::to_string(retries));
+  }
+  for (int i = 0; i < 8; ++i) {
+    auto* cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(
+                    reinterpret_cast<DBImpl*>(db_)->GetColumnFamilyHandle(i))
+                    ->cfd();
+    ASSERT_NE(cfd->TEST_GetLocalSV()->Get(), SuperVersion::kSVInUse);
+  }
+}
+
+TEST_P(DBMultiGetTestWithParam, MultiGetMultiCFSnapshot) {
+  Options options = CurrentOptions();
+  CreateAndReopenWithCF({"pikachu", "ilya", "muromec", "dobrynia", "nikitich",
+                         "alyosha", "popovich"},
+                        options);
+
+  for (int i = 0; i < 8; ++i) {
+    ASSERT_OK(Put(i, "cf" + std::to_string(i) + "_key",
+                  "cf" + std::to_string(i) + "_val"));
+  }
+
+  int get_sv_count = 0;
+  ROCKSDB_NAMESPACE::DBImpl* db = reinterpret_cast<DBImpl*>(db_);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::MultiGet::AfterRefSV", [&](void* /*arg*/) {
+        if (++get_sv_count == 2) {
+          for (int i = 0; i < 8; ++i) {
+            ASSERT_OK(Flush(i));
+            ASSERT_OK(Put(i, "cf" + std::to_string(i) + "_key",
+                          "cf" + std::to_string(i) + "_val2"));
+          }
+        }
+        if (get_sv_count == 8) {
+          for (int i = 0; i < 8; ++i) {
+            auto* cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(
+                            db->GetColumnFamilyHandle(i))
+                            ->cfd();
+            ASSERT_TRUE(
+                (cfd->TEST_GetLocalSV()->Get() == SuperVersion::kSVInUse) ||
+                (cfd->TEST_GetLocalSV()->Get() == SuperVersion::kSVObsolete));
+          }
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  std::vector<int> cfs;
+  std::vector<std::string> keys;
+  std::vector<std::string> values;
+
+  for (int i = 0; i < 8; ++i) {
+    cfs.push_back(i);
+    keys.push_back("cf" + std::to_string(i) + "_key");
+  }
+
+  const Snapshot* snapshot = db_->GetSnapshot();
+  values = MultiGet(cfs, keys, snapshot, GetParam());
+  db_->ReleaseSnapshot(snapshot);
+  ASSERT_EQ(values.size(), 8);
+  for (unsigned int j = 0; j < values.size(); ++j) {
+    ASSERT_EQ(values[j], "cf" + std::to_string(j) + "_val");
+  }
+  for (int i = 0; i < 8; ++i) {
+    auto* cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(
+                    reinterpret_cast<DBImpl*>(db_)->GetColumnFamilyHandle(i))
+                    ->cfd();
+    ASSERT_NE(cfd->TEST_GetLocalSV()->Get(), SuperVersion::kSVInUse);
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(DBMultiGetTestWithParam, DBMultiGetTestWithParam,
+                        testing::Bool());
+
+TEST_F(DBBasicTest, MultiGetBatchedSimpleUnsorted) {
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    SetPerfLevel(kEnableCount);
+    ASSERT_OK(Put(1, "k1", "v1"));
+    ASSERT_OK(Put(1, "k2", "v2"));
+    ASSERT_OK(Put(1, "k3", "v3"));
+    ASSERT_OK(Put(1, "k4", "v4"));
+    ASSERT_OK(Delete(1, "k4"));
+    ASSERT_OK(Put(1, "k5", "v5"));
+    ASSERT_OK(Delete(1, "no_key"));
+
+    get_perf_context()->Reset();
+
+    std::vector<Slice> keys({"no_key", "k5", "k4", "k3", "k2", "k1"});
+    std::vector<PinnableSlice> values(keys.size());
+    std::vector<ColumnFamilyHandle*> cfs(keys.size(), handles_[1]);
+    std::vector<Status> s(keys.size());
+
+    db_->MultiGet(ReadOptions(), handles_[1], keys.size(), keys.data(),
+                  values.data(), s.data(), false);
+
+    ASSERT_EQ(values.size(), keys.size());
+    ASSERT_EQ(std::string(values[5].data(), values[5].size()), "v1");
+    ASSERT_EQ(std::string(values[4].data(), values[4].size()), "v2");
+    ASSERT_EQ(std::string(values[3].data(), values[3].size()), "v3");
+    ASSERT_EQ(std::string(values[1].data(), values[1].size()), "v5");
+    // four kv pairs * two bytes per value
+    ASSERT_EQ(8, (int)get_perf_context()->multiget_read_bytes);
+
+    ASSERT_TRUE(s[0].IsNotFound());
+    ASSERT_OK(s[1]);
+    ASSERT_TRUE(s[2].IsNotFound());
+    ASSERT_OK(s[3]);
+    ASSERT_OK(s[4]);
+    ASSERT_OK(s[5]);
+
+    SetPerfLevel(kDisable);
+  } while (ChangeCompactOptions());
+}
+
+TEST_F(DBBasicTest, MultiGetBatchedSimpleSorted) {
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    SetPerfLevel(kEnableCount);
+    ASSERT_OK(Put(1, "k1", "v1"));
+    ASSERT_OK(Put(1, "k2", "v2"));
+    ASSERT_OK(Put(1, "k3", "v3"));
+    ASSERT_OK(Put(1, "k4", "v4"));
+    ASSERT_OK(Delete(1, "k4"));
+    ASSERT_OK(Put(1, "k5", "v5"));
+    ASSERT_OK(Delete(1, "no_key"));
+
+    get_perf_context()->Reset();
+
+    std::vector<Slice> keys({"k1", "k2", "k3", "k4", "k5", "no_key"});
+    std::vector<PinnableSlice> values(keys.size());
+    std::vector<ColumnFamilyHandle*> cfs(keys.size(), handles_[1]);
+    std::vector<Status> s(keys.size());
+
+    db_->MultiGet(ReadOptions(), handles_[1], keys.size(), keys.data(),
+                  values.data(), s.data(), true);
+
+    ASSERT_EQ(values.size(), keys.size());
+    ASSERT_EQ(std::string(values[0].data(), values[0].size()), "v1");
+    ASSERT_EQ(std::string(values[1].data(), values[1].size()), "v2");
+    ASSERT_EQ(std::string(values[2].data(), values[2].size()), "v3");
+    ASSERT_EQ(std::string(values[4].data(), values[4].size()), "v5");
+    // four kv pairs * two bytes per value
+    ASSERT_EQ(8, (int)get_perf_context()->multiget_read_bytes);
+
+    ASSERT_OK(s[0]);
+    ASSERT_OK(s[1]);
+    ASSERT_OK(s[2]);
+    ASSERT_TRUE(s[3].IsNotFound());
+    ASSERT_OK(s[4]);
+    ASSERT_TRUE(s[5].IsNotFound());
+
+    SetPerfLevel(kDisable);
+  } while (ChangeCompactOptions());
+}
+
+TEST_F(DBBasicTest, MultiGetBatchedMultiLevel) {
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  Reopen(options);
+  int num_keys = 0;
+
+  for (int i = 0; i < 128; ++i) {
+    ASSERT_OK(Put("key_" + std::to_string(i), "val_l2_" + std::to_string(i)));
+    num_keys++;
+    if (num_keys == 8) {
+      Flush();
+      num_keys = 0;
+    }
+  }
+  if (num_keys > 0) {
+    Flush();
+    num_keys = 0;
+  }
+  MoveFilesToLevel(2);
+
+  for (int i = 0; i < 128; i += 3) {
+    ASSERT_OK(Put("key_" + std::to_string(i), "val_l1_" + std::to_string(i)));
+    num_keys++;
+    if (num_keys == 8) {
+      Flush();
+      num_keys = 0;
+    }
+  }
+  if (num_keys > 0) {
+    Flush();
+    num_keys = 0;
+  }
+  MoveFilesToLevel(1);
+
+  for (int i = 0; i < 128; i += 5) {
+    ASSERT_OK(Put("key_" + std::to_string(i), "val_l0_" + std::to_string(i)));
+    num_keys++;
+    if (num_keys == 8) {
+      Flush();
+      num_keys = 0;
+    }
+  }
+  if (num_keys > 0) {
+    Flush();
+    num_keys = 0;
+  }
+  ASSERT_EQ(0, num_keys);
+
+  for (int i = 0; i < 128; i += 9) {
+    ASSERT_OK(Put("key_" + std::to_string(i), "val_mem_" + std::to_string(i)));
+  }
+
+  std::vector<std::string> keys;
+  std::vector<std::string> values;
+
+  for (int i = 64; i < 80; ++i) {
+    keys.push_back("key_" + std::to_string(i));
+  }
+
+  values = MultiGet(keys, nullptr);
+  ASSERT_EQ(values.size(), 16);
+  for (unsigned int j = 0; j < values.size(); ++j) {
+    int key = j + 64;
+    if (key % 9 == 0) {
+      ASSERT_EQ(values[j], "val_mem_" + std::to_string(key));
+    } else if (key % 5 == 0) {
+      ASSERT_EQ(values[j], "val_l0_" + std::to_string(key));
+    } else if (key % 3 == 0) {
+      ASSERT_EQ(values[j], "val_l1_" + std::to_string(key));
+    } else {
+      ASSERT_EQ(values[j], "val_l2_" + std::to_string(key));
+    }
+  }
+}
+
+TEST_F(DBBasicTest, MultiGetBatchedMultiLevelMerge) {
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.merge_operator = MergeOperators::CreateStringAppendOperator();
+  BlockBasedTableOptions bbto;
+  bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  Reopen(options);
+  int num_keys = 0;
+
+  for (int i = 0; i < 128; ++i) {
+    ASSERT_OK(Put("key_" + std::to_string(i), "val_l2_" + std::to_string(i)));
+    num_keys++;
+    if (num_keys == 8) {
+      Flush();
+      num_keys = 0;
+    }
+  }
+  if (num_keys > 0) {
+    Flush();
+    num_keys = 0;
+  }
+  MoveFilesToLevel(2);
+
+  for (int i = 0; i < 128; i += 3) {
+    ASSERT_OK(Merge("key_" + std::to_string(i), "val_l1_" + std::to_string(i)));
+    num_keys++;
+    if (num_keys == 8) {
+      Flush();
+      num_keys = 0;
+    }
+  }
+  if (num_keys > 0) {
+    Flush();
+    num_keys = 0;
+  }
+  MoveFilesToLevel(1);
+
+  for (int i = 0; i < 128; i += 5) {
+    ASSERT_OK(Merge("key_" + std::to_string(i), "val_l0_" + std::to_string(i)));
+    num_keys++;
+    if (num_keys == 8) {
+      Flush();
+      num_keys = 0;
+    }
+  }
+  if (num_keys > 0) {
+    Flush();
+    num_keys = 0;
+  }
+  ASSERT_EQ(0, num_keys);
+
+  for (int i = 0; i < 128; i += 9) {
+    ASSERT_OK(Merge("key_" + std::to_string(i), "val_mem_" + std::to_string(i)));
+  }
+
+  std::vector<std::string> keys;
+  std::vector<std::string> values;
+
+  for (int i = 32; i < 80; ++i) {
+    keys.push_back("key_" + std::to_string(i));
+  }
+
+  values = MultiGet(keys, nullptr);
+  ASSERT_EQ(values.size(), keys.size());
+  for (unsigned int j = 0; j < 48; ++j) {
+    int key = j + 32;
+    std::string value;
+    value.append("val_l2_" + std::to_string(key));
+    if (key % 3 == 0) {
+      value.append(",");
+      value.append("val_l1_" + std::to_string(key));
+    }
+    if (key % 5 == 0) {
+      value.append(",");
+      value.append("val_l0_" + std::to_string(key));
+    }
+    if (key % 9 == 0) {
+      value.append(",");
+      value.append("val_mem_" + std::to_string(key));
+    }
+    ASSERT_EQ(values[j], value);
+  }
+}
+
+// Test class for batched MultiGet with prefix extractor
+// Param bool - If true, use partitioned filters
+//              If false, use full filter block
+class MultiGetPrefixExtractorTest : public DBBasicTest,
+                                    public ::testing::WithParamInterface<bool> {
+};
+
+TEST_P(MultiGetPrefixExtractorTest, Batched) {
+  Options options = CurrentOptions();
+  options.prefix_extractor.reset(NewFixedPrefixTransform(2));
+  options.memtable_prefix_bloom_size_ratio = 10;
+  BlockBasedTableOptions bbto;
+  if (GetParam()) {
+    bbto.index_type = BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+    bbto.partition_filters = true;
+  }
+  bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
+  bbto.whole_key_filtering = false;
+  bbto.cache_index_and_filter_blocks = false;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  Reopen(options);
+
+  SetPerfLevel(kEnableCount);
+  get_perf_context()->Reset();
+
+  // First key is not in the prefix_extractor domain
+  ASSERT_OK(Put("k", "v0"));
+  ASSERT_OK(Put("kk1", "v1"));
+  ASSERT_OK(Put("kk2", "v2"));
+  ASSERT_OK(Put("kk3", "v3"));
+  ASSERT_OK(Put("kk4", "v4"));
+  std::vector<std::string> mem_keys(
+      {"k", "kk1", "kk2", "kk3", "kk4", "rofl", "lmho"});
+  std::vector<std::string> inmem_values;
+  inmem_values = MultiGet(mem_keys, nullptr);
+  ASSERT_EQ(inmem_values[0], "v0");
+  ASSERT_EQ(inmem_values[1], "v1");
+  ASSERT_EQ(inmem_values[2], "v2");
+  ASSERT_EQ(inmem_values[3], "v3");
+  ASSERT_EQ(inmem_values[4], "v4");
+  ASSERT_EQ(get_perf_context()->bloom_memtable_miss_count, 2);
+  ASSERT_EQ(get_perf_context()->bloom_memtable_hit_count, 5);
+  ASSERT_OK(Flush());
+
+  std::vector<std::string> keys({"k", "kk1", "kk2", "kk3", "kk4"});
+  std::vector<std::string> values;
+  get_perf_context()->Reset();
+  values = MultiGet(keys, nullptr);
+  ASSERT_EQ(values[0], "v0");
+  ASSERT_EQ(values[1], "v1");
+  ASSERT_EQ(values[2], "v2");
+  ASSERT_EQ(values[3], "v3");
+  ASSERT_EQ(values[4], "v4");
+  // Filter hits for 4 in-domain keys
+  ASSERT_EQ(get_perf_context()->bloom_sst_hit_count, 4);
+}
+
+INSTANTIATE_TEST_CASE_P(MultiGetPrefix, MultiGetPrefixExtractorTest,
+                        ::testing::Bool());
+
+#ifndef ROCKSDB_LITE
+class DBMultiGetRowCacheTest : public DBBasicTest,
+                               public ::testing::WithParamInterface<bool> {};
+
+TEST_P(DBMultiGetRowCacheTest, MultiGetBatched) {
+  do {
+    option_config_ = kRowCache;
+    Options options = CurrentOptions();
+    options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+    CreateAndReopenWithCF({"pikachu"}, options);
+    SetPerfLevel(kEnableCount);
+    ASSERT_OK(Put(1, "k1", "v1"));
+    ASSERT_OK(Put(1, "k2", "v2"));
+    ASSERT_OK(Put(1, "k3", "v3"));
+    ASSERT_OK(Put(1, "k4", "v4"));
+    Flush(1);
+    ASSERT_OK(Put(1, "k5", "v5"));
+    const Snapshot* snap1 = dbfull()->GetSnapshot();
+    ASSERT_OK(Delete(1, "k4"));
+    Flush(1);
+    const Snapshot* snap2 = dbfull()->GetSnapshot();
+
+    get_perf_context()->Reset();
+
+    std::vector<Slice> keys({"no_key", "k5", "k4", "k3", "k1"});
+    std::vector<PinnableSlice> values(keys.size());
+    std::vector<ColumnFamilyHandle*> cfs(keys.size(), handles_[1]);
+    std::vector<Status> s(keys.size());
+
+    ReadOptions ro;
+    bool use_snapshots = GetParam();
+    if (use_snapshots) {
+      ro.snapshot = snap2;
+    }
+    db_->MultiGet(ro, handles_[1], keys.size(), keys.data(), values.data(),
+                  s.data(), false);
+
+    ASSERT_EQ(values.size(), keys.size());
+    ASSERT_EQ(std::string(values[4].data(), values[4].size()), "v1");
+    ASSERT_EQ(std::string(values[3].data(), values[3].size()), "v3");
+    ASSERT_EQ(std::string(values[1].data(), values[1].size()), "v5");
+    // four kv pairs * two bytes per value
+    ASSERT_EQ(6, (int)get_perf_context()->multiget_read_bytes);
+
+    ASSERT_TRUE(s[0].IsNotFound());
+    ASSERT_OK(s[1]);
+    ASSERT_TRUE(s[2].IsNotFound());
+    ASSERT_OK(s[3]);
+    ASSERT_OK(s[4]);
+
+    // Call MultiGet() again with some intersection with the previous set of
+    // keys. Those should already be in the row cache.
+    keys.assign({"no_key", "k5", "k3", "k2"});
+    for (size_t i = 0; i < keys.size(); ++i) {
+      values[i].Reset();
+      s[i] = Status::OK();
+    }
+    get_perf_context()->Reset();
+
+    if (use_snapshots) {
+      ro.snapshot = snap1;
+    }
+    db_->MultiGet(ReadOptions(), handles_[1], keys.size(), keys.data(),
+                  values.data(), s.data(), false);
+
+    ASSERT_EQ(std::string(values[3].data(), values[3].size()), "v2");
+    ASSERT_EQ(std::string(values[2].data(), values[2].size()), "v3");
+    ASSERT_EQ(std::string(values[1].data(), values[1].size()), "v5");
+    // four kv pairs * two bytes per value
+    ASSERT_EQ(6, (int)get_perf_context()->multiget_read_bytes);
+
+    ASSERT_TRUE(s[0].IsNotFound());
+    ASSERT_OK(s[1]);
+    ASSERT_OK(s[2]);
+    ASSERT_OK(s[3]);
+    if (use_snapshots) {
+      // Only reads from the first SST file would have been cached, since
+      // snapshot seq no is > fd.largest_seqno
+      ASSERT_EQ(1, TestGetTickerCount(options, ROW_CACHE_HIT));
+    } else {
+      ASSERT_EQ(2, TestGetTickerCount(options, ROW_CACHE_HIT));
+    }
+
+    SetPerfLevel(kDisable);
+    dbfull()->ReleaseSnapshot(snap1);
+    dbfull()->ReleaseSnapshot(snap2);
+  } while (ChangeCompactOptions());
+}
+
+INSTANTIATE_TEST_CASE_P(DBMultiGetRowCacheTest, DBMultiGetRowCacheTest,
+                        testing::Values(true, false));
+
+TEST_F(DBBasicTest, GetAllKeyVersions) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  options.disable_auto_compactions = true;
+  CreateAndReopenWithCF({"pikachu"}, options);
+  ASSERT_EQ(2, handles_.size());
+  const size_t kNumInserts = 4;
+  const size_t kNumDeletes = 4;
+  const size_t kNumUpdates = 4;
+
+  // Check default column family
+  for (size_t i = 0; i != kNumInserts; ++i) {
+    ASSERT_OK(Put(std::to_string(i), "value"));
+  }
+  for (size_t i = 0; i != kNumUpdates; ++i) {
+    ASSERT_OK(Put(std::to_string(i), "value1"));
+  }
+  for (size_t i = 0; i != kNumDeletes; ++i) {
+    ASSERT_OK(Delete(std::to_string(i)));
+  }
+  std::vector<KeyVersion> key_versions;
+  ASSERT_OK(ROCKSDB_NAMESPACE::GetAllKeyVersions(
+      db_, Slice(), Slice(), std::numeric_limits<size_t>::max(),
+      &key_versions));
+  ASSERT_EQ(kNumInserts + kNumDeletes + kNumUpdates, key_versions.size());
+  ASSERT_OK(ROCKSDB_NAMESPACE::GetAllKeyVersions(
+      db_, handles_[0], Slice(), Slice(), std::numeric_limits<size_t>::max(),
+      &key_versions));
+  ASSERT_EQ(kNumInserts + kNumDeletes + kNumUpdates, key_versions.size());
+
+  // Check non-default column family
+  for (size_t i = 0; i != kNumInserts - 1; ++i) {
+    ASSERT_OK(Put(1, std::to_string(i), "value"));
+  }
+  for (size_t i = 0; i != kNumUpdates - 1; ++i) {
+    ASSERT_OK(Put(1, std::to_string(i), "value1"));
+  }
+  for (size_t i = 0; i != kNumDeletes - 1; ++i) {
+    ASSERT_OK(Delete(1, std::to_string(i)));
+  }
+  ASSERT_OK(ROCKSDB_NAMESPACE::GetAllKeyVersions(
+      db_, handles_[1], Slice(), Slice(), std::numeric_limits<size_t>::max(),
+      &key_versions));
+  ASSERT_EQ(kNumInserts + kNumDeletes + kNumUpdates - 3, key_versions.size());
+}
+#endif  // !ROCKSDB_LITE
+
+TEST_F(DBBasicTest, MultiGetIOBufferOverrun) {
+  Options options = CurrentOptions();
+  Random rnd(301);
+  BlockBasedTableOptions table_options;
+  table_options.pin_l0_filter_and_index_blocks_in_cache = true;
+  table_options.block_size = 16 * 1024;
+  assert(table_options.block_size >
+          BlockBasedTable::kMultiGetReadStackBufSize);
+  options.table_factory.reset(new BlockBasedTableFactory(table_options));
+  Reopen(options);
+
+  std::string zero_str(128, '\0');
+  for (int i = 0; i < 100; ++i) {
+    // Make the value compressible. A purely random string doesn't compress
+    // and the resultant data block will not be compressed
+    std::string value(RandomString(&rnd, 128) + zero_str);
+    assert(Put(Key(i), value) == Status::OK());
+  }
+  Flush();
+
+  std::vector<std::string> key_data(10);
+  std::vector<Slice> keys;
+  // We cannot resize a PinnableSlice vector, so just set initial size to
+  // largest we think we will need
+  std::vector<PinnableSlice> values(10);
+  std::vector<Status> statuses;
+  ReadOptions ro;
+
+  // Warm up the cache first
+  key_data.emplace_back(Key(0));
+  keys.emplace_back(Slice(key_data.back()));
+  key_data.emplace_back(Key(50));
+  keys.emplace_back(Slice(key_data.back()));
+  statuses.resize(keys.size());
+
+  dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
+                     keys.data(), values.data(), statuses.data(), true);
+}
+
+class DBBasicTestWithParallelIO
+    : public DBTestBase,
+      public testing::WithParamInterface<std::tuple<bool, bool, bool, bool>> {
+ public:
+  DBBasicTestWithParallelIO() : DBTestBase("/db_basic_test_with_parallel_io") {
+    bool compressed_cache = std::get<0>(GetParam());
+    bool uncompressed_cache = std::get<1>(GetParam());
+    compression_enabled_ = std::get<2>(GetParam());
+    fill_cache_ = std::get<3>(GetParam());
+
+    if (compressed_cache) {
+      std::shared_ptr<Cache> cache = NewLRUCache(1048576);
+      compressed_cache_ = std::make_shared<MyBlockCache>(cache);
+    }
+    if (uncompressed_cache) {
+      std::shared_ptr<Cache> cache = NewLRUCache(1048576);
+      uncompressed_cache_ = std::make_shared<MyBlockCache>(cache);
+    }
+
+    env_->count_random_reads_ = true;
+
+    Options options = CurrentOptions();
+    Random rnd(301);
+    BlockBasedTableOptions table_options;
+
+#ifndef ROCKSDB_LITE
+    if (compression_enabled_) {
+      std::vector<CompressionType> compression_types;
+      compression_types = GetSupportedCompressions();
+      // Not every platform may have compression libraries available, so
+      // dynamically pick based on what's available
+      if (compression_types.size() == 0) {
+        compression_enabled_ = false;
+      } else {
+        options.compression = compression_types[0];
+      }
+    }
+#else
+    // GetSupportedCompressions() is not available in LITE build
+    if (!Snappy_Supported()) {
+      compression_enabled_ = false;
+    }
+#endif //ROCKSDB_LITE
+
+    table_options.block_cache = uncompressed_cache_;
+    if (table_options.block_cache == nullptr) {
+      table_options.no_block_cache = true;
+    } else {
+      table_options.pin_l0_filter_and_index_blocks_in_cache = true;
+    }
+    table_options.block_cache_compressed = compressed_cache_;
+    table_options.flush_block_policy_factory.reset(
+        new MyFlushBlockPolicyFactory());
+    options.table_factory.reset(new BlockBasedTableFactory(table_options));
+    if (!compression_enabled_) {
+      options.compression = kNoCompression;
+    }
+    Reopen(options);
+
+    std::string zero_str(128, '\0');
+    for (int i = 0; i < 100; ++i) {
+      // Make the value compressible. A purely random string doesn't compress
+      // and the resultant data block will not be compressed
+      values_.emplace_back(RandomString(&rnd, 128) + zero_str);
+      assert(Put(Key(i), values_[i]) == Status::OK());
+    }
+    Flush();
+
+    for (int i = 0; i < 100; ++i) {
+      // block cannot gain space by compression
+      uncompressable_values_.emplace_back(RandomString(&rnd, 256) + '\0');
+      std::string tmp_key = "a" + Key(i);
+      assert(Put(tmp_key, uncompressable_values_[i]) == Status::OK());
+    }
+    Flush();
+  }
+
+  bool CheckValue(int i, const std::string& value) {
+    if (values_[i].compare(value) == 0) {
+      return true;
+    }
+    return false;
+  }
+
+  bool CheckUncompressableValue(int i, const std::string& value) {
+    if (uncompressable_values_[i].compare(value) == 0) {
+      return true;
+    }
+    return false;
+  }
+
+  int num_lookups() { return uncompressed_cache_->num_lookups(); }
+  int num_found() { return uncompressed_cache_->num_found(); }
+  int num_inserts() { return uncompressed_cache_->num_inserts(); }
+
+  int num_lookups_compressed() { return compressed_cache_->num_lookups(); }
+  int num_found_compressed() { return compressed_cache_->num_found(); }
+  int num_inserts_compressed() { return compressed_cache_->num_inserts(); }
+
+  bool fill_cache() { return fill_cache_; }
+  bool compression_enabled() { return compression_enabled_; }
+  bool has_compressed_cache() { return compressed_cache_ != nullptr; }
+  bool has_uncompressed_cache() { return uncompressed_cache_ != nullptr; }
+
+  static void SetUpTestCase() {}
+  static void TearDownTestCase() {}
+
+ private:
+  class MyFlushBlockPolicyFactory : public FlushBlockPolicyFactory {
+   public:
+    MyFlushBlockPolicyFactory() {}
+
+    virtual const char* Name() const override {
+      return "MyFlushBlockPolicyFactory";
+    }
+
+    virtual FlushBlockPolicy* NewFlushBlockPolicy(
+        const BlockBasedTableOptions& /*table_options*/,
+        const BlockBuilder& data_block_builder) const override {
+      return new MyFlushBlockPolicy(data_block_builder);
+    }
+  };
+
+  class MyFlushBlockPolicy : public FlushBlockPolicy {
+   public:
+    explicit MyFlushBlockPolicy(const BlockBuilder& data_block_builder)
+        : num_keys_(0), data_block_builder_(data_block_builder) {}
+
+    bool Update(const Slice& /*key*/, const Slice& /*value*/) override {
+      if (data_block_builder_.empty()) {
+        // First key in this block
+        num_keys_ = 1;
+        return false;
+      }
+      // Flush every 10 keys
+      if (num_keys_ == 10) {
+        num_keys_ = 1;
+        return true;
+      }
+      num_keys_++;
+      return false;
+    }
+
+   private:
+    int num_keys_;
+    const BlockBuilder& data_block_builder_;
+  };
+
+  class MyBlockCache : public Cache {
+   public:
+    explicit MyBlockCache(std::shared_ptr<Cache>& target)
+        : target_(target), num_lookups_(0), num_found_(0), num_inserts_(0) {}
+
+    virtual const char* Name() const override { return "MyBlockCache"; }
+
+    virtual Status Insert(const Slice& key, void* value, size_t charge,
+                          void (*deleter)(const Slice& key, void* value),
+                          Handle** handle = nullptr,
+                          Priority priority = Priority::LOW) override {
+      num_inserts_++;
+      return target_->Insert(key, value, charge, deleter, handle, priority);
+    }
+
+    virtual Handle* Lookup(const Slice& key,
+                           Statistics* stats = nullptr) override {
+      num_lookups_++;
+      Handle* handle = target_->Lookup(key, stats);
+      if (handle != nullptr) {
+        num_found_++;
+      }
+      return handle;
+    }
+
+    virtual bool Ref(Handle* handle) override { return target_->Ref(handle); }
+
+    virtual bool Release(Handle* handle, bool force_erase = false) override {
+      return target_->Release(handle, force_erase);
+    }
+
+    virtual void* Value(Handle* handle) override {
+      return target_->Value(handle);
+    }
+
+    virtual void Erase(const Slice& key) override { target_->Erase(key); }
+    virtual uint64_t NewId() override { return target_->NewId(); }
+
+    virtual void SetCapacity(size_t capacity) override {
+      target_->SetCapacity(capacity);
+    }
+
+    virtual void SetStrictCapacityLimit(bool strict_capacity_limit) override {
+      target_->SetStrictCapacityLimit(strict_capacity_limit);
+    }
+
+    virtual bool HasStrictCapacityLimit() const override {
+      return target_->HasStrictCapacityLimit();
+    }
+
+    virtual size_t GetCapacity() const override {
+      return target_->GetCapacity();
+    }
+
+    virtual size_t GetUsage() const override { return target_->GetUsage(); }
+
+    virtual size_t GetUsage(Handle* handle) const override {
+      return target_->GetUsage(handle);
+    }
+
+    virtual size_t GetPinnedUsage() const override {
+      return target_->GetPinnedUsage();
+    }
+
+    virtual size_t GetCharge(Handle* /*handle*/) const override { return 0; }
+
+    virtual void ApplyToAllCacheEntries(void (*callback)(void*, size_t),
+                                        bool thread_safe) override {
+      return target_->ApplyToAllCacheEntries(callback, thread_safe);
+    }
+
+    virtual void EraseUnRefEntries() override {
+      return target_->EraseUnRefEntries();
+    }
+
+    int num_lookups() { return num_lookups_; }
+
+    int num_found() { return num_found_; }
+
+    int num_inserts() { return num_inserts_; }
+
+   private:
+    std::shared_ptr<Cache> target_;
+    int num_lookups_;
+    int num_found_;
+    int num_inserts_;
+  };
+
+  std::shared_ptr<MyBlockCache> compressed_cache_;
+  std::shared_ptr<MyBlockCache> uncompressed_cache_;
+  bool compression_enabled_;
+  std::vector<std::string> values_;
+  std::vector<std::string> uncompressable_values_;
+  bool fill_cache_;
+};
+
+TEST_P(DBBasicTestWithParallelIO, MultiGet) {
+  std::vector<std::string> key_data(10);
+  std::vector<Slice> keys;
+  // We cannot resize a PinnableSlice vector, so just set initial size to
+  // largest we think we will need
+  std::vector<PinnableSlice> values(10);
+  std::vector<Status> statuses;
+  ReadOptions ro;
+  ro.fill_cache = fill_cache();
+
+  // Warm up the cache first
+  key_data.emplace_back(Key(0));
+  keys.emplace_back(Slice(key_data.back()));
+  key_data.emplace_back(Key(50));
+  keys.emplace_back(Slice(key_data.back()));
+  statuses.resize(keys.size());
+
+  dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
+                     keys.data(), values.data(), statuses.data(), true);
+  ASSERT_TRUE(CheckValue(0, values[0].ToString()));
+  ASSERT_TRUE(CheckValue(50, values[1].ToString()));
+
+  int random_reads = env_->random_read_counter_.Read();
+  key_data[0] = Key(1);
+  key_data[1] = Key(51);
+  keys[0] = Slice(key_data[0]);
+  keys[1] = Slice(key_data[1]);
+  values[0].Reset();
+  values[1].Reset();
+  dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
+                     keys.data(), values.data(), statuses.data(), true);
+  ASSERT_TRUE(CheckValue(1, values[0].ToString()));
+  ASSERT_TRUE(CheckValue(51, values[1].ToString()));
+
+  bool read_from_cache = false;
+  if (fill_cache()) {
+    if (has_uncompressed_cache()) {
+      read_from_cache = true;
+    } else if (has_compressed_cache() && compression_enabled()) {
+      read_from_cache = true;
+    }
+  }
+
+  int expected_reads = random_reads + (read_from_cache ? 0 : 2);
+  ASSERT_EQ(env_->random_read_counter_.Read(), expected_reads);
+
+  keys.resize(10);
+  statuses.resize(10);
+  std::vector<int> key_ints{1, 2, 15, 16, 55, 81, 82, 83, 84, 85};
+  for (size_t i = 0; i < key_ints.size(); ++i) {
+    key_data[i] = Key(key_ints[i]);
+    keys[i] = Slice(key_data[i]);
+    statuses[i] = Status::OK();
+    values[i].Reset();
+  }
+  dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
+                     keys.data(), values.data(), statuses.data(), true);
+  for (size_t i = 0; i < key_ints.size(); ++i) {
+    ASSERT_OK(statuses[i]);
+    ASSERT_TRUE(CheckValue(key_ints[i], values[i].ToString()));
+  }
+  if (compression_enabled() && !has_compressed_cache()) {
+    expected_reads += (read_from_cache ? 2 : 3);
+  } else {
+    expected_reads += (read_from_cache ? 2 : 4);
+  }
+  ASSERT_EQ(env_->random_read_counter_.Read(), expected_reads);
+
+  keys.resize(10);
+  statuses.resize(10);
+  std::vector<int> key_uncmp{1, 2, 15, 16, 55, 81, 82, 83, 84, 85};
+  for (size_t i = 0; i < key_uncmp.size(); ++i) {
+    key_data[i] = "a" + Key(key_uncmp[i]);
+    keys[i] = Slice(key_data[i]);
+    statuses[i] = Status::OK();
+    values[i].Reset();
+  }
+  dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
+                     keys.data(), values.data(), statuses.data(), true);
+  for (size_t i = 0; i < key_uncmp.size(); ++i) {
+    ASSERT_OK(statuses[i]);
+    ASSERT_TRUE(CheckUncompressableValue(key_uncmp[i], values[i].ToString()));
+  }
+  if (compression_enabled() && !has_compressed_cache()) {
+    expected_reads += (read_from_cache ? 3 : 3);
+  } else {
+    expected_reads += (read_from_cache ? 4 : 4);
+  }
+  ASSERT_EQ(env_->random_read_counter_.Read(), expected_reads);
+
+  keys.resize(5);
+  statuses.resize(5);
+  std::vector<int> key_tr{1, 2, 15, 16, 55};
+  for (size_t i = 0; i < key_tr.size(); ++i) {
+    key_data[i] = "a" + Key(key_tr[i]);
+    keys[i] = Slice(key_data[i]);
+    statuses[i] = Status::OK();
+    values[i].Reset();
+  }
+  dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
+                     keys.data(), values.data(), statuses.data(), true);
+  for (size_t i = 0; i < key_tr.size(); ++i) {
+    ASSERT_OK(statuses[i]);
+    ASSERT_TRUE(CheckUncompressableValue(key_tr[i], values[i].ToString()));
+  }
+  if (compression_enabled() && !has_compressed_cache()) {
+    expected_reads += (read_from_cache ? 0 : 2);
+    ASSERT_EQ(env_->random_read_counter_.Read(), expected_reads);
+  } else {
+    if (has_uncompressed_cache()) {
+      expected_reads += (read_from_cache ? 0 : 3);
+      ASSERT_EQ(env_->random_read_counter_.Read(), expected_reads);
+    } else {
+      // A rare case, even we enable the block compression but some of data
+      // blocks are not compressed due to content. If user only enable the
+      // compressed cache, the uncompressed blocks will not tbe cached, and
+      // block reads will be triggered. The number of reads is related to
+      // the compression algorithm.
+      ASSERT_TRUE(env_->random_read_counter_.Read() >= expected_reads);
+    }
+  }
+}
+
+TEST_P(DBBasicTestWithParallelIO, MultiGetWithChecksumMismatch) {
+  std::vector<std::string> key_data(10);
+  std::vector<Slice> keys;
+  // We cannot resize a PinnableSlice vector, so just set initial size to
+  // largest we think we will need
+  std::vector<PinnableSlice> values(10);
+  std::vector<Status> statuses;
+  int read_count = 0;
+  ReadOptions ro;
+  ro.fill_cache = fill_cache();
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "RetrieveMultipleBlocks:VerifyChecksum", [&](void *status) {
+      Status* s = static_cast<Status*>(status);
+      read_count++;
+      if (read_count == 2) {
+        *s = Status::Corruption();
+      }
+    });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  // Warm up the cache first
+  key_data.emplace_back(Key(0));
+  keys.emplace_back(Slice(key_data.back()));
+  key_data.emplace_back(Key(50));
+  keys.emplace_back(Slice(key_data.back()));
+  statuses.resize(keys.size());
+
+  dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
+                     keys.data(), values.data(), statuses.data(), true);
+  ASSERT_TRUE(CheckValue(0, values[0].ToString()));
+  //ASSERT_TRUE(CheckValue(50, values[1].ToString()));
+  ASSERT_EQ(statuses[0], Status::OK());
+  ASSERT_EQ(statuses[1], Status::Corruption());
+
+  SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_P(DBBasicTestWithParallelIO, MultiGetWithMissingFile) {
+  std::vector<std::string> key_data(10);
+  std::vector<Slice> keys;
+  // We cannot resize a PinnableSlice vector, so just set initial size to
+  // largest we think we will need
+  std::vector<PinnableSlice> values(10);
+  std::vector<Status> statuses;
+  ReadOptions ro;
+  ro.fill_cache = fill_cache();
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "TableCache::MultiGet:FindTable", [&](void *status) {
+      Status* s = static_cast<Status*>(status);
+      *s = Status::IOError();
+    });
+  // DB open will create table readers unless we reduce the table cache
+  // capacity.
+  // SanitizeOptions will set max_open_files to minimum of 20. Table cache
+  // is allocated with max_open_files - 10 as capacity. So override
+  // max_open_files to 11 so table cache capacity will become 1. This will
+  // prevent file open during DB open and force the file to be opened
+  // during MultiGet
+  SyncPoint::GetInstance()->SetCallBack(
+      "SanitizeOptions::AfterChangeMaxOpenFiles", [&](void *arg) {
+      int* max_open_files = (int*)arg;
+      *max_open_files = 11;
+    });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  Reopen(CurrentOptions());
+
+  // Warm up the cache first
+  key_data.emplace_back(Key(0));
+  keys.emplace_back(Slice(key_data.back()));
+  key_data.emplace_back(Key(50));
+  keys.emplace_back(Slice(key_data.back()));
+  statuses.resize(keys.size());
+
+  dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
+                     keys.data(), values.data(), statuses.data(), true);
+  ASSERT_EQ(statuses[0], Status::IOError());
+  ASSERT_EQ(statuses[1], Status::IOError());
+
+  SyncPoint::GetInstance()->DisableProcessing();
+}
+
+INSTANTIATE_TEST_CASE_P(
+    ParallelIO, DBBasicTestWithParallelIO,
+    // Params are as follows -
+    // Param 0 - Compressed cache enabled
+    // Param 1 - Uncompressed cache enabled
+    // Param 2 - Data compression enabled
+    // Param 3 - ReadOptions::fill_cache
+    ::testing::Combine(::testing::Bool(), ::testing::Bool(),
+                       ::testing::Bool(), ::testing::Bool()));
+
+class DBBasicTestWithTimestampBase : public DBTestBase {
+ public:
+  explicit DBBasicTestWithTimestampBase(const std::string& dbname)
+      : DBTestBase(dbname) {}
+
+ protected:
+  class TestComparatorBase : public Comparator {
+   public:
+    explicit TestComparatorBase(size_t ts_sz) : Comparator(ts_sz) {}
+
+    const char* Name() const override { return "TestComparator"; }
+
+    void FindShortSuccessor(std::string*) const override {}
+
+    void FindShortestSeparator(std::string*, const Slice&) const override {}
+
+    int Compare(const Slice& a, const Slice& b) const override {
+      int r = CompareWithoutTimestamp(a, b);
+      if (r != 0 || 0 == timestamp_size()) {
+        return r;
+      }
+      return CompareTimestamp(
+          Slice(a.data() + a.size() - timestamp_size(), timestamp_size()),
+          Slice(b.data() + b.size() - timestamp_size(), timestamp_size()));
+    }
+
+    virtual int CompareImpl(const Slice& a, const Slice& b) const = 0;
+
+    int CompareWithoutTimestamp(const Slice& a, const Slice& b) const override {
+      assert(a.size() >= timestamp_size());
+      assert(b.size() >= timestamp_size());
+      Slice k1 = StripTimestampFromUserKey(a, timestamp_size());
+      Slice k2 = StripTimestampFromUserKey(b, timestamp_size());
+
+      return CompareImpl(k1, k2);
+    }
+
+    int CompareTimestamp(const Slice& ts1, const Slice& ts2) const override {
+      if (!ts1.data() && !ts2.data()) {
+        return 0;
+      } else if (ts1.data() && !ts2.data()) {
+        return 1;
+      } else if (!ts1.data() && ts2.data()) {
+        return -1;
+      }
+      assert(ts1.size() == ts2.size());
+      uint64_t low1 = 0;
+      uint64_t low2 = 0;
+      uint64_t high1 = 0;
+      uint64_t high2 = 0;
+      auto* ptr1 = const_cast<Slice*>(&ts1);
+      auto* ptr2 = const_cast<Slice*>(&ts2);
+      if (!GetFixed64(ptr1, &low1) || !GetFixed64(ptr1, &high1) ||
+          !GetFixed64(ptr2, &low2) || !GetFixed64(ptr2, &high2)) {
+        assert(false);
+      }
+      if (high1 < high2) {
+        return 1;
+      } else if (high1 > high2) {
+        return -1;
+      }
+      if (low1 < low2) {
+        return 1;
+      } else if (low1 > low2) {
+        return -1;
+      }
+      return 0;
+    }
+  };
+
+  Slice EncodeTimestamp(uint64_t low, uint64_t high, std::string* ts) {
+    assert(nullptr != ts);
+    ts->clear();
+    PutFixed64(ts, low);
+    PutFixed64(ts, high);
+    assert(ts->size() == sizeof(low) + sizeof(high));
+    return Slice(*ts);
+  }
+};
+
+class DBBasicTestWithTimestamp : public DBBasicTestWithTimestampBase {
+ public:
+  DBBasicTestWithTimestamp()
+      : DBBasicTestWithTimestampBase("/db_basic_test_with_timestamp") {}
+
+ protected:
+  class TestComparator : public TestComparatorBase {
+   public:
+    const int kKeyPrefixLength =
+        3;  // 3: length of "key" in generated keys ("key" + std::to_string(j))
+    explicit TestComparator(size_t ts_sz) : TestComparatorBase(ts_sz) {}
+
+    int CompareImpl(const Slice& a, const Slice& b) const override {
+      int n1 = atoi(
+          std::string(a.data() + kKeyPrefixLength, a.size() - kKeyPrefixLength)
+              .c_str());
+      int n2 = atoi(
+          std::string(b.data() + kKeyPrefixLength, b.size() - kKeyPrefixLength)
+              .c_str());
+      return (n1 < n2) ? -1 : (n1 > n2) ? 1 : 0;
+    }
+  };
+};
+
+#ifndef ROCKSDB_LITE
+// A class which remembers the name of each flushed file.
+class FlushedFileCollector : public EventListener {
+ public:
+  FlushedFileCollector() {}
+  ~FlushedFileCollector() override {}
+
+  void OnFlushCompleted(DB* /*db*/, const FlushJobInfo& info) override {
+    InstrumentedMutexLock lock(&mutex_);
+    flushed_files_.push_back(info.file_path);
+  }
+
+  std::vector<std::string> GetFlushedFiles() {
+    std::vector<std::string> result;
+    {
+      InstrumentedMutexLock lock(&mutex_);
+      result = flushed_files_;
+    }
+    return result;
+  }
+
+  void ClearFlushedFiles() {
+    InstrumentedMutexLock lock(&mutex_);
+    flushed_files_.clear();
+  }
+
+ private:
+  std::vector<std::string> flushed_files_;
+  InstrumentedMutex mutex_;
+};
+
+TEST_F(DBBasicTestWithTimestamp, PutAndGetWithCompaction) {
+  const int kNumKeysPerFile = 8192;
+  const size_t kNumTimestamps = 2;
+  const size_t kNumKeysPerTimestamp = (kNumKeysPerFile - 1) / kNumTimestamps;
+  const size_t kSplitPosBase = kNumKeysPerTimestamp / 2;
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.env = env_;
+  options.memtable_factory.reset(new SpecialSkipListFactory(kNumKeysPerFile));
+
+  FlushedFileCollector* collector = new FlushedFileCollector();
+  options.listeners.emplace_back(collector);
+
+  std::string tmp;
+  size_t ts_sz = EncodeTimestamp(0, 0, &tmp).size();
+  TestComparator test_cmp(ts_sz);
+  options.comparator = &test_cmp;
+  BlockBasedTableOptions bbto;
+  bbto.filter_policy.reset(NewBloomFilterPolicy(
+      10 /*bits_per_key*/, false /*use_block_based_builder*/));
+  bbto.whole_key_filtering = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+  size_t num_cfs = handles_.size();
+  ASSERT_EQ(2, num_cfs);
+  std::vector<std::string> write_ts_strs(kNumTimestamps);
+  std::vector<std::string> read_ts_strs(kNumTimestamps);
+  std::vector<Slice> write_ts_list;
+  std::vector<Slice> read_ts_list;
+
+  for (size_t i = 0; i != kNumTimestamps; ++i) {
+    write_ts_list.emplace_back(EncodeTimestamp(i * 2, 0, &write_ts_strs[i]));
+    read_ts_list.emplace_back(EncodeTimestamp(1 + i * 2, 0, &read_ts_strs[i]));
+    const Slice& write_ts = write_ts_list.back();
+    WriteOptions wopts;
+    wopts.timestamp = &write_ts;
+    for (int cf = 0; cf != static_cast<int>(num_cfs); ++cf) {
+      for (size_t j = 0; j != kNumKeysPerTimestamp; ++j) {
+        ASSERT_OK(Put(cf, "key" + std::to_string(j),
+                      "value_" + std::to_string(j) + "_" + std::to_string(i),
+                      wopts));
+        if (j == kSplitPosBase + i || j == kNumKeysPerTimestamp - 1) {
+          // flush all keys with the same timestamp to two sst files, split at
+          // incremental positions such that lowerlevel[1].smallest.userkey ==
+          // higherlevel[0].largest.userkey
+          ASSERT_OK(Flush(cf));
+
+          // compact files (2 at each level) to a lower level such that all keys
+          // with the same timestamp is at one level, with newer versions at
+          // higher levels.
+          CompactionOptions compact_opt;
+          compact_opt.compression = kNoCompression;
+          db_->CompactFiles(compact_opt, handles_[cf],
+                            collector->GetFlushedFiles(),
+                            static_cast<int>(kNumTimestamps - i));
+          collector->ClearFlushedFiles();
+        }
+      }
+    }
+  }
+  const auto& verify_db_func = [&]() {
+    for (size_t i = 0; i != kNumTimestamps; ++i) {
+      ReadOptions ropts;
+      ropts.timestamp = &read_ts_list[i];
+      for (int cf = 0; cf != static_cast<int>(num_cfs); ++cf) {
+        ColumnFamilyHandle* cfh = handles_[cf];
+        for (size_t j = 0; j != kNumKeysPerTimestamp; ++j) {
+          std::string value;
+          ASSERT_OK(db_->Get(ropts, cfh, "key" + std::to_string(j), &value));
+          ASSERT_EQ("value_" + std::to_string(j) + "_" + std::to_string(i),
+                    value);
+        }
+      }
+    }
+  };
+  verify_db_func();
+}
+#endif  // !ROCKSDB_LITE
+
+class DBBasicTestWithTimestampWithParam
+    : public DBBasicTestWithTimestampBase,
+      public testing::WithParamInterface<bool> {
+ public:
+  DBBasicTestWithTimestampWithParam()
+      : DBBasicTestWithTimestampBase(
+            "/db_basic_test_with_timestamp_with_param") {}
+
+ protected:
+  class TestComparator : public TestComparatorBase {
+   private:
+    const Comparator* cmp_without_ts_;
+
+   public:
+    explicit TestComparator(size_t ts_sz)
+        : TestComparatorBase(ts_sz), cmp_without_ts_(nullptr) {
+      cmp_without_ts_ = BytewiseComparator();
+    }
+
+    int CompareImpl(const Slice& a, const Slice& b) const override {
+      return cmp_without_ts_->Compare(a, b);
+    }
+  };
+};
+
+TEST_P(DBBasicTestWithTimestampWithParam, PutAndGet) {
+  const int kNumKeysPerFile = 8192;
+  const size_t kNumTimestamps = 6;
+  bool memtable_only = GetParam();
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.env = env_;
+  options.memtable_factory.reset(new SpecialSkipListFactory(kNumKeysPerFile));
+  std::string tmp;
+  size_t ts_sz = EncodeTimestamp(0, 0, &tmp).size();
+  TestComparator test_cmp(ts_sz);
+  options.comparator = &test_cmp;
+  BlockBasedTableOptions bbto;
+  bbto.filter_policy.reset(NewBloomFilterPolicy(
+      10 /*bits_per_key*/, false /*use_block_based_builder*/));
+  bbto.whole_key_filtering = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+
+  std::vector<CompressionType> compression_types;
+  compression_types.push_back(kNoCompression);
+  if (Zlib_Supported()) {
+    compression_types.push_back(kZlibCompression);
+  }
+#if LZ4_VERSION_NUMBER >= 10400  // r124+
+  compression_types.push_back(kLZ4Compression);
+  compression_types.push_back(kLZ4HCCompression);
+#endif  // LZ4_VERSION_NUMBER >= 10400
+  if (ZSTD_Supported()) {
+    compression_types.push_back(kZSTD);
+  }
+
+  // Switch compression dictionary on/off to check key extraction
+  // correctness in kBuffered state
+  std::vector<uint32_t> max_dict_bytes_list = {0, 1 << 14};  // 0 or 16KB
+
+  for (auto compression_type : compression_types) {
+    for (uint32_t max_dict_bytes : max_dict_bytes_list) {
+      options.compression = compression_type;
+      options.compression_opts.max_dict_bytes = max_dict_bytes;
+      if (compression_type == kZSTD) {
+        options.compression_opts.zstd_max_train_bytes = max_dict_bytes;
+      }
+      options.target_file_size_base = 1 << 26;  // 64MB
+
+      DestroyAndReopen(options);
+      CreateAndReopenWithCF({"pikachu"}, options);
+      size_t num_cfs = handles_.size();
+      ASSERT_EQ(2, num_cfs);
+      std::vector<std::string> write_ts_strs(kNumTimestamps);
+      std::vector<std::string> read_ts_strs(kNumTimestamps);
+      std::vector<Slice> write_ts_list;
+      std::vector<Slice> read_ts_list;
+
+      for (size_t i = 0; i != kNumTimestamps; ++i) {
+        write_ts_list.emplace_back(
+            EncodeTimestamp(i * 2, 0, &write_ts_strs[i]));
+        read_ts_list.emplace_back(
+            EncodeTimestamp(1 + i * 2, 0, &read_ts_strs[i]));
+        const Slice& write_ts = write_ts_list.back();
+        WriteOptions wopts;
+        wopts.timestamp = &write_ts;
+        for (int cf = 0; cf != static_cast<int>(num_cfs); ++cf) {
+          for (size_t j = 0; j != (kNumKeysPerFile - 1) / kNumTimestamps; ++j) {
+            ASSERT_OK(Put(
+                cf, "key" + std::to_string(j),
+                "value_" + std::to_string(j) + "_" + std::to_string(i), wopts));
+          }
+          if (!memtable_only) {
+            ASSERT_OK(Flush(cf));
+          }
+        }
+      }
+      const auto& verify_db_func = [&]() {
+        for (size_t i = 0; i != kNumTimestamps; ++i) {
+          ReadOptions ropts;
+          ropts.timestamp = &read_ts_list[i];
+          for (int cf = 0; cf != static_cast<int>(num_cfs); ++cf) {
+            ColumnFamilyHandle* cfh = handles_[cf];
+            for (size_t j = 0; j != (kNumKeysPerFile - 1) / kNumTimestamps;
+                 ++j) {
+              std::string value;
+              ASSERT_OK(
+                  db_->Get(ropts, cfh, "key" + std::to_string(j), &value));
+              ASSERT_EQ("value_" + std::to_string(j) + "_" + std::to_string(i),
+                        value);
+            }
+          }
+        }
+      };
+      verify_db_func();
+    }
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(Timestamp, DBBasicTestWithTimestampWithParam,
+                        ::testing::Bool());
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#ifdef ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS
+extern "C" {
+void RegisterCustomObjects(int argc, char** argv);
+}
+#else
+void RegisterCustomObjects(int /*argc*/, char** /*argv*/) {}
+#endif  // !ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  RegisterCustomObjects(argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_blob_index_test.cc b/src/rocksdb/db/db_blob_index_test.cc
new file mode 100644
index 000000000..24862f771
--- /dev/null
+++ b/src/rocksdb/db/db_blob_index_test.cc
@@ -0,0 +1,436 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <functional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "db/arena_wrapped_db_iter.h"
+#include "db/column_family.h"
+#include "db/db_iter.h"
+#include "db/db_test_util.h"
+#include "db/dbformat.h"
+#include "db/write_batch_internal.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "util/string_util.h"
+#include "utilities/merge_operators.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// kTypeBlobIndex is a value type used by BlobDB only. The base rocksdb
+// should accept the value type on write, and report not supported value
+// for reads, unless caller request for it explicitly. The base rocksdb
+// doesn't understand format of actual blob index (the value).
+class DBBlobIndexTest : public DBTestBase {
+ public:
+  enum Tier {
+    kMemtable = 0,
+    kImmutableMemtables = 1,
+    kL0SstFile = 2,
+    kLnSstFile = 3,
+  };
+  const std::vector<Tier> kAllTiers = {Tier::kMemtable,
+                                       Tier::kImmutableMemtables,
+                                       Tier::kL0SstFile, Tier::kLnSstFile};
+
+  DBBlobIndexTest() : DBTestBase("/db_blob_index_test") {}
+
+  ColumnFamilyHandle* cfh() { return dbfull()->DefaultColumnFamily(); }
+
+  ColumnFamilyData* cfd() {
+    return reinterpret_cast<ColumnFamilyHandleImpl*>(cfh())->cfd();
+  }
+
+  Status PutBlobIndex(WriteBatch* batch, const Slice& key,
+                      const Slice& blob_index) {
+    return WriteBatchInternal::PutBlobIndex(batch, cfd()->GetID(), key,
+                                            blob_index);
+  }
+
+  Status Write(WriteBatch* batch) {
+    return dbfull()->Write(WriteOptions(), batch);
+  }
+
+  std::string GetImpl(const Slice& key, bool* is_blob_index = nullptr,
+                      const Snapshot* snapshot = nullptr) {
+    ReadOptions read_options;
+    read_options.snapshot = snapshot;
+    PinnableSlice value;
+    DBImpl::GetImplOptions get_impl_options;
+    get_impl_options.column_family = cfh();
+    get_impl_options.value = &value;
+    get_impl_options.is_blob_index = is_blob_index;
+    auto s = dbfull()->GetImpl(read_options, key, get_impl_options);
+    if (s.IsNotFound()) {
+      return "NOT_FOUND";
+    }
+    if (s.IsNotSupported()) {
+      return "NOT_SUPPORTED";
+    }
+    if (!s.ok()) {
+      return s.ToString();
+    }
+    return value.ToString();
+  }
+
+  std::string GetBlobIndex(const Slice& key,
+                           const Snapshot* snapshot = nullptr) {
+    bool is_blob_index = false;
+    std::string value = GetImpl(key, &is_blob_index, snapshot);
+    if (!is_blob_index) {
+      return "NOT_BLOB";
+    }
+    return value;
+  }
+
+  ArenaWrappedDBIter* GetBlobIterator() {
+    return dbfull()->NewIteratorImpl(
+        ReadOptions(), cfd(), dbfull()->GetLatestSequenceNumber(),
+        nullptr /*read_callback*/, true /*allow_blob*/);
+  }
+
+  Options GetTestOptions() {
+    Options options;
+    options.create_if_missing = true;
+    options.num_levels = 2;
+    options.disable_auto_compactions = true;
+    // Disable auto flushes.
+    options.max_write_buffer_number = 10;
+    options.min_write_buffer_number_to_merge = 10;
+    options.merge_operator = MergeOperators::CreateStringAppendOperator();
+    return options;
+  }
+
+  void MoveDataTo(Tier tier) {
+    switch (tier) {
+      case Tier::kMemtable:
+        break;
+      case Tier::kImmutableMemtables:
+        ASSERT_OK(dbfull()->TEST_SwitchMemtable());
+        break;
+      case Tier::kL0SstFile:
+        ASSERT_OK(Flush());
+        break;
+      case Tier::kLnSstFile:
+        ASSERT_OK(Flush());
+        ASSERT_OK(Put("a", "dummy"));
+        ASSERT_OK(Put("z", "dummy"));
+        ASSERT_OK(Flush());
+        ASSERT_OK(
+            dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+#ifndef ROCKSDB_LITE
+        ASSERT_EQ("0,1", FilesPerLevel());
+#endif  // !ROCKSDB_LITE
+        break;
+    }
+  }
+};
+
+// Should be able to write kTypeBlobIndex to memtables and SST files.
+TEST_F(DBBlobIndexTest, Write) {
+  for (auto tier : kAllTiers) {
+    DestroyAndReopen(GetTestOptions());
+    for (int i = 1; i <= 5; i++) {
+      std::string index = ToString(i);
+      WriteBatch batch;
+      ASSERT_OK(PutBlobIndex(&batch, "key" + index, "blob" + index));
+      ASSERT_OK(Write(&batch));
+    }
+    MoveDataTo(tier);
+    for (int i = 1; i <= 5; i++) {
+      std::string index = ToString(i);
+      ASSERT_EQ("blob" + index, GetBlobIndex("key" + index));
+    }
+  }
+}
+
+// Get should be able to return blob index if is_blob_index is provided,
+// otherwise return Status::NotSupported status.
+TEST_F(DBBlobIndexTest, Get) {
+  for (auto tier : kAllTiers) {
+    DestroyAndReopen(GetTestOptions());
+    WriteBatch batch;
+    ASSERT_OK(batch.Put("key", "value"));
+    ASSERT_OK(PutBlobIndex(&batch, "blob_key", "blob_index"));
+    ASSERT_OK(Write(&batch));
+    MoveDataTo(tier);
+    // Verify normal value
+    bool is_blob_index = false;
+    PinnableSlice value;
+    ASSERT_EQ("value", Get("key"));
+    ASSERT_EQ("value", GetImpl("key"));
+    ASSERT_EQ("value", GetImpl("key", &is_blob_index));
+    ASSERT_FALSE(is_blob_index);
+    // Verify blob index
+    ASSERT_TRUE(Get("blob_key", &value).IsNotSupported());
+    ASSERT_EQ("NOT_SUPPORTED", GetImpl("blob_key"));
+    ASSERT_EQ("blob_index", GetImpl("blob_key", &is_blob_index));
+    ASSERT_TRUE(is_blob_index);
+  }
+}
+
+// Get should NOT return Status::NotSupported if blob index is updated with
+// a normal value.
+TEST_F(DBBlobIndexTest, Updated) {
+  for (auto tier : kAllTiers) {
+    DestroyAndReopen(GetTestOptions());
+    WriteBatch batch;
+    for (int i = 0; i < 10; i++) {
+      ASSERT_OK(PutBlobIndex(&batch, "key" + ToString(i), "blob_index"));
+    }
+    ASSERT_OK(Write(&batch));
+    // Avoid blob values from being purged.
+    const Snapshot* snapshot = dbfull()->GetSnapshot();
+    ASSERT_OK(Put("key1", "new_value"));
+    ASSERT_OK(Merge("key2", "a"));
+    ASSERT_OK(Merge("key2", "b"));
+    ASSERT_OK(Merge("key2", "c"));
+    ASSERT_OK(Delete("key3"));
+    ASSERT_OK(SingleDelete("key4"));
+    ASSERT_OK(Delete("key5"));
+    ASSERT_OK(Merge("key5", "a"));
+    ASSERT_OK(Merge("key5", "b"));
+    ASSERT_OK(Merge("key5", "c"));
+    ASSERT_OK(dbfull()->DeleteRange(WriteOptions(), cfh(), "key6", "key9"));
+    MoveDataTo(tier);
+    for (int i = 0; i < 10; i++) {
+      ASSERT_EQ("blob_index", GetBlobIndex("key" + ToString(i), snapshot));
+    }
+    ASSERT_EQ("new_value", Get("key1"));
+    ASSERT_EQ("NOT_SUPPORTED", GetImpl("key2"));
+    ASSERT_EQ("NOT_FOUND", Get("key3"));
+    ASSERT_EQ("NOT_FOUND", Get("key4"));
+    ASSERT_EQ("a,b,c", GetImpl("key5"));
+    for (int i = 6; i < 9; i++) {
+      ASSERT_EQ("NOT_FOUND", Get("key" + ToString(i)));
+    }
+    ASSERT_EQ("blob_index", GetBlobIndex("key9"));
+    dbfull()->ReleaseSnapshot(snapshot);
+  }
+}
+
+// Iterator should get blob value if allow_blob flag is set,
+// otherwise return Status::NotSupported status.
+TEST_F(DBBlobIndexTest, Iterate) {
+  const std::vector<std::vector<ValueType>> data = {
+      /*00*/ {kTypeValue},
+      /*01*/ {kTypeBlobIndex},
+      /*02*/ {kTypeValue},
+      /*03*/ {kTypeBlobIndex, kTypeValue},
+      /*04*/ {kTypeValue},
+      /*05*/ {kTypeValue, kTypeBlobIndex},
+      /*06*/ {kTypeValue},
+      /*07*/ {kTypeDeletion, kTypeBlobIndex},
+      /*08*/ {kTypeValue},
+      /*09*/ {kTypeSingleDeletion, kTypeBlobIndex},
+      /*10*/ {kTypeValue},
+      /*11*/ {kTypeMerge, kTypeMerge, kTypeMerge, kTypeBlobIndex},
+      /*12*/ {kTypeValue},
+      /*13*/
+      {kTypeMerge, kTypeMerge, kTypeMerge, kTypeDeletion, kTypeBlobIndex},
+      /*14*/ {kTypeValue},
+      /*15*/ {kTypeBlobIndex},
+      /*16*/ {kTypeValue},
+  };
+
+  auto get_key = [](int index) {
+    char buf[20];
+    snprintf(buf, sizeof(buf), "%02d", index);
+    return "key" + std::string(buf);
+  };
+
+  auto get_value = [&](int index, int version) {
+    return get_key(index) + "_value" + ToString(version);
+  };
+
+  auto check_iterator = [&](Iterator* iterator, Status::Code expected_status,
+                            const Slice& expected_value) {
+    ASSERT_EQ(expected_status, iterator->status().code());
+    if (expected_status == Status::kOk) {
+      ASSERT_TRUE(iterator->Valid());
+      ASSERT_EQ(expected_value, iterator->value());
+    } else {
+      ASSERT_FALSE(iterator->Valid());
+    }
+  };
+
+  auto create_normal_iterator = [&]() -> Iterator* {
+    return dbfull()->NewIterator(ReadOptions());
+  };
+
+  auto create_blob_iterator = [&]() -> Iterator* { return GetBlobIterator(); };
+
+  auto check_is_blob = [&](bool is_blob) {
+    return [is_blob](Iterator* iterator) {
+      ASSERT_EQ(is_blob,
+                reinterpret_cast<ArenaWrappedDBIter*>(iterator)->IsBlob());
+    };
+  };
+
+  auto verify = [&](int index, Status::Code expected_status,
+                    const Slice& forward_value, const Slice& backward_value,
+                    std::function<Iterator*()> create_iterator,
+                    std::function<void(Iterator*)> extra_check = nullptr) {
+    // Seek
+    auto* iterator = create_iterator();
+    ASSERT_OK(iterator->Refresh());
+    iterator->Seek(get_key(index));
+    check_iterator(iterator, expected_status, forward_value);
+    if (extra_check) {
+      extra_check(iterator);
+    }
+    delete iterator;
+
+    // Next
+    iterator = create_iterator();
+    ASSERT_OK(iterator->Refresh());
+    iterator->Seek(get_key(index - 1));
+    ASSERT_TRUE(iterator->Valid());
+    iterator->Next();
+    check_iterator(iterator, expected_status, forward_value);
+    if (extra_check) {
+      extra_check(iterator);
+    }
+    delete iterator;
+
+    // SeekForPrev
+    iterator = create_iterator();
+    ASSERT_OK(iterator->Refresh());
+    iterator->SeekForPrev(get_key(index));
+    check_iterator(iterator, expected_status, backward_value);
+    if (extra_check) {
+      extra_check(iterator);
+    }
+    delete iterator;
+
+    // Prev
+    iterator = create_iterator();
+    iterator->Seek(get_key(index + 1));
+    ASSERT_TRUE(iterator->Valid());
+    iterator->Prev();
+    check_iterator(iterator, expected_status, backward_value);
+    if (extra_check) {
+      extra_check(iterator);
+    }
+    delete iterator;
+  };
+
+  for (auto tier : {Tier::kMemtable} /*kAllTiers*/) {
+    // Avoid values from being purged.
+    std::vector<const Snapshot*> snapshots;
+    DestroyAndReopen(GetTestOptions());
+
+    // fill data
+    for (int i = 0; i < static_cast<int>(data.size()); i++) {
+      for (int j = static_cast<int>(data[i].size()) - 1; j >= 0; j--) {
+        std::string key = get_key(i);
+        std::string value = get_value(i, j);
+        WriteBatch batch;
+        switch (data[i][j]) {
+          case kTypeValue:
+            ASSERT_OK(Put(key, value));
+            break;
+          case kTypeDeletion:
+            ASSERT_OK(Delete(key));
+            break;
+          case kTypeSingleDeletion:
+            ASSERT_OK(SingleDelete(key));
+            break;
+          case kTypeMerge:
+            ASSERT_OK(Merge(key, value));
+            break;
+          case kTypeBlobIndex:
+            ASSERT_OK(PutBlobIndex(&batch, key, value));
+            ASSERT_OK(Write(&batch));
+            break;
+          default:
+            assert(false);
+        };
+      }
+      snapshots.push_back(dbfull()->GetSnapshot());
+    }
+    ASSERT_OK(
+        dbfull()->DeleteRange(WriteOptions(), cfh(), get_key(15), get_key(16)));
+    snapshots.push_back(dbfull()->GetSnapshot());
+    MoveDataTo(tier);
+
+    // Normal iterator
+    verify(1, Status::kNotSupported, "", "", create_normal_iterator);
+    verify(3, Status::kNotSupported, "", "", create_normal_iterator);
+    verify(5, Status::kOk, get_value(5, 0), get_value(5, 0),
+           create_normal_iterator);
+    verify(7, Status::kOk, get_value(8, 0), get_value(6, 0),
+           create_normal_iterator);
+    verify(9, Status::kOk, get_value(10, 0), get_value(8, 0),
+           create_normal_iterator);
+    verify(11, Status::kNotSupported, "", "", create_normal_iterator);
+    verify(13, Status::kOk,
+           get_value(13, 2) + "," + get_value(13, 1) + "," + get_value(13, 0),
+           get_value(13, 2) + "," + get_value(13, 1) + "," + get_value(13, 0),
+           create_normal_iterator);
+    verify(15, Status::kOk, get_value(16, 0), get_value(14, 0),
+           create_normal_iterator);
+
+    // Iterator with blob support
+    verify(1, Status::kOk, get_value(1, 0), get_value(1, 0),
+           create_blob_iterator, check_is_blob(true));
+    verify(3, Status::kOk, get_value(3, 0), get_value(3, 0),
+           create_blob_iterator, check_is_blob(true));
+    verify(5, Status::kOk, get_value(5, 0), get_value(5, 0),
+           create_blob_iterator, check_is_blob(false));
+    verify(7, Status::kOk, get_value(8, 0), get_value(6, 0),
+           create_blob_iterator, check_is_blob(false));
+    verify(9, Status::kOk, get_value(10, 0), get_value(8, 0),
+           create_blob_iterator, check_is_blob(false));
+    verify(11, Status::kNotSupported, "", "", create_blob_iterator);
+    verify(13, Status::kOk,
+           get_value(13, 2) + "," + get_value(13, 1) + "," + get_value(13, 0),
+           get_value(13, 2) + "," + get_value(13, 1) + "," + get_value(13, 0),
+           create_blob_iterator, check_is_blob(false));
+    verify(15, Status::kOk, get_value(16, 0), get_value(14, 0),
+           create_blob_iterator, check_is_blob(false));
+
+#ifndef ROCKSDB_LITE
+    // Iterator with blob support and using seek.
+    ASSERT_OK(dbfull()->SetOptions(
+        cfh(), {{"max_sequential_skip_in_iterations", "0"}}));
+    verify(1, Status::kOk, get_value(1, 0), get_value(1, 0),
+           create_blob_iterator, check_is_blob(true));
+    verify(3, Status::kOk, get_value(3, 0), get_value(3, 0),
+           create_blob_iterator, check_is_blob(true));
+    verify(5, Status::kOk, get_value(5, 0), get_value(5, 0),
+           create_blob_iterator, check_is_blob(false));
+    verify(7, Status::kOk, get_value(8, 0), get_value(6, 0),
+           create_blob_iterator, check_is_blob(false));
+    verify(9, Status::kOk, get_value(10, 0), get_value(8, 0),
+           create_blob_iterator, check_is_blob(false));
+    verify(11, Status::kNotSupported, "", "", create_blob_iterator);
+    verify(13, Status::kOk,
+           get_value(13, 2) + "," + get_value(13, 1) + "," + get_value(13, 0),
+           get_value(13, 2) + "," + get_value(13, 1) + "," + get_value(13, 0),
+           create_blob_iterator, check_is_blob(false));
+    verify(15, Status::kOk, get_value(16, 0), get_value(14, 0),
+           create_blob_iterator, check_is_blob(false));
+#endif  // !ROCKSDB_LITE
+
+    for (auto* snapshot : snapshots) {
+      dbfull()->ReleaseSnapshot(snapshot);
+    }
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_block_cache_test.cc b/src/rocksdb/db/db_block_cache_test.cc
new file mode 100644
index 000000000..3031e56bb
--- /dev/null
+++ b/src/rocksdb/db/db_block_cache_test.cc
@@ -0,0 +1,761 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include <cstdlib>
+#include "cache/lru_cache.h"
+#include "db/db_test_util.h"
+#include "port/stack_trace.h"
+#include "util/compression.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBBlockCacheTest : public DBTestBase {
+ private:
+  size_t miss_count_ = 0;
+  size_t hit_count_ = 0;
+  size_t insert_count_ = 0;
+  size_t failure_count_ = 0;
+  size_t compression_dict_miss_count_ = 0;
+  size_t compression_dict_hit_count_ = 0;
+  size_t compression_dict_insert_count_ = 0;
+  size_t compressed_miss_count_ = 0;
+  size_t compressed_hit_count_ = 0;
+  size_t compressed_insert_count_ = 0;
+  size_t compressed_failure_count_ = 0;
+
+ public:
+  const size_t kNumBlocks = 10;
+  const size_t kValueSize = 100;
+
+  DBBlockCacheTest() : DBTestBase("/db_block_cache_test") {}
+
+  BlockBasedTableOptions GetTableOptions() {
+    BlockBasedTableOptions table_options;
+    // Set a small enough block size so that each key-value get its own block.
+    table_options.block_size = 1;
+    return table_options;
+  }
+
+  Options GetOptions(const BlockBasedTableOptions& table_options) {
+    Options options = CurrentOptions();
+    options.create_if_missing = true;
+    options.avoid_flush_during_recovery = false;
+    // options.compression = kNoCompression;
+    options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+    options.table_factory.reset(new BlockBasedTableFactory(table_options));
+    return options;
+  }
+
+  void InitTable(const Options& /*options*/) {
+    std::string value(kValueSize, 'a');
+    for (size_t i = 0; i < kNumBlocks; i++) {
+      ASSERT_OK(Put(ToString(i), value.c_str()));
+    }
+  }
+
+  void RecordCacheCounters(const Options& options) {
+    miss_count_ = TestGetTickerCount(options, BLOCK_CACHE_MISS);
+    hit_count_ = TestGetTickerCount(options, BLOCK_CACHE_HIT);
+    insert_count_ = TestGetTickerCount(options, BLOCK_CACHE_ADD);
+    failure_count_ = TestGetTickerCount(options, BLOCK_CACHE_ADD_FAILURES);
+    compressed_miss_count_ =
+        TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_MISS);
+    compressed_hit_count_ =
+        TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_HIT);
+    compressed_insert_count_ =
+        TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_ADD);
+    compressed_failure_count_ =
+        TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_ADD_FAILURES);
+  }
+
+  void RecordCacheCountersForCompressionDict(const Options& options) {
+    compression_dict_miss_count_ =
+        TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_MISS);
+    compression_dict_hit_count_ =
+        TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_HIT);
+    compression_dict_insert_count_ =
+        TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_ADD);
+  }
+
+  void CheckCacheCounters(const Options& options, size_t expected_misses,
+                          size_t expected_hits, size_t expected_inserts,
+                          size_t expected_failures) {
+    size_t new_miss_count = TestGetTickerCount(options, BLOCK_CACHE_MISS);
+    size_t new_hit_count = TestGetTickerCount(options, BLOCK_CACHE_HIT);
+    size_t new_insert_count = TestGetTickerCount(options, BLOCK_CACHE_ADD);
+    size_t new_failure_count =
+        TestGetTickerCount(options, BLOCK_CACHE_ADD_FAILURES);
+    ASSERT_EQ(miss_count_ + expected_misses, new_miss_count);
+    ASSERT_EQ(hit_count_ + expected_hits, new_hit_count);
+    ASSERT_EQ(insert_count_ + expected_inserts, new_insert_count);
+    ASSERT_EQ(failure_count_ + expected_failures, new_failure_count);
+    miss_count_ = new_miss_count;
+    hit_count_ = new_hit_count;
+    insert_count_ = new_insert_count;
+    failure_count_ = new_failure_count;
+  }
+
+  void CheckCacheCountersForCompressionDict(
+      const Options& options, size_t expected_compression_dict_misses,
+      size_t expected_compression_dict_hits,
+      size_t expected_compression_dict_inserts) {
+    size_t new_compression_dict_miss_count =
+        TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_MISS);
+    size_t new_compression_dict_hit_count =
+        TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_HIT);
+    size_t new_compression_dict_insert_count =
+        TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_ADD);
+    ASSERT_EQ(compression_dict_miss_count_ + expected_compression_dict_misses,
+              new_compression_dict_miss_count);
+    ASSERT_EQ(compression_dict_hit_count_ + expected_compression_dict_hits,
+              new_compression_dict_hit_count);
+    ASSERT_EQ(
+        compression_dict_insert_count_ + expected_compression_dict_inserts,
+        new_compression_dict_insert_count);
+    compression_dict_miss_count_ = new_compression_dict_miss_count;
+    compression_dict_hit_count_ = new_compression_dict_hit_count;
+    compression_dict_insert_count_ = new_compression_dict_insert_count;
+  }
+
+  void CheckCompressedCacheCounters(const Options& options,
+                                    size_t expected_misses,
+                                    size_t expected_hits,
+                                    size_t expected_inserts,
+                                    size_t expected_failures) {
+    size_t new_miss_count =
+        TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_MISS);
+    size_t new_hit_count =
+        TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_HIT);
+    size_t new_insert_count =
+        TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_ADD);
+    size_t new_failure_count =
+        TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_ADD_FAILURES);
+    ASSERT_EQ(compressed_miss_count_ + expected_misses, new_miss_count);
+    ASSERT_EQ(compressed_hit_count_ + expected_hits, new_hit_count);
+    ASSERT_EQ(compressed_insert_count_ + expected_inserts, new_insert_count);
+    ASSERT_EQ(compressed_failure_count_ + expected_failures, new_failure_count);
+    compressed_miss_count_ = new_miss_count;
+    compressed_hit_count_ = new_hit_count;
+    compressed_insert_count_ = new_insert_count;
+    compressed_failure_count_ = new_failure_count;
+  }
+};
+
+TEST_F(DBBlockCacheTest, IteratorBlockCacheUsage) {
+  ReadOptions read_options;
+  read_options.fill_cache = false;
+  auto table_options = GetTableOptions();
+  auto options = GetOptions(table_options);
+  InitTable(options);
+
+  std::shared_ptr<Cache> cache = NewLRUCache(0, 0, false);
+  table_options.block_cache = cache;
+  options.table_factory.reset(new BlockBasedTableFactory(table_options));
+  Reopen(options);
+  RecordCacheCounters(options);
+
+  std::vector<std::unique_ptr<Iterator>> iterators(kNumBlocks - 1);
+  Iterator* iter = nullptr;
+
+  ASSERT_EQ(0, cache->GetUsage());
+  iter = db_->NewIterator(read_options);
+  iter->Seek(ToString(0));
+  ASSERT_LT(0, cache->GetUsage());
+  delete iter;
+  iter = nullptr;
+  ASSERT_EQ(0, cache->GetUsage());
+}
+
+TEST_F(DBBlockCacheTest, TestWithoutCompressedBlockCache) {
+  ReadOptions read_options;
+  auto table_options = GetTableOptions();
+  auto options = GetOptions(table_options);
+  InitTable(options);
+
+  std::shared_ptr<Cache> cache = NewLRUCache(0, 0, false);
+  table_options.block_cache = cache;
+  options.table_factory.reset(new BlockBasedTableFactory(table_options));
+  Reopen(options);
+  RecordCacheCounters(options);
+
+  std::vector<std::unique_ptr<Iterator>> iterators(kNumBlocks - 1);
+  Iterator* iter = nullptr;
+
+  // Load blocks into cache.
+  for (size_t i = 0; i < kNumBlocks - 1; i++) {
+    iter = db_->NewIterator(read_options);
+    iter->Seek(ToString(i));
+    ASSERT_OK(iter->status());
+    CheckCacheCounters(options, 1, 0, 1, 0);
+    iterators[i].reset(iter);
+  }
+  size_t usage = cache->GetUsage();
+  ASSERT_LT(0, usage);
+  cache->SetCapacity(usage);
+  ASSERT_EQ(usage, cache->GetPinnedUsage());
+
+  // Test with strict capacity limit.
+  cache->SetStrictCapacityLimit(true);
+  iter = db_->NewIterator(read_options);
+  iter->Seek(ToString(kNumBlocks - 1));
+  ASSERT_TRUE(iter->status().IsIncomplete());
+  CheckCacheCounters(options, 1, 0, 0, 1);
+  delete iter;
+  iter = nullptr;
+
+  // Release iterators and access cache again.
+  for (size_t i = 0; i < kNumBlocks - 1; i++) {
+    iterators[i].reset();
+    CheckCacheCounters(options, 0, 0, 0, 0);
+  }
+  ASSERT_EQ(0, cache->GetPinnedUsage());
+  for (size_t i = 0; i < kNumBlocks - 1; i++) {
+    iter = db_->NewIterator(read_options);
+    iter->Seek(ToString(i));
+    ASSERT_OK(iter->status());
+    CheckCacheCounters(options, 0, 1, 0, 0);
+    iterators[i].reset(iter);
+  }
+}
+
+#ifdef SNAPPY
+TEST_F(DBBlockCacheTest, TestWithCompressedBlockCache) {
+  ReadOptions read_options;
+  auto table_options = GetTableOptions();
+  auto options = GetOptions(table_options);
+  options.compression = CompressionType::kSnappyCompression;
+  InitTable(options);
+
+  std::shared_ptr<Cache> cache = NewLRUCache(0, 0, false);
+  std::shared_ptr<Cache> compressed_cache = NewLRUCache(1 << 25, 0, false);
+  table_options.block_cache = cache;
+  table_options.block_cache_compressed = compressed_cache;
+  options.table_factory.reset(new BlockBasedTableFactory(table_options));
+  Reopen(options);
+  RecordCacheCounters(options);
+
+  std::vector<std::unique_ptr<Iterator>> iterators(kNumBlocks - 1);
+  Iterator* iter = nullptr;
+
+  // Load blocks into cache.
+  for (size_t i = 0; i < kNumBlocks - 1; i++) {
+    iter = db_->NewIterator(read_options);
+    iter->Seek(ToString(i));
+    ASSERT_OK(iter->status());
+    CheckCacheCounters(options, 1, 0, 1, 0);
+    CheckCompressedCacheCounters(options, 1, 0, 1, 0);
+    iterators[i].reset(iter);
+  }
+  size_t usage = cache->GetUsage();
+  ASSERT_LT(0, usage);
+  ASSERT_EQ(usage, cache->GetPinnedUsage());
+  size_t compressed_usage = compressed_cache->GetUsage();
+  ASSERT_LT(0, compressed_usage);
+  // Compressed block cache cannot be pinned.
+  ASSERT_EQ(0, compressed_cache->GetPinnedUsage());
+
+  // Set strict capacity limit flag. Now block will only load into compressed
+  // block cache.
+  cache->SetCapacity(usage);
+  cache->SetStrictCapacityLimit(true);
+  ASSERT_EQ(usage, cache->GetPinnedUsage());
+  iter = db_->NewIterator(read_options);
+  iter->Seek(ToString(kNumBlocks - 1));
+  ASSERT_TRUE(iter->status().IsIncomplete());
+  CheckCacheCounters(options, 1, 0, 0, 1);
+  CheckCompressedCacheCounters(options, 1, 0, 1, 0);
+  delete iter;
+  iter = nullptr;
+
+  // Clear strict capacity limit flag. This time we shall hit compressed block
+  // cache.
+  cache->SetStrictCapacityLimit(false);
+  iter = db_->NewIterator(read_options);
+  iter->Seek(ToString(kNumBlocks - 1));
+  ASSERT_OK(iter->status());
+  CheckCacheCounters(options, 1, 0, 1, 0);
+  CheckCompressedCacheCounters(options, 0, 1, 0, 0);
+  delete iter;
+  iter = nullptr;
+}
+#endif  // SNAPPY
+
+#ifndef ROCKSDB_LITE
+
+// Make sure that when options.block_cache is set, after a new table is
+// created its index/filter blocks are added to block cache.
+TEST_F(DBBlockCacheTest, IndexAndFilterBlocksOfNewTableAddedToCache) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  BlockBasedTableOptions table_options;
+  table_options.cache_index_and_filter_blocks = true;
+  table_options.filter_policy.reset(NewBloomFilterPolicy(20));
+  options.table_factory.reset(new BlockBasedTableFactory(table_options));
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  ASSERT_OK(Put(1, "key", "val"));
+  // Create a new table.
+  ASSERT_OK(Flush(1));
+
+  // index/filter blocks added to block cache right after table creation.
+  ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+  ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+  ASSERT_EQ(2, /* only index/filter were added */
+            TestGetTickerCount(options, BLOCK_CACHE_ADD));
+  ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_DATA_MISS));
+  uint64_t int_num;
+  ASSERT_TRUE(
+      dbfull()->GetIntProperty("rocksdb.estimate-table-readers-mem", &int_num));
+  ASSERT_EQ(int_num, 0U);
+
+  // Make sure filter block is in cache.
+  std::string value;
+  ReadOptions ropt;
+  db_->KeyMayExist(ReadOptions(), handles_[1], "key", &value);
+
+  // Miss count should remain the same.
+  ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+  ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
+
+  db_->KeyMayExist(ReadOptions(), handles_[1], "key", &value);
+  ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+  ASSERT_EQ(2, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
+
+  // Make sure index block is in cache.
+  auto index_block_hit = TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT);
+  value = Get(1, "key");
+  ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+  ASSERT_EQ(index_block_hit + 1,
+            TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
+
+  value = Get(1, "key");
+  ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+  ASSERT_EQ(index_block_hit + 2,
+            TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
+}
+
+// With fill_cache = false, fills up the cache, then iterates over the entire
+// db, verify dummy entries inserted in `BlockBasedTable::NewDataBlockIterator`
+// does not cause heap-use-after-free errors in COMPILE_WITH_ASAN=1 runs
+TEST_F(DBBlockCacheTest, FillCacheAndIterateDB) {
+  ReadOptions read_options;
+  read_options.fill_cache = false;
+  auto table_options = GetTableOptions();
+  auto options = GetOptions(table_options);
+  InitTable(options);
+
+  std::shared_ptr<Cache> cache = NewLRUCache(10, 0, true);
+  table_options.block_cache = cache;
+  options.table_factory.reset(new BlockBasedTableFactory(table_options));
+  Reopen(options);
+  ASSERT_OK(Put("key1", "val1"));
+  ASSERT_OK(Put("key2", "val2"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("key3", "val3"));
+  ASSERT_OK(Put("key4", "val4"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("key5", "val5"));
+  ASSERT_OK(Put("key6", "val6"));
+  ASSERT_OK(Flush());
+
+  Iterator* iter = nullptr;
+
+  iter = db_->NewIterator(read_options);
+  iter->Seek(ToString(0));
+  while (iter->Valid()) {
+    iter->Next();
+  }
+  delete iter;
+  iter = nullptr;
+}
+
+TEST_F(DBBlockCacheTest, IndexAndFilterBlocksStats) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  BlockBasedTableOptions table_options;
+  table_options.cache_index_and_filter_blocks = true;
+  LRUCacheOptions co;
+  // 500 bytes are enough to hold the first two blocks
+  co.capacity = 500;
+  co.num_shard_bits = 0;
+  co.strict_capacity_limit = false;
+  co.metadata_charge_policy = kDontChargeCacheMetadata;
+  std::shared_ptr<Cache> cache = NewLRUCache(co);
+  table_options.block_cache = cache;
+  table_options.filter_policy.reset(NewBloomFilterPolicy(20, true));
+  options.table_factory.reset(new BlockBasedTableFactory(table_options));
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  ASSERT_OK(Put(1, "longer_key", "val"));
+  // Create a new table
+  ASSERT_OK(Flush(1));
+  size_t index_bytes_insert =
+      TestGetTickerCount(options, BLOCK_CACHE_INDEX_BYTES_INSERT);
+  size_t filter_bytes_insert =
+      TestGetTickerCount(options, BLOCK_CACHE_FILTER_BYTES_INSERT);
+  ASSERT_GT(index_bytes_insert, 0);
+  ASSERT_GT(filter_bytes_insert, 0);
+  ASSERT_EQ(cache->GetUsage(), index_bytes_insert + filter_bytes_insert);
+  // set the cache capacity to the current usage
+  cache->SetCapacity(index_bytes_insert + filter_bytes_insert);
+  // The index and filter eviction statistics were broken by the refactoring
+  // that moved the readers out of the block cache. Disabling these until we can
+  // bring the stats back.
+  // ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_INDEX_BYTES_EVICT), 0);
+  // ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_FILTER_BYTES_EVICT), 0);
+  // Note that the second key needs to be no longer than the first one.
+  // Otherwise the second index block may not fit in cache.
+  ASSERT_OK(Put(1, "key", "val"));
+  // Create a new table
+  ASSERT_OK(Flush(1));
+  // cache evicted old index and block entries
+  ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_INDEX_BYTES_INSERT),
+            index_bytes_insert);
+  ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_FILTER_BYTES_INSERT),
+            filter_bytes_insert);
+  // The index and filter eviction statistics were broken by the refactoring
+  // that moved the readers out of the block cache. Disabling these until we can
+  // bring the stats back.
+  // ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_INDEX_BYTES_EVICT),
+  //           index_bytes_insert);
+  // ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_FILTER_BYTES_EVICT),
+  //           filter_bytes_insert);
+}
+
+namespace {
+
+// A mock cache wraps LRUCache, and record how many entries have been
+// inserted for each priority.
+class MockCache : public LRUCache {
+ public:
+  static uint32_t high_pri_insert_count;
+  static uint32_t low_pri_insert_count;
+
+  MockCache()
+      : LRUCache((size_t)1 << 25 /*capacity*/, 0 /*num_shard_bits*/,
+                 false /*strict_capacity_limit*/, 0.0 /*high_pri_pool_ratio*/) {
+  }
+
+  Status Insert(const Slice& key, void* value, size_t charge,
+                void (*deleter)(const Slice& key, void* value), Handle** handle,
+                Priority priority) override {
+    if (priority == Priority::LOW) {
+      low_pri_insert_count++;
+    } else {
+      high_pri_insert_count++;
+    }
+    return LRUCache::Insert(key, value, charge, deleter, handle, priority);
+  }
+};
+
+uint32_t MockCache::high_pri_insert_count = 0;
+uint32_t MockCache::low_pri_insert_count = 0;
+
+}  // anonymous namespace
+
+TEST_F(DBBlockCacheTest, IndexAndFilterBlocksCachePriority) {
+  for (auto priority : {Cache::Priority::LOW, Cache::Priority::HIGH}) {
+    Options options = CurrentOptions();
+    options.create_if_missing = true;
+    options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+    BlockBasedTableOptions table_options;
+    table_options.cache_index_and_filter_blocks = true;
+    table_options.block_cache.reset(new MockCache());
+    table_options.filter_policy.reset(NewBloomFilterPolicy(20));
+    table_options.cache_index_and_filter_blocks_with_high_priority =
+        priority == Cache::Priority::HIGH ? true : false;
+    options.table_factory.reset(new BlockBasedTableFactory(table_options));
+    DestroyAndReopen(options);
+
+    MockCache::high_pri_insert_count = 0;
+    MockCache::low_pri_insert_count = 0;
+
+    // Create a new table.
+    ASSERT_OK(Put("foo", "value"));
+    ASSERT_OK(Put("bar", "value"));
+    ASSERT_OK(Flush());
+    ASSERT_EQ(1, NumTableFilesAtLevel(0));
+
+    // index/filter blocks added to block cache right after table creation.
+    ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+    ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+    ASSERT_EQ(2, /* only index/filter were added */
+              TestGetTickerCount(options, BLOCK_CACHE_ADD));
+    ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_DATA_MISS));
+    if (priority == Cache::Priority::LOW) {
+      ASSERT_EQ(0u, MockCache::high_pri_insert_count);
+      ASSERT_EQ(2u, MockCache::low_pri_insert_count);
+    } else {
+      ASSERT_EQ(2u, MockCache::high_pri_insert_count);
+      ASSERT_EQ(0u, MockCache::low_pri_insert_count);
+    }
+
+    // Access data block.
+    ASSERT_EQ("value", Get("foo"));
+
+    ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+    ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+    ASSERT_EQ(3, /*adding data block*/
+              TestGetTickerCount(options, BLOCK_CACHE_ADD));
+    ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_DATA_MISS));
+
+    // Data block should be inserted with low priority.
+    if (priority == Cache::Priority::LOW) {
+      ASSERT_EQ(0u, MockCache::high_pri_insert_count);
+      ASSERT_EQ(3u, MockCache::low_pri_insert_count);
+    } else {
+      ASSERT_EQ(2u, MockCache::high_pri_insert_count);
+      ASSERT_EQ(1u, MockCache::low_pri_insert_count);
+    }
+  }
+}
+
+TEST_F(DBBlockCacheTest, ParanoidFileChecks) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  options.level0_file_num_compaction_trigger = 2;
+  options.paranoid_file_checks = true;
+  BlockBasedTableOptions table_options;
+  table_options.cache_index_and_filter_blocks = false;
+  table_options.filter_policy.reset(NewBloomFilterPolicy(20));
+  options.table_factory.reset(new BlockBasedTableFactory(table_options));
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  ASSERT_OK(Put(1, "1_key", "val"));
+  ASSERT_OK(Put(1, "9_key", "val"));
+  // Create a new table.
+  ASSERT_OK(Flush(1));
+  ASSERT_EQ(1, /* read and cache data block */
+            TestGetTickerCount(options, BLOCK_CACHE_ADD));
+
+  ASSERT_OK(Put(1, "1_key2", "val2"));
+  ASSERT_OK(Put(1, "9_key2", "val2"));
+  // Create a new SST file. This will further trigger a compaction
+  // and generate another file.
+  ASSERT_OK(Flush(1));
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(3, /* Totally 3 files created up to now */
+            TestGetTickerCount(options, BLOCK_CACHE_ADD));
+
+  // After disabling options.paranoid_file_checks. NO further block
+  // is added after generating a new file.
+  ASSERT_OK(
+      dbfull()->SetOptions(handles_[1], {{"paranoid_file_checks", "false"}}));
+
+  ASSERT_OK(Put(1, "1_key3", "val3"));
+  ASSERT_OK(Put(1, "9_key3", "val3"));
+  ASSERT_OK(Flush(1));
+  ASSERT_OK(Put(1, "1_key4", "val4"));
+  ASSERT_OK(Put(1, "9_key4", "val4"));
+  ASSERT_OK(Flush(1));
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(3, /* Totally 3 files created up to now */
+            TestGetTickerCount(options, BLOCK_CACHE_ADD));
+}
+
+TEST_F(DBBlockCacheTest, CompressedCache) {
+  if (!Snappy_Supported()) {
+    return;
+  }
+  int num_iter = 80;
+
+  // Run this test three iterations.
+  // Iteration 1: only a uncompressed block cache
+  // Iteration 2: only a compressed block cache
+  // Iteration 3: both block cache and compressed cache
+  // Iteration 4: both block cache and compressed cache, but DB is not
+  // compressed
+  for (int iter = 0; iter < 4; iter++) {
+    Options options = CurrentOptions();
+    options.write_buffer_size = 64 * 1024;  // small write buffer
+    options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+
+    BlockBasedTableOptions table_options;
+    switch (iter) {
+      case 0:
+        // only uncompressed block cache
+        table_options.block_cache = NewLRUCache(8 * 1024);
+        table_options.block_cache_compressed = nullptr;
+        options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+        break;
+      case 1:
+        // no block cache, only compressed cache
+        table_options.no_block_cache = true;
+        table_options.block_cache = nullptr;
+        table_options.block_cache_compressed = NewLRUCache(8 * 1024);
+        options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+        break;
+      case 2:
+        // both compressed and uncompressed block cache
+        table_options.block_cache = NewLRUCache(1024);
+        table_options.block_cache_compressed = NewLRUCache(8 * 1024);
+        options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+        break;
+      case 3:
+        // both block cache and compressed cache, but DB is not compressed
+        // also, make block cache sizes bigger, to trigger block cache hits
+        table_options.block_cache = NewLRUCache(1024 * 1024);
+        table_options.block_cache_compressed = NewLRUCache(8 * 1024 * 1024);
+        options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+        options.compression = kNoCompression;
+        break;
+      default:
+        FAIL();
+    }
+    CreateAndReopenWithCF({"pikachu"}, options);
+    // default column family doesn't have block cache
+    Options no_block_cache_opts;
+    no_block_cache_opts.statistics = options.statistics;
+    no_block_cache_opts = CurrentOptions(no_block_cache_opts);
+    BlockBasedTableOptions table_options_no_bc;
+    table_options_no_bc.no_block_cache = true;
+    no_block_cache_opts.table_factory.reset(
+        NewBlockBasedTableFactory(table_options_no_bc));
+    ReopenWithColumnFamilies(
+        {"default", "pikachu"},
+        std::vector<Options>({no_block_cache_opts, options}));
+
+    Random rnd(301);
+
+    // Write 8MB (80 values, each 100K)
+    ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
+    std::vector<std::string> values;
+    std::string str;
+    for (int i = 0; i < num_iter; i++) {
+      if (i % 4 == 0) {  // high compression ratio
+        str = RandomString(&rnd, 1000);
+      }
+      values.push_back(str);
+      ASSERT_OK(Put(1, Key(i), values[i]));
+    }
+
+    // flush all data from memtable so that reads are from block cache
+    ASSERT_OK(Flush(1));
+
+    for (int i = 0; i < num_iter; i++) {
+      ASSERT_EQ(Get(1, Key(i)), values[i]);
+    }
+
+    // check that we triggered the appropriate code paths in the cache
+    switch (iter) {
+      case 0:
+        // only uncompressed block cache
+        ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_MISS), 0);
+        ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_MISS), 0);
+        break;
+      case 1:
+        // no block cache, only compressed cache
+        ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_MISS), 0);
+        ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_MISS), 0);
+        break;
+      case 2:
+        // both compressed and uncompressed block cache
+        ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_MISS), 0);
+        ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_MISS), 0);
+        break;
+      case 3:
+        // both compressed and uncompressed block cache
+        ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_MISS), 0);
+        ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_HIT), 0);
+        ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_MISS), 0);
+        // compressed doesn't have any hits since blocks are not compressed on
+        // storage
+        ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_HIT), 0);
+        break;
+      default:
+        FAIL();
+    }
+
+    options.create_if_missing = true;
+    DestroyAndReopen(options);
+  }
+}
+
+TEST_F(DBBlockCacheTest, CacheCompressionDict) {
+  const int kNumFiles = 4;
+  const int kNumEntriesPerFile = 128;
+  const int kNumBytesPerEntry = 1024;
+
+  // Try all the available libraries that support dictionary compression
+  std::vector<CompressionType> compression_types;
+  if (Zlib_Supported()) {
+    compression_types.push_back(kZlibCompression);
+  }
+  if (LZ4_Supported()) {
+    compression_types.push_back(kLZ4Compression);
+    compression_types.push_back(kLZ4HCCompression);
+  }
+  if (ZSTD_Supported()) {
+    compression_types.push_back(kZSTD);
+  } else if (ZSTDNotFinal_Supported()) {
+    compression_types.push_back(kZSTDNotFinalCompression);
+  }
+  Random rnd(301);
+  for (auto compression_type : compression_types) {
+    Options options = CurrentOptions();
+    options.compression = compression_type;
+    options.compression_opts.max_dict_bytes = 4096;
+    options.create_if_missing = true;
+    options.num_levels = 2;
+    options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+    options.target_file_size_base = kNumEntriesPerFile * kNumBytesPerEntry;
+    BlockBasedTableOptions table_options;
+    table_options.cache_index_and_filter_blocks = true;
+    table_options.block_cache.reset(new MockCache());
+    options.table_factory.reset(new BlockBasedTableFactory(table_options));
+    DestroyAndReopen(options);
+
+    RecordCacheCountersForCompressionDict(options);
+
+    for (int i = 0; i < kNumFiles; ++i) {
+      ASSERT_EQ(i, NumTableFilesAtLevel(0, 0));
+      for (int j = 0; j < kNumEntriesPerFile; ++j) {
+        std::string value = RandomString(&rnd, kNumBytesPerEntry);
+        ASSERT_OK(Put(Key(j * kNumFiles + i), value.c_str()));
+      }
+      ASSERT_OK(Flush());
+    }
+    dbfull()->TEST_WaitForCompact();
+    ASSERT_EQ(0, NumTableFilesAtLevel(0));
+    ASSERT_EQ(kNumFiles, NumTableFilesAtLevel(1));
+
+    // Compression dictionary blocks are preloaded.
+    CheckCacheCountersForCompressionDict(
+        options, kNumFiles /* expected_compression_dict_misses */,
+        0 /* expected_compression_dict_hits */,
+        kNumFiles /* expected_compression_dict_inserts */);
+
+    // Seek to a key in a file. It should cause the SST's dictionary meta-block
+    // to be read.
+    RecordCacheCounters(options);
+    RecordCacheCountersForCompressionDict(options);
+    ReadOptions read_options;
+    ASSERT_NE("NOT_FOUND", Get(Key(kNumFiles * kNumEntriesPerFile - 1)));
+    // Two block hits: index and dictionary since they are prefetched
+    // One block missed/added: data block
+    CheckCacheCounters(options, 1 /* expected_misses */, 2 /* expected_hits */,
+                       1 /* expected_inserts */, 0 /* expected_failures */);
+    CheckCacheCountersForCompressionDict(
+        options, 0 /* expected_compression_dict_misses */,
+        1 /* expected_compression_dict_hits */,
+        0 /* expected_compression_dict_inserts */);
+  }
+}
+
+#endif  // ROCKSDB_LITE
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_bloom_filter_test.cc b/src/rocksdb/db/db_bloom_filter_test.cc
new file mode 100644
index 000000000..dcad00327
--- /dev/null
+++ b/src/rocksdb/db/db_bloom_filter_test.cc
@@ -0,0 +1,1910 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/db_test_util.h"
+#include "port/stack_trace.h"
+#include "rocksdb/perf_context.h"
+#include "table/block_based/filter_policy_internal.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+using BFP = BloomFilterPolicy;
+}  // namespace
+
+// DB tests related to bloom filter.
+
+class DBBloomFilterTest : public DBTestBase {
+ public:
+  DBBloomFilterTest() : DBTestBase("/db_bloom_filter_test") {}
+};
+
+class DBBloomFilterTestWithParam : public DBTestBase,
+                                   public testing::WithParamInterface<
+                                       std::tuple<BFP::Mode, bool, uint32_t>> {
+  //                             public testing::WithParamInterface<bool> {
+ protected:
+  BFP::Mode bfp_impl_;
+  bool partition_filters_;
+  uint32_t format_version_;
+
+ public:
+  DBBloomFilterTestWithParam() : DBTestBase("/db_bloom_filter_tests") {}
+
+  ~DBBloomFilterTestWithParam() override {}
+
+  void SetUp() override {
+    bfp_impl_ = std::get<0>(GetParam());
+    partition_filters_ = std::get<1>(GetParam());
+    format_version_ = std::get<2>(GetParam());
+  }
+};
+
+class DBBloomFilterTestDefFormatVersion : public DBBloomFilterTestWithParam {};
+
+class SliceTransformLimitedDomainGeneric : public SliceTransform {
+  const char* Name() const override {
+    return "SliceTransformLimitedDomainGeneric";
+  }
+
+  Slice Transform(const Slice& src) const override {
+    return Slice(src.data(), 5);
+  }
+
+  bool InDomain(const Slice& src) const override {
+    // prefix will be x????
+    return src.size() >= 5;
+  }
+
+  bool InRange(const Slice& dst) const override {
+    // prefix will be x????
+    return dst.size() == 5;
+  }
+};
+
+// KeyMayExist can lead to a few false positives, but not false negatives.
+// To make test deterministic, use a much larger number of bits per key-20 than
+// bits in the key, so that false positives are eliminated
+TEST_P(DBBloomFilterTestDefFormatVersion, KeyMayExist) {
+  do {
+    ReadOptions ropts;
+    std::string value;
+    anon::OptionsOverride options_override;
+    options_override.filter_policy.reset(new BFP(20, bfp_impl_));
+    options_override.partition_filters = partition_filters_;
+    options_override.metadata_block_size = 32;
+    Options options = CurrentOptions(options_override);
+    if (partition_filters_ &&
+        static_cast<BlockBasedTableOptions*>(
+            options.table_factory->GetOptions())
+                ->index_type != BlockBasedTableOptions::kTwoLevelIndexSearch) {
+      // In the current implementation partitioned filters depend on partitioned
+      // indexes
+      continue;
+    }
+    options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+    CreateAndReopenWithCF({"pikachu"}, options);
+
+    ASSERT_TRUE(!db_->KeyMayExist(ropts, handles_[1], "a", &value));
+
+    ASSERT_OK(Put(1, "a", "b"));
+    bool value_found = false;
+    ASSERT_TRUE(
+        db_->KeyMayExist(ropts, handles_[1], "a", &value, &value_found));
+    ASSERT_TRUE(value_found);
+    ASSERT_EQ("b", value);
+
+    ASSERT_OK(Flush(1));
+    value.clear();
+
+    uint64_t numopen = TestGetTickerCount(options, NO_FILE_OPENS);
+    uint64_t cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD);
+    ASSERT_TRUE(
+        db_->KeyMayExist(ropts, handles_[1], "a", &value, &value_found));
+    ASSERT_TRUE(!value_found);
+    // assert that no new files were opened and no new blocks were
+    // read into block cache.
+    ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS));
+    ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD));
+
+    ASSERT_OK(Delete(1, "a"));
+
+    numopen = TestGetTickerCount(options, NO_FILE_OPENS);
+    cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD);
+    ASSERT_TRUE(!db_->KeyMayExist(ropts, handles_[1], "a", &value));
+    ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS));
+    ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD));
+
+    ASSERT_OK(Flush(1));
+    dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1],
+                                true /* disallow trivial move */);
+
+    numopen = TestGetTickerCount(options, NO_FILE_OPENS);
+    cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD);
+    ASSERT_TRUE(!db_->KeyMayExist(ropts, handles_[1], "a", &value));
+    ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS));
+    ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD));
+
+    ASSERT_OK(Delete(1, "c"));
+
+    numopen = TestGetTickerCount(options, NO_FILE_OPENS);
+    cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD);
+    ASSERT_TRUE(!db_->KeyMayExist(ropts, handles_[1], "c", &value));
+    ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS));
+    ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD));
+
+    // KeyMayExist function only checks data in block caches, which is not used
+    // by plain table format.
+  } while (
+      ChangeOptions(kSkipPlainTable | kSkipHashIndex | kSkipFIFOCompaction));
+}
+
+TEST_F(DBBloomFilterTest, GetFilterByPrefixBloomCustomPrefixExtractor) {
+  for (bool partition_filters : {true, false}) {
+    Options options = last_options_;
+    options.prefix_extractor =
+        std::make_shared<SliceTransformLimitedDomainGeneric>();
+    options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+    get_perf_context()->EnablePerLevelPerfContext();
+    BlockBasedTableOptions bbto;
+    bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
+    if (partition_filters) {
+      bbto.partition_filters = true;
+      bbto.index_type = BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+    }
+    bbto.whole_key_filtering = false;
+    options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+    DestroyAndReopen(options);
+
+    WriteOptions wo;
+    ReadOptions ro;
+    FlushOptions fo;
+    fo.wait = true;
+    std::string value;
+
+    ASSERT_OK(dbfull()->Put(wo, "barbarbar", "foo"));
+    ASSERT_OK(dbfull()->Put(wo, "barbarbar2", "foo2"));
+    ASSERT_OK(dbfull()->Put(wo, "foofoofoo", "bar"));
+
+    dbfull()->Flush(fo);
+
+    ASSERT_EQ("foo", Get("barbarbar"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0);
+    ASSERT_EQ(
+        0,
+        (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful);
+    ASSERT_EQ("foo2", Get("barbarbar2"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0);
+    ASSERT_EQ(
+        0,
+        (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful);
+    ASSERT_EQ("NOT_FOUND", Get("barbarbar3"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0);
+    ASSERT_EQ(
+        0,
+        (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful);
+
+    ASSERT_EQ("NOT_FOUND", Get("barfoofoo"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1);
+    ASSERT_EQ(
+        1,
+        (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful);
+
+    ASSERT_EQ("NOT_FOUND", Get("foobarbar"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 2);
+    ASSERT_EQ(
+        2,
+        (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful);
+
+    ro.total_order_seek = true;
+    ASSERT_TRUE(db_->Get(ro, "foobarbar", &value).IsNotFound());
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 2);
+    ASSERT_EQ(
+        2,
+        (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful);
+    get_perf_context()->Reset();
+  }
+}
+
+TEST_F(DBBloomFilterTest, GetFilterByPrefixBloom) {
+  for (bool partition_filters : {true, false}) {
+    Options options = last_options_;
+    options.prefix_extractor.reset(NewFixedPrefixTransform(8));
+    options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+    get_perf_context()->EnablePerLevelPerfContext();
+    BlockBasedTableOptions bbto;
+    bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
+    if (partition_filters) {
+      bbto.partition_filters = true;
+      bbto.index_type = BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+    }
+    bbto.whole_key_filtering = false;
+    options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+    DestroyAndReopen(options);
+
+    WriteOptions wo;
+    ReadOptions ro;
+    FlushOptions fo;
+    fo.wait = true;
+    std::string value;
+
+    ASSERT_OK(dbfull()->Put(wo, "barbarbar", "foo"));
+    ASSERT_OK(dbfull()->Put(wo, "barbarbar2", "foo2"));
+    ASSERT_OK(dbfull()->Put(wo, "foofoofoo", "bar"));
+
+    dbfull()->Flush(fo);
+
+    ASSERT_EQ("foo", Get("barbarbar"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0);
+    ASSERT_EQ("foo2", Get("barbarbar2"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0);
+    ASSERT_EQ("NOT_FOUND", Get("barbarbar3"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0);
+
+    ASSERT_EQ("NOT_FOUND", Get("barfoofoo"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1);
+
+    ASSERT_EQ("NOT_FOUND", Get("foobarbar"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 2);
+
+    ro.total_order_seek = true;
+    ASSERT_TRUE(db_->Get(ro, "foobarbar", &value).IsNotFound());
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 2);
+    ASSERT_EQ(
+        2,
+        (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful);
+    get_perf_context()->Reset();
+  }
+}
+
+TEST_F(DBBloomFilterTest, WholeKeyFilterProp) {
+  for (bool partition_filters : {true, false}) {
+    Options options = last_options_;
+    options.prefix_extractor.reset(NewFixedPrefixTransform(3));
+    options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+    get_perf_context()->EnablePerLevelPerfContext();
+
+    BlockBasedTableOptions bbto;
+    bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
+    bbto.whole_key_filtering = false;
+    if (partition_filters) {
+      bbto.partition_filters = true;
+      bbto.index_type = BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+    }
+    options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+    DestroyAndReopen(options);
+
+    WriteOptions wo;
+    ReadOptions ro;
+    FlushOptions fo;
+    fo.wait = true;
+    std::string value;
+
+    ASSERT_OK(dbfull()->Put(wo, "foobar", "foo"));
+    // Needs insert some keys to make sure files are not filtered out by key
+    // ranges.
+    ASSERT_OK(dbfull()->Put(wo, "aaa", ""));
+    ASSERT_OK(dbfull()->Put(wo, "zzz", ""));
+    dbfull()->Flush(fo);
+
+    Reopen(options);
+    ASSERT_EQ("NOT_FOUND", Get("foo"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0);
+    ASSERT_EQ("NOT_FOUND", Get("bar"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1);
+    ASSERT_EQ("foo", Get("foobar"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1);
+
+    // Reopen with whole key filtering enabled and prefix extractor
+    // NULL. Bloom filter should be off for both of whole key and
+    // prefix bloom.
+    bbto.whole_key_filtering = true;
+    options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+    options.prefix_extractor.reset();
+    Reopen(options);
+
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1);
+    ASSERT_EQ("NOT_FOUND", Get("foo"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1);
+    ASSERT_EQ("NOT_FOUND", Get("bar"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1);
+    ASSERT_EQ("foo", Get("foobar"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1);
+    // Write DB with only full key filtering.
+    ASSERT_OK(dbfull()->Put(wo, "foobar", "foo"));
+    // Needs insert some keys to make sure files are not filtered out by key
+    // ranges.
+    ASSERT_OK(dbfull()->Put(wo, "aaa", ""));
+    ASSERT_OK(dbfull()->Put(wo, "zzz", ""));
+    db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+
+    // Reopen with both of whole key off and prefix extractor enabled.
+    // Still no bloom filter should be used.
+    options.prefix_extractor.reset(NewFixedPrefixTransform(3));
+    bbto.whole_key_filtering = false;
+    options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+    Reopen(options);
+
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1);
+    ASSERT_EQ("NOT_FOUND", Get("foo"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1);
+    ASSERT_EQ("NOT_FOUND", Get("bar"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1);
+    ASSERT_EQ("foo", Get("foobar"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1);
+
+    // Try to create a DB with mixed files:
+    ASSERT_OK(dbfull()->Put(wo, "foobar", "foo"));
+    // Needs insert some keys to make sure files are not filtered out by key
+    // ranges.
+    ASSERT_OK(dbfull()->Put(wo, "aaa", ""));
+    ASSERT_OK(dbfull()->Put(wo, "zzz", ""));
+    db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+
+    options.prefix_extractor.reset();
+    bbto.whole_key_filtering = true;
+    options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+    Reopen(options);
+
+    // Try to create a DB with mixed files.
+    ASSERT_OK(dbfull()->Put(wo, "barfoo", "bar"));
+    // In this case needs insert some keys to make sure files are
+    // not filtered out by key ranges.
+    ASSERT_OK(dbfull()->Put(wo, "aaa", ""));
+    ASSERT_OK(dbfull()->Put(wo, "zzz", ""));
+    Flush();
+
+    // Now we have two files:
+    // File 1: An older file with prefix bloom.
+    // File 2: A newer file with whole bloom filter.
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1);
+    ASSERT_EQ("NOT_FOUND", Get("foo"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 2);
+    ASSERT_EQ("NOT_FOUND", Get("bar"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 3);
+    ASSERT_EQ("foo", Get("foobar"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 4);
+    ASSERT_EQ("bar", Get("barfoo"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 4);
+
+    // Reopen with the same setting: only whole key is used
+    Reopen(options);
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 4);
+    ASSERT_EQ("NOT_FOUND", Get("foo"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 5);
+    ASSERT_EQ("NOT_FOUND", Get("bar"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 6);
+    ASSERT_EQ("foo", Get("foobar"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 7);
+    ASSERT_EQ("bar", Get("barfoo"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 7);
+
+    // Restart with both filters are allowed
+    options.prefix_extractor.reset(NewFixedPrefixTransform(3));
+    bbto.whole_key_filtering = true;
+    options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+    Reopen(options);
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 7);
+    // File 1 will has it filtered out.
+    // File 2 will not, as prefix `foo` exists in the file.
+    ASSERT_EQ("NOT_FOUND", Get("foo"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 8);
+    ASSERT_EQ("NOT_FOUND", Get("bar"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 10);
+    ASSERT_EQ("foo", Get("foobar"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 11);
+    ASSERT_EQ("bar", Get("barfoo"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 11);
+
+    // Restart with only prefix bloom is allowed.
+    options.prefix_extractor.reset(NewFixedPrefixTransform(3));
+    bbto.whole_key_filtering = false;
+    options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+    Reopen(options);
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 11);
+    ASSERT_EQ("NOT_FOUND", Get("foo"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 11);
+    ASSERT_EQ("NOT_FOUND", Get("bar"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 12);
+    ASSERT_EQ("foo", Get("foobar"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 12);
+    ASSERT_EQ("bar", Get("barfoo"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 12);
+    uint64_t bloom_filter_useful_all_levels = 0;
+    for (auto& kv : (*(get_perf_context()->level_to_perf_context))) {
+      if (kv.second.bloom_filter_useful > 0) {
+        bloom_filter_useful_all_levels += kv.second.bloom_filter_useful;
+      }
+    }
+    ASSERT_EQ(12, bloom_filter_useful_all_levels);
+    get_perf_context()->Reset();
+  }
+}
+
+TEST_P(DBBloomFilterTestWithParam, BloomFilter) {
+  do {
+    Options options = CurrentOptions();
+    env_->count_random_reads_ = true;
+    options.env = env_;
+    // ChangeCompactOptions() only changes compaction style, which does not
+    // trigger reset of table_factory
+    BlockBasedTableOptions table_options;
+    table_options.no_block_cache = true;
+    table_options.filter_policy.reset(new BFP(10, bfp_impl_));
+    table_options.partition_filters = partition_filters_;
+    if (partition_filters_) {
+      table_options.index_type =
+          BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+    }
+    table_options.format_version = format_version_;
+    if (format_version_ >= 4) {
+      // value delta encoding challenged more with index interval > 1
+      table_options.index_block_restart_interval = 8;
+    }
+    table_options.metadata_block_size = 32;
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+    CreateAndReopenWithCF({"pikachu"}, options);
+
+    // Populate multiple layers
+    const int N = 10000;
+    for (int i = 0; i < N; i++) {
+      ASSERT_OK(Put(1, Key(i), Key(i)));
+    }
+    Compact(1, "a", "z");
+    for (int i = 0; i < N; i += 100) {
+      ASSERT_OK(Put(1, Key(i), Key(i)));
+    }
+    Flush(1);
+
+    // Prevent auto compactions triggered by seeks
+    env_->delay_sstable_sync_.store(true, std::memory_order_release);
+
+    // Lookup present keys.  Should rarely read from small sstable.
+    env_->random_read_counter_.Reset();
+    for (int i = 0; i < N; i++) {
+      ASSERT_EQ(Key(i), Get(1, Key(i)));
+    }
+    int reads = env_->random_read_counter_.Read();
+    fprintf(stderr, "%d present => %d reads\n", N, reads);
+    ASSERT_GE(reads, N);
+    if (partition_filters_) {
+      // Without block cache, we read an extra partition filter per each
+      // level*read and a partition index per each read
+      ASSERT_LE(reads, 4 * N + 2 * N / 100);
+    } else {
+      ASSERT_LE(reads, N + 2 * N / 100);
+    }
+
+    // Lookup present keys.  Should rarely read from either sstable.
+    env_->random_read_counter_.Reset();
+    for (int i = 0; i < N; i++) {
+      ASSERT_EQ("NOT_FOUND", Get(1, Key(i) + ".missing"));
+    }
+    reads = env_->random_read_counter_.Read();
+    fprintf(stderr, "%d missing => %d reads\n", N, reads);
+    if (partition_filters_) {
+      // With partitioned filter we read one extra filter per level per each
+      // missed read.
+      ASSERT_LE(reads, 2 * N + 3 * N / 100);
+    } else {
+      ASSERT_LE(reads, 3 * N / 100);
+    }
+
+    env_->delay_sstable_sync_.store(false, std::memory_order_release);
+    Close();
+  } while (ChangeCompactOptions());
+}
+
+#ifndef ROCKSDB_VALGRIND_RUN
+INSTANTIATE_TEST_CASE_P(
+    FormatDef, DBBloomFilterTestDefFormatVersion,
+    ::testing::Values(
+        std::make_tuple(BFP::kDeprecatedBlock, false,
+                        test::kDefaultFormatVersion),
+        std::make_tuple(BFP::kAuto, true, test::kDefaultFormatVersion),
+        std::make_tuple(BFP::kAuto, false, test::kDefaultFormatVersion)));
+
+INSTANTIATE_TEST_CASE_P(
+    FormatDef, DBBloomFilterTestWithParam,
+    ::testing::Values(
+        std::make_tuple(BFP::kDeprecatedBlock, false,
+                        test::kDefaultFormatVersion),
+        std::make_tuple(BFP::kAuto, true, test::kDefaultFormatVersion),
+        std::make_tuple(BFP::kAuto, false, test::kDefaultFormatVersion)));
+
+INSTANTIATE_TEST_CASE_P(
+    FormatLatest, DBBloomFilterTestWithParam,
+    ::testing::Values(
+        std::make_tuple(BFP::kDeprecatedBlock, false,
+                        test::kLatestFormatVersion),
+        std::make_tuple(BFP::kAuto, true, test::kLatestFormatVersion),
+        std::make_tuple(BFP::kAuto, false, test::kLatestFormatVersion)));
+#endif  // ROCKSDB_VALGRIND_RUN
+
+TEST_F(DBBloomFilterTest, BloomFilterRate) {
+  while (ChangeFilterOptions()) {
+    Options options = CurrentOptions();
+    options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+    get_perf_context()->EnablePerLevelPerfContext();
+    CreateAndReopenWithCF({"pikachu"}, options);
+
+    const int maxKey = 10000;
+    for (int i = 0; i < maxKey; i++) {
+      ASSERT_OK(Put(1, Key(i), Key(i)));
+    }
+    // Add a large key to make the file contain wide range
+    ASSERT_OK(Put(1, Key(maxKey + 55555), Key(maxKey + 55555)));
+    Flush(1);
+
+    // Check if they can be found
+    for (int i = 0; i < maxKey; i++) {
+      ASSERT_EQ(Key(i), Get(1, Key(i)));
+    }
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0);
+
+    // Check if filter is useful
+    for (int i = 0; i < maxKey; i++) {
+      ASSERT_EQ("NOT_FOUND", Get(1, Key(i + 33333)));
+    }
+    ASSERT_GE(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), maxKey * 0.98);
+    ASSERT_GE(
+        (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful,
+        maxKey * 0.98);
+    get_perf_context()->Reset();
+  }
+}
+
+TEST_F(DBBloomFilterTest, BloomFilterCompatibility) {
+  Options options = CurrentOptions();
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  BlockBasedTableOptions table_options;
+  table_options.filter_policy.reset(NewBloomFilterPolicy(10, true));
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  // Create with block based filter
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  const int maxKey = 10000;
+  for (int i = 0; i < maxKey; i++) {
+    ASSERT_OK(Put(1, Key(i), Key(i)));
+  }
+  ASSERT_OK(Put(1, Key(maxKey + 55555), Key(maxKey + 55555)));
+  Flush(1);
+
+  // Check db with full filter
+  table_options.filter_policy.reset(NewBloomFilterPolicy(10, false));
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+  // Check if they can be found
+  for (int i = 0; i < maxKey; i++) {
+    ASSERT_EQ(Key(i), Get(1, Key(i)));
+  }
+  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0);
+
+  // Check db with partitioned full filter
+  table_options.partition_filters = true;
+  table_options.index_type =
+      BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+  table_options.filter_policy.reset(NewBloomFilterPolicy(10, false));
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+  // Check if they can be found
+  for (int i = 0; i < maxKey; i++) {
+    ASSERT_EQ(Key(i), Get(1, Key(i)));
+  }
+  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0);
+}
+
+TEST_F(DBBloomFilterTest, BloomFilterReverseCompatibility) {
+  for (bool partition_filters : {true, false}) {
+    Options options = CurrentOptions();
+    options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+    BlockBasedTableOptions table_options;
+    if (partition_filters) {
+      table_options.partition_filters = true;
+      table_options.index_type =
+          BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+    }
+    table_options.filter_policy.reset(NewBloomFilterPolicy(10, false));
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+    DestroyAndReopen(options);
+
+    // Create with full filter
+    CreateAndReopenWithCF({"pikachu"}, options);
+
+    const int maxKey = 10000;
+    for (int i = 0; i < maxKey; i++) {
+      ASSERT_OK(Put(1, Key(i), Key(i)));
+    }
+    ASSERT_OK(Put(1, Key(maxKey + 55555), Key(maxKey + 55555)));
+    Flush(1);
+
+    // Check db with block_based filter
+    table_options.filter_policy.reset(NewBloomFilterPolicy(10, true));
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+    ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+    // Check if they can be found
+    for (int i = 0; i < maxKey; i++) {
+      ASSERT_EQ(Key(i), Get(1, Key(i)));
+    }
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0);
+  }
+}
+
+namespace {
+// A wrapped bloom over block-based FilterPolicy
+class TestingWrappedBlockBasedFilterPolicy : public FilterPolicy {
+ public:
+  explicit TestingWrappedBlockBasedFilterPolicy(int bits_per_key)
+      : filter_(NewBloomFilterPolicy(bits_per_key, true)), counter_(0) {}
+
+  ~TestingWrappedBlockBasedFilterPolicy() override { delete filter_; }
+
+  const char* Name() const override {
+    return "TestingWrappedBlockBasedFilterPolicy";
+  }
+
+  void CreateFilter(const ROCKSDB_NAMESPACE::Slice* keys, int n,
+                    std::string* dst) const override {
+    std::unique_ptr<ROCKSDB_NAMESPACE::Slice[]> user_keys(
+        new ROCKSDB_NAMESPACE::Slice[n]);
+    for (int i = 0; i < n; ++i) {
+      user_keys[i] = convertKey(keys[i]);
+    }
+    return filter_->CreateFilter(user_keys.get(), n, dst);
+  }
+
+  bool KeyMayMatch(const ROCKSDB_NAMESPACE::Slice& key,
+                   const ROCKSDB_NAMESPACE::Slice& filter) const override {
+    counter_++;
+    return filter_->KeyMayMatch(convertKey(key), filter);
+  }
+
+  uint32_t GetCounter() { return counter_; }
+
+ private:
+  const FilterPolicy* filter_;
+  mutable uint32_t counter_;
+
+  ROCKSDB_NAMESPACE::Slice convertKey(
+      const ROCKSDB_NAMESPACE::Slice& key) const {
+    return key;
+  }
+};
+}  // namespace
+
+TEST_F(DBBloomFilterTest, WrappedBlockBasedFilterPolicy) {
+  Options options = CurrentOptions();
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+
+  BlockBasedTableOptions table_options;
+  TestingWrappedBlockBasedFilterPolicy* policy =
+      new TestingWrappedBlockBasedFilterPolicy(10);
+  table_options.filter_policy.reset(policy);
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  const int maxKey = 10000;
+  for (int i = 0; i < maxKey; i++) {
+    ASSERT_OK(Put(1, Key(i), Key(i)));
+  }
+  // Add a large key to make the file contain wide range
+  ASSERT_OK(Put(1, Key(maxKey + 55555), Key(maxKey + 55555)));
+  ASSERT_EQ(0U, policy->GetCounter());
+  Flush(1);
+
+  // Check if they can be found
+  for (int i = 0; i < maxKey; i++) {
+    ASSERT_EQ(Key(i), Get(1, Key(i)));
+  }
+  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0);
+  ASSERT_EQ(1U * maxKey, policy->GetCounter());
+
+  // Check if filter is useful
+  for (int i = 0; i < maxKey; i++) {
+    ASSERT_EQ("NOT_FOUND", Get(1, Key(i + 33333)));
+  }
+  ASSERT_GE(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), maxKey * 0.98);
+  ASSERT_EQ(2U * maxKey, policy->GetCounter());
+}
+
+namespace {
+// NOTE: This class is referenced by HISTORY.md as a model for a wrapper
+// FilterPolicy selecting among configurations based on context.
+class LevelAndStyleCustomFilterPolicy : public FilterPolicy {
+ public:
+  explicit LevelAndStyleCustomFilterPolicy(int bpk_fifo, int bpk_l0_other,
+                                           int bpk_otherwise)
+      : policy_fifo_(NewBloomFilterPolicy(bpk_fifo)),
+        policy_l0_other_(NewBloomFilterPolicy(bpk_l0_other)),
+        policy_otherwise_(NewBloomFilterPolicy(bpk_otherwise)) {}
+
+  // OK to use built-in policy name because we are deferring to a
+  // built-in builder. We aren't changing the serialized format.
+  const char* Name() const override { return policy_fifo_->Name(); }
+
+  FilterBitsBuilder* GetBuilderWithContext(
+      const FilterBuildingContext& context) const override {
+    if (context.compaction_style == kCompactionStyleFIFO) {
+      return policy_fifo_->GetBuilderWithContext(context);
+    } else if (context.level_at_creation == 0) {
+      return policy_l0_other_->GetBuilderWithContext(context);
+    } else {
+      return policy_otherwise_->GetBuilderWithContext(context);
+    }
+  }
+
+  FilterBitsReader* GetFilterBitsReader(const Slice& contents) const override {
+    // OK to defer to any of them; they all can parse built-in filters
+    // from any settings.
+    return policy_fifo_->GetFilterBitsReader(contents);
+  }
+
+  // Defer just in case configuration uses block-based filter
+  void CreateFilter(const Slice* keys, int n, std::string* dst) const override {
+    policy_otherwise_->CreateFilter(keys, n, dst);
+  }
+  bool KeyMayMatch(const Slice& key, const Slice& filter) const override {
+    return policy_otherwise_->KeyMayMatch(key, filter);
+  }
+
+ private:
+  const std::unique_ptr<const FilterPolicy> policy_fifo_;
+  const std::unique_ptr<const FilterPolicy> policy_l0_other_;
+  const std::unique_ptr<const FilterPolicy> policy_otherwise_;
+};
+
+class TestingContextCustomFilterPolicy
+    : public LevelAndStyleCustomFilterPolicy {
+ public:
+  explicit TestingContextCustomFilterPolicy(int bpk_fifo, int bpk_l0_other,
+                                            int bpk_otherwise)
+      : LevelAndStyleCustomFilterPolicy(bpk_fifo, bpk_l0_other, bpk_otherwise) {
+  }
+
+  FilterBitsBuilder* GetBuilderWithContext(
+      const FilterBuildingContext& context) const override {
+    test_report_ += "cf=";
+    test_report_ += context.column_family_name;
+    test_report_ += ",cs=";
+    test_report_ +=
+        OptionsHelper::compaction_style_to_string[context.compaction_style];
+    test_report_ += ",lv=";
+    test_report_ += std::to_string(context.level_at_creation);
+    test_report_ += "\n";
+
+    return LevelAndStyleCustomFilterPolicy::GetBuilderWithContext(context);
+  }
+
+  std::string DumpTestReport() {
+    std::string rv;
+    std::swap(rv, test_report_);
+    return rv;
+  }
+
+ private:
+  mutable std::string test_report_;
+};
+}  // namespace
+
+TEST_F(DBBloomFilterTest, ContextCustomFilterPolicy) {
+  for (bool fifo : {true, false}) {
+    Options options = CurrentOptions();
+    options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+    options.compaction_style =
+        fifo ? kCompactionStyleFIFO : kCompactionStyleLevel;
+
+    BlockBasedTableOptions table_options;
+    auto policy = std::make_shared<TestingContextCustomFilterPolicy>(15, 8, 5);
+    table_options.filter_policy = policy;
+    table_options.format_version = 5;
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+    CreateAndReopenWithCF({fifo ? "abe" : "bob"}, options);
+
+    const int maxKey = 10000;
+    for (int i = 0; i < maxKey / 2; i++) {
+      ASSERT_OK(Put(1, Key(i), Key(i)));
+    }
+    // Add a large key to make the file contain wide range
+    ASSERT_OK(Put(1, Key(maxKey + 55555), Key(maxKey + 55555)));
+    Flush(1);
+    EXPECT_EQ(policy->DumpTestReport(),
+              fifo ? "cf=abe,cs=kCompactionStyleFIFO,lv=0\n"
+                   : "cf=bob,cs=kCompactionStyleLevel,lv=0\n");
+
+    for (int i = maxKey / 2; i < maxKey; i++) {
+      ASSERT_OK(Put(1, Key(i), Key(i)));
+    }
+    Flush(1);
+    EXPECT_EQ(policy->DumpTestReport(),
+              fifo ? "cf=abe,cs=kCompactionStyleFIFO,lv=0\n"
+                   : "cf=bob,cs=kCompactionStyleLevel,lv=0\n");
+
+    // Check that they can be found
+    for (int i = 0; i < maxKey; i++) {
+      ASSERT_EQ(Key(i), Get(1, Key(i)));
+    }
+    // Since we have two tables / two filters, we might have Bloom checks on
+    // our queries, but no more than one "useful" per query on a found key.
+    EXPECT_LE(TestGetAndResetTickerCount(options, BLOOM_FILTER_USEFUL), maxKey);
+
+    // Check that we have two filters, each about
+    // fifo: 0.12% FP rate (15 bits per key)
+    // level: 2.3% FP rate (8 bits per key)
+    for (int i = 0; i < maxKey; i++) {
+      ASSERT_EQ("NOT_FOUND", Get(1, Key(i + 33333)));
+    }
+    {
+      auto useful_count =
+          TestGetAndResetTickerCount(options, BLOOM_FILTER_USEFUL);
+      EXPECT_GE(useful_count, maxKey * 2 * (fifo ? 0.9980 : 0.975));
+      EXPECT_LE(useful_count, maxKey * 2 * (fifo ? 0.9995 : 0.98));
+    }
+
+    if (!fifo) {  // FIFO only has L0
+      // Full compaction
+      ASSERT_OK(db_->CompactRange(CompactRangeOptions(), handles_[1], nullptr,
+                                  nullptr));
+      EXPECT_EQ(policy->DumpTestReport(),
+                "cf=bob,cs=kCompactionStyleLevel,lv=1\n");
+
+      // Check that we now have one filter, about 9.2% FP rate (5 bits per key)
+      for (int i = 0; i < maxKey; i++) {
+        ASSERT_EQ("NOT_FOUND", Get(1, Key(i + 33333)));
+      }
+      {
+        auto useful_count =
+            TestGetAndResetTickerCount(options, BLOOM_FILTER_USEFUL);
+        EXPECT_GE(useful_count, maxKey * 0.90);
+        EXPECT_LE(useful_count, maxKey * 0.91);
+      }
+    }
+
+    // Destroy
+    ASSERT_OK(dbfull()->DropColumnFamily(handles_[1]));
+    dbfull()->DestroyColumnFamilyHandle(handles_[1]);
+    handles_[1] = nullptr;
+  }
+}
+
+class SliceTransformLimitedDomain : public SliceTransform {
+  const char* Name() const override { return "SliceTransformLimitedDomain"; }
+
+  Slice Transform(const Slice& src) const override {
+    return Slice(src.data(), 5);
+  }
+
+  bool InDomain(const Slice& src) const override {
+    // prefix will be x????
+    return src.size() >= 5 && src[0] == 'x';
+  }
+
+  bool InRange(const Slice& dst) const override {
+    // prefix will be x????
+    return dst.size() == 5 && dst[0] == 'x';
+  }
+};
+
+TEST_F(DBBloomFilterTest, PrefixExtractorFullFilter) {
+  BlockBasedTableOptions bbto;
+  // Full Filter Block
+  bbto.filter_policy.reset(ROCKSDB_NAMESPACE::NewBloomFilterPolicy(10, false));
+  bbto.whole_key_filtering = false;
+
+  Options options = CurrentOptions();
+  options.prefix_extractor = std::make_shared<SliceTransformLimitedDomain>();
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put("x1111_AAAA", "val1"));
+  ASSERT_OK(Put("x1112_AAAA", "val2"));
+  ASSERT_OK(Put("x1113_AAAA", "val3"));
+  ASSERT_OK(Put("x1114_AAAA", "val4"));
+  // Not in domain, wont be added to filter
+  ASSERT_OK(Put("zzzzz_AAAA", "val5"));
+
+  ASSERT_OK(Flush());
+
+  ASSERT_EQ(Get("x1111_AAAA"), "val1");
+  ASSERT_EQ(Get("x1112_AAAA"), "val2");
+  ASSERT_EQ(Get("x1113_AAAA"), "val3");
+  ASSERT_EQ(Get("x1114_AAAA"), "val4");
+  // Was not added to filter but rocksdb will try to read it from the filter
+  ASSERT_EQ(Get("zzzzz_AAAA"), "val5");
+}
+
+TEST_F(DBBloomFilterTest, PrefixExtractorBlockFilter) {
+  BlockBasedTableOptions bbto;
+  // Block Filter Block
+  bbto.filter_policy.reset(ROCKSDB_NAMESPACE::NewBloomFilterPolicy(10, true));
+
+  Options options = CurrentOptions();
+  options.prefix_extractor = std::make_shared<SliceTransformLimitedDomain>();
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put("x1113_AAAA", "val3"));
+  ASSERT_OK(Put("x1114_AAAA", "val4"));
+  // Not in domain, wont be added to filter
+  ASSERT_OK(Put("zzzzz_AAAA", "val1"));
+  ASSERT_OK(Put("zzzzz_AAAB", "val2"));
+  ASSERT_OK(Put("zzzzz_AAAC", "val3"));
+  ASSERT_OK(Put("zzzzz_AAAD", "val4"));
+
+  ASSERT_OK(Flush());
+
+  std::vector<std::string> iter_res;
+  auto iter = db_->NewIterator(ReadOptions());
+  // Seek to a key that was not in Domain
+  for (iter->Seek("zzzzz_AAAA"); iter->Valid(); iter->Next()) {
+    iter_res.emplace_back(iter->value().ToString());
+  }
+
+  std::vector<std::string> expected_res = {"val1", "val2", "val3", "val4"};
+  ASSERT_EQ(iter_res, expected_res);
+  delete iter;
+}
+
+TEST_F(DBBloomFilterTest, MemtableWholeKeyBloomFilter) {
+  // regression test for #2743. the range delete tombstones in memtable should
+  // be added even when Get() skips searching due to its prefix bloom filter
+  const int kMemtableSize = 1 << 20;              // 1MB
+  const int kMemtablePrefixFilterSize = 1 << 13;  // 8KB
+  const int kPrefixLen = 4;
+  Options options = CurrentOptions();
+  options.memtable_prefix_bloom_size_ratio =
+      static_cast<double>(kMemtablePrefixFilterSize) / kMemtableSize;
+  options.prefix_extractor.reset(
+      ROCKSDB_NAMESPACE::NewFixedPrefixTransform(kPrefixLen));
+  options.write_buffer_size = kMemtableSize;
+  options.memtable_whole_key_filtering = false;
+  Reopen(options);
+  std::string key1("AAAABBBB");
+  std::string key2("AAAACCCC");  // not in DB
+  std::string key3("AAAADDDD");
+  std::string key4("AAAAEEEE");
+  std::string value1("Value1");
+  std::string value3("Value3");
+  std::string value4("Value4");
+
+  ASSERT_OK(Put(key1, value1, WriteOptions()));
+
+  // check memtable bloom stats
+  ASSERT_EQ("NOT_FOUND", Get(key2));
+  ASSERT_EQ(0, get_perf_context()->bloom_memtable_miss_count);
+  // same prefix, bloom filter false positive
+  ASSERT_EQ(1, get_perf_context()->bloom_memtable_hit_count);
+
+  // enable whole key bloom filter
+  options.memtable_whole_key_filtering = true;
+  Reopen(options);
+  // check memtable bloom stats
+  ASSERT_OK(Put(key3, value3, WriteOptions()));
+  ASSERT_EQ("NOT_FOUND", Get(key2));
+  // whole key bloom filter kicks in and determines it's a miss
+  ASSERT_EQ(1, get_perf_context()->bloom_memtable_miss_count);
+  ASSERT_EQ(1, get_perf_context()->bloom_memtable_hit_count);
+
+  // verify whole key filtering does not depend on prefix_extractor
+  options.prefix_extractor.reset();
+  Reopen(options);
+  // check memtable bloom stats
+  ASSERT_OK(Put(key4, value4, WriteOptions()));
+  ASSERT_EQ("NOT_FOUND", Get(key2));
+  // whole key bloom filter kicks in and determines it's a miss
+  ASSERT_EQ(2, get_perf_context()->bloom_memtable_miss_count);
+  ASSERT_EQ(1, get_perf_context()->bloom_memtable_hit_count);
+}
+
+TEST_F(DBBloomFilterTest, MemtablePrefixBloomOutOfDomain) {
+  constexpr size_t kPrefixSize = 8;
+  const std::string kKey = "key";
+  assert(kKey.size() < kPrefixSize);
+  Options options = CurrentOptions();
+  options.prefix_extractor.reset(NewFixedPrefixTransform(kPrefixSize));
+  options.memtable_prefix_bloom_size_ratio = 0.25;
+  Reopen(options);
+  ASSERT_OK(Put(kKey, "v"));
+  ASSERT_EQ("v", Get(kKey));
+  std::unique_ptr<Iterator> iter(dbfull()->NewIterator(ReadOptions()));
+  iter->Seek(kKey);
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(kKey, iter->key());
+  iter->SeekForPrev(kKey);
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(kKey, iter->key());
+}
+
+#ifndef ROCKSDB_LITE
+namespace {
+namespace BFP2 {
+// Extends BFP::Mode with option to use Plain table
+using PseudoMode = int;
+static constexpr PseudoMode kPlainTable = -1;
+}  // namespace BFP2
+}  // namespace
+
+class BloomStatsTestWithParam
+    : public DBBloomFilterTest,
+      public testing::WithParamInterface<std::tuple<BFP2::PseudoMode, bool>> {
+ public:
+  BloomStatsTestWithParam() {
+    bfp_impl_ = std::get<0>(GetParam());
+    partition_filters_ = std::get<1>(GetParam());
+
+    options_.create_if_missing = true;
+    options_.prefix_extractor.reset(
+        ROCKSDB_NAMESPACE::NewFixedPrefixTransform(4));
+    options_.memtable_prefix_bloom_size_ratio =
+        8.0 * 1024.0 / static_cast<double>(options_.write_buffer_size);
+    if (bfp_impl_ == BFP2::kPlainTable) {
+      assert(!partition_filters_);  // not supported in plain table
+      PlainTableOptions table_options;
+      options_.table_factory.reset(NewPlainTableFactory(table_options));
+    } else {
+      BlockBasedTableOptions table_options;
+      table_options.hash_index_allow_collision = false;
+      if (partition_filters_) {
+        assert(bfp_impl_ != BFP::kDeprecatedBlock);
+        table_options.partition_filters = partition_filters_;
+        table_options.index_type =
+            BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+      }
+      table_options.filter_policy.reset(
+          new BFP(10, static_cast<BFP::Mode>(bfp_impl_)));
+      options_.table_factory.reset(NewBlockBasedTableFactory(table_options));
+    }
+    options_.env = env_;
+
+    get_perf_context()->Reset();
+    DestroyAndReopen(options_);
+  }
+
+  ~BloomStatsTestWithParam() override {
+    get_perf_context()->Reset();
+    Destroy(options_);
+  }
+
+  // Required if inheriting from testing::WithParamInterface<>
+  static void SetUpTestCase() {}
+  static void TearDownTestCase() {}
+
+  BFP2::PseudoMode bfp_impl_;
+  bool partition_filters_;
+  Options options_;
+};
+
+// 1 Insert 2 K-V pairs into DB
+// 2 Call Get() for both keys - expext memtable bloom hit stat to be 2
+// 3 Call Get() for nonexisting key - expect memtable bloom miss stat to be 1
+// 4 Call Flush() to create SST
+// 5 Call Get() for both keys - expext SST bloom hit stat to be 2
+// 6 Call Get() for nonexisting key - expect SST bloom miss stat to be 1
+// Test both: block and plain SST
+TEST_P(BloomStatsTestWithParam, BloomStatsTest) {
+  std::string key1("AAAA");
+  std::string key2("RXDB");  // not in DB
+  std::string key3("ZBRA");
+  std::string value1("Value1");
+  std::string value3("Value3");
+
+  ASSERT_OK(Put(key1, value1, WriteOptions()));
+  ASSERT_OK(Put(key3, value3, WriteOptions()));
+
+  // check memtable bloom stats
+  ASSERT_EQ(value1, Get(key1));
+  ASSERT_EQ(1, get_perf_context()->bloom_memtable_hit_count);
+  ASSERT_EQ(value3, Get(key3));
+  ASSERT_EQ(2, get_perf_context()->bloom_memtable_hit_count);
+  ASSERT_EQ(0, get_perf_context()->bloom_memtable_miss_count);
+
+  ASSERT_EQ("NOT_FOUND", Get(key2));
+  ASSERT_EQ(1, get_perf_context()->bloom_memtable_miss_count);
+  ASSERT_EQ(2, get_perf_context()->bloom_memtable_hit_count);
+
+  // sanity checks
+  ASSERT_EQ(0, get_perf_context()->bloom_sst_hit_count);
+  ASSERT_EQ(0, get_perf_context()->bloom_sst_miss_count);
+
+  Flush();
+
+  // sanity checks
+  ASSERT_EQ(0, get_perf_context()->bloom_sst_hit_count);
+  ASSERT_EQ(0, get_perf_context()->bloom_sst_miss_count);
+
+  // check SST bloom stats
+  ASSERT_EQ(value1, Get(key1));
+  ASSERT_EQ(1, get_perf_context()->bloom_sst_hit_count);
+  ASSERT_EQ(value3, Get(key3));
+  ASSERT_EQ(2, get_perf_context()->bloom_sst_hit_count);
+
+  ASSERT_EQ("NOT_FOUND", Get(key2));
+  ASSERT_EQ(1, get_perf_context()->bloom_sst_miss_count);
+}
+
+// Same scenario as in BloomStatsTest but using an iterator
+TEST_P(BloomStatsTestWithParam, BloomStatsTestWithIter) {
+  std::string key1("AAAA");
+  std::string key2("RXDB");  // not in DB
+  std::string key3("ZBRA");
+  std::string value1("Value1");
+  std::string value3("Value3");
+
+  ASSERT_OK(Put(key1, value1, WriteOptions()));
+  ASSERT_OK(Put(key3, value3, WriteOptions()));
+
+  std::unique_ptr<Iterator> iter(dbfull()->NewIterator(ReadOptions()));
+
+  // check memtable bloom stats
+  iter->Seek(key1);
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(value1, iter->value().ToString());
+  ASSERT_EQ(1, get_perf_context()->bloom_memtable_hit_count);
+  ASSERT_EQ(0, get_perf_context()->bloom_memtable_miss_count);
+
+  iter->Seek(key3);
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(value3, iter->value().ToString());
+  ASSERT_EQ(2, get_perf_context()->bloom_memtable_hit_count);
+  ASSERT_EQ(0, get_perf_context()->bloom_memtable_miss_count);
+
+  iter->Seek(key2);
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(!iter->Valid());
+  ASSERT_EQ(1, get_perf_context()->bloom_memtable_miss_count);
+  ASSERT_EQ(2, get_perf_context()->bloom_memtable_hit_count);
+
+  Flush();
+
+  iter.reset(dbfull()->NewIterator(ReadOptions()));
+
+  // Check SST bloom stats
+  iter->Seek(key1);
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(value1, iter->value().ToString());
+  ASSERT_EQ(1, get_perf_context()->bloom_sst_hit_count);
+
+  iter->Seek(key3);
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(value3, iter->value().ToString());
+  // The seek doesn't check block-based bloom filter because last index key
+  // starts with the same prefix we're seeking to.
+  uint64_t expected_hits = bfp_impl_ == BFP::kDeprecatedBlock ? 1 : 2;
+  ASSERT_EQ(expected_hits, get_perf_context()->bloom_sst_hit_count);
+
+  iter->Seek(key2);
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(!iter->Valid());
+  ASSERT_EQ(1, get_perf_context()->bloom_sst_miss_count);
+  ASSERT_EQ(expected_hits, get_perf_context()->bloom_sst_hit_count);
+}
+
+INSTANTIATE_TEST_CASE_P(
+    BloomStatsTestWithParam, BloomStatsTestWithParam,
+    ::testing::Values(std::make_tuple(BFP::kDeprecatedBlock, false),
+                      std::make_tuple(BFP::kLegacyBloom, false),
+                      std::make_tuple(BFP::kLegacyBloom, true),
+                      std::make_tuple(BFP::kFastLocalBloom, false),
+                      std::make_tuple(BFP::kFastLocalBloom, true),
+                      std::make_tuple(BFP2::kPlainTable, false)));
+
+namespace {
+void PrefixScanInit(DBBloomFilterTest* dbtest) {
+  char buf[100];
+  std::string keystr;
+  const int small_range_sstfiles = 5;
+  const int big_range_sstfiles = 5;
+
+  // Generate 11 sst files with the following prefix ranges.
+  // GROUP 0: [0,10]                              (level 1)
+  // GROUP 1: [1,2], [2,3], [3,4], [4,5], [5, 6]  (level 0)
+  // GROUP 2: [0,6], [0,7], [0,8], [0,9], [0,10]  (level 0)
+  //
+  // A seek with the previous API would do 11 random I/Os (to all the
+  // files).  With the new API and a prefix filter enabled, we should
+  // only do 2 random I/O, to the 2 files containing the key.
+
+  // GROUP 0
+  snprintf(buf, sizeof(buf), "%02d______:start", 0);
+  keystr = std::string(buf);
+  ASSERT_OK(dbtest->Put(keystr, keystr));
+  snprintf(buf, sizeof(buf), "%02d______:end", 10);
+  keystr = std::string(buf);
+  ASSERT_OK(dbtest->Put(keystr, keystr));
+  dbtest->Flush();
+  dbtest->dbfull()->CompactRange(CompactRangeOptions(), nullptr,
+                                 nullptr);  // move to level 1
+
+  // GROUP 1
+  for (int i = 1; i <= small_range_sstfiles; i++) {
+    snprintf(buf, sizeof(buf), "%02d______:start", i);
+    keystr = std::string(buf);
+    ASSERT_OK(dbtest->Put(keystr, keystr));
+    snprintf(buf, sizeof(buf), "%02d______:end", i + 1);
+    keystr = std::string(buf);
+    ASSERT_OK(dbtest->Put(keystr, keystr));
+    dbtest->Flush();
+  }
+
+  // GROUP 2
+  for (int i = 1; i <= big_range_sstfiles; i++) {
+    snprintf(buf, sizeof(buf), "%02d______:start", 0);
+    keystr = std::string(buf);
+    ASSERT_OK(dbtest->Put(keystr, keystr));
+    snprintf(buf, sizeof(buf), "%02d______:end", small_range_sstfiles + i + 1);
+    keystr = std::string(buf);
+    ASSERT_OK(dbtest->Put(keystr, keystr));
+    dbtest->Flush();
+  }
+}
+}  // namespace
+
+TEST_F(DBBloomFilterTest, PrefixScan) {
+  while (ChangeFilterOptions()) {
+    int count;
+    Slice prefix;
+    Slice key;
+    char buf[100];
+    Iterator* iter;
+    snprintf(buf, sizeof(buf), "03______:");
+    prefix = Slice(buf, 8);
+    key = Slice(buf, 9);
+    ASSERT_EQ(key.difference_offset(prefix), 8);
+    ASSERT_EQ(prefix.difference_offset(key), 8);
+    // db configs
+    env_->count_random_reads_ = true;
+    Options options = CurrentOptions();
+    options.env = env_;
+    options.prefix_extractor.reset(NewFixedPrefixTransform(8));
+    options.disable_auto_compactions = true;
+    options.max_background_compactions = 2;
+    options.create_if_missing = true;
+    options.memtable_factory.reset(NewHashSkipListRepFactory(16));
+    assert(!options.unordered_write);
+    // It is incompatible with allow_concurrent_memtable_write=false
+    options.allow_concurrent_memtable_write = false;
+
+    BlockBasedTableOptions table_options;
+    table_options.no_block_cache = true;
+    table_options.filter_policy.reset(NewBloomFilterPolicy(10));
+    table_options.whole_key_filtering = false;
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+    // 11 RAND I/Os
+    DestroyAndReopen(options);
+    PrefixScanInit(this);
+    count = 0;
+    env_->random_read_counter_.Reset();
+    iter = db_->NewIterator(ReadOptions());
+    for (iter->Seek(prefix); iter->Valid(); iter->Next()) {
+      if (!iter->key().starts_with(prefix)) {
+        break;
+      }
+      count++;
+    }
+    ASSERT_OK(iter->status());
+    delete iter;
+    ASSERT_EQ(count, 2);
+    ASSERT_EQ(env_->random_read_counter_.Read(), 2);
+    Close();
+  }  // end of while
+}
+
+TEST_F(DBBloomFilterTest, OptimizeFiltersForHits) {
+  Options options = CurrentOptions();
+  options.write_buffer_size = 64 * 1024;
+  options.arena_block_size = 4 * 1024;
+  options.target_file_size_base = 64 * 1024;
+  options.level0_file_num_compaction_trigger = 2;
+  options.level0_slowdown_writes_trigger = 2;
+  options.level0_stop_writes_trigger = 4;
+  options.max_bytes_for_level_base = 256 * 1024;
+  options.max_write_buffer_number = 2;
+  options.max_background_compactions = 8;
+  options.max_background_flushes = 8;
+  options.compression = kNoCompression;
+  options.compaction_style = kCompactionStyleLevel;
+  options.level_compaction_dynamic_level_bytes = true;
+  BlockBasedTableOptions bbto;
+  bbto.cache_index_and_filter_blocks = true;
+  bbto.filter_policy.reset(NewBloomFilterPolicy(10, true));
+  bbto.whole_key_filtering = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  options.optimize_filters_for_hits = true;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  get_perf_context()->Reset();
+  get_perf_context()->EnablePerLevelPerfContext();
+  CreateAndReopenWithCF({"mypikachu"}, options);
+
+  int numkeys = 200000;
+
+  // Generate randomly shuffled keys, so the updates are almost
+  // random.
+  std::vector<int> keys;
+  keys.reserve(numkeys);
+  for (int i = 0; i < numkeys; i += 2) {
+    keys.push_back(i);
+  }
+  std::random_shuffle(std::begin(keys), std::end(keys));
+
+  int num_inserted = 0;
+  for (int key : keys) {
+    ASSERT_OK(Put(1, Key(key), "val"));
+    if (++num_inserted % 1000 == 0) {
+      dbfull()->TEST_WaitForFlushMemTable();
+      dbfull()->TEST_WaitForCompact();
+    }
+  }
+  ASSERT_OK(Put(1, Key(0), "val"));
+  ASSERT_OK(Put(1, Key(numkeys), "val"));
+  ASSERT_OK(Flush(1));
+  dbfull()->TEST_WaitForCompact();
+
+  if (NumTableFilesAtLevel(0, 1) == 0) {
+    // No Level 0 file. Create one.
+    ASSERT_OK(Put(1, Key(0), "val"));
+    ASSERT_OK(Put(1, Key(numkeys), "val"));
+    ASSERT_OK(Flush(1));
+    dbfull()->TEST_WaitForCompact();
+  }
+
+  for (int i = 1; i < numkeys; i += 2) {
+    ASSERT_EQ(Get(1, Key(i)), "NOT_FOUND");
+  }
+
+  ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L0));
+  ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L1));
+  ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L2_AND_UP));
+
+  // Now we have three sorted run, L0, L5 and L6 with most files in L6 have
+  // no bloom filter. Most keys be checked bloom filters twice.
+  ASSERT_GT(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 65000 * 2);
+  ASSERT_LT(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 120000 * 2);
+  uint64_t bloom_filter_useful_all_levels = 0;
+  for (auto& kv : (*(get_perf_context()->level_to_perf_context))) {
+    if (kv.second.bloom_filter_useful > 0) {
+      bloom_filter_useful_all_levels += kv.second.bloom_filter_useful;
+    }
+  }
+  ASSERT_GT(bloom_filter_useful_all_levels, 65000 * 2);
+  ASSERT_LT(bloom_filter_useful_all_levels, 120000 * 2);
+
+  for (int i = 0; i < numkeys; i += 2) {
+    ASSERT_EQ(Get(1, Key(i)), "val");
+  }
+
+  // Part 2 (read path): rewrite last level with blooms, then verify they get
+  // cached only if !optimize_filters_for_hits
+  options.disable_auto_compactions = true;
+  options.num_levels = 9;
+  options.optimize_filters_for_hits = false;
+  options.statistics = CreateDBStatistics();
+  bbto.block_cache.reset();
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+
+  ReopenWithColumnFamilies({"default", "mypikachu"}, options);
+  MoveFilesToLevel(7 /* level */, 1 /* column family index */);
+
+  std::string value = Get(1, Key(0));
+  uint64_t prev_cache_filter_hits =
+      TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT);
+  value = Get(1, Key(0));
+  ASSERT_EQ(prev_cache_filter_hits + 1,
+            TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
+
+  // Now that we know the filter blocks exist in the last level files, see if
+  // filter caching is skipped for this optimization
+  options.optimize_filters_for_hits = true;
+  options.statistics = CreateDBStatistics();
+  bbto.block_cache.reset();
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+
+  ReopenWithColumnFamilies({"default", "mypikachu"}, options);
+
+  value = Get(1, Key(0));
+  ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+  ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
+  ASSERT_EQ(2 /* index and data block */,
+            TestGetTickerCount(options, BLOCK_CACHE_ADD));
+
+  // Check filter block ignored for files preloaded during DB::Open()
+  options.max_open_files = -1;
+  options.statistics = CreateDBStatistics();
+  bbto.block_cache.reset();
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+
+  ReopenWithColumnFamilies({"default", "mypikachu"}, options);
+
+  uint64_t prev_cache_filter_misses =
+      TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS);
+  prev_cache_filter_hits = TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT);
+  Get(1, Key(0));
+  ASSERT_EQ(prev_cache_filter_misses,
+            TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+  ASSERT_EQ(prev_cache_filter_hits,
+            TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
+
+  // Check filter block ignored for file trivially-moved to bottom level
+  bbto.block_cache.reset();
+  options.max_open_files = 100;  // setting > -1 makes it not preload all files
+  options.statistics = CreateDBStatistics();
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+
+  ReopenWithColumnFamilies({"default", "mypikachu"}, options);
+
+  ASSERT_OK(Put(1, Key(numkeys + 1), "val"));
+  ASSERT_OK(Flush(1));
+
+  int32_t trivial_move = 0;
+  int32_t non_trivial_move = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:TrivialMove",
+      [&](void* /*arg*/) { trivial_move++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:NonTrivial",
+      [&](void* /*arg*/) { non_trivial_move++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  CompactRangeOptions compact_options;
+  compact_options.bottommost_level_compaction =
+      BottommostLevelCompaction::kSkip;
+  compact_options.change_level = true;
+  compact_options.target_level = 7;
+  db_->CompactRange(compact_options, handles_[1], nullptr, nullptr);
+
+  ASSERT_EQ(trivial_move, 1);
+  ASSERT_EQ(non_trivial_move, 0);
+
+  prev_cache_filter_hits = TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT);
+  prev_cache_filter_misses =
+      TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS);
+  value = Get(1, Key(numkeys + 1));
+  ASSERT_EQ(prev_cache_filter_hits,
+            TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
+  ASSERT_EQ(prev_cache_filter_misses,
+            TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+
+  // Check filter block not cached for iterator
+  bbto.block_cache.reset();
+  options.statistics = CreateDBStatistics();
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+
+  ReopenWithColumnFamilies({"default", "mypikachu"}, options);
+
+  std::unique_ptr<Iterator> iter(db_->NewIterator(ReadOptions(), handles_[1]));
+  iter->SeekToFirst();
+  ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+  ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
+  ASSERT_EQ(2 /* index and data block */,
+            TestGetTickerCount(options, BLOCK_CACHE_ADD));
+  get_perf_context()->Reset();
+}
+
+int CountIter(std::unique_ptr<Iterator>& iter, const Slice& key) {
+  int count = 0;
+  for (iter->Seek(key); iter->Valid() && iter->status() == Status::OK();
+       iter->Next()) {
+    count++;
+  }
+  return count;
+}
+
+// use iterate_upper_bound to hint compatiability of existing bloom filters.
+// The BF is considered compatible if 1) upper bound and seek key transform
+// into the same string, or 2) the transformed seek key is of the same length
+// as the upper bound and two keys are adjacent according to the comparator.
+TEST_F(DBBloomFilterTest, DynamicBloomFilterUpperBound) {
+  for (auto bfp_impl : BFP::kAllFixedImpls) {
+    int using_full_builder = bfp_impl != BFP::kDeprecatedBlock;
+    Options options;
+    options.create_if_missing = true;
+    options.prefix_extractor.reset(NewCappedPrefixTransform(4));
+    options.disable_auto_compactions = true;
+    options.statistics = CreateDBStatistics();
+    // Enable prefix bloom for SST files
+    BlockBasedTableOptions table_options;
+    table_options.cache_index_and_filter_blocks = true;
+    table_options.filter_policy.reset(new BFP(10, bfp_impl));
+    table_options.index_shortening = BlockBasedTableOptions::
+        IndexShorteningMode::kShortenSeparatorsAndSuccessor;
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+    DestroyAndReopen(options);
+
+    ASSERT_OK(Put("abcdxxx0", "val1"));
+    ASSERT_OK(Put("abcdxxx1", "val2"));
+    ASSERT_OK(Put("abcdxxx2", "val3"));
+    ASSERT_OK(Put("abcdxxx3", "val4"));
+    dbfull()->Flush(FlushOptions());
+    {
+      // prefix_extractor has not changed, BF will always be read
+      Slice upper_bound("abce");
+      ReadOptions read_options;
+      read_options.prefix_same_as_start = true;
+      read_options.iterate_upper_bound = &upper_bound;
+      std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+      ASSERT_EQ(CountIter(iter, "abcd0000"), 4);
+    }
+    {
+      Slice upper_bound("abcdzzzz");
+      ReadOptions read_options;
+      read_options.prefix_same_as_start = true;
+      read_options.iterate_upper_bound = &upper_bound;
+      std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+      ASSERT_EQ(CountIter(iter, "abcd0000"), 4);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 2);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
+    }
+    ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "fixed:5"}}));
+    ASSERT_EQ(0, strcmp(dbfull()->GetOptions().prefix_extractor->Name(),
+                        "rocksdb.FixedPrefix.5"));
+    {
+      // BF changed, [abcdxx00, abce) is a valid bound, will trigger BF read
+      Slice upper_bound("abce");
+      ReadOptions read_options;
+      read_options.prefix_same_as_start = true;
+      read_options.iterate_upper_bound = &upper_bound;
+      std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+      ASSERT_EQ(CountIter(iter, "abcdxx00"), 4);
+      // should check bloom filter since upper bound meets requirement
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
+                2 + using_full_builder);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
+    }
+    {
+      // [abcdxx01, abcey) is not valid bound since upper bound is too long for
+      // the BF in SST (capped:4)
+      Slice upper_bound("abcey");
+      ReadOptions read_options;
+      read_options.prefix_same_as_start = true;
+      read_options.iterate_upper_bound = &upper_bound;
+      std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+      ASSERT_EQ(CountIter(iter, "abcdxx01"), 4);
+      // should skip bloom filter since upper bound is too long
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
+                2 + using_full_builder);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
+    }
+    {
+      // [abcdxx02, abcdy) is a valid bound since the prefix is the same
+      Slice upper_bound("abcdy");
+      ReadOptions read_options;
+      read_options.prefix_same_as_start = true;
+      read_options.iterate_upper_bound = &upper_bound;
+      std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+      ASSERT_EQ(CountIter(iter, "abcdxx02"), 4);
+      // should check bloom filter since upper bound matches transformed seek
+      // key
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
+                2 + using_full_builder * 2);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
+    }
+    {
+      // [aaaaaaaa, abce) is not a valid bound since 1) they don't share the
+      // same prefix, 2) the prefixes are not consecutive
+      Slice upper_bound("abce");
+      ReadOptions read_options;
+      read_options.prefix_same_as_start = true;
+      read_options.iterate_upper_bound = &upper_bound;
+      std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+      ASSERT_EQ(CountIter(iter, "aaaaaaaa"), 0);
+      // should skip bloom filter since mismatch is found
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
+                2 + using_full_builder * 2);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
+    }
+    ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "fixed:3"}}));
+    {
+      // [abc, abd) is not a valid bound since the upper bound is too short
+      // for BF (capped:4)
+      Slice upper_bound("abd");
+      ReadOptions read_options;
+      read_options.prefix_same_as_start = true;
+      read_options.iterate_upper_bound = &upper_bound;
+      std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+      ASSERT_EQ(CountIter(iter, "abc"), 4);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
+                2 + using_full_builder * 2);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
+    }
+    ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "capped:4"}}));
+    {
+      // set back to capped:4 and verify BF is always read
+      Slice upper_bound("abd");
+      ReadOptions read_options;
+      read_options.prefix_same_as_start = true;
+      read_options.iterate_upper_bound = &upper_bound;
+      std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+      ASSERT_EQ(CountIter(iter, "abc"), 0);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
+                3 + using_full_builder * 2);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 1);
+    }
+  }
+}
+
+// Create multiple SST files each with a different prefix_extractor config,
+// verify iterators can read all SST files using the latest config.
+TEST_F(DBBloomFilterTest, DynamicBloomFilterMultipleSST) {
+  for (auto bfp_impl : BFP::kAllFixedImpls) {
+    int using_full_builder = bfp_impl != BFP::kDeprecatedBlock;
+    Options options;
+    options.create_if_missing = true;
+    options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+    options.disable_auto_compactions = true;
+    options.statistics = CreateDBStatistics();
+    // Enable prefix bloom for SST files
+    BlockBasedTableOptions table_options;
+    table_options.filter_policy.reset(new BFP(10, bfp_impl));
+    table_options.cache_index_and_filter_blocks = true;
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+    DestroyAndReopen(options);
+
+    Slice upper_bound("foz90000");
+    ReadOptions read_options;
+    read_options.prefix_same_as_start = true;
+
+    // first SST with fixed:1 BF
+    ASSERT_OK(Put("foo2", "bar2"));
+    ASSERT_OK(Put("foo", "bar"));
+    ASSERT_OK(Put("foq1", "bar1"));
+    ASSERT_OK(Put("fpa", "0"));
+    dbfull()->Flush(FlushOptions());
+    std::unique_ptr<Iterator> iter_old(db_->NewIterator(read_options));
+    ASSERT_EQ(CountIter(iter_old, "foo"), 4);
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 1);
+
+    ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "capped:3"}}));
+    ASSERT_EQ(0, strcmp(dbfull()->GetOptions().prefix_extractor->Name(),
+                        "rocksdb.CappedPrefix.3"));
+    read_options.iterate_upper_bound = &upper_bound;
+    std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+    ASSERT_EQ(CountIter(iter, "foo"), 2);
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
+              1 + using_full_builder);
+    ASSERT_EQ(CountIter(iter, "gpk"), 0);
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
+              1 + using_full_builder);
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
+
+    // second SST with capped:3 BF
+    ASSERT_OK(Put("foo3", "bar3"));
+    ASSERT_OK(Put("foo4", "bar4"));
+    ASSERT_OK(Put("foq5", "bar5"));
+    ASSERT_OK(Put("fpb", "1"));
+    dbfull()->Flush(FlushOptions());
+    {
+      // BF is cappped:3 now
+      std::unique_ptr<Iterator> iter_tmp(db_->NewIterator(read_options));
+      ASSERT_EQ(CountIter(iter_tmp, "foo"), 4);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
+                2 + using_full_builder * 2);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
+      ASSERT_EQ(CountIter(iter_tmp, "gpk"), 0);
+      // both counters are incremented because BF is "not changed" for 1 of the
+      // 2 SST files, so filter is checked once and found no match.
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
+                3 + using_full_builder * 2);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 1);
+    }
+
+    ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "fixed:2"}}));
+    ASSERT_EQ(0, strcmp(dbfull()->GetOptions().prefix_extractor->Name(),
+                        "rocksdb.FixedPrefix.2"));
+    // third SST with fixed:2 BF
+    ASSERT_OK(Put("foo6", "bar6"));
+    ASSERT_OK(Put("foo7", "bar7"));
+    ASSERT_OK(Put("foq8", "bar8"));
+    ASSERT_OK(Put("fpc", "2"));
+    dbfull()->Flush(FlushOptions());
+    {
+      // BF is fixed:2 now
+      std::unique_ptr<Iterator> iter_tmp(db_->NewIterator(read_options));
+      ASSERT_EQ(CountIter(iter_tmp, "foo"), 9);
+      // the first and last BF are checked
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
+                4 + using_full_builder * 3);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 1);
+      ASSERT_EQ(CountIter(iter_tmp, "gpk"), 0);
+      // only last BF is checked and not found
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
+                5 + using_full_builder * 3);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 2);
+    }
+
+    // iter_old can only see the first SST, so checked plus 1
+    ASSERT_EQ(CountIter(iter_old, "foo"), 4);
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
+              6 + using_full_builder * 3);
+    // iter was created after the first setoptions call so only full filter
+    // will check the filter
+    ASSERT_EQ(CountIter(iter, "foo"), 2);
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
+              6 + using_full_builder * 4);
+
+    {
+      // keys in all three SSTs are visible to iterator
+      // The range of [foo, foz90000] is compatible with (fixed:1) and (fixed:2)
+      // so +2 for checked counter
+      std::unique_ptr<Iterator> iter_all(db_->NewIterator(read_options));
+      ASSERT_EQ(CountIter(iter_all, "foo"), 9);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
+                7 + using_full_builder * 5);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 2);
+      ASSERT_EQ(CountIter(iter_all, "gpk"), 0);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
+                8 + using_full_builder * 5);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 3);
+    }
+    ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "capped:3"}}));
+    ASSERT_EQ(0, strcmp(dbfull()->GetOptions().prefix_extractor->Name(),
+                        "rocksdb.CappedPrefix.3"));
+    {
+      std::unique_ptr<Iterator> iter_all(db_->NewIterator(read_options));
+      ASSERT_EQ(CountIter(iter_all, "foo"), 6);
+      // all three SST are checked because the current options has the same as
+      // the remaining SST (capped:3)
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
+                9 + using_full_builder * 7);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 3);
+      ASSERT_EQ(CountIter(iter_all, "gpk"), 0);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
+                10 + using_full_builder * 7);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 4);
+    }
+    // TODO(Zhongyi): Maybe also need to add Get calls to test point look up?
+  }
+}
+
+// Create a new column family in a running DB, change prefix_extractor
+// dynamically, verify the iterator created on the new column family behaves
+// as expected
+TEST_F(DBBloomFilterTest, DynamicBloomFilterNewColumnFamily) {
+  int iteration = 0;
+  for (auto bfp_impl : BFP::kAllFixedImpls) {
+    Options options = CurrentOptions();
+    options.create_if_missing = true;
+    options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+    options.disable_auto_compactions = true;
+    options.statistics = CreateDBStatistics();
+    // Enable prefix bloom for SST files
+    BlockBasedTableOptions table_options;
+    table_options.cache_index_and_filter_blocks = true;
+    table_options.filter_policy.reset(new BFP(10, bfp_impl));
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+    CreateAndReopenWithCF({"pikachu" + std::to_string(iteration)}, options);
+    ReadOptions read_options;
+    read_options.prefix_same_as_start = true;
+    // create a new CF and set prefix_extractor dynamically
+    options.prefix_extractor.reset(NewCappedPrefixTransform(3));
+    CreateColumnFamilies({"ramen_dojo_" + std::to_string(iteration)}, options);
+    ASSERT_EQ(0,
+              strcmp(dbfull()->GetOptions(handles_[2]).prefix_extractor->Name(),
+                     "rocksdb.CappedPrefix.3"));
+    ASSERT_OK(Put(2, "foo3", "bar3"));
+    ASSERT_OK(Put(2, "foo4", "bar4"));
+    ASSERT_OK(Put(2, "foo5", "bar5"));
+    ASSERT_OK(Put(2, "foq6", "bar6"));
+    ASSERT_OK(Put(2, "fpq7", "bar7"));
+    dbfull()->Flush(FlushOptions());
+    {
+      std::unique_ptr<Iterator> iter(
+          db_->NewIterator(read_options, handles_[2]));
+      ASSERT_EQ(CountIter(iter, "foo"), 3);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 0);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
+    }
+    ASSERT_OK(
+        dbfull()->SetOptions(handles_[2], {{"prefix_extractor", "fixed:2"}}));
+    ASSERT_EQ(0,
+              strcmp(dbfull()->GetOptions(handles_[2]).prefix_extractor->Name(),
+                     "rocksdb.FixedPrefix.2"));
+    {
+      std::unique_ptr<Iterator> iter(
+          db_->NewIterator(read_options, handles_[2]));
+      ASSERT_EQ(CountIter(iter, "foo"), 4);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 0);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
+    }
+    ASSERT_OK(dbfull()->DropColumnFamily(handles_[2]));
+    dbfull()->DestroyColumnFamilyHandle(handles_[2]);
+    handles_[2] = nullptr;
+    ASSERT_OK(dbfull()->DropColumnFamily(handles_[1]));
+    dbfull()->DestroyColumnFamilyHandle(handles_[1]);
+    handles_[1] = nullptr;
+    iteration++;
+  }
+}
+
+// Verify it's possible to change prefix_extractor at runtime and iterators
+// behaves as expected
+TEST_F(DBBloomFilterTest, DynamicBloomFilterOptions) {
+  for (auto bfp_impl : BFP::kAllFixedImpls) {
+    Options options;
+    options.create_if_missing = true;
+    options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+    options.disable_auto_compactions = true;
+    options.statistics = CreateDBStatistics();
+    // Enable prefix bloom for SST files
+    BlockBasedTableOptions table_options;
+    table_options.cache_index_and_filter_blocks = true;
+    table_options.filter_policy.reset(new BFP(10, bfp_impl));
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+    DestroyAndReopen(options);
+
+    ASSERT_OK(Put("foo2", "bar2"));
+    ASSERT_OK(Put("foo", "bar"));
+    ASSERT_OK(Put("foo1", "bar1"));
+    ASSERT_OK(Put("fpa", "0"));
+    dbfull()->Flush(FlushOptions());
+    ASSERT_OK(Put("foo3", "bar3"));
+    ASSERT_OK(Put("foo4", "bar4"));
+    ASSERT_OK(Put("foo5", "bar5"));
+    ASSERT_OK(Put("fpb", "1"));
+    dbfull()->Flush(FlushOptions());
+    ASSERT_OK(Put("foo6", "bar6"));
+    ASSERT_OK(Put("foo7", "bar7"));
+    ASSERT_OK(Put("foo8", "bar8"));
+    ASSERT_OK(Put("fpc", "2"));
+    dbfull()->Flush(FlushOptions());
+
+    ReadOptions read_options;
+    read_options.prefix_same_as_start = true;
+    {
+      std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+      ASSERT_EQ(CountIter(iter, "foo"), 12);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 3);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
+    }
+    std::unique_ptr<Iterator> iter_old(db_->NewIterator(read_options));
+    ASSERT_EQ(CountIter(iter_old, "foo"), 12);
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 6);
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
+
+    ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "capped:3"}}));
+    ASSERT_EQ(0, strcmp(dbfull()->GetOptions().prefix_extractor->Name(),
+                        "rocksdb.CappedPrefix.3"));
+    {
+      std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+      // "fp*" should be skipped
+      ASSERT_EQ(CountIter(iter, "foo"), 9);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 6);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
+    }
+
+    // iterator created before should not be affected and see all keys
+    ASSERT_EQ(CountIter(iter_old, "foo"), 12);
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 9);
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
+    ASSERT_EQ(CountIter(iter_old, "abc"), 0);
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 12);
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 3);
+  }
+}
+
+#endif  // ROCKSDB_LITE
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_compaction_filter_test.cc b/src/rocksdb/db/db_compaction_filter_test.cc
new file mode 100644
index 000000000..a708c0b1a
--- /dev/null
+++ b/src/rocksdb/db/db_compaction_filter_test.cc
@@ -0,0 +1,872 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/db_test_util.h"
+#include "port/stack_trace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+static int cfilter_count = 0;
+static int cfilter_skips = 0;
+
+// This is a static filter used for filtering
+// kvs during the compaction process.
+static std::string NEW_VALUE = "NewValue";
+
+class DBTestCompactionFilter : public DBTestBase {
+ public:
+  DBTestCompactionFilter() : DBTestBase("/db_compaction_filter_test") {}
+};
+
+// Param variant of DBTestBase::ChangeCompactOptions
+class DBTestCompactionFilterWithCompactParam
+    : public DBTestCompactionFilter,
+      public ::testing::WithParamInterface<DBTestBase::OptionConfig> {
+ public:
+  DBTestCompactionFilterWithCompactParam() : DBTestCompactionFilter() {
+    option_config_ = GetParam();
+    Destroy(last_options_);
+    auto options = CurrentOptions();
+    if (option_config_ == kDefault || option_config_ == kUniversalCompaction ||
+        option_config_ == kUniversalCompactionMultiLevel) {
+      options.create_if_missing = true;
+    }
+    if (option_config_ == kLevelSubcompactions ||
+        option_config_ == kUniversalSubcompactions) {
+      assert(options.max_subcompactions > 1);
+    }
+    TryReopen(options);
+  }
+};
+
+#ifndef ROCKSDB_VALGRIND_RUN
+INSTANTIATE_TEST_CASE_P(
+    CompactionFilterWithOption, DBTestCompactionFilterWithCompactParam,
+    ::testing::Values(DBTestBase::OptionConfig::kDefault,
+                      DBTestBase::OptionConfig::kUniversalCompaction,
+                      DBTestBase::OptionConfig::kUniversalCompactionMultiLevel,
+                      DBTestBase::OptionConfig::kLevelSubcompactions,
+                      DBTestBase::OptionConfig::kUniversalSubcompactions));
+#else
+// Run fewer cases in valgrind
+INSTANTIATE_TEST_CASE_P(CompactionFilterWithOption,
+                        DBTestCompactionFilterWithCompactParam,
+                        ::testing::Values(DBTestBase::OptionConfig::kDefault));
+#endif  // ROCKSDB_VALGRIND_RUN
+
+class KeepFilter : public CompactionFilter {
+ public:
+  bool Filter(int /*level*/, const Slice& /*key*/, const Slice& /*value*/,
+              std::string* /*new_value*/,
+              bool* /*value_changed*/) const override {
+    cfilter_count++;
+    return false;
+  }
+
+  const char* Name() const override { return "KeepFilter"; }
+};
+
+class DeleteFilter : public CompactionFilter {
+ public:
+  bool Filter(int /*level*/, const Slice& /*key*/, const Slice& /*value*/,
+              std::string* /*new_value*/,
+              bool* /*value_changed*/) const override {
+    cfilter_count++;
+    return true;
+  }
+
+  const char* Name() const override { return "DeleteFilter"; }
+};
+
+class DeleteISFilter : public CompactionFilter {
+ public:
+  bool Filter(int /*level*/, const Slice& key, const Slice& /*value*/,
+              std::string* /*new_value*/,
+              bool* /*value_changed*/) const override {
+    cfilter_count++;
+    int i = std::stoi(key.ToString());
+    if (i > 5 && i <= 105) {
+      return true;
+    }
+    return false;
+  }
+
+  bool IgnoreSnapshots() const override { return true; }
+
+  const char* Name() const override { return "DeleteFilter"; }
+};
+
+// Skip x if floor(x/10) is even, use range skips. Requires that keys are
+// zero-padded to length 10.
+class SkipEvenFilter : public CompactionFilter {
+ public:
+  Decision FilterV2(int /*level*/, const Slice& key, ValueType /*value_type*/,
+                    const Slice& /*existing_value*/, std::string* /*new_value*/,
+                    std::string* skip_until) const override {
+    cfilter_count++;
+    int i = std::stoi(key.ToString());
+    if (i / 10 % 2 == 0) {
+      char key_str[100];
+      snprintf(key_str, sizeof(key_str), "%010d", i / 10 * 10 + 10);
+      *skip_until = key_str;
+      ++cfilter_skips;
+      return Decision::kRemoveAndSkipUntil;
+    }
+    return Decision::kKeep;
+  }
+
+  bool IgnoreSnapshots() const override { return true; }
+
+  const char* Name() const override { return "DeleteFilter"; }
+};
+
+class DelayFilter : public CompactionFilter {
+ public:
+  explicit DelayFilter(DBTestBase* d) : db_test(d) {}
+  bool Filter(int /*level*/, const Slice& /*key*/, const Slice& /*value*/,
+              std::string* /*new_value*/,
+              bool* /*value_changed*/) const override {
+    db_test->env_->addon_time_.fetch_add(1000);
+    return true;
+  }
+
+  const char* Name() const override { return "DelayFilter"; }
+
+ private:
+  DBTestBase* db_test;
+};
+
+class ConditionalFilter : public CompactionFilter {
+ public:
+  explicit ConditionalFilter(const std::string* filtered_value)
+      : filtered_value_(filtered_value) {}
+  bool Filter(int /*level*/, const Slice& /*key*/, const Slice& value,
+              std::string* /*new_value*/,
+              bool* /*value_changed*/) const override {
+    return value.ToString() == *filtered_value_;
+  }
+
+  const char* Name() const override { return "ConditionalFilter"; }
+
+ private:
+  const std::string* filtered_value_;
+};
+
+class ChangeFilter : public CompactionFilter {
+ public:
+  explicit ChangeFilter() {}
+
+  bool Filter(int /*level*/, const Slice& /*key*/, const Slice& /*value*/,
+              std::string* new_value, bool* value_changed) const override {
+    assert(new_value != nullptr);
+    *new_value = NEW_VALUE;
+    *value_changed = true;
+    return false;
+  }
+
+  const char* Name() const override { return "ChangeFilter"; }
+};
+
+class KeepFilterFactory : public CompactionFilterFactory {
+ public:
+  explicit KeepFilterFactory(bool check_context = false,
+                             bool check_context_cf_id = false)
+      : check_context_(check_context),
+        check_context_cf_id_(check_context_cf_id),
+        compaction_filter_created_(false) {}
+
+  std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+      const CompactionFilter::Context& context) override {
+    if (check_context_) {
+      EXPECT_EQ(expect_full_compaction_.load(), context.is_full_compaction);
+      EXPECT_EQ(expect_manual_compaction_.load(), context.is_manual_compaction);
+    }
+    if (check_context_cf_id_) {
+      EXPECT_EQ(expect_cf_id_.load(), context.column_family_id);
+    }
+    compaction_filter_created_ = true;
+    return std::unique_ptr<CompactionFilter>(new KeepFilter());
+  }
+
+  bool compaction_filter_created() const { return compaction_filter_created_; }
+
+  const char* Name() const override { return "KeepFilterFactory"; }
+  bool check_context_;
+  bool check_context_cf_id_;
+  std::atomic_bool expect_full_compaction_;
+  std::atomic_bool expect_manual_compaction_;
+  std::atomic<uint32_t> expect_cf_id_;
+  bool compaction_filter_created_;
+};
+
+class DeleteFilterFactory : public CompactionFilterFactory {
+ public:
+  std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+      const CompactionFilter::Context& context) override {
+    if (context.is_manual_compaction) {
+      return std::unique_ptr<CompactionFilter>(new DeleteFilter());
+    } else {
+      return std::unique_ptr<CompactionFilter>(nullptr);
+    }
+  }
+
+  const char* Name() const override { return "DeleteFilterFactory"; }
+};
+
+// Delete Filter Factory which ignores snapshots
+class DeleteISFilterFactory : public CompactionFilterFactory {
+ public:
+  std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+      const CompactionFilter::Context& context) override {
+    if (context.is_manual_compaction) {
+      return std::unique_ptr<CompactionFilter>(new DeleteISFilter());
+    } else {
+      return std::unique_ptr<CompactionFilter>(nullptr);
+    }
+  }
+
+  const char* Name() const override { return "DeleteFilterFactory"; }
+};
+
+class SkipEvenFilterFactory : public CompactionFilterFactory {
+ public:
+  std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+      const CompactionFilter::Context& context) override {
+    if (context.is_manual_compaction) {
+      return std::unique_ptr<CompactionFilter>(new SkipEvenFilter());
+    } else {
+      return std::unique_ptr<CompactionFilter>(nullptr);
+    }
+  }
+
+  const char* Name() const override { return "SkipEvenFilterFactory"; }
+};
+
+class DelayFilterFactory : public CompactionFilterFactory {
+ public:
+  explicit DelayFilterFactory(DBTestBase* d) : db_test(d) {}
+  std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+      const CompactionFilter::Context& /*context*/) override {
+    return std::unique_ptr<CompactionFilter>(new DelayFilter(db_test));
+  }
+
+  const char* Name() const override { return "DelayFilterFactory"; }
+
+ private:
+  DBTestBase* db_test;
+};
+
+class ConditionalFilterFactory : public CompactionFilterFactory {
+ public:
+  explicit ConditionalFilterFactory(const Slice& filtered_value)
+      : filtered_value_(filtered_value.ToString()) {}
+
+  std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+      const CompactionFilter::Context& /*context*/) override {
+    return std::unique_ptr<CompactionFilter>(
+        new ConditionalFilter(&filtered_value_));
+  }
+
+  const char* Name() const override { return "ConditionalFilterFactory"; }
+
+ private:
+  std::string filtered_value_;
+};
+
+class ChangeFilterFactory : public CompactionFilterFactory {
+ public:
+  explicit ChangeFilterFactory() {}
+
+  std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+      const CompactionFilter::Context& /*context*/) override {
+    return std::unique_ptr<CompactionFilter>(new ChangeFilter());
+  }
+
+  const char* Name() const override { return "ChangeFilterFactory"; }
+};
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTestCompactionFilter, CompactionFilter) {
+  Options options = CurrentOptions();
+  options.max_open_files = -1;
+  options.num_levels = 3;
+  options.compaction_filter_factory = std::make_shared<KeepFilterFactory>();
+  options = CurrentOptions(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  // Write 100K keys, these are written to a few files in L0.
+  const std::string value(10, 'x');
+  for (int i = 0; i < 100000; i++) {
+    char key[100];
+    snprintf(key, sizeof(key), "B%010d", i);
+    Put(1, key, value);
+  }
+  ASSERT_OK(Flush(1));
+
+  // Push all files to the highest level L2. Verify that
+  // the compaction is each level invokes the filter for
+  // all the keys in that level.
+  cfilter_count = 0;
+  dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]);
+  ASSERT_EQ(cfilter_count, 100000);
+  cfilter_count = 0;
+  dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]);
+  ASSERT_EQ(cfilter_count, 100000);
+
+  ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(1, 1), 0);
+  ASSERT_NE(NumTableFilesAtLevel(2, 1), 0);
+  cfilter_count = 0;
+
+  // All the files are in the lowest level.
+  // Verify that all but the 100001st record
+  // has sequence number zero. The 100001st record
+  // is at the tip of this snapshot and cannot
+  // be zeroed out.
+  int count = 0;
+  int total = 0;
+  Arena arena;
+  {
+    InternalKeyComparator icmp(options.comparator);
+    ReadRangeDelAggregator range_del_agg(&icmp,
+                                         kMaxSequenceNumber /* upper_bound */);
+    ScopedArenaIterator iter(dbfull()->NewInternalIterator(
+        &arena, &range_del_agg, kMaxSequenceNumber, handles_[1]));
+    iter->SeekToFirst();
+    ASSERT_OK(iter->status());
+    while (iter->Valid()) {
+      ParsedInternalKey ikey(Slice(), 0, kTypeValue);
+      ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true);
+      total++;
+      if (ikey.sequence != 0) {
+        count++;
+      }
+      iter->Next();
+    }
+  }
+  ASSERT_EQ(total, 100000);
+  ASSERT_EQ(count, 0);
+
+  // overwrite all the 100K keys once again.
+  for (int i = 0; i < 100000; i++) {
+    char key[100];
+    snprintf(key, sizeof(key), "B%010d", i);
+    ASSERT_OK(Put(1, key, value));
+  }
+  ASSERT_OK(Flush(1));
+
+  // push all files to the highest level L2. This
+  // means that all keys should pass at least once
+  // via the compaction filter
+  cfilter_count = 0;
+  dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]);
+  ASSERT_EQ(cfilter_count, 100000);
+  cfilter_count = 0;
+  dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]);
+  ASSERT_EQ(cfilter_count, 100000);
+  ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(1, 1), 0);
+  ASSERT_NE(NumTableFilesAtLevel(2, 1), 0);
+
+  // create a new database with the compaction
+  // filter in such a way that it deletes all keys
+  options.compaction_filter_factory = std::make_shared<DeleteFilterFactory>();
+  options.create_if_missing = true;
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  // write all the keys once again.
+  for (int i = 0; i < 100000; i++) {
+    char key[100];
+    snprintf(key, sizeof(key), "B%010d", i);
+    ASSERT_OK(Put(1, key, value));
+  }
+  ASSERT_OK(Flush(1));
+  ASSERT_NE(NumTableFilesAtLevel(0, 1), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(1, 1), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(2, 1), 0);
+
+  // Push all files to the highest level L2. This
+  // triggers the compaction filter to delete all keys,
+  // verify that at the end of the compaction process,
+  // nothing is left.
+  cfilter_count = 0;
+  dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]);
+  ASSERT_EQ(cfilter_count, 100000);
+  cfilter_count = 0;
+  dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]);
+  ASSERT_EQ(cfilter_count, 0);
+  ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(1, 1), 0);
+
+  {
+    // Scan the entire database to ensure that nothing is left
+    std::unique_ptr<Iterator> iter(
+        db_->NewIterator(ReadOptions(), handles_[1]));
+    iter->SeekToFirst();
+    count = 0;
+    while (iter->Valid()) {
+      count++;
+      iter->Next();
+    }
+    ASSERT_EQ(count, 0);
+  }
+
+  // The sequence number of the remaining record
+  // is not zeroed out even though it is at the
+  // level Lmax because this record is at the tip
+  count = 0;
+  {
+    InternalKeyComparator icmp(options.comparator);
+    ReadRangeDelAggregator range_del_agg(&icmp,
+                                         kMaxSequenceNumber /* upper_bound */);
+    ScopedArenaIterator iter(dbfull()->NewInternalIterator(
+        &arena, &range_del_agg, kMaxSequenceNumber, handles_[1]));
+    iter->SeekToFirst();
+    ASSERT_OK(iter->status());
+    while (iter->Valid()) {
+      ParsedInternalKey ikey(Slice(), 0, kTypeValue);
+      ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true);
+      ASSERT_NE(ikey.sequence, (unsigned)0);
+      count++;
+      iter->Next();
+    }
+    ASSERT_EQ(count, 0);
+  }
+}
+
+// Tests the edge case where compaction does not produce any output -- all
+// entries are deleted. The compaction should create bunch of 'DeleteFile'
+// entries in VersionEdit, but none of the 'AddFile's.
+TEST_F(DBTestCompactionFilter, CompactionFilterDeletesAll) {
+  Options options = CurrentOptions();
+  options.compaction_filter_factory = std::make_shared<DeleteFilterFactory>();
+  options.disable_auto_compactions = true;
+  options.create_if_missing = true;
+  DestroyAndReopen(options);
+
+  // put some data
+  for (int table = 0; table < 4; ++table) {
+    for (int i = 0; i < 10 + table; ++i) {
+      Put(ToString(table * 100 + i), "val");
+    }
+    Flush();
+  }
+
+  // this will produce empty file (delete compaction filter)
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ(0U, CountLiveFiles());
+
+  Reopen(options);
+
+  Iterator* itr = db_->NewIterator(ReadOptions());
+  itr->SeekToFirst();
+  // empty db
+  ASSERT_TRUE(!itr->Valid());
+
+  delete itr;
+}
+#endif  // ROCKSDB_LITE
+
+TEST_P(DBTestCompactionFilterWithCompactParam,
+       CompactionFilterWithValueChange) {
+  Options options = CurrentOptions();
+  options.num_levels = 3;
+  options.compaction_filter_factory = std::make_shared<ChangeFilterFactory>();
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  // Write 100K+1 keys, these are written to a few files
+  // in L0. We do this so that the current snapshot points
+  // to the 100001 key.The compaction filter is  not invoked
+  // on keys that are visible via a snapshot because we
+  // anyways cannot delete it.
+  const std::string value(10, 'x');
+  for (int i = 0; i < 100001; i++) {
+    char key[100];
+    snprintf(key, sizeof(key), "B%010d", i);
+    Put(1, key, value);
+  }
+
+  // push all files to  lower levels
+  ASSERT_OK(Flush(1));
+  if (option_config_ != kUniversalCompactionMultiLevel &&
+      option_config_ != kUniversalSubcompactions) {
+    dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]);
+    dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]);
+  } else {
+    dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr,
+                           nullptr);
+  }
+
+  // re-write all data again
+  for (int i = 0; i < 100001; i++) {
+    char key[100];
+    snprintf(key, sizeof(key), "B%010d", i);
+    Put(1, key, value);
+  }
+
+  // push all files to  lower levels. This should
+  // invoke the compaction filter for all 100000 keys.
+  ASSERT_OK(Flush(1));
+  if (option_config_ != kUniversalCompactionMultiLevel &&
+      option_config_ != kUniversalSubcompactions) {
+    dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]);
+    dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]);
+  } else {
+    dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr,
+                           nullptr);
+  }
+
+  // verify that all keys now have the new value that
+  // was set by the compaction process.
+  for (int i = 0; i < 100001; i++) {
+    char key[100];
+    snprintf(key, sizeof(key), "B%010d", i);
+    std::string newvalue = Get(1, key);
+    ASSERT_EQ(newvalue.compare(NEW_VALUE), 0);
+  }
+}
+
+TEST_F(DBTestCompactionFilter, CompactionFilterWithMergeOperator) {
+  std::string one, two, three, four;
+  PutFixed64(&one, 1);
+  PutFixed64(&two, 2);
+  PutFixed64(&three, 3);
+  PutFixed64(&four, 4);
+
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.merge_operator = MergeOperators::CreateUInt64AddOperator();
+  options.num_levels = 3;
+  // Filter out keys with value is 2.
+  options.compaction_filter_factory =
+      std::make_shared<ConditionalFilterFactory>(two);
+  DestroyAndReopen(options);
+
+  // In the same compaction, a value type needs to be deleted based on
+  // compaction filter, and there is a merge type for the key. compaction
+  // filter result is ignored.
+  ASSERT_OK(db_->Put(WriteOptions(), "foo", two));
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->Merge(WriteOptions(), "foo", one));
+  ASSERT_OK(Flush());
+  std::string newvalue = Get("foo");
+  ASSERT_EQ(newvalue, three);
+  dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  newvalue = Get("foo");
+  ASSERT_EQ(newvalue, three);
+
+  // value key can be deleted based on compaction filter, leaving only
+  // merge keys.
+  ASSERT_OK(db_->Put(WriteOptions(), "bar", two));
+  ASSERT_OK(Flush());
+  dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  newvalue = Get("bar");
+  ASSERT_EQ("NOT_FOUND", newvalue);
+  ASSERT_OK(db_->Merge(WriteOptions(), "bar", two));
+  ASSERT_OK(Flush());
+  dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  newvalue = Get("bar");
+  ASSERT_EQ(two, two);
+
+  // Compaction filter never applies to merge keys.
+  ASSERT_OK(db_->Put(WriteOptions(), "foobar", one));
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->Merge(WriteOptions(), "foobar", two));
+  ASSERT_OK(Flush());
+  newvalue = Get("foobar");
+  ASSERT_EQ(newvalue, three);
+  dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  newvalue = Get("foobar");
+  ASSERT_EQ(newvalue, three);
+
+  // In the same compaction, both of value type and merge type keys need to be
+  // deleted based on compaction filter, and there is a merge type for the key.
+  // For both keys, compaction filter results are ignored.
+  ASSERT_OK(db_->Put(WriteOptions(), "barfoo", two));
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->Merge(WriteOptions(), "barfoo", two));
+  ASSERT_OK(Flush());
+  newvalue = Get("barfoo");
+  ASSERT_EQ(newvalue, four);
+  dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  newvalue = Get("barfoo");
+  ASSERT_EQ(newvalue, four);
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTestCompactionFilter, CompactionFilterContextManual) {
+  KeepFilterFactory* filter = new KeepFilterFactory(true, true);
+
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  options.compaction_filter_factory.reset(filter);
+  options.compression = kNoCompression;
+  options.level0_file_num_compaction_trigger = 8;
+  Reopen(options);
+  int num_keys_per_file = 400;
+  for (int j = 0; j < 3; j++) {
+    // Write several keys.
+    const std::string value(10, 'x');
+    for (int i = 0; i < num_keys_per_file; i++) {
+      char key[100];
+      snprintf(key, sizeof(key), "B%08d%02d", i, j);
+      Put(key, value);
+    }
+    dbfull()->TEST_FlushMemTable();
+    // Make sure next file is much smaller so automatic compaction will not
+    // be triggered.
+    num_keys_per_file /= 2;
+  }
+  dbfull()->TEST_WaitForCompact();
+
+  // Force a manual compaction
+  cfilter_count = 0;
+  filter->expect_manual_compaction_.store(true);
+  filter->expect_full_compaction_.store(true);
+  filter->expect_cf_id_.store(0);
+  dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_EQ(cfilter_count, 700);
+  ASSERT_EQ(NumSortedRuns(0), 1);
+  ASSERT_TRUE(filter->compaction_filter_created());
+
+  // Verify total number of keys is correct after manual compaction.
+  {
+    int count = 0;
+    int total = 0;
+    Arena arena;
+    InternalKeyComparator icmp(options.comparator);
+    ReadRangeDelAggregator range_del_agg(&icmp,
+                                         kMaxSequenceNumber /* snapshots */);
+    ScopedArenaIterator iter(dbfull()->NewInternalIterator(
+        &arena, &range_del_agg, kMaxSequenceNumber));
+    iter->SeekToFirst();
+    ASSERT_OK(iter->status());
+    while (iter->Valid()) {
+      ParsedInternalKey ikey(Slice(), 0, kTypeValue);
+      ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true);
+      total++;
+      if (ikey.sequence != 0) {
+        count++;
+      }
+      iter->Next();
+    }
+    ASSERT_EQ(total, 700);
+    ASSERT_EQ(count, 0);
+  }
+}
+#endif  // ROCKSDB_LITE
+
+TEST_F(DBTestCompactionFilter, CompactionFilterContextCfId) {
+  KeepFilterFactory* filter = new KeepFilterFactory(false, true);
+  filter->expect_cf_id_.store(1);
+
+  Options options = CurrentOptions();
+  options.compaction_filter_factory.reset(filter);
+  options.compression = kNoCompression;
+  options.level0_file_num_compaction_trigger = 2;
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  int num_keys_per_file = 400;
+  for (int j = 0; j < 3; j++) {
+    // Write several keys.
+    const std::string value(10, 'x');
+    for (int i = 0; i < num_keys_per_file; i++) {
+      char key[100];
+      snprintf(key, sizeof(key), "B%08d%02d", i, j);
+      Put(1, key, value);
+    }
+    Flush(1);
+    // Make sure next file is much smaller so automatic compaction will not
+    // be triggered.
+    num_keys_per_file /= 2;
+  }
+  dbfull()->TEST_WaitForCompact();
+
+  ASSERT_TRUE(filter->compaction_filter_created());
+}
+
+#ifndef ROCKSDB_LITE
+// Compaction filters aplies to all records, regardless snapshots.
+TEST_F(DBTestCompactionFilter, CompactionFilterIgnoreSnapshot) {
+  std::string five = ToString(5);
+  Options options = CurrentOptions();
+  options.compaction_filter_factory = std::make_shared<DeleteISFilterFactory>();
+  options.disable_auto_compactions = true;
+  options.create_if_missing = true;
+  DestroyAndReopen(options);
+
+  // Put some data.
+  const Snapshot* snapshot = nullptr;
+  for (int table = 0; table < 4; ++table) {
+    for (int i = 0; i < 10; ++i) {
+      Put(ToString(table * 100 + i), "val");
+    }
+    Flush();
+
+    if (table == 0) {
+      snapshot = db_->GetSnapshot();
+    }
+  }
+  assert(snapshot != nullptr);
+
+  cfilter_count = 0;
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  // The filter should delete 40 records.
+  ASSERT_EQ(40, cfilter_count);
+
+  {
+    // Scan the entire database as of the snapshot to ensure
+    // that nothing is left
+    ReadOptions read_options;
+    read_options.snapshot = snapshot;
+    std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+    iter->SeekToFirst();
+    int count = 0;
+    while (iter->Valid()) {
+      count++;
+      iter->Next();
+    }
+    ASSERT_EQ(count, 6);
+    read_options.snapshot = nullptr;
+    std::unique_ptr<Iterator> iter1(db_->NewIterator(read_options));
+    iter1->SeekToFirst();
+    count = 0;
+    while (iter1->Valid()) {
+      count++;
+      iter1->Next();
+    }
+    // We have deleted 10 keys from 40 using the compaction filter
+    //  Keys 6-9 before the snapshot and 100-105 after the snapshot
+    ASSERT_EQ(count, 30);
+  }
+
+  // Release the snapshot and compact again -> now all records should be
+  // removed.
+  db_->ReleaseSnapshot(snapshot);
+}
+#endif  // ROCKSDB_LITE
+
+TEST_F(DBTestCompactionFilter, SkipUntil) {
+  Options options = CurrentOptions();
+  options.compaction_filter_factory = std::make_shared<SkipEvenFilterFactory>();
+  options.disable_auto_compactions = true;
+  options.create_if_missing = true;
+  DestroyAndReopen(options);
+
+  // Write 100K keys, these are written to a few files in L0.
+  for (int table = 0; table < 4; ++table) {
+    // Key ranges in tables are [0, 38], [106, 149], [212, 260], [318, 371].
+    for (int i = table * 6; i < 39 + table * 11; ++i) {
+      char key[100];
+      snprintf(key, sizeof(key), "%010d", table * 100 + i);
+      Put(key, std::to_string(table * 1000 + i));
+    }
+    Flush();
+  }
+
+  cfilter_skips = 0;
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  // Number of skips in tables: 2, 3, 3, 3.
+  ASSERT_EQ(11, cfilter_skips);
+
+  for (int table = 0; table < 4; ++table) {
+    for (int i = table * 6; i < 39 + table * 11; ++i) {
+      int k = table * 100 + i;
+      char key[100];
+      snprintf(key, sizeof(key), "%010d", table * 100 + i);
+      auto expected = std::to_string(table * 1000 + i);
+      std::string val;
+      Status s = db_->Get(ReadOptions(), key, &val);
+      if (k / 10 % 2 == 0) {
+        ASSERT_TRUE(s.IsNotFound());
+      } else {
+        ASSERT_OK(s);
+        ASSERT_EQ(expected, val);
+      }
+    }
+  }
+}
+
+TEST_F(DBTestCompactionFilter, SkipUntilWithBloomFilter) {
+  BlockBasedTableOptions table_options;
+  table_options.whole_key_filtering = false;
+  table_options.filter_policy.reset(NewBloomFilterPolicy(100, false));
+
+  Options options = CurrentOptions();
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options.prefix_extractor.reset(NewCappedPrefixTransform(9));
+  options.compaction_filter_factory = std::make_shared<SkipEvenFilterFactory>();
+  options.disable_auto_compactions = true;
+  options.create_if_missing = true;
+  DestroyAndReopen(options);
+
+  Put("0000000010", "v10");
+  Put("0000000020", "v20");  // skipped
+  Put("0000000050", "v50");
+  Flush();
+
+  cfilter_skips = 0;
+  EXPECT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  EXPECT_EQ(1, cfilter_skips);
+
+  Status s;
+  std::string val;
+
+  s = db_->Get(ReadOptions(), "0000000010", &val);
+  ASSERT_OK(s);
+  EXPECT_EQ("v10", val);
+
+  s = db_->Get(ReadOptions(), "0000000020", &val);
+  EXPECT_TRUE(s.IsNotFound());
+
+  s = db_->Get(ReadOptions(), "0000000050", &val);
+  ASSERT_OK(s);
+  EXPECT_EQ("v50", val);
+}
+
+class TestNotSupportedFilter : public CompactionFilter {
+ public:
+  bool Filter(int /*level*/, const Slice& /*key*/, const Slice& /*value*/,
+              std::string* /*new_value*/,
+              bool* /*value_changed*/) const override {
+    return true;
+  }
+
+  const char* Name() const override { return "NotSupported"; }
+  bool IgnoreSnapshots() const override { return false; }
+};
+
+TEST_F(DBTestCompactionFilter, IgnoreSnapshotsFalse) {
+  Options options = CurrentOptions();
+  options.compaction_filter = new TestNotSupportedFilter();
+  DestroyAndReopen(options);
+
+  Put("a", "v10");
+  Put("z", "v20");
+  Flush();
+
+  Put("a", "v10");
+  Put("z", "v20");
+  Flush();
+
+  // Comapction should fail because IgnoreSnapshots() = false
+  EXPECT_TRUE(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)
+                  .IsNotSupported());
+
+  delete options.compaction_filter;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_compaction_test.cc b/src/rocksdb/db/db_compaction_test.cc
new file mode 100644
index 000000000..635aca135
--- /dev/null
+++ b/src/rocksdb/db/db_compaction_test.cc
@@ -0,0 +1,5167 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/db_test_util.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "rocksdb/concurrent_task_limiter.h"
+#include "rocksdb/experimental.h"
+#include "rocksdb/sst_file_writer.h"
+#include "rocksdb/utilities/convenience.h"
+#include "test_util/fault_injection_test_env.h"
+#include "test_util/sync_point.h"
+#include "util/concurrent_task_limiter_impl.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// SYNC_POINT is not supported in released Windows mode.
+#if !defined(ROCKSDB_LITE)
+
+class DBCompactionTest : public DBTestBase {
+ public:
+  DBCompactionTest() : DBTestBase("/db_compaction_test") {}
+};
+
+class DBCompactionTestWithParam
+    : public DBTestBase,
+      public testing::WithParamInterface<std::tuple<uint32_t, bool>> {
+ public:
+  DBCompactionTestWithParam() : DBTestBase("/db_compaction_test") {
+    max_subcompactions_ = std::get<0>(GetParam());
+    exclusive_manual_compaction_ = std::get<1>(GetParam());
+  }
+
+  // Required if inheriting from testing::WithParamInterface<>
+  static void SetUpTestCase() {}
+  static void TearDownTestCase() {}
+
+  uint32_t max_subcompactions_;
+  bool exclusive_manual_compaction_;
+};
+
+class DBCompactionDirectIOTest : public DBCompactionTest,
+                                 public ::testing::WithParamInterface<bool> {
+ public:
+  DBCompactionDirectIOTest() : DBCompactionTest() {}
+};
+
+namespace {
+
+class FlushedFileCollector : public EventListener {
+ public:
+  FlushedFileCollector() {}
+  ~FlushedFileCollector() override {}
+
+  void OnFlushCompleted(DB* /*db*/, const FlushJobInfo& info) override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    flushed_files_.push_back(info.file_path);
+  }
+
+  std::vector<std::string> GetFlushedFiles() {
+    std::lock_guard<std::mutex> lock(mutex_);
+    std::vector<std::string> result;
+    for (auto fname : flushed_files_) {
+      result.push_back(fname);
+    }
+    return result;
+  }
+
+  void ClearFlushedFiles() { flushed_files_.clear(); }
+
+ private:
+  std::vector<std::string> flushed_files_;
+  std::mutex mutex_;
+};
+
+class CompactionStatsCollector : public EventListener {
+public:
+  CompactionStatsCollector()
+      : compaction_completed_(static_cast<int>(CompactionReason::kNumOfReasons)) {
+    for (auto& v : compaction_completed_) {
+      v.store(0);
+    }
+  }
+
+  ~CompactionStatsCollector() override {}
+
+  void OnCompactionCompleted(DB* /* db */,
+                             const CompactionJobInfo& info) override {
+    int k = static_cast<int>(info.compaction_reason);
+    int num_of_reasons = static_cast<int>(CompactionReason::kNumOfReasons);
+    assert(k >= 0 && k < num_of_reasons);
+    compaction_completed_[k]++;
+  }
+
+  void OnExternalFileIngested(
+      DB* /* db */, const ExternalFileIngestionInfo& /* info */) override {
+    int k = static_cast<int>(CompactionReason::kExternalSstIngestion);
+    compaction_completed_[k]++;
+  }
+
+  void OnFlushCompleted(DB* /* db */, const FlushJobInfo& /* info */) override {
+    int k = static_cast<int>(CompactionReason::kFlush);
+    compaction_completed_[k]++;
+  }
+
+  int NumberOfCompactions(CompactionReason reason) const {
+    int num_of_reasons = static_cast<int>(CompactionReason::kNumOfReasons);
+    int k = static_cast<int>(reason);
+    assert(k >= 0 && k < num_of_reasons);
+    return compaction_completed_.at(k).load();
+  }
+
+private:
+  std::vector<std::atomic<int>> compaction_completed_;
+};
+
+class SstStatsCollector : public EventListener {
+ public:
+  SstStatsCollector() : num_ssts_creation_started_(0) {}
+
+  void OnTableFileCreationStarted(
+      const TableFileCreationBriefInfo& /* info */) override {
+    ++num_ssts_creation_started_;
+  }
+
+  int num_ssts_creation_started() { return num_ssts_creation_started_; }
+
+ private:
+  std::atomic<int> num_ssts_creation_started_;
+};
+
+static const int kCDTValueSize = 1000;
+static const int kCDTKeysPerBuffer = 4;
+static const int kCDTNumLevels = 8;
+Options DeletionTriggerOptions(Options options) {
+  options.compression = kNoCompression;
+  options.write_buffer_size = kCDTKeysPerBuffer * (kCDTValueSize + 24);
+  options.min_write_buffer_number_to_merge = 1;
+  options.max_write_buffer_size_to_maintain = 0;
+  options.num_levels = kCDTNumLevels;
+  options.level0_file_num_compaction_trigger = 1;
+  options.target_file_size_base = options.write_buffer_size * 2;
+  options.target_file_size_multiplier = 2;
+  options.max_bytes_for_level_base =
+      options.target_file_size_base * options.target_file_size_multiplier;
+  options.max_bytes_for_level_multiplier = 2;
+  options.disable_auto_compactions = false;
+  return options;
+}
+
+bool HaveOverlappingKeyRanges(
+    const Comparator* c,
+    const SstFileMetaData& a, const SstFileMetaData& b) {
+  if (c->Compare(a.smallestkey, b.smallestkey) >= 0) {
+    if (c->Compare(a.smallestkey, b.largestkey) <= 0) {
+      // b.smallestkey <= a.smallestkey <= b.largestkey
+      return true;
+    }
+  } else if (c->Compare(a.largestkey, b.smallestkey) >= 0) {
+    // a.smallestkey < b.smallestkey <= a.largestkey
+    return true;
+  }
+  if (c->Compare(a.largestkey, b.largestkey) <= 0) {
+    if (c->Compare(a.largestkey, b.smallestkey) >= 0) {
+      // b.smallestkey <= a.largestkey <= b.largestkey
+      return true;
+    }
+  } else if (c->Compare(a.smallestkey, b.largestkey) <= 0) {
+    // a.smallestkey <= b.largestkey < a.largestkey
+    return true;
+  }
+  return false;
+}
+
+// Identifies all files between level "min_level" and "max_level"
+// which has overlapping key range with "input_file_meta".
+void GetOverlappingFileNumbersForLevelCompaction(
+    const ColumnFamilyMetaData& cf_meta,
+    const Comparator* comparator,
+    int min_level, int max_level,
+    const SstFileMetaData* input_file_meta,
+    std::set<std::string>* overlapping_file_names) {
+  std::set<const SstFileMetaData*> overlapping_files;
+  overlapping_files.insert(input_file_meta);
+  for (int m = min_level; m <= max_level; ++m) {
+    for (auto& file : cf_meta.levels[m].files) {
+      for (auto* included_file : overlapping_files) {
+        if (HaveOverlappingKeyRanges(
+                comparator, *included_file, file)) {
+          overlapping_files.insert(&file);
+          overlapping_file_names->insert(file.name);
+          break;
+        }
+      }
+    }
+  }
+}
+
+void VerifyCompactionResult(
+    const ColumnFamilyMetaData& cf_meta,
+    const std::set<std::string>& overlapping_file_numbers) {
+#ifndef NDEBUG
+  for (auto& level : cf_meta.levels) {
+    for (auto& file : level.files) {
+      assert(overlapping_file_numbers.find(file.name) ==
+             overlapping_file_numbers.end());
+    }
+  }
+#endif
+}
+
+/*
+ * Verifies compaction stats of cfd are valid.
+ *
+ * For each level of cfd, its compaction stats are valid if
+ * 1) sum(stat.counts) == stat.count, and
+ * 2) stat.counts[i] == collector.NumberOfCompactions(i)
+ */
+void VerifyCompactionStats(ColumnFamilyData& cfd,
+    const CompactionStatsCollector& collector) {
+#ifndef NDEBUG
+  InternalStats* internal_stats_ptr = cfd.internal_stats();
+  ASSERT_TRUE(internal_stats_ptr != nullptr);
+  const std::vector<InternalStats::CompactionStats>& comp_stats =
+      internal_stats_ptr->TEST_GetCompactionStats();
+  const int num_of_reasons = static_cast<int>(CompactionReason::kNumOfReasons);
+  std::vector<int> counts(num_of_reasons, 0);
+  // Count the number of compactions caused by each CompactionReason across
+  // all levels.
+  for (const auto& stat : comp_stats) {
+    int sum = 0;
+    for (int i = 0; i < num_of_reasons; i++) {
+      counts[i] += stat.counts[i];
+      sum += stat.counts[i];
+    }
+    ASSERT_EQ(sum, stat.count);
+  }
+  // Verify InternalStats bookkeeping matches that of CompactionStatsCollector,
+  // assuming that all compactions complete.
+  for (int i = 0; i < num_of_reasons; i++) {
+    ASSERT_EQ(collector.NumberOfCompactions(static_cast<CompactionReason>(i)), counts[i]);
+  }
+#endif /* NDEBUG */
+}
+
+const SstFileMetaData* PickFileRandomly(
+    const ColumnFamilyMetaData& cf_meta,
+    Random* rand,
+    int* level = nullptr) {
+  auto file_id = rand->Uniform(static_cast<int>(
+      cf_meta.file_count)) + 1;
+  for (auto& level_meta : cf_meta.levels) {
+    if (file_id <= level_meta.files.size()) {
+      if (level != nullptr) {
+        *level = level_meta.level;
+      }
+      auto result = rand->Uniform(file_id);
+      return &(level_meta.files[result]);
+    }
+    file_id -= static_cast<uint32_t>(level_meta.files.size());
+  }
+  assert(false);
+  return nullptr;
+}
+}  // anonymous namespace
+
+#ifndef ROCKSDB_VALGRIND_RUN
+// All the TEST_P tests run once with sub_compactions disabled (i.e.
+// options.max_subcompactions = 1) and once with it enabled
+TEST_P(DBCompactionTestWithParam, CompactionDeletionTrigger) {
+  for (int tid = 0; tid < 3; ++tid) {
+    uint64_t db_size[2];
+    Options options = DeletionTriggerOptions(CurrentOptions());
+    options.max_subcompactions = max_subcompactions_;
+
+    if (tid == 1) {
+      // the following only disable stats update in DB::Open()
+      // and should not affect the result of this test.
+      options.skip_stats_update_on_db_open = true;
+    } else if (tid == 2) {
+      // third pass with universal compaction
+      options.compaction_style = kCompactionStyleUniversal;
+      options.num_levels = 1;
+    }
+
+    DestroyAndReopen(options);
+    Random rnd(301);
+
+    const int kTestSize = kCDTKeysPerBuffer * 1024;
+    std::vector<std::string> values;
+    for (int k = 0; k < kTestSize; ++k) {
+      values.push_back(RandomString(&rnd, kCDTValueSize));
+      ASSERT_OK(Put(Key(k), values[k]));
+    }
+    dbfull()->TEST_WaitForFlushMemTable();
+    dbfull()->TEST_WaitForCompact();
+    db_size[0] = Size(Key(0), Key(kTestSize - 1));
+
+    for (int k = 0; k < kTestSize; ++k) {
+      ASSERT_OK(Delete(Key(k)));
+    }
+    dbfull()->TEST_WaitForFlushMemTable();
+    dbfull()->TEST_WaitForCompact();
+    db_size[1] = Size(Key(0), Key(kTestSize - 1));
+
+    // must have much smaller db size.
+    ASSERT_GT(db_size[0] / 3, db_size[1]);
+  }
+}
+#endif  // ROCKSDB_VALGRIND_RUN
+
+TEST_P(DBCompactionTestWithParam, CompactionsPreserveDeletes) {
+  //  For each options type we test following
+  //  - Enable preserve_deletes
+  //  - write bunch of keys and deletes
+  //  - Set start_seqnum to the beginning; compact; check that keys are present
+  //  - rewind start_seqnum way forward; compact; check that keys are gone
+
+  for (int tid = 0; tid < 3; ++tid) {
+    Options options = DeletionTriggerOptions(CurrentOptions());
+    options.max_subcompactions = max_subcompactions_;
+    options.preserve_deletes=true;
+    options.num_levels = 2;
+
+    if (tid == 1) {
+      options.skip_stats_update_on_db_open = true;
+    } else if (tid == 2) {
+      // third pass with universal compaction
+      options.compaction_style = kCompactionStyleUniversal;
+    }
+
+    DestroyAndReopen(options);
+    Random rnd(301);
+    // highlight the default; all deletes should be preserved
+    SetPreserveDeletesSequenceNumber(0);
+
+    const int kTestSize = kCDTKeysPerBuffer;
+    std::vector<std::string> values;
+    for (int k = 0; k < kTestSize; ++k) {
+      values.push_back(RandomString(&rnd, kCDTValueSize));
+      ASSERT_OK(Put(Key(k), values[k]));
+    }
+
+    for (int k = 0; k < kTestSize; ++k) {
+      ASSERT_OK(Delete(Key(k)));
+    }
+    // to ensure we tackle all tombstones
+    CompactRangeOptions cro;
+    cro.change_level = true;
+    cro.target_level = 2;
+    cro.bottommost_level_compaction =
+        BottommostLevelCompaction::kForceOptimized;
+
+    dbfull()->TEST_WaitForFlushMemTable();
+    dbfull()->CompactRange(cro, nullptr, nullptr);
+
+    // check that normal user iterator doesn't see anything
+    Iterator* db_iter = dbfull()->NewIterator(ReadOptions());
+    int i = 0;
+    for (db_iter->SeekToFirst(); db_iter->Valid(); db_iter->Next()) {
+      i++;
+    }
+    ASSERT_EQ(i, 0);
+    delete db_iter;
+
+    // check that iterator that sees internal keys sees tombstones
+    ReadOptions ro;
+    ro.iter_start_seqnum=1;
+    db_iter = dbfull()->NewIterator(ro);
+    i = 0;
+    for (db_iter->SeekToFirst(); db_iter->Valid(); db_iter->Next()) {
+      i++;
+    }
+    ASSERT_EQ(i, 4);
+    delete db_iter;
+
+    // now all deletes should be gone
+    SetPreserveDeletesSequenceNumber(100000000);
+    dbfull()->CompactRange(cro, nullptr, nullptr);
+
+    db_iter = dbfull()->NewIterator(ro);
+    i = 0;
+    for (db_iter->SeekToFirst(); db_iter->Valid(); db_iter->Next()) {
+      i++;
+    }
+    ASSERT_EQ(i, 0);
+    delete db_iter;
+  }
+}
+
+TEST_F(DBCompactionTest, SkipStatsUpdateTest) {
+  // This test verify UpdateAccumulatedStats is not on
+  // if options.skip_stats_update_on_db_open = true
+  // The test will need to be updated if the internal behavior changes.
+
+  Options options = DeletionTriggerOptions(CurrentOptions());
+  options.disable_auto_compactions = true;
+  options.env = env_;
+  DestroyAndReopen(options);
+  Random rnd(301);
+
+  const int kTestSize = kCDTKeysPerBuffer * 512;
+  std::vector<std::string> values;
+  for (int k = 0; k < kTestSize; ++k) {
+    values.push_back(RandomString(&rnd, kCDTValueSize));
+    ASSERT_OK(Put(Key(k), values[k]));
+  }
+
+  ASSERT_OK(Flush());
+
+  Close();
+
+  int update_acc_stats_called = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "VersionStorageInfo::UpdateAccumulatedStats",
+      [&](void* /* arg */) { ++update_acc_stats_called; });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  // Reopen the DB with stats-update disabled
+  options.skip_stats_update_on_db_open = true;
+  options.max_open_files = 20;
+  Reopen(options);
+
+  ASSERT_EQ(update_acc_stats_called, 0);
+
+  // Repeat the reopen process, but this time we enable
+  // stats-update.
+  options.skip_stats_update_on_db_open = false;
+  Reopen(options);
+
+  ASSERT_GT(update_acc_stats_called, 0);
+
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBCompactionTest, TestTableReaderForCompaction) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.new_table_reader_for_compaction_inputs = true;
+  options.max_open_files = 20;
+  options.level0_file_num_compaction_trigger = 3;
+  DestroyAndReopen(options);
+  Random rnd(301);
+
+  int num_table_cache_lookup = 0;
+  int num_new_table_reader = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "TableCache::FindTable:0", [&](void* arg) {
+        assert(arg != nullptr);
+        bool no_io = *(reinterpret_cast<bool*>(arg));
+        if (!no_io) {
+          // filter out cases for table properties queries.
+          num_table_cache_lookup++;
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "TableCache::GetTableReader:0",
+      [&](void* /*arg*/) { num_new_table_reader++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  for (int k = 0; k < options.level0_file_num_compaction_trigger; ++k) {
+    ASSERT_OK(Put(Key(k), Key(k)));
+    ASSERT_OK(Put(Key(10 - k), "bar"));
+    if (k < options.level0_file_num_compaction_trigger - 1) {
+      num_table_cache_lookup = 0;
+      Flush();
+      dbfull()->TEST_WaitForCompact();
+      // preloading iterator issues one table cache lookup and create
+      // a new table reader, if not preloaded.
+      int old_num_table_cache_lookup = num_table_cache_lookup;
+      ASSERT_GE(num_table_cache_lookup, 1);
+      ASSERT_EQ(num_new_table_reader, 1);
+
+      num_table_cache_lookup = 0;
+      num_new_table_reader = 0;
+      ASSERT_EQ(Key(k), Get(Key(k)));
+      // lookup iterator from table cache and no need to create a new one.
+      ASSERT_EQ(old_num_table_cache_lookup + num_table_cache_lookup, 2);
+      ASSERT_EQ(num_new_table_reader, 0);
+    }
+  }
+
+  num_table_cache_lookup = 0;
+  num_new_table_reader = 0;
+  Flush();
+  dbfull()->TEST_WaitForCompact();
+  // Preloading iterator issues one table cache lookup and creates
+  // a new table reader. One file is created for flush and one for compaction.
+  // Compaction inputs make no table cache look-up for data/range deletion
+  // iterators
+  // May preload table cache too.
+  ASSERT_GE(num_table_cache_lookup, 2);
+  int old_num_table_cache_lookup2 = num_table_cache_lookup;
+
+  // Create new iterator for:
+  // (1) 1 for verifying flush results
+  // (2) 1 for verifying compaction results.
+  // (3) New TableReaders will not be created for compaction inputs
+  ASSERT_EQ(num_new_table_reader, 2);
+
+  num_table_cache_lookup = 0;
+  num_new_table_reader = 0;
+  ASSERT_EQ(Key(1), Get(Key(1)));
+  ASSERT_EQ(num_table_cache_lookup + old_num_table_cache_lookup2, 5);
+  ASSERT_EQ(num_new_table_reader, 0);
+
+  num_table_cache_lookup = 0;
+  num_new_table_reader = 0;
+  CompactRangeOptions cro;
+  cro.change_level = true;
+  cro.target_level = 2;
+  cro.bottommost_level_compaction = BottommostLevelCompaction::kForceOptimized;
+  db_->CompactRange(cro, nullptr, nullptr);
+  // Only verifying compaction outputs issues one table cache lookup
+  // for both data block and range deletion block).
+  // May preload table cache too.
+  ASSERT_GE(num_table_cache_lookup, 1);
+  old_num_table_cache_lookup2 = num_table_cache_lookup;
+  // One for verifying compaction results.
+  // No new iterator created for compaction.
+  ASSERT_EQ(num_new_table_reader, 1);
+
+  num_table_cache_lookup = 0;
+  num_new_table_reader = 0;
+  ASSERT_EQ(Key(1), Get(Key(1)));
+  ASSERT_EQ(num_table_cache_lookup + old_num_table_cache_lookup2, 3);
+  ASSERT_EQ(num_new_table_reader, 0);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_P(DBCompactionTestWithParam, CompactionDeletionTriggerReopen) {
+  for (int tid = 0; tid < 2; ++tid) {
+    uint64_t db_size[3];
+    Options options = DeletionTriggerOptions(CurrentOptions());
+    options.max_subcompactions = max_subcompactions_;
+
+    if (tid == 1) {
+      // second pass with universal compaction
+      options.compaction_style = kCompactionStyleUniversal;
+      options.num_levels = 1;
+    }
+
+    DestroyAndReopen(options);
+    Random rnd(301);
+
+    // round 1 --- insert key/value pairs.
+    const int kTestSize = kCDTKeysPerBuffer * 512;
+    std::vector<std::string> values;
+    for (int k = 0; k < kTestSize; ++k) {
+      values.push_back(RandomString(&rnd, kCDTValueSize));
+      ASSERT_OK(Put(Key(k), values[k]));
+    }
+    dbfull()->TEST_WaitForFlushMemTable();
+    dbfull()->TEST_WaitForCompact();
+    db_size[0] = Size(Key(0), Key(kTestSize - 1));
+    Close();
+
+    // round 2 --- disable auto-compactions and issue deletions.
+    options.create_if_missing = false;
+    options.disable_auto_compactions = true;
+    Reopen(options);
+
+    for (int k = 0; k < kTestSize; ++k) {
+      ASSERT_OK(Delete(Key(k)));
+    }
+    db_size[1] = Size(Key(0), Key(kTestSize - 1));
+    Close();
+    // as auto_compaction is off, we shouldn't see too much reduce
+    // in db size.
+    ASSERT_LT(db_size[0] / 3, db_size[1]);
+
+    // round 3 --- reopen db with auto_compaction on and see if
+    // deletion compensation still work.
+    options.disable_auto_compactions = false;
+    Reopen(options);
+    // insert relatively small amount of data to trigger auto compaction.
+    for (int k = 0; k < kTestSize / 10; ++k) {
+      ASSERT_OK(Put(Key(k), values[k]));
+    }
+    dbfull()->TEST_WaitForFlushMemTable();
+    dbfull()->TEST_WaitForCompact();
+    db_size[2] = Size(Key(0), Key(kTestSize - 1));
+    // this time we're expecting significant drop in size.
+    ASSERT_GT(db_size[0] / 3, db_size[2]);
+  }
+}
+
+TEST_F(DBCompactionTest, DisableStatsUpdateReopen) {
+  uint64_t db_size[3];
+  for (int test = 0; test < 2; ++test) {
+    Options options = DeletionTriggerOptions(CurrentOptions());
+    options.skip_stats_update_on_db_open = (test == 0);
+
+    env_->random_read_counter_.Reset();
+    DestroyAndReopen(options);
+    Random rnd(301);
+
+    // round 1 --- insert key/value pairs.
+    const int kTestSize = kCDTKeysPerBuffer * 512;
+    std::vector<std::string> values;
+    for (int k = 0; k < kTestSize; ++k) {
+      values.push_back(RandomString(&rnd, kCDTValueSize));
+      ASSERT_OK(Put(Key(k), values[k]));
+    }
+    dbfull()->TEST_WaitForFlushMemTable();
+    dbfull()->TEST_WaitForCompact();
+    db_size[0] = Size(Key(0), Key(kTestSize - 1));
+    Close();
+
+    // round 2 --- disable auto-compactions and issue deletions.
+    options.create_if_missing = false;
+    options.disable_auto_compactions = true;
+
+    env_->random_read_counter_.Reset();
+    Reopen(options);
+
+    for (int k = 0; k < kTestSize; ++k) {
+      ASSERT_OK(Delete(Key(k)));
+    }
+    db_size[1] = Size(Key(0), Key(kTestSize - 1));
+    Close();
+    // as auto_compaction is off, we shouldn't see too much reduce
+    // in db size.
+    ASSERT_LT(db_size[0] / 3, db_size[1]);
+
+    // round 3 --- reopen db with auto_compaction on and see if
+    // deletion compensation still work.
+    options.disable_auto_compactions = false;
+    Reopen(options);
+    dbfull()->TEST_WaitForFlushMemTable();
+    dbfull()->TEST_WaitForCompact();
+    db_size[2] = Size(Key(0), Key(kTestSize - 1));
+
+    if (options.skip_stats_update_on_db_open) {
+      // If update stats on DB::Open is disable, we don't expect
+      // deletion entries taking effect.
+      ASSERT_LT(db_size[0] / 3, db_size[2]);
+    } else {
+      // Otherwise, we should see a significant drop in db size.
+      ASSERT_GT(db_size[0] / 3, db_size[2]);
+    }
+  }
+}
+
+
+TEST_P(DBCompactionTestWithParam, CompactionTrigger) {
+  const int kNumKeysPerFile = 100;
+
+  Options options = CurrentOptions();
+  options.write_buffer_size = 110 << 10;  // 110KB
+  options.arena_block_size = 4 << 10;
+  options.num_levels = 3;
+  options.level0_file_num_compaction_trigger = 3;
+  options.max_subcompactions = max_subcompactions_;
+  options.memtable_factory.reset(new SpecialSkipListFactory(kNumKeysPerFile));
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  Random rnd(301);
+
+  for (int num = 0; num < options.level0_file_num_compaction_trigger - 1;
+       num++) {
+    std::vector<std::string> values;
+    // Write 100KB (100 values, each 1K)
+    for (int i = 0; i < kNumKeysPerFile; i++) {
+      values.push_back(RandomString(&rnd, 990));
+      ASSERT_OK(Put(1, Key(i), values[i]));
+    }
+    // put extra key to trigger flush
+    ASSERT_OK(Put(1, "", ""));
+    dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
+    ASSERT_EQ(NumTableFilesAtLevel(0, 1), num + 1);
+  }
+
+  // generate one more file in level-0, and should trigger level-0 compaction
+  std::vector<std::string> values;
+  for (int i = 0; i < kNumKeysPerFile; i++) {
+    values.push_back(RandomString(&rnd, 990));
+    ASSERT_OK(Put(1, Key(i), values[i]));
+  }
+  // put extra key to trigger flush
+  ASSERT_OK(Put(1, "", ""));
+  dbfull()->TEST_WaitForCompact();
+
+  ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(1, 1), 1);
+}
+
+TEST_F(DBCompactionTest, BGCompactionsAllowed) {
+  // Create several column families. Make compaction triggers in all of them
+  // and see number of compactions scheduled to be less than allowed.
+  const int kNumKeysPerFile = 100;
+
+  Options options = CurrentOptions();
+  options.write_buffer_size = 110 << 10;  // 110KB
+  options.arena_block_size = 4 << 10;
+  options.num_levels = 3;
+  // Should speed up compaction when there are 4 files.
+  options.level0_file_num_compaction_trigger = 2;
+  options.level0_slowdown_writes_trigger = 20;
+  options.soft_pending_compaction_bytes_limit = 1 << 30;  // Infinitely large
+  options.max_background_compactions = 3;
+  options.memtable_factory.reset(new SpecialSkipListFactory(kNumKeysPerFile));
+
+  // Block all threads in thread pool.
+  const size_t kTotalTasks = 4;
+  env_->SetBackgroundThreads(4, Env::LOW);
+  test::SleepingBackgroundTask sleeping_tasks[kTotalTasks];
+  for (size_t i = 0; i < kTotalTasks; i++) {
+    env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask,
+                   &sleeping_tasks[i], Env::Priority::LOW);
+    sleeping_tasks[i].WaitUntilSleeping();
+  }
+
+  CreateAndReopenWithCF({"one", "two", "three"}, options);
+
+  Random rnd(301);
+  for (int cf = 0; cf < 4; cf++) {
+    for (int num = 0; num < options.level0_file_num_compaction_trigger; num++) {
+      for (int i = 0; i < kNumKeysPerFile; i++) {
+        ASSERT_OK(Put(cf, Key(i), ""));
+      }
+      // put extra key to trigger flush
+      ASSERT_OK(Put(cf, "", ""));
+      dbfull()->TEST_WaitForFlushMemTable(handles_[cf]);
+      ASSERT_EQ(NumTableFilesAtLevel(0, cf), num + 1);
+    }
+  }
+
+  // Now all column families qualify compaction but only one should be
+  // scheduled, because no column family hits speed up condition.
+  ASSERT_EQ(1u, env_->GetThreadPoolQueueLen(Env::Priority::LOW));
+
+  // Create two more files for one column family, which triggers speed up
+  // condition, three compactions will be scheduled.
+  for (int num = 0; num < options.level0_file_num_compaction_trigger; num++) {
+    for (int i = 0; i < kNumKeysPerFile; i++) {
+      ASSERT_OK(Put(2, Key(i), ""));
+    }
+    // put extra key to trigger flush
+    ASSERT_OK(Put(2, "", ""));
+    dbfull()->TEST_WaitForFlushMemTable(handles_[2]);
+    ASSERT_EQ(options.level0_file_num_compaction_trigger + num + 1,
+              NumTableFilesAtLevel(0, 2));
+  }
+  ASSERT_EQ(3U, env_->GetThreadPoolQueueLen(Env::Priority::LOW));
+
+  // Unblock all threads to unblock all compactions.
+  for (size_t i = 0; i < kTotalTasks; i++) {
+    sleeping_tasks[i].WakeUp();
+    sleeping_tasks[i].WaitUntilDone();
+  }
+  dbfull()->TEST_WaitForCompact();
+
+  // Verify number of compactions allowed will come back to 1.
+
+  for (size_t i = 0; i < kTotalTasks; i++) {
+    sleeping_tasks[i].Reset();
+    env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask,
+                   &sleeping_tasks[i], Env::Priority::LOW);
+    sleeping_tasks[i].WaitUntilSleeping();
+  }
+  for (int cf = 0; cf < 4; cf++) {
+    for (int num = 0; num < options.level0_file_num_compaction_trigger; num++) {
+      for (int i = 0; i < kNumKeysPerFile; i++) {
+        ASSERT_OK(Put(cf, Key(i), ""));
+      }
+      // put extra key to trigger flush
+      ASSERT_OK(Put(cf, "", ""));
+      dbfull()->TEST_WaitForFlushMemTable(handles_[cf]);
+      ASSERT_EQ(NumTableFilesAtLevel(0, cf), num + 1);
+    }
+  }
+
+  // Now all column families qualify compaction but only one should be
+  // scheduled, because no column family hits speed up condition.
+  ASSERT_EQ(1U, env_->GetThreadPoolQueueLen(Env::Priority::LOW));
+
+  for (size_t i = 0; i < kTotalTasks; i++) {
+    sleeping_tasks[i].WakeUp();
+    sleeping_tasks[i].WaitUntilDone();
+  }
+}
+
+TEST_P(DBCompactionTestWithParam, CompactionsGenerateMultipleFiles) {
+  Options options = CurrentOptions();
+  options.write_buffer_size = 100000000;        // Large write buffer
+  options.max_subcompactions = max_subcompactions_;
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  Random rnd(301);
+
+  // Write 8MB (80 values, each 100K)
+  ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
+  std::vector<std::string> values;
+  for (int i = 0; i < 80; i++) {
+    values.push_back(RandomString(&rnd, 100000));
+    ASSERT_OK(Put(1, Key(i), values[i]));
+  }
+
+  // Reopening moves updates to level-0
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+  dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1],
+                              true /* disallow trivial move */);
+
+  ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
+  ASSERT_GT(NumTableFilesAtLevel(1, 1), 1);
+  for (int i = 0; i < 80; i++) {
+    ASSERT_EQ(Get(1, Key(i)), values[i]);
+  }
+}
+
+TEST_F(DBCompactionTest, MinorCompactionsHappen) {
+  do {
+    Options options = CurrentOptions();
+    options.write_buffer_size = 10000;
+    CreateAndReopenWithCF({"pikachu"}, options);
+
+    const int N = 500;
+
+    int starting_num_tables = TotalTableFiles(1);
+    for (int i = 0; i < N; i++) {
+      ASSERT_OK(Put(1, Key(i), Key(i) + std::string(1000, 'v')));
+    }
+    int ending_num_tables = TotalTableFiles(1);
+    ASSERT_GT(ending_num_tables, starting_num_tables);
+
+    for (int i = 0; i < N; i++) {
+      ASSERT_EQ(Key(i) + std::string(1000, 'v'), Get(1, Key(i)));
+    }
+
+    ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+    for (int i = 0; i < N; i++) {
+      ASSERT_EQ(Key(i) + std::string(1000, 'v'), Get(1, Key(i)));
+    }
+  } while (ChangeCompactOptions());
+}
+
+TEST_F(DBCompactionTest, UserKeyCrossFile1) {
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleLevel;
+  options.level0_file_num_compaction_trigger = 3;
+
+  DestroyAndReopen(options);
+
+  // create first file and flush to l0
+  Put("4", "A");
+  Put("3", "A");
+  Flush();
+  dbfull()->TEST_WaitForFlushMemTable();
+
+  Put("2", "A");
+  Delete("3");
+  Flush();
+  dbfull()->TEST_WaitForFlushMemTable();
+  ASSERT_EQ("NOT_FOUND", Get("3"));
+
+  // move both files down to l1
+  dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_EQ("NOT_FOUND", Get("3"));
+
+  for (int i = 0; i < 3; i++) {
+    Put("2", "B");
+    Flush();
+    dbfull()->TEST_WaitForFlushMemTable();
+  }
+  dbfull()->TEST_WaitForCompact();
+
+  ASSERT_EQ("NOT_FOUND", Get("3"));
+}
+
+TEST_F(DBCompactionTest, UserKeyCrossFile2) {
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleLevel;
+  options.level0_file_num_compaction_trigger = 3;
+
+  DestroyAndReopen(options);
+
+  // create first file and flush to l0
+  Put("4", "A");
+  Put("3", "A");
+  Flush();
+  dbfull()->TEST_WaitForFlushMemTable();
+
+  Put("2", "A");
+  SingleDelete("3");
+  Flush();
+  dbfull()->TEST_WaitForFlushMemTable();
+  ASSERT_EQ("NOT_FOUND", Get("3"));
+
+  // move both files down to l1
+  dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_EQ("NOT_FOUND", Get("3"));
+
+  for (int i = 0; i < 3; i++) {
+    Put("2", "B");
+    Flush();
+    dbfull()->TEST_WaitForFlushMemTable();
+  }
+  dbfull()->TEST_WaitForCompact();
+
+  ASSERT_EQ("NOT_FOUND", Get("3"));
+}
+
+TEST_F(DBCompactionTest, ZeroSeqIdCompaction) {
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleLevel;
+  options.level0_file_num_compaction_trigger = 3;
+
+  FlushedFileCollector* collector = new FlushedFileCollector();
+  options.listeners.emplace_back(collector);
+
+  // compaction options
+  CompactionOptions compact_opt;
+  compact_opt.compression = kNoCompression;
+  compact_opt.output_file_size_limit = 4096;
+  const size_t key_len =
+    static_cast<size_t>(compact_opt.output_file_size_limit) / 5;
+
+  DestroyAndReopen(options);
+
+  std::vector<const Snapshot*> snaps;
+
+  // create first file and flush to l0
+  for (auto& key : {"1", "2", "3", "3", "3", "3"}) {
+    Put(key, std::string(key_len, 'A'));
+    snaps.push_back(dbfull()->GetSnapshot());
+  }
+  Flush();
+  dbfull()->TEST_WaitForFlushMemTable();
+
+  // create second file and flush to l0
+  for (auto& key : {"3", "4", "5", "6", "7", "8"}) {
+    Put(key, std::string(key_len, 'A'));
+    snaps.push_back(dbfull()->GetSnapshot());
+  }
+  Flush();
+  dbfull()->TEST_WaitForFlushMemTable();
+
+  // move both files down to l1
+  dbfull()->CompactFiles(compact_opt, collector->GetFlushedFiles(), 1);
+
+  // release snap so that first instance of key(3) can have seqId=0
+  for (auto snap : snaps) {
+    dbfull()->ReleaseSnapshot(snap);
+  }
+
+  // create 3 files in l0 so to trigger compaction
+  for (int i = 0; i < options.level0_file_num_compaction_trigger; i++) {
+    Put("2", std::string(1, 'A'));
+    Flush();
+    dbfull()->TEST_WaitForFlushMemTable();
+  }
+
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(Put("", ""));
+}
+
+TEST_F(DBCompactionTest, ManualCompactionUnknownOutputSize) {
+  // github issue #2249
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleLevel;
+  options.level0_file_num_compaction_trigger = 3;
+  DestroyAndReopen(options);
+
+  // create two files in l1 that we can compact
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < options.level0_file_num_compaction_trigger; j++) {
+      // make l0 files' ranges overlap to avoid trivial move
+      Put(std::to_string(2 * i), std::string(1, 'A'));
+      Put(std::to_string(2 * i + 1), std::string(1, 'A'));
+      Flush();
+      dbfull()->TEST_WaitForFlushMemTable();
+    }
+    dbfull()->TEST_WaitForCompact();
+    ASSERT_EQ(NumTableFilesAtLevel(0, 0), 0);
+    ASSERT_EQ(NumTableFilesAtLevel(1, 0), i + 1);
+  }
+
+  ColumnFamilyMetaData cf_meta;
+  dbfull()->GetColumnFamilyMetaData(dbfull()->DefaultColumnFamily(), &cf_meta);
+  ASSERT_EQ(2, cf_meta.levels[1].files.size());
+  std::vector<std::string> input_filenames;
+  for (const auto& sst_file : cf_meta.levels[1].files) {
+    input_filenames.push_back(sst_file.name);
+  }
+
+  // note CompactionOptions::output_file_size_limit is unset.
+  CompactionOptions compact_opt;
+  compact_opt.compression = kNoCompression;
+  dbfull()->CompactFiles(compact_opt, input_filenames, 1);
+}
+
+// Check that writes done during a memtable compaction are recovered
+// if the database is shutdown during the memtable compaction.
+TEST_F(DBCompactionTest, RecoverDuringMemtableCompaction) {
+  do {
+    Options options = CurrentOptions();
+    options.env = env_;
+    CreateAndReopenWithCF({"pikachu"}, options);
+
+    // Trigger a long memtable compaction and reopen the database during it
+    ASSERT_OK(Put(1, "foo", "v1"));  // Goes to 1st log file
+    ASSERT_OK(Put(1, "big1", std::string(10000000, 'x')));  // Fills memtable
+    ASSERT_OK(Put(1, "big2", std::string(1000, 'y')));  // Triggers compaction
+    ASSERT_OK(Put(1, "bar", "v2"));                     // Goes to new log file
+
+    ReopenWithColumnFamilies({"default", "pikachu"}, options);
+    ASSERT_EQ("v1", Get(1, "foo"));
+    ASSERT_EQ("v2", Get(1, "bar"));
+    ASSERT_EQ(std::string(10000000, 'x'), Get(1, "big1"));
+    ASSERT_EQ(std::string(1000, 'y'), Get(1, "big2"));
+  } while (ChangeOptions());
+}
+
+TEST_P(DBCompactionTestWithParam, TrivialMoveOneFile) {
+  int32_t trivial_move = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:TrivialMove",
+      [&](void* /*arg*/) { trivial_move++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  Options options = CurrentOptions();
+  options.write_buffer_size = 100000000;
+  options.max_subcompactions = max_subcompactions_;
+  DestroyAndReopen(options);
+
+  int32_t num_keys = 80;
+  int32_t value_size = 100 * 1024;  // 100 KB
+
+  Random rnd(301);
+  std::vector<std::string> values;
+  for (int i = 0; i < num_keys; i++) {
+    values.push_back(RandomString(&rnd, value_size));
+    ASSERT_OK(Put(Key(i), values[i]));
+  }
+
+  // Reopening moves updates to L0
+  Reopen(options);
+  ASSERT_EQ(NumTableFilesAtLevel(0, 0), 1);  // 1 file in L0
+  ASSERT_EQ(NumTableFilesAtLevel(1, 0), 0);  // 0 files in L1
+
+  std::vector<LiveFileMetaData> metadata;
+  db_->GetLiveFilesMetaData(&metadata);
+  ASSERT_EQ(metadata.size(), 1U);
+  LiveFileMetaData level0_file = metadata[0];  // L0 file meta
+
+  CompactRangeOptions cro;
+  cro.exclusive_manual_compaction = exclusive_manual_compaction_;
+
+  // Compaction will initiate a trivial move from L0 to L1
+  dbfull()->CompactRange(cro, nullptr, nullptr);
+
+  // File moved From L0 to L1
+  ASSERT_EQ(NumTableFilesAtLevel(0, 0), 0);  // 0 files in L0
+  ASSERT_EQ(NumTableFilesAtLevel(1, 0), 1);  // 1 file in L1
+
+  metadata.clear();
+  db_->GetLiveFilesMetaData(&metadata);
+  ASSERT_EQ(metadata.size(), 1U);
+  ASSERT_EQ(metadata[0].name /* level1_file.name */, level0_file.name);
+  ASSERT_EQ(metadata[0].size /* level1_file.size */, level0_file.size);
+
+  for (int i = 0; i < num_keys; i++) {
+    ASSERT_EQ(Get(Key(i)), values[i]);
+  }
+
+  ASSERT_EQ(trivial_move, 1);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_P(DBCompactionTestWithParam, TrivialMoveNonOverlappingFiles) {
+  int32_t trivial_move = 0;
+  int32_t non_trivial_move = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:TrivialMove",
+      [&](void* /*arg*/) { trivial_move++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:NonTrivial",
+      [&](void* /*arg*/) { non_trivial_move++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.write_buffer_size = 10 * 1024 * 1024;
+  options.max_subcompactions = max_subcompactions_;
+
+  DestroyAndReopen(options);
+  // non overlapping ranges
+  std::vector<std::pair<int32_t, int32_t>> ranges = {
+    {100, 199},
+    {300, 399},
+    {0, 99},
+    {200, 299},
+    {600, 699},
+    {400, 499},
+    {500, 550},
+    {551, 599},
+  };
+  int32_t value_size = 10 * 1024;  // 10 KB
+
+  Random rnd(301);
+  std::map<int32_t, std::string> values;
+  for (size_t i = 0; i < ranges.size(); i++) {
+    for (int32_t j = ranges[i].first; j <= ranges[i].second; j++) {
+      values[j] = RandomString(&rnd, value_size);
+      ASSERT_OK(Put(Key(j), values[j]));
+    }
+    ASSERT_OK(Flush());
+  }
+
+  int32_t level0_files = NumTableFilesAtLevel(0, 0);
+  ASSERT_EQ(level0_files, ranges.size());    // Multiple files in L0
+  ASSERT_EQ(NumTableFilesAtLevel(1, 0), 0);  // No files in L1
+
+  CompactRangeOptions cro;
+  cro.exclusive_manual_compaction = exclusive_manual_compaction_;
+
+  // Since data is non-overlapping we expect compaction to initiate
+  // a trivial move
+  db_->CompactRange(cro, nullptr, nullptr);
+  // We expect that all the files were trivially moved from L0 to L1
+  ASSERT_EQ(NumTableFilesAtLevel(0, 0), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(1, 0) /* level1_files */, level0_files);
+
+  for (size_t i = 0; i < ranges.size(); i++) {
+    for (int32_t j = ranges[i].first; j <= ranges[i].second; j++) {
+      ASSERT_EQ(Get(Key(j)), values[j]);
+    }
+  }
+
+  ASSERT_EQ(trivial_move, 1);
+  ASSERT_EQ(non_trivial_move, 0);
+
+  trivial_move = 0;
+  non_trivial_move = 0;
+  values.clear();
+  DestroyAndReopen(options);
+  // Same ranges as above but overlapping
+  ranges = {
+    {100, 199},
+    {300, 399},
+    {0, 99},
+    {200, 299},
+    {600, 699},
+    {400, 499},
+    {500, 560},  // this range overlap with the next one
+    {551, 599},
+  };
+  for (size_t i = 0; i < ranges.size(); i++) {
+    for (int32_t j = ranges[i].first; j <= ranges[i].second; j++) {
+      values[j] = RandomString(&rnd, value_size);
+      ASSERT_OK(Put(Key(j), values[j]));
+    }
+    ASSERT_OK(Flush());
+  }
+
+  db_->CompactRange(cro, nullptr, nullptr);
+
+  for (size_t i = 0; i < ranges.size(); i++) {
+    for (int32_t j = ranges[i].first; j <= ranges[i].second; j++) {
+      ASSERT_EQ(Get(Key(j)), values[j]);
+    }
+  }
+  ASSERT_EQ(trivial_move, 0);
+  ASSERT_EQ(non_trivial_move, 1);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_P(DBCompactionTestWithParam, TrivialMoveTargetLevel) {
+  int32_t trivial_move = 0;
+  int32_t non_trivial_move = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:TrivialMove",
+      [&](void* /*arg*/) { trivial_move++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:NonTrivial",
+      [&](void* /*arg*/) { non_trivial_move++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.write_buffer_size = 10 * 1024 * 1024;
+  options.num_levels = 7;
+  options.max_subcompactions = max_subcompactions_;
+
+  DestroyAndReopen(options);
+  int32_t value_size = 10 * 1024;  // 10 KB
+
+  // Add 2 non-overlapping files
+  Random rnd(301);
+  std::map<int32_t, std::string> values;
+
+  // file 1 [0 => 300]
+  for (int32_t i = 0; i <= 300; i++) {
+    values[i] = RandomString(&rnd, value_size);
+    ASSERT_OK(Put(Key(i), values[i]));
+  }
+  ASSERT_OK(Flush());
+
+  // file 2 [600 => 700]
+  for (int32_t i = 600; i <= 700; i++) {
+    values[i] = RandomString(&rnd, value_size);
+    ASSERT_OK(Put(Key(i), values[i]));
+  }
+  ASSERT_OK(Flush());
+
+  // 2 files in L0
+  ASSERT_EQ("2", FilesPerLevel(0));
+  CompactRangeOptions compact_options;
+  compact_options.change_level = true;
+  compact_options.target_level = 6;
+  compact_options.exclusive_manual_compaction = exclusive_manual_compaction_;
+  ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr));
+  // 2 files in L6
+  ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel(0));
+
+  ASSERT_EQ(trivial_move, 1);
+  ASSERT_EQ(non_trivial_move, 0);
+
+  for (int32_t i = 0; i <= 300; i++) {
+    ASSERT_EQ(Get(Key(i)), values[i]);
+  }
+  for (int32_t i = 600; i <= 700; i++) {
+    ASSERT_EQ(Get(Key(i)), values[i]);
+  }
+}
+
+TEST_P(DBCompactionTestWithParam, ManualCompactionPartial) {
+  int32_t trivial_move = 0;
+  int32_t non_trivial_move = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:TrivialMove",
+      [&](void* /*arg*/) { trivial_move++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:NonTrivial",
+      [&](void* /*arg*/) { non_trivial_move++; });
+  bool first = true;
+  // Purpose of dependencies:
+  // 4 -> 1: ensure the order of two non-trivial compactions
+  // 5 -> 2 and 5 -> 3: ensure we do a check before two non-trivial compactions
+  // are installed
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBCompaction::ManualPartial:4", "DBCompaction::ManualPartial:1"},
+       {"DBCompaction::ManualPartial:5", "DBCompaction::ManualPartial:2"},
+       {"DBCompaction::ManualPartial:5", "DBCompaction::ManualPartial:3"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:NonTrivial:AfterRun", [&](void* /*arg*/) {
+        if (first) {
+          first = false;
+          TEST_SYNC_POINT("DBCompaction::ManualPartial:4");
+          TEST_SYNC_POINT("DBCompaction::ManualPartial:3");
+        } else {  // second non-trivial compaction
+          TEST_SYNC_POINT("DBCompaction::ManualPartial:2");
+        }
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  Options options = CurrentOptions();
+  options.write_buffer_size = 10 * 1024 * 1024;
+  options.num_levels = 7;
+  options.max_subcompactions = max_subcompactions_;
+  options.level0_file_num_compaction_trigger = 3;
+  options.max_background_compactions = 3;
+  options.target_file_size_base = 1 << 23;  // 8 MB
+
+  DestroyAndReopen(options);
+  int32_t value_size = 10 * 1024;  // 10 KB
+
+  // Add 2 non-overlapping files
+  Random rnd(301);
+  std::map<int32_t, std::string> values;
+
+  // file 1 [0 => 100]
+  for (int32_t i = 0; i < 100; i++) {
+    values[i] = RandomString(&rnd, value_size);
+    ASSERT_OK(Put(Key(i), values[i]));
+  }
+  ASSERT_OK(Flush());
+
+  // file 2 [100 => 300]
+  for (int32_t i = 100; i < 300; i++) {
+    values[i] = RandomString(&rnd, value_size);
+    ASSERT_OK(Put(Key(i), values[i]));
+  }
+  ASSERT_OK(Flush());
+
+  // 2 files in L0
+  ASSERT_EQ("2", FilesPerLevel(0));
+  CompactRangeOptions compact_options;
+  compact_options.change_level = true;
+  compact_options.target_level = 6;
+  compact_options.exclusive_manual_compaction = exclusive_manual_compaction_;
+  // Trivial move the two non-overlapping files to level 6
+  ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr));
+  // 2 files in L6
+  ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel(0));
+
+  ASSERT_EQ(trivial_move, 1);
+  ASSERT_EQ(non_trivial_move, 0);
+
+  // file 3 [ 0 => 200]
+  for (int32_t i = 0; i < 200; i++) {
+    values[i] = RandomString(&rnd, value_size);
+    ASSERT_OK(Put(Key(i), values[i]));
+  }
+  ASSERT_OK(Flush());
+
+  // 1 files in L0
+  ASSERT_EQ("1,0,0,0,0,0,2", FilesPerLevel(0));
+  ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr, false));
+  ASSERT_OK(dbfull()->TEST_CompactRange(1, nullptr, nullptr, nullptr, false));
+  ASSERT_OK(dbfull()->TEST_CompactRange(2, nullptr, nullptr, nullptr, false));
+  ASSERT_OK(dbfull()->TEST_CompactRange(3, nullptr, nullptr, nullptr, false));
+  ASSERT_OK(dbfull()->TEST_CompactRange(4, nullptr, nullptr, nullptr, false));
+  // 2 files in L6, 1 file in L5
+  ASSERT_EQ("0,0,0,0,0,1,2", FilesPerLevel(0));
+
+  ASSERT_EQ(trivial_move, 6);
+  ASSERT_EQ(non_trivial_move, 0);
+
+  ROCKSDB_NAMESPACE::port::Thread threads([&] {
+    compact_options.change_level = false;
+    compact_options.exclusive_manual_compaction = false;
+    std::string begin_string = Key(0);
+    std::string end_string = Key(199);
+    Slice begin(begin_string);
+    Slice end(end_string);
+    // First non-trivial compaction is triggered
+    ASSERT_OK(db_->CompactRange(compact_options, &begin, &end));
+  });
+
+  TEST_SYNC_POINT("DBCompaction::ManualPartial:1");
+  // file 4 [300 => 400)
+  for (int32_t i = 300; i <= 400; i++) {
+    values[i] = RandomString(&rnd, value_size);
+    ASSERT_OK(Put(Key(i), values[i]));
+  }
+  ASSERT_OK(Flush());
+
+  // file 5 [400 => 500)
+  for (int32_t i = 400; i <= 500; i++) {
+    values[i] = RandomString(&rnd, value_size);
+    ASSERT_OK(Put(Key(i), values[i]));
+  }
+  ASSERT_OK(Flush());
+
+  // file 6 [500 => 600)
+  for (int32_t i = 500; i <= 600; i++) {
+    values[i] = RandomString(&rnd, value_size);
+    ASSERT_OK(Put(Key(i), values[i]));
+  }
+  // Second non-trivial compaction is triggered
+  ASSERT_OK(Flush());
+
+  // Before two non-trivial compactions are installed, there are 3 files in L0
+  ASSERT_EQ("3,0,0,0,0,1,2", FilesPerLevel(0));
+  TEST_SYNC_POINT("DBCompaction::ManualPartial:5");
+
+  dbfull()->TEST_WaitForFlushMemTable();
+  dbfull()->TEST_WaitForCompact();
+  // After two non-trivial compactions are installed, there is 1 file in L6, and
+  // 1 file in L1
+  ASSERT_EQ("0,1,0,0,0,0,1", FilesPerLevel(0));
+  threads.join();
+
+  for (int32_t i = 0; i < 600; i++) {
+    ASSERT_EQ(Get(Key(i)), values[i]);
+  }
+}
+
+// Disable as the test is flaky.
+TEST_F(DBCompactionTest, DISABLED_ManualPartialFill) {
+  int32_t trivial_move = 0;
+  int32_t non_trivial_move = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:TrivialMove",
+      [&](void* /*arg*/) { trivial_move++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:NonTrivial",
+      [&](void* /*arg*/) { non_trivial_move++; });
+  bool first = true;
+  bool second = true;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBCompaction::PartialFill:4", "DBCompaction::PartialFill:1"},
+       {"DBCompaction::PartialFill:2", "DBCompaction::PartialFill:3"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:NonTrivial:AfterRun", [&](void* /*arg*/) {
+        if (first) {
+          TEST_SYNC_POINT("DBCompaction::PartialFill:4");
+          first = false;
+          TEST_SYNC_POINT("DBCompaction::PartialFill:3");
+        } else if (second) {
+        }
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  Options options = CurrentOptions();
+  options.write_buffer_size = 10 * 1024 * 1024;
+  options.max_bytes_for_level_multiplier = 2;
+  options.num_levels = 4;
+  options.level0_file_num_compaction_trigger = 3;
+  options.max_background_compactions = 3;
+
+  DestroyAndReopen(options);
+  // make sure all background compaction jobs can be scheduled
+  auto stop_token =
+      dbfull()->TEST_write_controler().GetCompactionPressureToken();
+  int32_t value_size = 10 * 1024;  // 10 KB
+
+  // Add 2 non-overlapping files
+  Random rnd(301);
+  std::map<int32_t, std::string> values;
+
+  // file 1 [0 => 100]
+  for (int32_t i = 0; i < 100; i++) {
+    values[i] = RandomString(&rnd, value_size);
+    ASSERT_OK(Put(Key(i), values[i]));
+  }
+  ASSERT_OK(Flush());
+
+  // file 2 [100 => 300]
+  for (int32_t i = 100; i < 300; i++) {
+    values[i] = RandomString(&rnd, value_size);
+    ASSERT_OK(Put(Key(i), values[i]));
+  }
+  ASSERT_OK(Flush());
+
+  // 2 files in L0
+  ASSERT_EQ("2", FilesPerLevel(0));
+  CompactRangeOptions compact_options;
+  compact_options.change_level = true;
+  compact_options.target_level = 2;
+  ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr));
+  // 2 files in L2
+  ASSERT_EQ("0,0,2", FilesPerLevel(0));
+
+  ASSERT_EQ(trivial_move, 1);
+  ASSERT_EQ(non_trivial_move, 0);
+
+  // file 3 [ 0 => 200]
+  for (int32_t i = 0; i < 200; i++) {
+    values[i] = RandomString(&rnd, value_size);
+    ASSERT_OK(Put(Key(i), values[i]));
+  }
+  ASSERT_OK(Flush());
+
+  // 2 files in L2, 1 in L0
+  ASSERT_EQ("1,0,2", FilesPerLevel(0));
+  ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr, false));
+  // 2 files in L2, 1 in L1
+  ASSERT_EQ("0,1,2", FilesPerLevel(0));
+
+  ASSERT_EQ(trivial_move, 2);
+  ASSERT_EQ(non_trivial_move, 0);
+
+  ROCKSDB_NAMESPACE::port::Thread threads([&] {
+    compact_options.change_level = false;
+    compact_options.exclusive_manual_compaction = false;
+    std::string begin_string = Key(0);
+    std::string end_string = Key(199);
+    Slice begin(begin_string);
+    Slice end(end_string);
+    ASSERT_OK(db_->CompactRange(compact_options, &begin, &end));
+  });
+
+  TEST_SYNC_POINT("DBCompaction::PartialFill:1");
+  // Many files 4 [300 => 4300)
+  for (int32_t i = 0; i <= 5; i++) {
+    for (int32_t j = 300; j < 4300; j++) {
+      if (j == 2300) {
+        ASSERT_OK(Flush());
+        dbfull()->TEST_WaitForFlushMemTable();
+      }
+      values[j] = RandomString(&rnd, value_size);
+      ASSERT_OK(Put(Key(j), values[j]));
+    }
+  }
+
+  // Verify level sizes
+  uint64_t target_size = 4 * options.max_bytes_for_level_base;
+  for (int32_t i = 1; i < options.num_levels; i++) {
+    ASSERT_LE(SizeAtLevel(i), target_size);
+    target_size = static_cast<uint64_t>(target_size *
+                                        options.max_bytes_for_level_multiplier);
+  }
+
+  TEST_SYNC_POINT("DBCompaction::PartialFill:2");
+  dbfull()->TEST_WaitForFlushMemTable();
+  dbfull()->TEST_WaitForCompact();
+  threads.join();
+
+  for (int32_t i = 0; i < 4300; i++) {
+    ASSERT_EQ(Get(Key(i)), values[i]);
+  }
+}
+
+TEST_F(DBCompactionTest, ManualCompactionWithUnorderedWrite) {
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::WriteImpl:UnorderedWriteAfterWriteWAL",
+        "DBCompactionTest::ManualCompactionWithUnorderedWrite:WaitWriteWAL"},
+       {"DBImpl::WaitForPendingWrites:BeforeBlock",
+        "DBImpl::WriteImpl:BeforeUnorderedWriteMemtable"}});
+
+  Options options = CurrentOptions();
+  options.unordered_write = true;
+  DestroyAndReopen(options);
+  Put("foo", "v1");
+  ASSERT_OK(Flush());
+
+  Put("bar", "v1");
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  port::Thread writer([&]() { Put("foo", "v2"); });
+
+  TEST_SYNC_POINT(
+      "DBCompactionTest::ManualCompactionWithUnorderedWrite:WaitWriteWAL");
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+  writer.join();
+  ASSERT_EQ(Get("foo"), "v2");
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  Reopen(options);
+  ASSERT_EQ(Get("foo"), "v2");
+}
+
+TEST_F(DBCompactionTest, DeleteFileRange) {
+  Options options = CurrentOptions();
+  options.write_buffer_size = 10 * 1024 * 1024;
+  options.max_bytes_for_level_multiplier = 2;
+  options.num_levels = 4;
+  options.level0_file_num_compaction_trigger = 3;
+  options.max_background_compactions = 3;
+
+  DestroyAndReopen(options);
+  int32_t value_size = 10 * 1024;  // 10 KB
+
+  // Add 2 non-overlapping files
+  Random rnd(301);
+  std::map<int32_t, std::string> values;
+
+  // file 1 [0 => 100]
+  for (int32_t i = 0; i < 100; i++) {
+    values[i] = RandomString(&rnd, value_size);
+    ASSERT_OK(Put(Key(i), values[i]));
+  }
+  ASSERT_OK(Flush());
+
+  // file 2 [100 => 300]
+  for (int32_t i = 100; i < 300; i++) {
+    values[i] = RandomString(&rnd, value_size);
+    ASSERT_OK(Put(Key(i), values[i]));
+  }
+  ASSERT_OK(Flush());
+
+  // 2 files in L0
+  ASSERT_EQ("2", FilesPerLevel(0));
+  CompactRangeOptions compact_options;
+  compact_options.change_level = true;
+  compact_options.target_level = 2;
+  ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr));
+  // 2 files in L2
+  ASSERT_EQ("0,0,2", FilesPerLevel(0));
+
+  // file 3 [ 0 => 200]
+  for (int32_t i = 0; i < 200; i++) {
+    values[i] = RandomString(&rnd, value_size);
+    ASSERT_OK(Put(Key(i), values[i]));
+  }
+  ASSERT_OK(Flush());
+
+  // Many files 4 [300 => 4300)
+  for (int32_t i = 0; i <= 5; i++) {
+    for (int32_t j = 300; j < 4300; j++) {
+      if (j == 2300) {
+        ASSERT_OK(Flush());
+        dbfull()->TEST_WaitForFlushMemTable();
+      }
+      values[j] = RandomString(&rnd, value_size);
+      ASSERT_OK(Put(Key(j), values[j]));
+    }
+  }
+  ASSERT_OK(Flush());
+  dbfull()->TEST_WaitForFlushMemTable();
+  dbfull()->TEST_WaitForCompact();
+
+  // Verify level sizes
+  uint64_t target_size = 4 * options.max_bytes_for_level_base;
+  for (int32_t i = 1; i < options.num_levels; i++) {
+    ASSERT_LE(SizeAtLevel(i), target_size);
+    target_size = static_cast<uint64_t>(target_size *
+                                        options.max_bytes_for_level_multiplier);
+  }
+
+  size_t old_num_files = CountFiles();
+  std::string begin_string = Key(1000);
+  std::string end_string = Key(2000);
+  Slice begin(begin_string);
+  Slice end(end_string);
+  ASSERT_OK(DeleteFilesInRange(db_, db_->DefaultColumnFamily(), &begin, &end));
+
+  int32_t deleted_count = 0;
+  for (int32_t i = 0; i < 4300; i++) {
+    if (i < 1000 || i > 2000) {
+      ASSERT_EQ(Get(Key(i)), values[i]);
+    } else {
+      ReadOptions roptions;
+      std::string result;
+      Status s = db_->Get(roptions, Key(i), &result);
+      ASSERT_TRUE(s.IsNotFound() || s.ok());
+      if (s.IsNotFound()) {
+        deleted_count++;
+      }
+    }
+  }
+  ASSERT_GT(deleted_count, 0);
+  begin_string = Key(5000);
+  end_string = Key(6000);
+  Slice begin1(begin_string);
+  Slice end1(end_string);
+  // Try deleting files in range which contain no keys
+  ASSERT_OK(
+      DeleteFilesInRange(db_, db_->DefaultColumnFamily(), &begin1, &end1));
+
+  // Push data from level 0 to level 1 to force all data to be deleted
+  // Note that we don't delete level 0 files
+  compact_options.change_level = true;
+  compact_options.target_level = 1;
+  ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr));
+  dbfull()->TEST_WaitForCompact();
+
+  ASSERT_OK(
+      DeleteFilesInRange(db_, db_->DefaultColumnFamily(), nullptr, nullptr));
+
+  int32_t deleted_count2 = 0;
+  for (int32_t i = 0; i < 4300; i++) {
+    ReadOptions roptions;
+    std::string result;
+    Status s = db_->Get(roptions, Key(i), &result);
+    ASSERT_TRUE(s.IsNotFound());
+    deleted_count2++;
+  }
+  ASSERT_GT(deleted_count2, deleted_count);
+  size_t new_num_files = CountFiles();
+  ASSERT_GT(old_num_files, new_num_files);
+}
+
+TEST_F(DBCompactionTest, DeleteFilesInRanges) {
+  Options options = CurrentOptions();
+  options.write_buffer_size = 10 * 1024 * 1024;
+  options.max_bytes_for_level_multiplier = 2;
+  options.num_levels = 4;
+  options.max_background_compactions = 3;
+  options.disable_auto_compactions = true;
+
+  DestroyAndReopen(options);
+  int32_t value_size = 10 * 1024;  // 10 KB
+
+  Random rnd(301);
+  std::map<int32_t, std::string> values;
+
+  // file [0 => 100), [100 => 200), ... [900, 1000)
+  for (auto i = 0; i < 10; i++) {
+    for (auto j = 0; j < 100; j++) {
+      auto k = i * 100 + j;
+      values[k] = RandomString(&rnd, value_size);
+      ASSERT_OK(Put(Key(k), values[k]));
+    }
+    ASSERT_OK(Flush());
+  }
+  ASSERT_EQ("10", FilesPerLevel(0));
+  CompactRangeOptions compact_options;
+  compact_options.change_level = true;
+  compact_options.target_level = 2;
+  ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr));
+  ASSERT_EQ("0,0,10", FilesPerLevel(0));
+
+  // file [0 => 100), [200 => 300), ... [800, 900)
+  for (auto i = 0; i < 10; i+=2) {
+    for (auto j = 0; j < 100; j++) {
+      auto k = i * 100 + j;
+      ASSERT_OK(Put(Key(k), values[k]));
+    }
+    ASSERT_OK(Flush());
+  }
+  ASSERT_EQ("5,0,10", FilesPerLevel(0));
+  ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr));
+  ASSERT_EQ("0,5,10", FilesPerLevel(0));
+
+  // Delete files in range [0, 299] (inclusive)
+  {
+    auto begin_str1 = Key(0), end_str1 = Key(100);
+    auto begin_str2 = Key(100), end_str2 = Key(200);
+    auto begin_str3 = Key(200), end_str3 = Key(299);
+    Slice begin1(begin_str1), end1(end_str1);
+    Slice begin2(begin_str2), end2(end_str2);
+    Slice begin3(begin_str3), end3(end_str3);
+    std::vector<RangePtr> ranges;
+    ranges.push_back(RangePtr(&begin1, &end1));
+    ranges.push_back(RangePtr(&begin2, &end2));
+    ranges.push_back(RangePtr(&begin3, &end3));
+    ASSERT_OK(DeleteFilesInRanges(db_, db_->DefaultColumnFamily(),
+                                  ranges.data(), ranges.size()));
+    ASSERT_EQ("0,3,7", FilesPerLevel(0));
+
+    // Keys [0, 300) should not exist.
+    for (auto i = 0; i < 300; i++) {
+      ReadOptions ropts;
+      std::string result;
+      auto s = db_->Get(ropts, Key(i), &result);
+      ASSERT_TRUE(s.IsNotFound());
+    }
+    for (auto i = 300; i < 1000; i++) {
+      ASSERT_EQ(Get(Key(i)), values[i]);
+    }
+  }
+
+  // Delete files in range [600, 999) (exclusive)
+  {
+    auto begin_str1 = Key(600), end_str1 = Key(800);
+    auto begin_str2 = Key(700), end_str2 = Key(900);
+    auto begin_str3 = Key(800), end_str3 = Key(999);
+    Slice begin1(begin_str1), end1(end_str1);
+    Slice begin2(begin_str2), end2(end_str2);
+    Slice begin3(begin_str3), end3(end_str3);
+    std::vector<RangePtr> ranges;
+    ranges.push_back(RangePtr(&begin1, &end1));
+    ranges.push_back(RangePtr(&begin2, &end2));
+    ranges.push_back(RangePtr(&begin3, &end3));
+    ASSERT_OK(DeleteFilesInRanges(db_, db_->DefaultColumnFamily(),
+                                  ranges.data(), ranges.size(), false));
+    ASSERT_EQ("0,1,4", FilesPerLevel(0));
+
+    // Keys [600, 900) should not exist.
+    for (auto i = 600; i < 900; i++) {
+      ReadOptions ropts;
+      std::string result;
+      auto s = db_->Get(ropts, Key(i), &result);
+      ASSERT_TRUE(s.IsNotFound());
+    }
+    for (auto i = 300; i < 600; i++) {
+      ASSERT_EQ(Get(Key(i)), values[i]);
+    }
+    for (auto i = 900; i < 1000; i++) {
+      ASSERT_EQ(Get(Key(i)), values[i]);
+    }
+  }
+
+  // Delete all files.
+  {
+    RangePtr range;
+    ASSERT_OK(DeleteFilesInRanges(db_, db_->DefaultColumnFamily(), &range, 1));
+    ASSERT_EQ("", FilesPerLevel(0));
+
+    for (auto i = 0; i < 1000; i++) {
+      ReadOptions ropts;
+      std::string result;
+      auto s = db_->Get(ropts, Key(i), &result);
+      ASSERT_TRUE(s.IsNotFound());
+    }
+  }
+}
+
+TEST_F(DBCompactionTest, DeleteFileRangeFileEndpointsOverlapBug) {
+  // regression test for #2833: groups of files whose user-keys overlap at the
+  // endpoints could be split by `DeleteFilesInRange`. This caused old data to
+  // reappear, either because a new version of the key was removed, or a range
+  // deletion was partially dropped. It could also cause non-overlapping
+  // invariant to be violated if the files dropped by DeleteFilesInRange were
+  // a subset of files that a range deletion spans.
+  const int kNumL0Files = 2;
+  const int kValSize = 8 << 10;  // 8KB
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = kNumL0Files;
+  options.target_file_size_base = 1 << 10;  // 1KB
+  DestroyAndReopen(options);
+
+  // The snapshot prevents key 1 from having its old version dropped. The low
+  // `target_file_size_base` ensures two keys will be in each output file.
+  const Snapshot* snapshot = nullptr;
+  Random rnd(301);
+  // The value indicates which flush the key belonged to, which is enough
+  // for us to determine the keys' relative ages. After L0 flushes finish,
+  // files look like:
+  //
+  // File 0: 0 -> vals[0], 1 -> vals[0]
+  // File 1:               1 -> vals[1], 2 -> vals[1]
+  //
+  // Then L0->L1 compaction happens, which outputs keys as follows:
+  //
+  // File 0: 0 -> vals[0], 1 -> vals[1]
+  // File 1:               1 -> vals[0], 2 -> vals[1]
+  //
+  // DeleteFilesInRange shouldn't be allowed to drop just file 0, as that
+  // would cause `1 -> vals[0]` (an older key) to reappear.
+  std::string vals[kNumL0Files];
+  for (int i = 0; i < kNumL0Files; ++i) {
+    vals[i] = RandomString(&rnd, kValSize);
+    Put(Key(i), vals[i]);
+    Put(Key(i + 1), vals[i]);
+    Flush();
+    if (i == 0) {
+      snapshot = db_->GetSnapshot();
+    }
+  }
+  dbfull()->TEST_WaitForCompact();
+
+  // Verify `DeleteFilesInRange` can't drop only file 0 which would cause
+  // "1 -> vals[0]" to reappear.
+  std::string begin_str = Key(0), end_str = Key(1);
+  Slice begin = begin_str, end = end_str;
+  ASSERT_OK(DeleteFilesInRange(db_, db_->DefaultColumnFamily(), &begin, &end));
+  ASSERT_EQ(vals[1], Get(Key(1)));
+
+  db_->ReleaseSnapshot(snapshot);
+}
+
+TEST_P(DBCompactionTestWithParam, TrivialMoveToLastLevelWithFiles) {
+  int32_t trivial_move = 0;
+  int32_t non_trivial_move = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:TrivialMove",
+      [&](void* /*arg*/) { trivial_move++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:NonTrivial",
+      [&](void* /*arg*/) { non_trivial_move++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  Options options = CurrentOptions();
+  options.write_buffer_size = 100000000;
+  options.max_subcompactions = max_subcompactions_;
+  DestroyAndReopen(options);
+
+  int32_t value_size = 10 * 1024;  // 10 KB
+
+  Random rnd(301);
+  std::vector<std::string> values;
+  // File with keys [ 0 => 99 ]
+  for (int i = 0; i < 100; i++) {
+    values.push_back(RandomString(&rnd, value_size));
+    ASSERT_OK(Put(Key(i), values[i]));
+  }
+  ASSERT_OK(Flush());
+
+  ASSERT_EQ("1", FilesPerLevel(0));
+  // Compaction will do L0=>L1 (trivial move) then move L1 files to L3
+  CompactRangeOptions compact_options;
+  compact_options.change_level = true;
+  compact_options.target_level = 3;
+  compact_options.exclusive_manual_compaction = exclusive_manual_compaction_;
+  ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr));
+  ASSERT_EQ("0,0,0,1", FilesPerLevel(0));
+  ASSERT_EQ(trivial_move, 1);
+  ASSERT_EQ(non_trivial_move, 0);
+
+  // File with keys [ 100 => 199 ]
+  for (int i = 100; i < 200; i++) {
+    values.push_back(RandomString(&rnd, value_size));
+    ASSERT_OK(Put(Key(i), values[i]));
+  }
+  ASSERT_OK(Flush());
+
+  ASSERT_EQ("1,0,0,1", FilesPerLevel(0));
+  CompactRangeOptions cro;
+  cro.exclusive_manual_compaction = exclusive_manual_compaction_;
+  // Compaction will do L0=>L1 L1=>L2 L2=>L3 (3 trivial moves)
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  ASSERT_EQ("0,0,0,2", FilesPerLevel(0));
+  ASSERT_EQ(trivial_move, 4);
+  ASSERT_EQ(non_trivial_move, 0);
+
+  for (int i = 0; i < 200; i++) {
+    ASSERT_EQ(Get(Key(i)), values[i]);
+  }
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_P(DBCompactionTestWithParam, LevelCompactionThirdPath) {
+  Options options = CurrentOptions();
+  options.db_paths.emplace_back(dbname_, 500 * 1024);
+  options.db_paths.emplace_back(dbname_ + "_2", 4 * 1024 * 1024);
+  options.db_paths.emplace_back(dbname_ + "_3", 1024 * 1024 * 1024);
+  options.memtable_factory.reset(
+      new SpecialSkipListFactory(KNumKeysByGenerateNewFile - 1));
+  options.compaction_style = kCompactionStyleLevel;
+  options.write_buffer_size = 110 << 10;  // 110KB
+  options.arena_block_size = 4 << 10;
+  options.level0_file_num_compaction_trigger = 2;
+  options.num_levels = 4;
+  options.max_bytes_for_level_base = 400 * 1024;
+  options.max_subcompactions = max_subcompactions_;
+  //  options = CurrentOptions(options);
+
+  std::vector<std::string> filenames;
+  env_->GetChildren(options.db_paths[1].path, &filenames);
+  // Delete archival files.
+  for (size_t i = 0; i < filenames.size(); ++i) {
+    env_->DeleteFile(options.db_paths[1].path + "/" + filenames[i]);
+  }
+  env_->DeleteDir(options.db_paths[1].path);
+  Reopen(options);
+
+  Random rnd(301);
+  int key_idx = 0;
+
+  // First three 110KB files are not going to second path.
+  // After that, (100K, 200K)
+  for (int num = 0; num < 3; num++) {
+    GenerateNewFile(&rnd, &key_idx);
+  }
+
+  // Another 110KB triggers a compaction to 400K file to fill up first path
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(3, GetSstFileCount(options.db_paths[1].path));
+
+  // (1, 4)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,4", FilesPerLevel(0));
+  ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  // (1, 4, 1)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,4,1", FilesPerLevel(0));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  // (1, 4, 2)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,4,2", FilesPerLevel(0));
+  ASSERT_EQ(2, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  // (1, 4, 3)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,4,3", FilesPerLevel(0));
+  ASSERT_EQ(3, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  // (1, 4, 4)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,4,4", FilesPerLevel(0));
+  ASSERT_EQ(4, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  // (1, 4, 5)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,4,5", FilesPerLevel(0));
+  ASSERT_EQ(5, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  // (1, 4, 6)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,4,6", FilesPerLevel(0));
+  ASSERT_EQ(6, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  // (1, 4, 7)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,4,7", FilesPerLevel(0));
+  ASSERT_EQ(7, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  // (1, 4, 8)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,4,8", FilesPerLevel(0));
+  ASSERT_EQ(8, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  for (int i = 0; i < key_idx; i++) {
+    auto v = Get(Key(i));
+    ASSERT_NE(v, "NOT_FOUND");
+    ASSERT_TRUE(v.size() == 1 || v.size() == 990);
+  }
+
+  Reopen(options);
+
+  for (int i = 0; i < key_idx; i++) {
+    auto v = Get(Key(i));
+    ASSERT_NE(v, "NOT_FOUND");
+    ASSERT_TRUE(v.size() == 1 || v.size() == 990);
+  }
+
+  Destroy(options);
+}
+
+TEST_P(DBCompactionTestWithParam, LevelCompactionPathUse) {
+  Options options = CurrentOptions();
+  options.db_paths.emplace_back(dbname_, 500 * 1024);
+  options.db_paths.emplace_back(dbname_ + "_2", 4 * 1024 * 1024);
+  options.db_paths.emplace_back(dbname_ + "_3", 1024 * 1024 * 1024);
+  options.memtable_factory.reset(
+      new SpecialSkipListFactory(KNumKeysByGenerateNewFile - 1));
+  options.compaction_style = kCompactionStyleLevel;
+  options.write_buffer_size = 110 << 10;  // 110KB
+  options.arena_block_size = 4 << 10;
+  options.level0_file_num_compaction_trigger = 2;
+  options.num_levels = 4;
+  options.max_bytes_for_level_base = 400 * 1024;
+  options.max_subcompactions = max_subcompactions_;
+  //  options = CurrentOptions(options);
+
+  std::vector<std::string> filenames;
+  env_->GetChildren(options.db_paths[1].path, &filenames);
+  // Delete archival files.
+  for (size_t i = 0; i < filenames.size(); ++i) {
+    env_->DeleteFile(options.db_paths[1].path + "/" + filenames[i]);
+  }
+  env_->DeleteDir(options.db_paths[1].path);
+  Reopen(options);
+
+  Random rnd(301);
+  int key_idx = 0;
+
+  // Always gets compacted into 1 Level1 file,
+  // 0/1 Level 0 file
+  for (int num = 0; num < 3; num++) {
+    key_idx = 0;
+    GenerateNewFile(&rnd, &key_idx);
+  }
+
+  key_idx = 0;
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+
+  key_idx = 0;
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,1", FilesPerLevel(0));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  key_idx = 0;
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("0,1", FilesPerLevel(0));
+  ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+  key_idx = 0;
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,1", FilesPerLevel(0));
+  ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  key_idx = 0;
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("0,1", FilesPerLevel(0));
+  ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+  key_idx = 0;
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,1", FilesPerLevel(0));
+  ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  key_idx = 0;
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("0,1", FilesPerLevel(0));
+  ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+  key_idx = 0;
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,1", FilesPerLevel(0));
+  ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  key_idx = 0;
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("0,1", FilesPerLevel(0));
+  ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+  key_idx = 0;
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,1", FilesPerLevel(0));
+  ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  for (int i = 0; i < key_idx; i++) {
+    auto v = Get(Key(i));
+    ASSERT_NE(v, "NOT_FOUND");
+    ASSERT_TRUE(v.size() == 1 || v.size() == 990);
+  }
+
+  Reopen(options);
+
+  for (int i = 0; i < key_idx; i++) {
+    auto v = Get(Key(i));
+    ASSERT_NE(v, "NOT_FOUND");
+    ASSERT_TRUE(v.size() == 1 || v.size() == 990);
+  }
+
+  Destroy(options);
+}
+
+TEST_P(DBCompactionTestWithParam, LevelCompactionCFPathUse) {
+  Options options = CurrentOptions();
+  options.db_paths.emplace_back(dbname_, 500 * 1024);
+  options.db_paths.emplace_back(dbname_ + "_2", 4 * 1024 * 1024);
+  options.db_paths.emplace_back(dbname_ + "_3", 1024 * 1024 * 1024);
+  options.memtable_factory.reset(
+    new SpecialSkipListFactory(KNumKeysByGenerateNewFile - 1));
+  options.compaction_style = kCompactionStyleLevel;
+  options.write_buffer_size = 110 << 10;  // 110KB
+  options.arena_block_size = 4 << 10;
+  options.level0_file_num_compaction_trigger = 2;
+  options.num_levels = 4;
+  options.max_bytes_for_level_base = 400 * 1024;
+  options.max_subcompactions = max_subcompactions_;
+
+  std::vector<Options> option_vector;
+  option_vector.emplace_back(options);
+  ColumnFamilyOptions cf_opt1(options), cf_opt2(options);
+  // Configure CF1 specific paths.
+  cf_opt1.cf_paths.emplace_back(dbname_ + "cf1", 500 * 1024);
+  cf_opt1.cf_paths.emplace_back(dbname_ + "cf1_2", 4 * 1024 * 1024);
+  cf_opt1.cf_paths.emplace_back(dbname_ + "cf1_3", 1024 * 1024 * 1024);
+  option_vector.emplace_back(DBOptions(options), cf_opt1);
+  CreateColumnFamilies({"one"},option_vector[1]);
+
+  // Configura CF2 specific paths.
+  cf_opt2.cf_paths.emplace_back(dbname_ + "cf2", 500 * 1024);
+  cf_opt2.cf_paths.emplace_back(dbname_ + "cf2_2", 4 * 1024 * 1024);
+  cf_opt2.cf_paths.emplace_back(dbname_ + "cf2_3", 1024 * 1024 * 1024);
+  option_vector.emplace_back(DBOptions(options), cf_opt2);
+  CreateColumnFamilies({"two"},option_vector[2]);
+
+  ReopenWithColumnFamilies({"default", "one", "two"}, option_vector);
+
+  Random rnd(301);
+  int key_idx = 0;
+  int key_idx1 = 0;
+  int key_idx2 = 0;
+
+  auto generate_file = [&]() {
+    GenerateNewFile(0, &rnd, &key_idx);
+    GenerateNewFile(1, &rnd, &key_idx1);
+    GenerateNewFile(2, &rnd, &key_idx2);
+  };
+
+  auto check_sstfilecount = [&](int path_id, int expected) {
+    ASSERT_EQ(expected, GetSstFileCount(options.db_paths[path_id].path));
+    ASSERT_EQ(expected, GetSstFileCount(cf_opt1.cf_paths[path_id].path));
+    ASSERT_EQ(expected, GetSstFileCount(cf_opt2.cf_paths[path_id].path));
+  };
+
+  auto check_filesperlevel = [&](const std::string& expected) {
+    ASSERT_EQ(expected, FilesPerLevel(0));
+    ASSERT_EQ(expected, FilesPerLevel(1));
+    ASSERT_EQ(expected, FilesPerLevel(2));
+  };
+
+  auto check_getvalues = [&]() {
+    for (int i = 0; i < key_idx; i++) {
+      auto v = Get(0, Key(i));
+      ASSERT_NE(v, "NOT_FOUND");
+      ASSERT_TRUE(v.size() == 1 || v.size() == 990);
+    }
+
+    for (int i = 0; i < key_idx1; i++) {
+      auto v = Get(1, Key(i));
+      ASSERT_NE(v, "NOT_FOUND");
+      ASSERT_TRUE(v.size() == 1 || v.size() == 990);
+    }
+
+    for (int i = 0; i < key_idx2; i++) {
+      auto v = Get(2, Key(i));
+      ASSERT_NE(v, "NOT_FOUND");
+      ASSERT_TRUE(v.size() == 1 || v.size() == 990);
+    }
+  };
+
+  // Check that default column family uses db_paths.
+  // And Column family "one" uses cf_paths.
+
+  // First three 110KB files are not going to second path.
+  // After that, (100K, 200K)
+  for (int num = 0; num < 3; num++) {
+    generate_file();
+  }
+
+  // Another 110KB triggers a compaction to 400K file to fill up first path
+  generate_file();
+  check_sstfilecount(1, 3);
+
+  // (1, 4)
+  generate_file();
+  check_filesperlevel("1,4");
+  check_sstfilecount(1, 4);
+  check_sstfilecount(0, 1);
+
+  // (1, 4, 1)
+  generate_file();
+  check_filesperlevel("1,4,1");
+  check_sstfilecount(2, 1);
+  check_sstfilecount(1, 4);
+  check_sstfilecount(0, 1);
+
+  // (1, 4, 2)
+  generate_file();
+  check_filesperlevel("1,4,2");
+  check_sstfilecount(2, 2);
+  check_sstfilecount(1, 4);
+  check_sstfilecount(0, 1);
+
+  check_getvalues();
+
+  ReopenWithColumnFamilies({"default", "one", "two"}, option_vector);
+
+  check_getvalues();
+
+  Destroy(options, true);
+}
+
+TEST_P(DBCompactionTestWithParam, ConvertCompactionStyle) {
+  Random rnd(301);
+  int max_key_level_insert = 200;
+  int max_key_universal_insert = 600;
+
+  // Stage 1: generate a db with level compaction
+  Options options = CurrentOptions();
+  options.write_buffer_size = 110 << 10;  // 110KB
+  options.arena_block_size = 4 << 10;
+  options.num_levels = 4;
+  options.level0_file_num_compaction_trigger = 3;
+  options.max_bytes_for_level_base = 500 << 10;  // 500KB
+  options.max_bytes_for_level_multiplier = 1;
+  options.target_file_size_base = 200 << 10;  // 200KB
+  options.target_file_size_multiplier = 1;
+  options.max_subcompactions = max_subcompactions_;
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  for (int i = 0; i <= max_key_level_insert; i++) {
+    // each value is 10K
+    ASSERT_OK(Put(1, Key(i), RandomString(&rnd, 10000)));
+  }
+  ASSERT_OK(Flush(1));
+  dbfull()->TEST_WaitForCompact();
+
+  ASSERT_GT(TotalTableFiles(1, 4), 1);
+  int non_level0_num_files = 0;
+  for (int i = 1; i < options.num_levels; i++) {
+    non_level0_num_files += NumTableFilesAtLevel(i, 1);
+  }
+  ASSERT_GT(non_level0_num_files, 0);
+
+  // Stage 2: reopen with universal compaction - should fail
+  options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  options.num_levels = 1;
+  options = CurrentOptions(options);
+  Status s = TryReopenWithColumnFamilies({"default", "pikachu"}, options);
+  ASSERT_TRUE(s.IsInvalidArgument());
+
+  // Stage 3: compact into a single file and move the file to level 0
+  options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.target_file_size_base = INT_MAX;
+  options.target_file_size_multiplier = 1;
+  options.max_bytes_for_level_base = INT_MAX;
+  options.max_bytes_for_level_multiplier = 1;
+  options.num_levels = 4;
+  options = CurrentOptions(options);
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+  CompactRangeOptions compact_options;
+  compact_options.change_level = true;
+  compact_options.target_level = 0;
+  // cannot use kForceOptimized here because the compaction here is expected
+  // to generate one output file
+  compact_options.bottommost_level_compaction =
+      BottommostLevelCompaction::kForce;
+  compact_options.exclusive_manual_compaction = exclusive_manual_compaction_;
+  dbfull()->CompactRange(compact_options, handles_[1], nullptr, nullptr);
+
+  // Only 1 file in L0
+  ASSERT_EQ("1", FilesPerLevel(1));
+
+  // Stage 4: re-open in universal compaction style and do some db operations
+  options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  options.num_levels = 4;
+  options.write_buffer_size = 110 << 10;  // 110KB
+  options.arena_block_size = 4 << 10;
+  options.level0_file_num_compaction_trigger = 3;
+  options = CurrentOptions(options);
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+  options.num_levels = 1;
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+  for (int i = max_key_level_insert / 2; i <= max_key_universal_insert; i++) {
+    ASSERT_OK(Put(1, Key(i), RandomString(&rnd, 10000)));
+  }
+  dbfull()->Flush(FlushOptions());
+  ASSERT_OK(Flush(1));
+  dbfull()->TEST_WaitForCompact();
+
+  for (int i = 1; i < options.num_levels; i++) {
+    ASSERT_EQ(NumTableFilesAtLevel(i, 1), 0);
+  }
+
+  // verify keys inserted in both level compaction style and universal
+  // compaction style
+  std::string keys_in_db;
+  Iterator* iter = dbfull()->NewIterator(ReadOptions(), handles_[1]);
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    keys_in_db.append(iter->key().ToString());
+    keys_in_db.push_back(',');
+  }
+  delete iter;
+
+  std::string expected_keys;
+  for (int i = 0; i <= max_key_universal_insert; i++) {
+    expected_keys.append(Key(i));
+    expected_keys.push_back(',');
+  }
+
+  ASSERT_EQ(keys_in_db, expected_keys);
+}
+
+TEST_F(DBCompactionTest, L0_CompactionBug_Issue44_a) {
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    ASSERT_OK(Put(1, "b", "v"));
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    ASSERT_OK(Delete(1, "b"));
+    ASSERT_OK(Delete(1, "a"));
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    ASSERT_OK(Delete(1, "a"));
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    ASSERT_OK(Put(1, "a", "v"));
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    ASSERT_EQ("(a->v)", Contents(1));
+    env_->SleepForMicroseconds(1000000);  // Wait for compaction to finish
+    ASSERT_EQ("(a->v)", Contents(1));
+  } while (ChangeCompactOptions());
+}
+
+TEST_F(DBCompactionTest, L0_CompactionBug_Issue44_b) {
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    Put(1, "", "");
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    Delete(1, "e");
+    Put(1, "", "");
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    Put(1, "c", "cv");
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    Put(1, "", "");
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    Put(1, "", "");
+    env_->SleepForMicroseconds(1000000);  // Wait for compaction to finish
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    Put(1, "d", "dv");
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    Put(1, "", "");
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    Delete(1, "d");
+    Delete(1, "b");
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    ASSERT_EQ("(->)(c->cv)", Contents(1));
+    env_->SleepForMicroseconds(1000000);  // Wait for compaction to finish
+    ASSERT_EQ("(->)(c->cv)", Contents(1));
+  } while (ChangeCompactOptions());
+}
+
+TEST_F(DBCompactionTest, ManualAutoRace) {
+  CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::BGWorkCompaction", "DBCompactionTest::ManualAutoRace:1"},
+       {"DBImpl::RunManualCompaction:WaitScheduled",
+        "BackgroundCallCompaction:0"}});
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  Put(1, "foo", "");
+  Put(1, "bar", "");
+  Flush(1);
+  Put(1, "foo", "");
+  Put(1, "bar", "");
+  // Generate four files in CF 0, which should trigger an auto compaction
+  Put("foo", "");
+  Put("bar", "");
+  Flush();
+  Put("foo", "");
+  Put("bar", "");
+  Flush();
+  Put("foo", "");
+  Put("bar", "");
+  Flush();
+  Put("foo", "");
+  Put("bar", "");
+  Flush();
+
+  // The auto compaction is scheduled but waited until here
+  TEST_SYNC_POINT("DBCompactionTest::ManualAutoRace:1");
+  // The auto compaction will wait until the manual compaction is registerd
+  // before processing so that it will be cancelled.
+  dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr, nullptr);
+  ASSERT_EQ("0,1", FilesPerLevel(1));
+
+  // Eventually the cancelled compaction will be rescheduled and executed.
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ("0,1", FilesPerLevel(0));
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_P(DBCompactionTestWithParam, ManualCompaction) {
+  Options options = CurrentOptions();
+  options.max_subcompactions = max_subcompactions_;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  // iter - 0 with 7 levels
+  // iter - 1 with 3 levels
+  for (int iter = 0; iter < 2; ++iter) {
+    MakeTables(3, "p", "q", 1);
+    ASSERT_EQ("1,1,1", FilesPerLevel(1));
+
+    // Compaction range falls before files
+    Compact(1, "", "c");
+    ASSERT_EQ("1,1,1", FilesPerLevel(1));
+
+    // Compaction range falls after files
+    Compact(1, "r", "z");
+    ASSERT_EQ("1,1,1", FilesPerLevel(1));
+
+    // Compaction range overlaps files
+    Compact(1, "p1", "p9");
+    ASSERT_EQ("0,0,1", FilesPerLevel(1));
+
+    // Populate a different range
+    MakeTables(3, "c", "e", 1);
+    ASSERT_EQ("1,1,2", FilesPerLevel(1));
+
+    // Compact just the new range
+    Compact(1, "b", "f");
+    ASSERT_EQ("0,0,2", FilesPerLevel(1));
+
+    // Compact all
+    MakeTables(1, "a", "z", 1);
+    ASSERT_EQ("1,0,2", FilesPerLevel(1));
+
+    uint64_t prev_block_cache_add =
+        options.statistics->getTickerCount(BLOCK_CACHE_ADD);
+    CompactRangeOptions cro;
+    cro.exclusive_manual_compaction = exclusive_manual_compaction_;
+    db_->CompactRange(cro, handles_[1], nullptr, nullptr);
+    // Verify manual compaction doesn't fill block cache
+    ASSERT_EQ(prev_block_cache_add,
+              options.statistics->getTickerCount(BLOCK_CACHE_ADD));
+
+    ASSERT_EQ("0,0,1", FilesPerLevel(1));
+
+    if (iter == 0) {
+      options = CurrentOptions();
+      options.num_levels = 3;
+      options.create_if_missing = true;
+      options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+      DestroyAndReopen(options);
+      CreateAndReopenWithCF({"pikachu"}, options);
+    }
+  }
+}
+
+
+TEST_P(DBCompactionTestWithParam, ManualLevelCompactionOutputPathId) {
+  Options options = CurrentOptions();
+  options.db_paths.emplace_back(dbname_ + "_2", 2 * 10485760);
+  options.db_paths.emplace_back(dbname_ + "_3", 100 * 10485760);
+  options.db_paths.emplace_back(dbname_ + "_4", 120 * 10485760);
+  options.max_subcompactions = max_subcompactions_;
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  // iter - 0 with 7 levels
+  // iter - 1 with 3 levels
+  for (int iter = 0; iter < 2; ++iter) {
+    for (int i = 0; i < 3; ++i) {
+      ASSERT_OK(Put(1, "p", "begin"));
+      ASSERT_OK(Put(1, "q", "end"));
+      ASSERT_OK(Flush(1));
+    }
+    ASSERT_EQ("3", FilesPerLevel(1));
+    ASSERT_EQ(3, GetSstFileCount(options.db_paths[0].path));
+    ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+    // Compaction range falls before files
+    Compact(1, "", "c");
+    ASSERT_EQ("3", FilesPerLevel(1));
+
+    // Compaction range falls after files
+    Compact(1, "r", "z");
+    ASSERT_EQ("3", FilesPerLevel(1));
+
+    // Compaction range overlaps files
+    Compact(1, "p1", "p9", 1);
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    ASSERT_EQ("0,1", FilesPerLevel(1));
+    ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+    ASSERT_EQ(0, GetSstFileCount(options.db_paths[0].path));
+    ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+    // Populate a different range
+    for (int i = 0; i < 3; ++i) {
+      ASSERT_OK(Put(1, "c", "begin"));
+      ASSERT_OK(Put(1, "e", "end"));
+      ASSERT_OK(Flush(1));
+    }
+    ASSERT_EQ("3,1", FilesPerLevel(1));
+
+    // Compact just the new range
+    Compact(1, "b", "f", 1);
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    ASSERT_EQ("0,2", FilesPerLevel(1));
+    ASSERT_EQ(2, GetSstFileCount(options.db_paths[1].path));
+    ASSERT_EQ(0, GetSstFileCount(options.db_paths[0].path));
+    ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+    // Compact all
+    ASSERT_OK(Put(1, "a", "begin"));
+    ASSERT_OK(Put(1, "z", "end"));
+    ASSERT_OK(Flush(1));
+    ASSERT_EQ("1,2", FilesPerLevel(1));
+    ASSERT_EQ(2, GetSstFileCount(options.db_paths[1].path));
+    ASSERT_EQ(1, GetSstFileCount(options.db_paths[0].path));
+    CompactRangeOptions compact_options;
+    compact_options.target_path_id = 1;
+    compact_options.exclusive_manual_compaction = exclusive_manual_compaction_;
+    db_->CompactRange(compact_options, handles_[1], nullptr, nullptr);
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+    ASSERT_EQ("0,1", FilesPerLevel(1));
+    ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+    ASSERT_EQ(0, GetSstFileCount(options.db_paths[0].path));
+    ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+    if (iter == 0) {
+      DestroyAndReopen(options);
+      options = CurrentOptions();
+      options.db_paths.emplace_back(dbname_ + "_2", 2 * 10485760);
+      options.db_paths.emplace_back(dbname_ + "_3", 100 * 10485760);
+      options.db_paths.emplace_back(dbname_ + "_4", 120 * 10485760);
+      options.max_background_flushes = 1;
+      options.num_levels = 3;
+      options.create_if_missing = true;
+      CreateAndReopenWithCF({"pikachu"}, options);
+    }
+  }
+}
+
+TEST_F(DBCompactionTest, FilesDeletedAfterCompaction) {
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    ASSERT_OK(Put(1, "foo", "v2"));
+    Compact(1, "a", "z");
+    const size_t num_files = CountLiveFiles();
+    for (int i = 0; i < 10; i++) {
+      ASSERT_OK(Put(1, "foo", "v2"));
+      Compact(1, "a", "z");
+    }
+    ASSERT_EQ(CountLiveFiles(), num_files);
+  } while (ChangeCompactOptions());
+}
+
+// Check level comapction with compact files
+TEST_P(DBCompactionTestWithParam, DISABLED_CompactFilesOnLevelCompaction) {
+  const int kTestKeySize = 16;
+  const int kTestValueSize = 984;
+  const int kEntrySize = kTestKeySize + kTestValueSize;
+  const int kEntriesPerBuffer = 100;
+  Options options;
+  options.create_if_missing = true;
+  options.write_buffer_size = kEntrySize * kEntriesPerBuffer;
+  options.compaction_style = kCompactionStyleLevel;
+  options.target_file_size_base = options.write_buffer_size;
+  options.max_bytes_for_level_base = options.target_file_size_base * 2;
+  options.level0_stop_writes_trigger = 2;
+  options.max_bytes_for_level_multiplier = 2;
+  options.compression = kNoCompression;
+  options.max_subcompactions = max_subcompactions_;
+  options = CurrentOptions(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  Random rnd(301);
+  for (int key = 64 * kEntriesPerBuffer; key >= 0; --key) {
+    ASSERT_OK(Put(1, ToString(key), RandomString(&rnd, kTestValueSize)));
+  }
+  dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
+  dbfull()->TEST_WaitForCompact();
+
+  ColumnFamilyMetaData cf_meta;
+  dbfull()->GetColumnFamilyMetaData(handles_[1], &cf_meta);
+  int output_level = static_cast<int>(cf_meta.levels.size()) - 1;
+  for (int file_picked = 5; file_picked > 0; --file_picked) {
+    std::set<std::string> overlapping_file_names;
+    std::vector<std::string> compaction_input_file_names;
+    for (int f = 0; f < file_picked; ++f) {
+      int level = 0;
+      auto file_meta = PickFileRandomly(cf_meta, &rnd, &level);
+      compaction_input_file_names.push_back(file_meta->name);
+      GetOverlappingFileNumbersForLevelCompaction(
+          cf_meta, options.comparator, level, output_level,
+          file_meta, &overlapping_file_names);
+    }
+
+    ASSERT_OK(dbfull()->CompactFiles(
+        CompactionOptions(), handles_[1],
+        compaction_input_file_names,
+        output_level));
+
+    // Make sure all overlapping files do not exist after compaction
+    dbfull()->GetColumnFamilyMetaData(handles_[1], &cf_meta);
+    VerifyCompactionResult(cf_meta, overlapping_file_names);
+  }
+
+  // make sure all key-values are still there.
+  for (int key = 64 * kEntriesPerBuffer; key >= 0; --key) {
+    ASSERT_NE(Get(1, ToString(key)), "NOT_FOUND");
+  }
+}
+
+TEST_P(DBCompactionTestWithParam, PartialCompactionFailure) {
+  Options options;
+  const int kKeySize = 16;
+  const int kKvSize = 1000;
+  const int kKeysPerBuffer = 100;
+  const int kNumL1Files = 5;
+  options.create_if_missing = true;
+  options.write_buffer_size = kKeysPerBuffer * kKvSize;
+  options.max_write_buffer_number = 2;
+  options.target_file_size_base =
+      options.write_buffer_size *
+      (options.max_write_buffer_number - 1);
+  options.level0_file_num_compaction_trigger = kNumL1Files;
+  options.max_bytes_for_level_base =
+      options.level0_file_num_compaction_trigger *
+      options.target_file_size_base;
+  options.max_bytes_for_level_multiplier = 2;
+  options.compression = kNoCompression;
+  options.max_subcompactions = max_subcompactions_;
+
+  env_->SetBackgroundThreads(1, Env::HIGH);
+  env_->SetBackgroundThreads(1, Env::LOW);
+  // stop the compaction thread until we simulate the file creation failure.
+  test::SleepingBackgroundTask sleeping_task_low;
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+                 Env::Priority::LOW);
+
+  options.env = env_;
+
+  DestroyAndReopen(options);
+
+  const int kNumInsertedKeys =
+      options.level0_file_num_compaction_trigger *
+      (options.max_write_buffer_number - 1) *
+      kKeysPerBuffer;
+
+  Random rnd(301);
+  std::vector<std::string> keys;
+  std::vector<std::string> values;
+  for (int k = 0; k < kNumInsertedKeys; ++k) {
+    keys.emplace_back(RandomString(&rnd, kKeySize));
+    values.emplace_back(RandomString(&rnd, kKvSize - kKeySize));
+    ASSERT_OK(Put(Slice(keys[k]), Slice(values[k])));
+    dbfull()->TEST_WaitForFlushMemTable();
+  }
+
+  dbfull()->TEST_FlushMemTable(true);
+  // Make sure the number of L0 files can trigger compaction.
+  ASSERT_GE(NumTableFilesAtLevel(0),
+            options.level0_file_num_compaction_trigger);
+
+  auto previous_num_level0_files = NumTableFilesAtLevel(0);
+
+  // Fail the first file creation.
+  env_->non_writable_count_ = 1;
+  sleeping_task_low.WakeUp();
+  sleeping_task_low.WaitUntilDone();
+
+  // Expect compaction to fail here as one file will fail its
+  // creation.
+  ASSERT_TRUE(!dbfull()->TEST_WaitForCompact().ok());
+
+  // Verify L0 -> L1 compaction does fail.
+  ASSERT_EQ(NumTableFilesAtLevel(1), 0);
+
+  // Verify all L0 files are still there.
+  ASSERT_EQ(NumTableFilesAtLevel(0), previous_num_level0_files);
+
+  // All key-values must exist after compaction fails.
+  for (int k = 0; k < kNumInsertedKeys; ++k) {
+    ASSERT_EQ(values[k], Get(keys[k]));
+  }
+
+  env_->non_writable_count_ = 0;
+
+  // Make sure RocksDB will not get into corrupted state.
+  Reopen(options);
+
+  // Verify again after reopen.
+  for (int k = 0; k < kNumInsertedKeys; ++k) {
+    ASSERT_EQ(values[k], Get(keys[k]));
+  }
+}
+
+TEST_P(DBCompactionTestWithParam, DeleteMovedFileAfterCompaction) {
+  // iter 1 -- delete_obsolete_files_period_micros == 0
+  for (int iter = 0; iter < 2; ++iter) {
+    // This test triggers move compaction and verifies that the file is not
+    // deleted when it's part of move compaction
+    Options options = CurrentOptions();
+    options.env = env_;
+    if (iter == 1) {
+      options.delete_obsolete_files_period_micros = 0;
+    }
+    options.create_if_missing = true;
+    options.level0_file_num_compaction_trigger =
+        2;  // trigger compaction when we have 2 files
+    OnFileDeletionListener* listener = new OnFileDeletionListener();
+    options.listeners.emplace_back(listener);
+    options.max_subcompactions = max_subcompactions_;
+    DestroyAndReopen(options);
+
+    Random rnd(301);
+    // Create two 1MB sst files
+    for (int i = 0; i < 2; ++i) {
+      // Create 1MB sst file
+      for (int j = 0; j < 100; ++j) {
+        ASSERT_OK(Put(Key(i * 50 + j), RandomString(&rnd, 10 * 1024)));
+      }
+      ASSERT_OK(Flush());
+    }
+    // this should execute L0->L1
+    dbfull()->TEST_WaitForCompact();
+    ASSERT_EQ("0,1", FilesPerLevel(0));
+
+    // block compactions
+    test::SleepingBackgroundTask sleeping_task;
+    env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task,
+                   Env::Priority::LOW);
+
+    options.max_bytes_for_level_base = 1024 * 1024;  // 1 MB
+    Reopen(options);
+    std::unique_ptr<Iterator> iterator(db_->NewIterator(ReadOptions()));
+    ASSERT_EQ("0,1", FilesPerLevel(0));
+    // let compactions go
+    sleeping_task.WakeUp();
+    sleeping_task.WaitUntilDone();
+
+    // this should execute L1->L2 (move)
+    dbfull()->TEST_WaitForCompact();
+
+    ASSERT_EQ("0,0,1", FilesPerLevel(0));
+
+    std::vector<LiveFileMetaData> metadata;
+    db_->GetLiveFilesMetaData(&metadata);
+    ASSERT_EQ(metadata.size(), 1U);
+    auto moved_file_name = metadata[0].name;
+
+    // Create two more 1MB sst files
+    for (int i = 0; i < 2; ++i) {
+      // Create 1MB sst file
+      for (int j = 0; j < 100; ++j) {
+        ASSERT_OK(Put(Key(i * 50 + j + 100), RandomString(&rnd, 10 * 1024)));
+      }
+      ASSERT_OK(Flush());
+    }
+    // this should execute both L0->L1 and L1->L2 (merge with previous file)
+    dbfull()->TEST_WaitForCompact();
+
+    ASSERT_EQ("0,0,2", FilesPerLevel(0));
+
+    // iterator is holding the file
+    ASSERT_OK(env_->FileExists(dbname_ + moved_file_name));
+
+    listener->SetExpectedFileName(dbname_ + moved_file_name);
+    iterator.reset();
+
+    // this file should have been compacted away
+    ASSERT_NOK(env_->FileExists(dbname_ + moved_file_name));
+    listener->VerifyMatchedCount(1);
+  }
+}
+
+TEST_P(DBCompactionTestWithParam, CompressLevelCompaction) {
+  if (!Zlib_Supported()) {
+    return;
+  }
+  Options options = CurrentOptions();
+  options.memtable_factory.reset(
+      new SpecialSkipListFactory(KNumKeysByGenerateNewFile - 1));
+  options.compaction_style = kCompactionStyleLevel;
+  options.write_buffer_size = 110 << 10;  // 110KB
+  options.arena_block_size = 4 << 10;
+  options.level0_file_num_compaction_trigger = 2;
+  options.num_levels = 4;
+  options.max_bytes_for_level_base = 400 * 1024;
+  options.max_subcompactions = max_subcompactions_;
+  // First two levels have no compression, so that a trivial move between
+  // them will be allowed. Level 2 has Zlib compression so that a trivial
+  // move to level 3 will not be allowed
+  options.compression_per_level = {kNoCompression, kNoCompression,
+                                   kZlibCompression};
+  int matches = 0, didnt_match = 0, trivial_move = 0, non_trivial = 0;
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "Compaction::InputCompressionMatchesOutput:Matches",
+      [&](void* /*arg*/) { matches++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "Compaction::InputCompressionMatchesOutput:DidntMatch",
+      [&](void* /*arg*/) { didnt_match++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:NonTrivial",
+      [&](void* /*arg*/) { non_trivial++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:TrivialMove",
+      [&](void* /*arg*/) { trivial_move++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  Reopen(options);
+
+  Random rnd(301);
+  int key_idx = 0;
+
+  // First three 110KB files are going to level 0
+  // After that, (100K, 200K)
+  for (int num = 0; num < 3; num++) {
+    GenerateNewFile(&rnd, &key_idx);
+  }
+
+  // Another 110KB triggers a compaction to 400K file to fill up level 0
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(4, GetSstFileCount(dbname_));
+
+  // (1, 4)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,4", FilesPerLevel(0));
+
+  // (1, 4, 1)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,4,1", FilesPerLevel(0));
+
+  // (1, 4, 2)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,4,2", FilesPerLevel(0));
+
+  // (1, 4, 3)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,4,3", FilesPerLevel(0));
+
+  // (1, 4, 4)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,4,4", FilesPerLevel(0));
+
+  // (1, 4, 5)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,4,5", FilesPerLevel(0));
+
+  // (1, 4, 6)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,4,6", FilesPerLevel(0));
+
+  // (1, 4, 7)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,4,7", FilesPerLevel(0));
+
+  // (1, 4, 8)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,4,8", FilesPerLevel(0));
+
+  ASSERT_EQ(matches, 12);
+  // Currently, the test relies on the number of calls to
+  // InputCompressionMatchesOutput() per compaction.
+  const int kCallsToInputCompressionMatch = 2;
+  ASSERT_EQ(didnt_match, 8 * kCallsToInputCompressionMatch);
+  ASSERT_EQ(trivial_move, 12);
+  ASSERT_EQ(non_trivial, 8);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+  for (int i = 0; i < key_idx; i++) {
+    auto v = Get(Key(i));
+    ASSERT_NE(v, "NOT_FOUND");
+    ASSERT_TRUE(v.size() == 1 || v.size() == 990);
+  }
+
+  Reopen(options);
+
+  for (int i = 0; i < key_idx; i++) {
+    auto v = Get(Key(i));
+    ASSERT_NE(v, "NOT_FOUND");
+    ASSERT_TRUE(v.size() == 1 || v.size() == 990);
+  }
+
+  Destroy(options);
+}
+
+TEST_F(DBCompactionTest, SanitizeCompactionOptionsTest) {
+  Options options = CurrentOptions();
+  options.max_background_compactions = 5;
+  options.soft_pending_compaction_bytes_limit = 0;
+  options.hard_pending_compaction_bytes_limit = 100;
+  options.create_if_missing = true;
+  DestroyAndReopen(options);
+  ASSERT_EQ(100, db_->GetOptions().soft_pending_compaction_bytes_limit);
+
+  options.max_background_compactions = 3;
+  options.soft_pending_compaction_bytes_limit = 200;
+  options.hard_pending_compaction_bytes_limit = 150;
+  DestroyAndReopen(options);
+  ASSERT_EQ(150, db_->GetOptions().soft_pending_compaction_bytes_limit);
+}
+
+// This tests for a bug that could cause two level0 compactions running
+// concurrently
+// TODO(aekmekji): Make sure that the reason this fails when run with
+// max_subcompactions > 1 is not a correctness issue but just inherent to
+// running parallel L0-L1 compactions
+TEST_F(DBCompactionTest, SuggestCompactRangeNoTwoLevel0Compactions) {
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleLevel;
+  options.write_buffer_size = 110 << 10;
+  options.arena_block_size = 4 << 10;
+  options.level0_file_num_compaction_trigger = 4;
+  options.num_levels = 4;
+  options.compression = kNoCompression;
+  options.max_bytes_for_level_base = 450 << 10;
+  options.target_file_size_base = 98 << 10;
+  options.max_write_buffer_number = 2;
+  options.max_background_compactions = 2;
+
+  DestroyAndReopen(options);
+
+  // fill up the DB
+  Random rnd(301);
+  for (int num = 0; num < 10; num++) {
+    GenerateNewRandomFile(&rnd);
+  }
+  db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"CompactionJob::Run():Start",
+        "DBCompactionTest::SuggestCompactRangeNoTwoLevel0Compactions:1"},
+       {"DBCompactionTest::SuggestCompactRangeNoTwoLevel0Compactions:2",
+        "CompactionJob::Run():End"}});
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // trigger L0 compaction
+  for (int num = 0; num < options.level0_file_num_compaction_trigger + 1;
+       num++) {
+    GenerateNewRandomFile(&rnd, /* nowait */ true);
+    ASSERT_OK(Flush());
+  }
+
+  TEST_SYNC_POINT(
+      "DBCompactionTest::SuggestCompactRangeNoTwoLevel0Compactions:1");
+
+  GenerateNewRandomFile(&rnd, /* nowait */ true);
+  dbfull()->TEST_WaitForFlushMemTable();
+  ASSERT_OK(experimental::SuggestCompactRange(db_, nullptr, nullptr));
+  for (int num = 0; num < options.level0_file_num_compaction_trigger + 1;
+       num++) {
+    GenerateNewRandomFile(&rnd, /* nowait */ true);
+    ASSERT_OK(Flush());
+  }
+
+  TEST_SYNC_POINT(
+      "DBCompactionTest::SuggestCompactRangeNoTwoLevel0Compactions:2");
+  dbfull()->TEST_WaitForCompact();
+}
+
+static std::string ShortKey(int i) {
+  assert(i < 10000);
+  char buf[100];
+  snprintf(buf, sizeof(buf), "key%04d", i);
+  return std::string(buf);
+}
+
+TEST_P(DBCompactionTestWithParam, ForceBottommostLevelCompaction) {
+  int32_t trivial_move = 0;
+  int32_t non_trivial_move = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:TrivialMove",
+      [&](void* /*arg*/) { trivial_move++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:NonTrivial",
+      [&](void* /*arg*/) { non_trivial_move++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // The key size is guaranteed to be <= 8
+  class ShortKeyComparator : public Comparator {
+    int Compare(const ROCKSDB_NAMESPACE::Slice& a,
+                const ROCKSDB_NAMESPACE::Slice& b) const override {
+      assert(a.size() <= 8);
+      assert(b.size() <= 8);
+      return BytewiseComparator()->Compare(a, b);
+    }
+    const char* Name() const override { return "ShortKeyComparator"; }
+    void FindShortestSeparator(
+        std::string* start,
+        const ROCKSDB_NAMESPACE::Slice& limit) const override {
+      return BytewiseComparator()->FindShortestSeparator(start, limit);
+    }
+    void FindShortSuccessor(std::string* key) const override {
+      return BytewiseComparator()->FindShortSuccessor(key);
+    }
+  } short_key_cmp;
+  Options options = CurrentOptions();
+  options.target_file_size_base = 100000000;
+  options.write_buffer_size = 100000000;
+  options.max_subcompactions = max_subcompactions_;
+  options.comparator = &short_key_cmp;
+  DestroyAndReopen(options);
+
+  int32_t value_size = 10 * 1024;  // 10 KB
+
+  Random rnd(301);
+  std::vector<std::string> values;
+  // File with keys [ 0 => 99 ]
+  for (int i = 0; i < 100; i++) {
+    values.push_back(RandomString(&rnd, value_size));
+    ASSERT_OK(Put(ShortKey(i), values[i]));
+  }
+  ASSERT_OK(Flush());
+
+  ASSERT_EQ("1", FilesPerLevel(0));
+  // Compaction will do L0=>L1 (trivial move) then move L1 files to L3
+  CompactRangeOptions compact_options;
+  compact_options.change_level = true;
+  compact_options.target_level = 3;
+  ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr));
+  ASSERT_EQ("0,0,0,1", FilesPerLevel(0));
+  ASSERT_EQ(trivial_move, 1);
+  ASSERT_EQ(non_trivial_move, 0);
+
+  // File with keys [ 100 => 199 ]
+  for (int i = 100; i < 200; i++) {
+    values.push_back(RandomString(&rnd, value_size));
+    ASSERT_OK(Put(ShortKey(i), values[i]));
+  }
+  ASSERT_OK(Flush());
+
+  ASSERT_EQ("1,0,0,1", FilesPerLevel(0));
+  // Compaction will do L0=>L1 L1=>L2 L2=>L3 (3 trivial moves)
+  // then compacte the bottommost level L3=>L3 (non trivial move)
+  compact_options = CompactRangeOptions();
+  compact_options.bottommost_level_compaction =
+      BottommostLevelCompaction::kForceOptimized;
+  ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr));
+  ASSERT_EQ("0,0,0,1", FilesPerLevel(0));
+  ASSERT_EQ(trivial_move, 4);
+  ASSERT_EQ(non_trivial_move, 1);
+
+  // File with keys [ 200 => 299 ]
+  for (int i = 200; i < 300; i++) {
+    values.push_back(RandomString(&rnd, value_size));
+    ASSERT_OK(Put(ShortKey(i), values[i]));
+  }
+  ASSERT_OK(Flush());
+
+  ASSERT_EQ("1,0,0,1", FilesPerLevel(0));
+  trivial_move = 0;
+  non_trivial_move = 0;
+  compact_options = CompactRangeOptions();
+  compact_options.bottommost_level_compaction =
+      BottommostLevelCompaction::kSkip;
+  // Compaction will do L0=>L1 L1=>L2 L2=>L3 (3 trivial moves)
+  // and will skip bottommost level compaction
+  ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr));
+  ASSERT_EQ("0,0,0,2", FilesPerLevel(0));
+  ASSERT_EQ(trivial_move, 3);
+  ASSERT_EQ(non_trivial_move, 0);
+
+  for (int i = 0; i < 300; i++) {
+    ASSERT_EQ(Get(ShortKey(i)), values[i]);
+  }
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_P(DBCompactionTestWithParam, IntraL0Compaction) {
+  Options options = CurrentOptions();
+  options.compression = kNoCompression;
+  options.level0_file_num_compaction_trigger = 5;
+  options.max_background_compactions = 2;
+  options.max_subcompactions = max_subcompactions_;
+  DestroyAndReopen(options);
+
+  const size_t kValueSize = 1 << 20;
+  Random rnd(301);
+  std::string value(RandomString(&rnd, kValueSize));
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"LevelCompactionPicker::PickCompactionBySize:0",
+        "CompactionJob::Run():Start"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // index:   0   1   2   3   4   5   6   7   8   9
+  // size:  1MB 1MB 1MB 1MB 1MB 2MB 1MB 1MB 1MB 1MB
+  // score:                     1.5 1.3 1.5 2.0 inf
+  //
+  // Files 0-4 will be included in an L0->L1 compaction.
+  //
+  // L0->L0 will be triggered since the sync points guarantee compaction to base
+  // level is still blocked when files 5-9 trigger another compaction.
+  //
+  // Files 6-9 are the longest span of available files for which
+  // work-per-deleted-file decreases (see "score" row above).
+  for (int i = 0; i < 10; ++i) {
+    ASSERT_OK(Put(Key(0), ""));  // prevents trivial move
+    if (i == 5) {
+      ASSERT_OK(Put(Key(i + 1), value + value));
+    } else {
+      ASSERT_OK(Put(Key(i + 1), value));
+    }
+    ASSERT_OK(Flush());
+  }
+  dbfull()->TEST_WaitForCompact();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+  std::vector<std::vector<FileMetaData>> level_to_files;
+  dbfull()->TEST_GetFilesMetaData(dbfull()->DefaultColumnFamily(),
+                                  &level_to_files);
+  ASSERT_GE(level_to_files.size(), 2);  // at least L0 and L1
+  // L0 has the 2MB file (not compacted) and 4MB file (output of L0->L0)
+  ASSERT_EQ(2, level_to_files[0].size());
+  ASSERT_GT(level_to_files[1].size(), 0);
+  for (int i = 0; i < 2; ++i) {
+    ASSERT_GE(level_to_files[0][i].fd.file_size, 1 << 21);
+  }
+}
+
+TEST_P(DBCompactionTestWithParam, IntraL0CompactionDoesNotObsoleteDeletions) {
+  // regression test for issue #2722: L0->L0 compaction can resurrect deleted
+  // keys from older L0 files if L1+ files' key-ranges do not include the key.
+  Options options = CurrentOptions();
+  options.compression = kNoCompression;
+  options.level0_file_num_compaction_trigger = 5;
+  options.max_background_compactions = 2;
+  options.max_subcompactions = max_subcompactions_;
+  DestroyAndReopen(options);
+
+  const size_t kValueSize = 1 << 20;
+  Random rnd(301);
+  std::string value(RandomString(&rnd, kValueSize));
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"LevelCompactionPicker::PickCompactionBySize:0",
+        "CompactionJob::Run():Start"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // index:   0   1   2   3   4    5    6   7   8   9
+  // size:  1MB 1MB 1MB 1MB 1MB  1MB  1MB 1MB 1MB 1MB
+  // score:                     1.25 1.33 1.5 2.0 inf
+  //
+  // Files 0-4 will be included in an L0->L1 compaction.
+  //
+  // L0->L0 will be triggered since the sync points guarantee compaction to base
+  // level is still blocked when files 5-9 trigger another compaction. All files
+  // 5-9 are included in the L0->L0 due to work-per-deleted file decreasing.
+  //
+  // Put a key-value in files 0-4. Delete that key in files 5-9. Verify the
+  // L0->L0 preserves the deletion such that the key remains deleted.
+  for (int i = 0; i < 10; ++i) {
+    // key 0 serves both to prevent trivial move and as the key we want to
+    // verify is not resurrected by L0->L0 compaction.
+    if (i < 5) {
+      ASSERT_OK(Put(Key(0), ""));
+    } else {
+      ASSERT_OK(Delete(Key(0)));
+    }
+    ASSERT_OK(Put(Key(i + 1), value));
+    ASSERT_OK(Flush());
+  }
+  dbfull()->TEST_WaitForCompact();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+  std::vector<std::vector<FileMetaData>> level_to_files;
+  dbfull()->TEST_GetFilesMetaData(dbfull()->DefaultColumnFamily(),
+                                  &level_to_files);
+  ASSERT_GE(level_to_files.size(), 2);  // at least L0 and L1
+  // L0 has a single output file from L0->L0
+  ASSERT_EQ(1, level_to_files[0].size());
+  ASSERT_GT(level_to_files[1].size(), 0);
+  ASSERT_GE(level_to_files[0][0].fd.file_size, 1 << 22);
+
+  ReadOptions roptions;
+  std::string result;
+  ASSERT_TRUE(db_->Get(roptions, Key(0), &result).IsNotFound());
+}
+
+TEST_P(DBCompactionTestWithParam, FullCompactionInBottomPriThreadPool) {
+  const int kNumFilesTrigger = 3;
+  Env::Default()->SetBackgroundThreads(1, Env::Priority::BOTTOM);
+  for (bool use_universal_compaction : {false, true}) {
+    Options options = CurrentOptions();
+    if (use_universal_compaction) {
+      options.compaction_style = kCompactionStyleUniversal;
+    } else {
+      options.compaction_style = kCompactionStyleLevel;
+      options.level_compaction_dynamic_level_bytes = true;
+    }
+    options.num_levels = 4;
+    options.write_buffer_size = 100 << 10;     // 100KB
+    options.target_file_size_base = 32 << 10;  // 32KB
+    options.level0_file_num_compaction_trigger = kNumFilesTrigger;
+    // Trigger compaction if size amplification exceeds 110%
+    options.compaction_options_universal.max_size_amplification_percent = 110;
+    DestroyAndReopen(options);
+
+    int num_bottom_pri_compactions = 0;
+    SyncPoint::GetInstance()->SetCallBack(
+        "DBImpl::BGWorkBottomCompaction",
+        [&](void* /*arg*/) { ++num_bottom_pri_compactions; });
+    SyncPoint::GetInstance()->EnableProcessing();
+
+    Random rnd(301);
+    for (int num = 0; num < kNumFilesTrigger; num++) {
+      ASSERT_EQ(NumSortedRuns(), num);
+      int key_idx = 0;
+      GenerateNewFile(&rnd, &key_idx);
+    }
+    dbfull()->TEST_WaitForCompact();
+
+    ASSERT_EQ(1, num_bottom_pri_compactions);
+
+    // Verify that size amplification did occur
+    ASSERT_EQ(NumSortedRuns(), 1);
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  }
+  Env::Default()->SetBackgroundThreads(0, Env::Priority::BOTTOM);
+}
+
+TEST_F(DBCompactionTest, OptimizedDeletionObsoleting) {
+  // Deletions can be dropped when compacted to non-last level if they fall
+  // outside the lower-level files' key-ranges.
+  const int kNumL0Files = 4;
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = kNumL0Files;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  DestroyAndReopen(options);
+
+  // put key 1 and 3 in separate L1, L2 files.
+  // So key 0, 2, and 4+ fall outside these levels' key-ranges.
+  for (int level = 2; level >= 1; --level) {
+    for (int i = 0; i < 2; ++i) {
+      Put(Key(2 * i + 1), "val");
+      Flush();
+    }
+    MoveFilesToLevel(level);
+    ASSERT_EQ(2, NumTableFilesAtLevel(level));
+  }
+
+  // Delete keys in range [1, 4]. These L0 files will be compacted with L1:
+  // - Tombstones for keys 2 and 4 can be dropped early.
+  // - Tombstones for keys 1 and 3 must be kept due to L2 files' key-ranges.
+  for (int i = 0; i < kNumL0Files; ++i) {
+    Put(Key(0), "val");  // sentinel to prevent trivial move
+    Delete(Key(i + 1));
+    Flush();
+  }
+  dbfull()->TEST_WaitForCompact();
+
+  for (int i = 0; i < kNumL0Files; ++i) {
+    std::string value;
+    ASSERT_TRUE(db_->Get(ReadOptions(), Key(i + 1), &value).IsNotFound());
+  }
+  ASSERT_EQ(2, options.statistics->getTickerCount(
+                   COMPACTION_OPTIMIZED_DEL_DROP_OBSOLETE));
+  ASSERT_EQ(2,
+            options.statistics->getTickerCount(COMPACTION_KEY_DROP_OBSOLETE));
+}
+
+TEST_F(DBCompactionTest, CompactFilesPendingL0Bug) {
+  // https://www.facebook.com/groups/rocksdb.dev/permalink/1389452781153232/
+  // CompactFiles() had a bug where it failed to pick a compaction when an L0
+  // compaction existed, but marked it as scheduled anyways. It'd never be
+  // unmarked as scheduled, so future compactions or DB close could hang.
+  const int kNumL0Files = 5;
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = kNumL0Files - 1;
+  options.max_background_compactions = 2;
+  DestroyAndReopen(options);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"LevelCompactionPicker::PickCompaction:Return",
+        "DBCompactionTest::CompactFilesPendingL0Bug:Picked"},
+       {"DBCompactionTest::CompactFilesPendingL0Bug:ManualCompacted",
+        "DBImpl::BackgroundCompaction:NonTrivial:AfterRun"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  auto schedule_multi_compaction_token =
+      dbfull()->TEST_write_controler().GetCompactionPressureToken();
+
+  // Files 0-3 will be included in an L0->L1 compaction.
+  //
+  // File 4 will be included in a call to CompactFiles() while the first
+  // compaction is running.
+  for (int i = 0; i < kNumL0Files - 1; ++i) {
+    ASSERT_OK(Put(Key(0), "val"));  // sentinel to prevent trivial move
+    ASSERT_OK(Put(Key(i + 1), "val"));
+    ASSERT_OK(Flush());
+  }
+  TEST_SYNC_POINT("DBCompactionTest::CompactFilesPendingL0Bug:Picked");
+  // file 4 flushed after 0-3 picked
+  ASSERT_OK(Put(Key(kNumL0Files), "val"));
+  ASSERT_OK(Flush());
+
+  // previously DB close would hang forever as this situation caused scheduled
+  // compactions count to never decrement to zero.
+  ColumnFamilyMetaData cf_meta;
+  dbfull()->GetColumnFamilyMetaData(dbfull()->DefaultColumnFamily(), &cf_meta);
+  ASSERT_EQ(kNumL0Files, cf_meta.levels[0].files.size());
+  std::vector<std::string> input_filenames;
+  input_filenames.push_back(cf_meta.levels[0].files.front().name);
+  ASSERT_OK(dbfull()
+                  ->CompactFiles(CompactionOptions(), input_filenames,
+                                 0 /* output_level */));
+  TEST_SYNC_POINT("DBCompactionTest::CompactFilesPendingL0Bug:ManualCompacted");
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBCompactionTest, CompactFilesOverlapInL0Bug) {
+  // Regression test for bug of not pulling in L0 files that overlap the user-
+  // specified input files in time- and key-ranges.
+  Put(Key(0), "old_val");
+  Flush();
+  Put(Key(0), "new_val");
+  Flush();
+
+  ColumnFamilyMetaData cf_meta;
+  dbfull()->GetColumnFamilyMetaData(dbfull()->DefaultColumnFamily(), &cf_meta);
+  ASSERT_GE(cf_meta.levels.size(), 2);
+  ASSERT_EQ(2, cf_meta.levels[0].files.size());
+
+  // Compacting {new L0 file, L1 file} should pull in the old L0 file since it
+  // overlaps in key-range and time-range.
+  std::vector<std::string> input_filenames;
+  input_filenames.push_back(cf_meta.levels[0].files.front().name);
+  ASSERT_OK(dbfull()->CompactFiles(CompactionOptions(), input_filenames,
+                                   1 /* output_level */));
+  ASSERT_EQ("new_val", Get(Key(0)));
+}
+
+TEST_F(DBCompactionTest, CompactBottomLevelFilesWithDeletions) {
+  // bottom-level files may contain deletions due to snapshots protecting the
+  // deleted keys. Once the snapshot is released, we should see files with many
+  // such deletions undergo single-file compactions.
+  const int kNumKeysPerFile = 1024;
+  const int kNumLevelFiles = 4;
+  const int kValueSize = 128;
+  Options options = CurrentOptions();
+  options.compression = kNoCompression;
+  options.level0_file_num_compaction_trigger = kNumLevelFiles;
+  // inflate it a bit to account for key/metadata overhead
+  options.target_file_size_base = 120 * kNumKeysPerFile * kValueSize / 100;
+  CreateAndReopenWithCF({"one"}, options);
+
+  Random rnd(301);
+  const Snapshot* snapshot = nullptr;
+  for (int i = 0; i < kNumLevelFiles; ++i) {
+    for (int j = 0; j < kNumKeysPerFile; ++j) {
+      ASSERT_OK(
+          Put(Key(i * kNumKeysPerFile + j), RandomString(&rnd, kValueSize)));
+    }
+    if (i == kNumLevelFiles - 1) {
+      snapshot = db_->GetSnapshot();
+      // delete every other key after grabbing a snapshot, so these deletions
+      // and the keys they cover can't be dropped until after the snapshot is
+      // released.
+      for (int j = 0; j < kNumLevelFiles * kNumKeysPerFile; j += 2) {
+        ASSERT_OK(Delete(Key(j)));
+      }
+    }
+    Flush();
+    if (i < kNumLevelFiles - 1) {
+      ASSERT_EQ(i + 1, NumTableFilesAtLevel(0));
+    }
+  }
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(kNumLevelFiles, NumTableFilesAtLevel(1));
+
+  std::vector<LiveFileMetaData> pre_release_metadata, post_release_metadata;
+  db_->GetLiveFilesMetaData(&pre_release_metadata);
+  // just need to bump seqnum so ReleaseSnapshot knows the newest key in the SST
+  // files does not need to be preserved in case of a future snapshot.
+  ASSERT_OK(Put(Key(0), "val"));
+  ASSERT_NE(kMaxSequenceNumber, dbfull()->bottommost_files_mark_threshold_);
+  // release snapshot and wait for compactions to finish. Single-file
+  // compactions should be triggered, which reduce the size of each bottom-level
+  // file without changing file count.
+  db_->ReleaseSnapshot(snapshot);
+  ASSERT_EQ(kMaxSequenceNumber, dbfull()->bottommost_files_mark_threshold_);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
+        Compaction* compaction = reinterpret_cast<Compaction*>(arg);
+        ASSERT_TRUE(compaction->compaction_reason() ==
+                    CompactionReason::kBottommostFiles);
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  dbfull()->TEST_WaitForCompact();
+  db_->GetLiveFilesMetaData(&post_release_metadata);
+  ASSERT_EQ(pre_release_metadata.size(), post_release_metadata.size());
+
+  for (size_t i = 0; i < pre_release_metadata.size(); ++i) {
+    const auto& pre_file = pre_release_metadata[i];
+    const auto& post_file = post_release_metadata[i];
+    ASSERT_EQ(1, pre_file.level);
+    ASSERT_EQ(1, post_file.level);
+    // each file is smaller than it was before as it was rewritten without
+    // deletion markers/deleted keys.
+    ASSERT_LT(post_file.size, pre_file.size);
+  }
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBCompactionTest, LevelCompactExpiredTtlFiles) {
+  const int kNumKeysPerFile = 32;
+  const int kNumLevelFiles = 2;
+  const int kValueSize = 1024;
+
+  Options options = CurrentOptions();
+  options.compression = kNoCompression;
+  options.ttl = 24 * 60 * 60;  // 24 hours
+  options.max_open_files = -1;
+  env_->time_elapse_only_sleep_ = false;
+  options.env = env_;
+
+  env_->addon_time_.store(0);
+  DestroyAndReopen(options);
+
+  Random rnd(301);
+  for (int i = 0; i < kNumLevelFiles; ++i) {
+    for (int j = 0; j < kNumKeysPerFile; ++j) {
+      ASSERT_OK(
+          Put(Key(i * kNumKeysPerFile + j), RandomString(&rnd, kValueSize)));
+    }
+    Flush();
+  }
+  dbfull()->TEST_WaitForCompact();
+  MoveFilesToLevel(3);
+  ASSERT_EQ("0,0,0,2", FilesPerLevel());
+
+  // Delete previously written keys.
+  for (int i = 0; i < kNumLevelFiles; ++i) {
+    for (int j = 0; j < kNumKeysPerFile; ++j) {
+      ASSERT_OK(Delete(Key(i * kNumKeysPerFile + j)));
+    }
+    Flush();
+  }
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ("2,0,0,2", FilesPerLevel());
+  MoveFilesToLevel(1);
+  ASSERT_EQ("0,2,0,2", FilesPerLevel());
+
+  env_->addon_time_.fetch_add(36 * 60 * 60);  // 36 hours
+  ASSERT_EQ("0,2,0,2", FilesPerLevel());
+
+  // Just do a simple write + flush so that the Ttl expired files get
+  // compacted.
+  ASSERT_OK(Put("a", "1"));
+  Flush();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
+        Compaction* compaction = reinterpret_cast<Compaction*>(arg);
+        ASSERT_TRUE(compaction->compaction_reason() == CompactionReason::kTtl);
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  dbfull()->TEST_WaitForCompact();
+  // All non-L0 files are deleted, as they contained only deleted data.
+  ASSERT_EQ("1", FilesPerLevel());
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+  // Test dynamically changing ttl.
+
+  env_->addon_time_.store(0);
+  DestroyAndReopen(options);
+
+  for (int i = 0; i < kNumLevelFiles; ++i) {
+    for (int j = 0; j < kNumKeysPerFile; ++j) {
+      ASSERT_OK(
+          Put(Key(i * kNumKeysPerFile + j), RandomString(&rnd, kValueSize)));
+    }
+    Flush();
+  }
+  dbfull()->TEST_WaitForCompact();
+  MoveFilesToLevel(3);
+  ASSERT_EQ("0,0,0,2", FilesPerLevel());
+
+  // Delete previously written keys.
+  for (int i = 0; i < kNumLevelFiles; ++i) {
+    for (int j = 0; j < kNumKeysPerFile; ++j) {
+      ASSERT_OK(Delete(Key(i * kNumKeysPerFile + j)));
+    }
+    Flush();
+  }
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ("2,0,0,2", FilesPerLevel());
+  MoveFilesToLevel(1);
+  ASSERT_EQ("0,2,0,2", FilesPerLevel());
+
+  // Move time forward by 12 hours, and make sure that compaction still doesn't
+  // trigger as ttl is set to 24 hours.
+  env_->addon_time_.fetch_add(12 * 60 * 60);
+  ASSERT_OK(Put("a", "1"));
+  Flush();
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ("1,2,0,2", FilesPerLevel());
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
+        Compaction* compaction = reinterpret_cast<Compaction*>(arg);
+        ASSERT_TRUE(compaction->compaction_reason() == CompactionReason::kTtl);
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // Dynamically change ttl to 10 hours.
+  // This should trigger a ttl compaction, as 12 hours have already passed.
+  ASSERT_OK(dbfull()->SetOptions({{"ttl", "36000"}}));
+  dbfull()->TEST_WaitForCompact();
+  // All non-L0 files are deleted, as they contained only deleted data.
+  ASSERT_EQ("1", FilesPerLevel());
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBCompactionTest, LevelTtlCascadingCompactions) {
+  const int kValueSize = 100;
+
+  for (bool if_restart : {false, true}) {
+    for (bool if_open_all_files : {false, true}) {
+      Options options = CurrentOptions();
+      options.compression = kNoCompression;
+      options.ttl = 24 * 60 * 60;  // 24 hours
+      if (if_open_all_files) {
+        options.max_open_files = -1;
+      } else {
+        options.max_open_files = 20;
+      }
+      // RocksDB sanitize max open files to at least 20. Modify it back.
+      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+          "SanitizeOptions::AfterChangeMaxOpenFiles", [&](void* arg) {
+            int* max_open_files = static_cast<int*>(arg);
+            *max_open_files = 2;
+          });
+      // In the case where all files are opened and doing DB restart
+      // forcing the oldest ancester time in manifest file to be 0 to
+      // simulate the case of reading from an old version.
+      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+          "VersionEdit::EncodeTo:VarintOldestAncesterTime", [&](void* arg) {
+            if (if_restart && if_open_all_files) {
+              std::string* encoded_fieled = static_cast<std::string*>(arg);
+              *encoded_fieled = "";
+              PutVarint64(encoded_fieled, 0);
+            }
+          });
+
+      env_->time_elapse_only_sleep_ = false;
+      options.env = env_;
+
+      env_->addon_time_.store(0);
+      DestroyAndReopen(options);
+
+      int ttl_compactions = 0;
+      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+          "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
+            Compaction* compaction = reinterpret_cast<Compaction*>(arg);
+            auto compaction_reason = compaction->compaction_reason();
+            if (compaction_reason == CompactionReason::kTtl) {
+              ttl_compactions++;
+            }
+          });
+      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+      // Add two L6 files with key ranges: [1 .. 100], [101 .. 200].
+      Random rnd(301);
+      for (int i = 1; i <= 100; ++i) {
+        ASSERT_OK(Put(Key(i), RandomString(&rnd, kValueSize)));
+      }
+      Flush();
+      // Get the first file's creation time. This will be the oldest file in the
+      // DB. Compactions inolving this file's descendents should keep getting
+      // this time.
+      std::vector<std::vector<FileMetaData>> level_to_files;
+      dbfull()->TEST_GetFilesMetaData(dbfull()->DefaultColumnFamily(),
+                                      &level_to_files);
+      uint64_t oldest_time = level_to_files[0][0].oldest_ancester_time;
+      // Add 1 hour and do another flush.
+      env_->addon_time_.fetch_add(1 * 60 * 60);
+      for (int i = 101; i <= 200; ++i) {
+        ASSERT_OK(Put(Key(i), RandomString(&rnd, kValueSize)));
+      }
+      Flush();
+      MoveFilesToLevel(6);
+      ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel());
+
+      env_->addon_time_.fetch_add(1 * 60 * 60);
+      // Add two L4 files with key ranges: [1 .. 50], [51 .. 150].
+      for (int i = 1; i <= 50; ++i) {
+        ASSERT_OK(Put(Key(i), RandomString(&rnd, kValueSize)));
+      }
+      Flush();
+      env_->addon_time_.fetch_add(1 * 60 * 60);
+      for (int i = 51; i <= 150; ++i) {
+        ASSERT_OK(Put(Key(i), RandomString(&rnd, kValueSize)));
+      }
+      Flush();
+      MoveFilesToLevel(4);
+      ASSERT_EQ("0,0,0,0,2,0,2", FilesPerLevel());
+
+      env_->addon_time_.fetch_add(1 * 60 * 60);
+      // Add one L1 file with key range: [26, 75].
+      for (int i = 26; i <= 75; ++i) {
+        ASSERT_OK(Put(Key(i), RandomString(&rnd, kValueSize)));
+      }
+      Flush();
+      dbfull()->TEST_WaitForCompact();
+      MoveFilesToLevel(1);
+      ASSERT_EQ("0,1,0,0,2,0,2", FilesPerLevel());
+
+      // LSM tree:
+      // L1:         [26 .. 75]
+      // L4:     [1 .. 50][51 ..... 150]
+      // L6:     [1 ........ 100][101 .... 200]
+      //
+      // On TTL expiry, TTL compaction should be initiated on L1 file, and the
+      // compactions should keep going on until the key range hits bottom level.
+      // In other words: the compaction on this data range "cascasdes" until
+      // reaching the bottom level.
+      //
+      // Order of events on TTL expiry:
+      // 1. L1 file falls to L3 via 2 trivial moves which are initiated by the
+      // ttl
+      //    compaction.
+      // 2. A TTL compaction happens between L3 and L4 files. Output file in L4.
+      // 3. The new output file from L4 falls to L5 via 1 trival move initiated
+      //    by the ttl compaction.
+      // 4. A TTL compaction happens between L5 and L6 files. Ouptut in L6.
+
+      // Add 25 hours and do a write
+      env_->addon_time_.fetch_add(25 * 60 * 60);
+
+      ASSERT_OK(Put(Key(1), "1"));
+      if (if_restart) {
+        Reopen(options);
+      } else {
+        Flush();
+      }
+      dbfull()->TEST_WaitForCompact();
+      ASSERT_EQ("1,0,0,0,0,0,1", FilesPerLevel());
+      ASSERT_EQ(5, ttl_compactions);
+
+      dbfull()->TEST_GetFilesMetaData(dbfull()->DefaultColumnFamily(),
+                                      &level_to_files);
+      ASSERT_EQ(oldest_time, level_to_files[6][0].oldest_ancester_time);
+
+      env_->addon_time_.fetch_add(25 * 60 * 60);
+      ASSERT_OK(Put(Key(2), "1"));
+      if (if_restart) {
+        Reopen(options);
+      } else {
+        Flush();
+      }
+      dbfull()->TEST_WaitForCompact();
+      ASSERT_EQ("1,0,0,0,0,0,1", FilesPerLevel());
+      ASSERT_GE(ttl_compactions, 6);
+
+      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+    }
+  }
+}
+
+TEST_F(DBCompactionTest, LevelPeriodicCompaction) {
+  const int kNumKeysPerFile = 32;
+  const int kNumLevelFiles = 2;
+  const int kValueSize = 100;
+
+  for (bool if_restart : {false, true}) {
+    for (bool if_open_all_files : {false, true}) {
+      Options options = CurrentOptions();
+      options.periodic_compaction_seconds = 48 * 60 * 60;  // 2 days
+      if (if_open_all_files) {
+        options.max_open_files = -1;  // needed for ttl compaction
+      } else {
+        options.max_open_files = 20;
+      }
+      // RocksDB sanitize max open files to at least 20. Modify it back.
+      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+          "SanitizeOptions::AfterChangeMaxOpenFiles", [&](void* arg) {
+            int* max_open_files = static_cast<int*>(arg);
+            *max_open_files = 0;
+          });
+      // In the case where all files are opened and doing DB restart
+      // forcing the file creation time in manifest file to be 0 to
+      // simulate the case of reading from an old version.
+      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+          "VersionEdit::EncodeTo:VarintFileCreationTime", [&](void* arg) {
+            if (if_restart && if_open_all_files) {
+              std::string* encoded_fieled = static_cast<std::string*>(arg);
+              *encoded_fieled = "";
+              PutVarint64(encoded_fieled, 0);
+            }
+          });
+
+      env_->time_elapse_only_sleep_ = false;
+      options.env = env_;
+
+      env_->addon_time_.store(0);
+      DestroyAndReopen(options);
+
+      int periodic_compactions = 0;
+      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+          "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
+            Compaction* compaction = reinterpret_cast<Compaction*>(arg);
+            auto compaction_reason = compaction->compaction_reason();
+            if (compaction_reason == CompactionReason::kPeriodicCompaction) {
+              periodic_compactions++;
+            }
+          });
+      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+      Random rnd(301);
+      for (int i = 0; i < kNumLevelFiles; ++i) {
+        for (int j = 0; j < kNumKeysPerFile; ++j) {
+          ASSERT_OK(Put(Key(i * kNumKeysPerFile + j),
+                        RandomString(&rnd, kValueSize)));
+        }
+        Flush();
+      }
+      dbfull()->TEST_WaitForCompact();
+
+      ASSERT_EQ("2", FilesPerLevel());
+      ASSERT_EQ(0, periodic_compactions);
+
+      // Add 50 hours and do a write
+      env_->addon_time_.fetch_add(50 * 60 * 60);
+      ASSERT_OK(Put("a", "1"));
+      Flush();
+      dbfull()->TEST_WaitForCompact();
+      // Assert that the files stay in the same level
+      ASSERT_EQ("3", FilesPerLevel());
+      // The two old files go through the periodic compaction process
+      ASSERT_EQ(2, periodic_compactions);
+
+      MoveFilesToLevel(1);
+      ASSERT_EQ("0,3", FilesPerLevel());
+
+      // Add another 50 hours and do another write
+      env_->addon_time_.fetch_add(50 * 60 * 60);
+      ASSERT_OK(Put("b", "2"));
+      if (if_restart) {
+        Reopen(options);
+      } else {
+        Flush();
+      }
+      dbfull()->TEST_WaitForCompact();
+      ASSERT_EQ("1,3", FilesPerLevel());
+      // The three old files now go through the periodic compaction process. 2
+      // + 3.
+      ASSERT_EQ(5, periodic_compactions);
+
+      // Add another 50 hours and do another write
+      env_->addon_time_.fetch_add(50 * 60 * 60);
+      ASSERT_OK(Put("c", "3"));
+      Flush();
+      dbfull()->TEST_WaitForCompact();
+      ASSERT_EQ("2,3", FilesPerLevel());
+      // The four old files now go through the periodic compaction process. 5
+      // + 4.
+      ASSERT_EQ(9, periodic_compactions);
+
+      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+    }
+  }
+}
+
+TEST_F(DBCompactionTest, LevelPeriodicCompactionWithOldDB) {
+  // This test makes sure that periodic compactions are working with a DB
+  // where file_creation_time of some files is 0.
+  // After compactions the new files are created with a valid file_creation_time
+
+  const int kNumKeysPerFile = 32;
+  const int kNumFiles = 4;
+  const int kValueSize = 100;
+
+  Options options = CurrentOptions();
+  env_->time_elapse_only_sleep_ = false;
+  options.env = env_;
+
+  env_->addon_time_.store(0);
+  DestroyAndReopen(options);
+
+  int periodic_compactions = 0;
+  bool set_file_creation_time_to_zero = true;
+  bool set_creation_time_to_zero = true;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
+        Compaction* compaction = reinterpret_cast<Compaction*>(arg);
+        auto compaction_reason = compaction->compaction_reason();
+        if (compaction_reason == CompactionReason::kPeriodicCompaction) {
+          periodic_compactions++;
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "PropertyBlockBuilder::AddTableProperty:Start", [&](void* arg) {
+        TableProperties* props = reinterpret_cast<TableProperties*>(arg);
+        if (set_file_creation_time_to_zero) {
+          props->file_creation_time = 0;
+        }
+        if (set_creation_time_to_zero) {
+          props->creation_time = 0;
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  Random rnd(301);
+  for (int i = 0; i < kNumFiles; ++i) {
+    for (int j = 0; j < kNumKeysPerFile; ++j) {
+      ASSERT_OK(
+          Put(Key(i * kNumKeysPerFile + j), RandomString(&rnd, kValueSize)));
+    }
+    Flush();
+    // Move the first two files to L2.
+    if (i == 1) {
+      MoveFilesToLevel(2);
+      set_creation_time_to_zero = false;
+    }
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  ASSERT_EQ("2,0,2", FilesPerLevel());
+  ASSERT_EQ(0, periodic_compactions);
+
+  Close();
+
+  set_file_creation_time_to_zero = false;
+  // Forward the clock by 2 days.
+  env_->addon_time_.fetch_add(2 * 24 * 60 * 60);
+  options.periodic_compaction_seconds = 1 * 24 * 60 * 60;  // 1 day
+
+  Reopen(options);
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ("2,0,2", FilesPerLevel());
+  // Make sure that all files go through periodic compaction.
+  ASSERT_EQ(kNumFiles, periodic_compactions);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBCompactionTest, LevelPeriodicAndTtlCompaction) {
+  const int kNumKeysPerFile = 32;
+  const int kNumLevelFiles = 2;
+  const int kValueSize = 100;
+
+  Options options = CurrentOptions();
+  options.ttl = 10 * 60 * 60;  // 10 hours
+  options.periodic_compaction_seconds = 48 * 60 * 60;  // 2 days
+  options.max_open_files = -1;   // needed for both periodic and ttl compactions
+  env_->time_elapse_only_sleep_ = false;
+  options.env = env_;
+
+  env_->addon_time_.store(0);
+  DestroyAndReopen(options);
+
+  int periodic_compactions = 0;
+  int ttl_compactions = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
+        Compaction* compaction = reinterpret_cast<Compaction*>(arg);
+        auto compaction_reason = compaction->compaction_reason();
+        if (compaction_reason == CompactionReason::kPeriodicCompaction) {
+          periodic_compactions++;
+        } else if (compaction_reason == CompactionReason::kTtl) {
+          ttl_compactions++;
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  Random rnd(301);
+  for (int i = 0; i < kNumLevelFiles; ++i) {
+    for (int j = 0; j < kNumKeysPerFile; ++j) {
+      ASSERT_OK(
+          Put(Key(i * kNumKeysPerFile + j), RandomString(&rnd, kValueSize)));
+    }
+    Flush();
+  }
+  dbfull()->TEST_WaitForCompact();
+
+  MoveFilesToLevel(3);
+
+  ASSERT_EQ("0,0,0,2", FilesPerLevel());
+  ASSERT_EQ(0, periodic_compactions);
+  ASSERT_EQ(0, ttl_compactions);
+
+  // Add some time greater than periodic_compaction_time.
+  env_->addon_time_.fetch_add(50 * 60 * 60);
+  ASSERT_OK(Put("a", "1"));
+  Flush();
+  dbfull()->TEST_WaitForCompact();
+  // Files in the bottom level go through periodic compactions.
+  ASSERT_EQ("1,0,0,2", FilesPerLevel());
+  ASSERT_EQ(2, periodic_compactions);
+  ASSERT_EQ(0, ttl_compactions);
+
+  // Add a little more time than ttl
+  env_->addon_time_.fetch_add(11 * 60 * 60);
+  ASSERT_OK(Put("b", "1"));
+  Flush();
+  dbfull()->TEST_WaitForCompact();
+  // Notice that the previous file in level 1 falls down to the bottom level
+  // due to ttl compactions, one level at a time.
+  // And bottom level files don't get picked up for ttl compactions.
+  ASSERT_EQ("1,0,0,3", FilesPerLevel());
+  ASSERT_EQ(2, periodic_compactions);
+  ASSERT_EQ(3, ttl_compactions);
+
+  // Add some time greater than periodic_compaction_time.
+  env_->addon_time_.fetch_add(50 * 60 * 60);
+  ASSERT_OK(Put("c", "1"));
+  Flush();
+  dbfull()->TEST_WaitForCompact();
+  // Previous L0 file falls one level at a time to bottom level due to ttl.
+  // And all 4 bottom files go through periodic compactions.
+  ASSERT_EQ("1,0,0,4", FilesPerLevel());
+  ASSERT_EQ(6, periodic_compactions);
+  ASSERT_EQ(6, ttl_compactions);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBCompactionTest, LevelPeriodicCompactionWithCompactionFilters) {
+  class TestCompactionFilter : public CompactionFilter {
+    const char* Name() const override { return "TestCompactionFilter"; }
+  };
+  class TestCompactionFilterFactory : public CompactionFilterFactory {
+    const char* Name() const override { return "TestCompactionFilterFactory"; }
+    std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+        const CompactionFilter::Context& /*context*/) override {
+      return std::unique_ptr<CompactionFilter>(new TestCompactionFilter());
+    }
+  };
+
+  const int kNumKeysPerFile = 32;
+  const int kNumLevelFiles = 2;
+  const int kValueSize = 100;
+
+  Random rnd(301);
+
+  Options options = CurrentOptions();
+  TestCompactionFilter test_compaction_filter;
+  env_->time_elapse_only_sleep_ = false;
+  options.env = env_;
+  env_->addon_time_.store(0);
+
+  enum CompactionFilterType {
+    kUseCompactionFilter,
+    kUseCompactionFilterFactory
+  };
+
+  for (CompactionFilterType comp_filter_type :
+       {kUseCompactionFilter, kUseCompactionFilterFactory}) {
+    // Assert that periodic compactions are not enabled.
+    ASSERT_EQ(port::kMaxUint64 - 1, options.periodic_compaction_seconds);
+
+    if (comp_filter_type == kUseCompactionFilter) {
+      options.compaction_filter = &test_compaction_filter;
+      options.compaction_filter_factory.reset();
+    } else if (comp_filter_type == kUseCompactionFilterFactory) {
+      options.compaction_filter = nullptr;
+      options.compaction_filter_factory.reset(
+          new TestCompactionFilterFactory());
+    }
+    DestroyAndReopen(options);
+
+    // periodic_compaction_seconds should be set to the sanitized value when
+    // a compaction filter or a compaction filter factory is used.
+    ASSERT_EQ(30 * 24 * 60 * 60,
+              dbfull()->GetOptions().periodic_compaction_seconds);
+
+    int periodic_compactions = 0;
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+        "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
+          Compaction* compaction = reinterpret_cast<Compaction*>(arg);
+          auto compaction_reason = compaction->compaction_reason();
+          if (compaction_reason == CompactionReason::kPeriodicCompaction) {
+            periodic_compactions++;
+          }
+        });
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+    for (int i = 0; i < kNumLevelFiles; ++i) {
+      for (int j = 0; j < kNumKeysPerFile; ++j) {
+        ASSERT_OK(
+            Put(Key(i * kNumKeysPerFile + j), RandomString(&rnd, kValueSize)));
+      }
+      Flush();
+    }
+    dbfull()->TEST_WaitForCompact();
+
+    ASSERT_EQ("2", FilesPerLevel());
+    ASSERT_EQ(0, periodic_compactions);
+
+    // Add 31 days and do a write
+    env_->addon_time_.fetch_add(31 * 24 * 60 * 60);
+    ASSERT_OK(Put("a", "1"));
+    Flush();
+    dbfull()->TEST_WaitForCompact();
+    // Assert that the files stay in the same level
+    ASSERT_EQ("3", FilesPerLevel());
+    // The two old files go through the periodic compaction process
+    ASSERT_EQ(2, periodic_compactions);
+
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  }
+}
+
+TEST_F(DBCompactionTest, CompactRangeDelayedByL0FileCount) {
+  // Verify that, when `CompactRangeOptions::allow_write_stall == false`, manual
+  // compaction only triggers flush after it's sure stall won't be triggered for
+  // L0 file count going too high.
+  const int kNumL0FilesTrigger = 4;
+  const int kNumL0FilesLimit = 8;
+  // i == 0: verifies normal case where stall is avoided by delay
+  // i == 1: verifies no delay in edge case where stall trigger is same as
+  //         compaction trigger, so stall can't be avoided
+  for (int i = 0; i < 2; ++i) {
+    Options options = CurrentOptions();
+    options.level0_slowdown_writes_trigger = kNumL0FilesLimit;
+    if (i == 0) {
+      options.level0_file_num_compaction_trigger = kNumL0FilesTrigger;
+    } else {
+      options.level0_file_num_compaction_trigger = kNumL0FilesLimit;
+    }
+    Reopen(options);
+
+    if (i == 0) {
+      // ensure the auto compaction doesn't finish until manual compaction has
+      // had a chance to be delayed.
+      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+          {{"DBImpl::WaitUntilFlushWouldNotStallWrites:StallWait",
+            "CompactionJob::Run():End"}});
+    } else {
+      // ensure the auto-compaction doesn't finish until manual compaction has
+      // continued without delay.
+      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+          {{"DBImpl::FlushMemTable:StallWaitDone",
+            "CompactionJob::Run():End"}});
+    }
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+    Random rnd(301);
+    for (int j = 0; j < kNumL0FilesLimit - 1; ++j) {
+      for (int k = 0; k < 2; ++k) {
+        ASSERT_OK(Put(Key(k), RandomString(&rnd, 1024)));
+      }
+      Flush();
+    }
+    auto manual_compaction_thread = port::Thread([this]() {
+      CompactRangeOptions cro;
+      cro.allow_write_stall = false;
+      db_->CompactRange(cro, nullptr, nullptr);
+    });
+
+    manual_compaction_thread.join();
+    dbfull()->TEST_WaitForCompact();
+    ASSERT_EQ(0, NumTableFilesAtLevel(0));
+    ASSERT_GT(NumTableFilesAtLevel(1), 0);
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  }
+}
+
+TEST_F(DBCompactionTest, CompactRangeDelayedByImmMemTableCount) {
+  // Verify that, when `CompactRangeOptions::allow_write_stall == false`, manual
+  // compaction only triggers flush after it's sure stall won't be triggered for
+  // immutable memtable count going too high.
+  const int kNumImmMemTableLimit = 8;
+  // i == 0: verifies normal case where stall is avoided by delay
+  // i == 1: verifies no delay in edge case where stall trigger is same as flush
+  //         trigger, so stall can't be avoided
+  for (int i = 0; i < 2; ++i) {
+    Options options = CurrentOptions();
+    options.disable_auto_compactions = true;
+    // the delay limit is one less than the stop limit. This test focuses on
+    // avoiding delay limit, but this option sets stop limit, so add one.
+    options.max_write_buffer_number = kNumImmMemTableLimit + 1;
+    if (i == 1) {
+      options.min_write_buffer_number_to_merge = kNumImmMemTableLimit;
+    }
+    Reopen(options);
+
+    if (i == 0) {
+      // ensure the flush doesn't finish until manual compaction has had a
+      // chance to be delayed.
+      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+          {{"DBImpl::WaitUntilFlushWouldNotStallWrites:StallWait",
+            "FlushJob::WriteLevel0Table"}});
+    } else {
+      // ensure the flush doesn't finish until manual compaction has continued
+      // without delay.
+      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+          {{"DBImpl::FlushMemTable:StallWaitDone",
+            "FlushJob::WriteLevel0Table"}});
+    }
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+    Random rnd(301);
+    for (int j = 0; j < kNumImmMemTableLimit - 1; ++j) {
+      ASSERT_OK(Put(Key(0), RandomString(&rnd, 1024)));
+      FlushOptions flush_opts;
+      flush_opts.wait = false;
+      flush_opts.allow_write_stall = true;
+      dbfull()->Flush(flush_opts);
+    }
+
+    auto manual_compaction_thread = port::Thread([this]() {
+      CompactRangeOptions cro;
+      cro.allow_write_stall = false;
+      db_->CompactRange(cro, nullptr, nullptr);
+    });
+
+    manual_compaction_thread.join();
+    dbfull()->TEST_WaitForFlushMemTable();
+    ASSERT_EQ(0, NumTableFilesAtLevel(0));
+    ASSERT_GT(NumTableFilesAtLevel(1), 0);
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  }
+}
+
+TEST_F(DBCompactionTest, CompactRangeShutdownWhileDelayed) {
+  // Verify that, when `CompactRangeOptions::allow_write_stall == false`, delay
+  // does not hang if CF is dropped or DB is closed
+  const int kNumL0FilesTrigger = 4;
+  const int kNumL0FilesLimit = 8;
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = kNumL0FilesTrigger;
+  options.level0_slowdown_writes_trigger = kNumL0FilesLimit;
+  // i == 0: DB::DropColumnFamily() on CompactRange's target CF unblocks it
+  // i == 1: DB::CancelAllBackgroundWork() unblocks CompactRange. This is to
+  //         simulate what happens during Close as we can't call Close (it
+  //         blocks on the auto-compaction, making a cycle).
+  for (int i = 0; i < 2; ++i) {
+    CreateAndReopenWithCF({"one"}, options);
+    // The calls to close CF/DB wait until the manual compaction stalls.
+    // The auto-compaction waits until the manual compaction finishes to ensure
+    // the signal comes from closing CF/DB, not from compaction making progress.
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+        {{"DBImpl::WaitUntilFlushWouldNotStallWrites:StallWait",
+          "DBCompactionTest::CompactRangeShutdownWhileDelayed:PreShutdown"},
+         {"DBCompactionTest::CompactRangeShutdownWhileDelayed:PostManual",
+          "CompactionJob::Run():End"}});
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+    Random rnd(301);
+    for (int j = 0; j < kNumL0FilesLimit - 1; ++j) {
+      for (int k = 0; k < 2; ++k) {
+        ASSERT_OK(Put(1, Key(k), RandomString(&rnd, 1024)));
+      }
+      Flush(1);
+    }
+    auto manual_compaction_thread = port::Thread([this, i]() {
+      CompactRangeOptions cro;
+      cro.allow_write_stall = false;
+      Status s = db_->CompactRange(cro, handles_[1], nullptr, nullptr);
+      if (i == 0) {
+        ASSERT_TRUE(db_->CompactRange(cro, handles_[1], nullptr, nullptr)
+                        .IsColumnFamilyDropped());
+      } else {
+        ASSERT_TRUE(db_->CompactRange(cro, handles_[1], nullptr, nullptr)
+                        .IsShutdownInProgress());
+      }
+    });
+
+    TEST_SYNC_POINT(
+        "DBCompactionTest::CompactRangeShutdownWhileDelayed:PreShutdown");
+    if (i == 0) {
+      ASSERT_OK(db_->DropColumnFamily(handles_[1]));
+    } else {
+      dbfull()->CancelAllBackgroundWork(false /* wait */);
+    }
+    manual_compaction_thread.join();
+    TEST_SYNC_POINT(
+        "DBCompactionTest::CompactRangeShutdownWhileDelayed:PostManual");
+    dbfull()->TEST_WaitForCompact();
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  }
+}
+
+TEST_F(DBCompactionTest, CompactRangeSkipFlushAfterDelay) {
+  // Verify that, when `CompactRangeOptions::allow_write_stall == false`,
+  // CompactRange skips its flush if the delay is long enough that the memtables
+  // existing at the beginning of the call have already been flushed.
+  const int kNumL0FilesTrigger = 4;
+  const int kNumL0FilesLimit = 8;
+  Options options = CurrentOptions();
+  options.level0_slowdown_writes_trigger = kNumL0FilesLimit;
+  options.level0_file_num_compaction_trigger = kNumL0FilesTrigger;
+  Reopen(options);
+
+  Random rnd(301);
+  // The manual flush includes the memtable that was active when CompactRange
+  // began. So it unblocks CompactRange and precludes its flush. Throughout the
+  // test, stall conditions are upheld via high L0 file count.
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::WaitUntilFlushWouldNotStallWrites:StallWait",
+        "DBCompactionTest::CompactRangeSkipFlushAfterDelay:PreFlush"},
+       {"DBCompactionTest::CompactRangeSkipFlushAfterDelay:PostFlush",
+        "DBImpl::FlushMemTable:StallWaitDone"},
+       {"DBImpl::FlushMemTable:StallWaitDone", "CompactionJob::Run():End"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  //used for the delayable flushes
+  FlushOptions flush_opts;
+  flush_opts.allow_write_stall = true;
+  for (int i = 0; i < kNumL0FilesLimit - 1; ++i) {
+    for (int j = 0; j < 2; ++j) {
+      ASSERT_OK(Put(Key(j), RandomString(&rnd, 1024)));
+    }
+    dbfull()->Flush(flush_opts);
+  }
+  auto manual_compaction_thread = port::Thread([this]() {
+    CompactRangeOptions cro;
+    cro.allow_write_stall = false;
+    db_->CompactRange(cro, nullptr, nullptr);
+  });
+
+  TEST_SYNC_POINT("DBCompactionTest::CompactRangeSkipFlushAfterDelay:PreFlush");
+  Put(ToString(0), RandomString(&rnd, 1024));
+  dbfull()->Flush(flush_opts);
+  Put(ToString(0), RandomString(&rnd, 1024));
+  TEST_SYNC_POINT("DBCompactionTest::CompactRangeSkipFlushAfterDelay:PostFlush");
+  manual_compaction_thread.join();
+
+  // If CompactRange's flush was skipped, the final Put above will still be
+  // in the active memtable.
+  std::string num_keys_in_memtable;
+  db_->GetProperty(DB::Properties::kNumEntriesActiveMemTable, &num_keys_in_memtable);
+  ASSERT_EQ(ToString(1), num_keys_in_memtable);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBCompactionTest, CompactRangeFlushOverlappingMemtable) {
+  // Verify memtable only gets flushed if it contains data overlapping the range
+  // provided to `CompactRange`. Tests all kinds of overlap/non-overlap.
+  const int kNumEndpointKeys = 5;
+  std::string keys[kNumEndpointKeys] = {"a", "b", "c", "d", "e"};
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  Reopen(options);
+
+  // One extra iteration for nullptr, which means left side of interval is
+  // unbounded.
+  for (int i = 0; i <= kNumEndpointKeys; ++i) {
+    Slice begin;
+    Slice* begin_ptr;
+    if (i == 0) {
+      begin_ptr = nullptr;
+    } else {
+      begin = keys[i - 1];
+      begin_ptr = &begin;
+    }
+    // Start at `i` so right endpoint comes after left endpoint. One extra
+    // iteration for nullptr, which means right side of interval is unbounded.
+    for (int j = std::max(0, i - 1); j <= kNumEndpointKeys; ++j) {
+      Slice end;
+      Slice* end_ptr;
+      if (j == kNumEndpointKeys) {
+        end_ptr = nullptr;
+      } else {
+        end = keys[j];
+        end_ptr = &end;
+      }
+      ASSERT_OK(Put("b", "val"));
+      ASSERT_OK(Put("d", "val"));
+      CompactRangeOptions compact_range_opts;
+      ASSERT_OK(db_->CompactRange(compact_range_opts, begin_ptr, end_ptr));
+
+      uint64_t get_prop_tmp, num_memtable_entries = 0;
+      ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kNumEntriesImmMemTables,
+                                      &get_prop_tmp));
+      num_memtable_entries += get_prop_tmp;
+      ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kNumEntriesActiveMemTable,
+                                      &get_prop_tmp));
+      num_memtable_entries += get_prop_tmp;
+      if (begin_ptr == nullptr || end_ptr == nullptr ||
+          (i <= 4 && j >= 1 && (begin != "c" || end != "c"))) {
+        // In this case `CompactRange`'s range overlapped in some way with the
+        // memtable's range, so flush should've happened. Then "b" and "d" won't
+        // be in the memtable.
+        ASSERT_EQ(0, num_memtable_entries);
+      } else {
+        ASSERT_EQ(2, num_memtable_entries);
+        // flush anyways to prepare for next iteration
+        db_->Flush(FlushOptions());
+      }
+    }
+  }
+}
+
+TEST_F(DBCompactionTest, CompactionStatsTest) {
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = 2;
+  CompactionStatsCollector* collector = new CompactionStatsCollector();
+  options.listeners.emplace_back(collector);
+  DestroyAndReopen(options);
+
+  for (int i = 0; i < 32; i++) {
+    for (int j = 0; j < 5000; j++) {
+      Put(std::to_string(j), std::string(1, 'A'));
+    }
+    ASSERT_OK(Flush());
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  }
+  dbfull()->TEST_WaitForCompact();
+  ColumnFamilyHandleImpl* cfh =
+      static_cast<ColumnFamilyHandleImpl*>(dbfull()->DefaultColumnFamily());
+  ColumnFamilyData* cfd = cfh->cfd();
+
+  VerifyCompactionStats(*cfd, *collector);
+}
+
+TEST_F(DBCompactionTest, CompactFilesOutputRangeConflict) {
+  // LSM setup:
+  // L1:      [ba bz]
+  // L2: [a b]       [c d]
+  // L3: [a b]       [c d]
+  //
+  // Thread 1:                        Thread 2:
+  // Begin compacting all L2->L3
+  //                                  Compact [ba bz] L1->L3
+  // End compacting all L2->L3
+  //
+  // The compaction operation in thread 2 should be disallowed because the range
+  // overlaps with the compaction in thread 1, which also covers that range in
+  // L3.
+  Options options = CurrentOptions();
+  FlushedFileCollector* collector = new FlushedFileCollector();
+  options.listeners.emplace_back(collector);
+  Reopen(options);
+
+  for (int level = 3; level >= 2; --level) {
+    ASSERT_OK(Put("a", "val"));
+    ASSERT_OK(Put("b", "val"));
+    ASSERT_OK(Flush());
+    ASSERT_OK(Put("c", "val"));
+    ASSERT_OK(Put("d", "val"));
+    ASSERT_OK(Flush());
+    MoveFilesToLevel(level);
+  }
+  ASSERT_OK(Put("ba", "val"));
+  ASSERT_OK(Put("bz", "val"));
+  ASSERT_OK(Flush());
+  MoveFilesToLevel(1);
+
+  SyncPoint::GetInstance()->LoadDependency({
+      {"CompactFilesImpl:0",
+       "DBCompactionTest::CompactFilesOutputRangeConflict:Thread2Begin"},
+      {"DBCompactionTest::CompactFilesOutputRangeConflict:Thread2End",
+       "CompactFilesImpl:1"},
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  auto bg_thread = port::Thread([&]() {
+    // Thread 1
+    std::vector<std::string> filenames = collector->GetFlushedFiles();
+    filenames.pop_back();
+    ASSERT_OK(db_->CompactFiles(CompactionOptions(), filenames,
+                                3 /* output_level */));
+  });
+
+  // Thread 2
+  TEST_SYNC_POINT(
+      "DBCompactionTest::CompactFilesOutputRangeConflict:Thread2Begin");
+  std::string filename = collector->GetFlushedFiles().back();
+  ASSERT_FALSE(
+      db_->CompactFiles(CompactionOptions(), {filename}, 3 /* output_level */)
+          .ok());
+  TEST_SYNC_POINT(
+      "DBCompactionTest::CompactFilesOutputRangeConflict:Thread2End");
+
+  bg_thread.join();
+}
+
+TEST_F(DBCompactionTest, CompactionHasEmptyOutput) {
+  Options options = CurrentOptions();
+  SstStatsCollector* collector = new SstStatsCollector();
+  options.level0_file_num_compaction_trigger = 2;
+  options.listeners.emplace_back(collector);
+  Reopen(options);
+
+  // Make sure the L0 files overlap to prevent trivial move.
+  ASSERT_OK(Put("a", "val"));
+  ASSERT_OK(Put("b", "val"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Delete("a"));
+  ASSERT_OK(Delete("b"));
+  ASSERT_OK(Flush());
+
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(1), 0);
+
+  // Expect one file creation to start for each flush, and zero for compaction
+  // since no keys are written.
+  ASSERT_EQ(2, collector->num_ssts_creation_started());
+}
+
+TEST_F(DBCompactionTest, CompactionLimiter) {
+  const int kNumKeysPerFile = 10;
+  const int kMaxBackgroundThreads = 64;
+
+  struct CompactionLimiter {
+    std::string name;
+    int limit_tasks;
+    int max_tasks;
+    int tasks;
+    std::shared_ptr<ConcurrentTaskLimiter> limiter;
+  };
+
+  std::vector<CompactionLimiter> limiter_settings;
+  limiter_settings.push_back({"limiter_1", 1, 0, 0, nullptr});
+  limiter_settings.push_back({"limiter_2", 2, 0, 0, nullptr});
+  limiter_settings.push_back({"limiter_3", 3, 0, 0, nullptr});
+
+  for (auto& ls : limiter_settings) {
+    ls.limiter.reset(NewConcurrentTaskLimiter(ls.name, ls.limit_tasks));
+  }
+
+  std::shared_ptr<ConcurrentTaskLimiter> unique_limiter(
+    NewConcurrentTaskLimiter("unique_limiter", -1));
+
+  const char* cf_names[] = {"default", "0", "1", "2", "3", "4", "5",
+    "6", "7", "8", "9", "a", "b", "c", "d", "e", "f" };
+  const unsigned int cf_count = sizeof cf_names / sizeof cf_names[0];
+
+  std::unordered_map<std::string, CompactionLimiter*> cf_to_limiter;
+
+  Options options = CurrentOptions();
+  options.write_buffer_size = 110 * 1024;  // 110KB
+  options.arena_block_size = 4096;
+  options.num_levels = 3;
+  options.level0_file_num_compaction_trigger = 4;
+  options.level0_slowdown_writes_trigger = 64;
+  options.level0_stop_writes_trigger = 64;
+  options.max_background_jobs = kMaxBackgroundThreads; // Enough threads
+  options.memtable_factory.reset(new SpecialSkipListFactory(kNumKeysPerFile));
+  options.max_write_buffer_number = 10; // Enough memtables
+  DestroyAndReopen(options);
+
+  std::vector<Options> option_vector;
+  option_vector.reserve(cf_count);
+
+  for (unsigned int cf = 0; cf < cf_count; cf++) {
+    ColumnFamilyOptions cf_opt(options);
+    if (cf == 0) {
+      // "Default" CF does't use compaction limiter
+      cf_opt.compaction_thread_limiter = nullptr;
+    } else if (cf == 1) {
+      // "1" CF uses bypass compaction limiter
+      unique_limiter->SetMaxOutstandingTask(-1);
+      cf_opt.compaction_thread_limiter = unique_limiter;
+    } else {
+      // Assign limiter by mod
+      auto& ls = limiter_settings[cf % 3];
+      cf_opt.compaction_thread_limiter = ls.limiter;
+      cf_to_limiter[cf_names[cf]] = &ls;
+    }
+    option_vector.emplace_back(DBOptions(options), cf_opt);
+  }
+
+  for (unsigned int cf = 1; cf < cf_count; cf++) {
+    CreateColumnFamilies({cf_names[cf]}, option_vector[cf]);
+  }
+
+  ReopenWithColumnFamilies(std::vector<std::string>(cf_names,
+                                                    cf_names + cf_count),
+                           option_vector);
+
+  port::Mutex mutex;
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:BeforeCompaction", [&](void* arg) {
+        const auto& cf_name = static_cast<ColumnFamilyData*>(arg)->GetName();
+        auto iter = cf_to_limiter.find(cf_name);
+        if (iter != cf_to_limiter.end()) {
+          MutexLock l(&mutex);
+          ASSERT_GE(iter->second->limit_tasks, ++iter->second->tasks);
+          iter->second->max_tasks =
+              std::max(iter->second->max_tasks, iter->second->limit_tasks);
+        }
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:AfterCompaction", [&](void* arg) {
+        const auto& cf_name = static_cast<ColumnFamilyData*>(arg)->GetName();
+        auto iter = cf_to_limiter.find(cf_name);
+        if (iter != cf_to_limiter.end()) {
+          MutexLock l(&mutex);
+          ASSERT_GE(--iter->second->tasks, 0);
+        }
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // Block all compact threads in thread pool.
+  const size_t kTotalFlushTasks = kMaxBackgroundThreads / 4;
+  const size_t kTotalCompactTasks = kMaxBackgroundThreads - kTotalFlushTasks;
+  env_->SetBackgroundThreads((int)kTotalFlushTasks, Env::HIGH);
+  env_->SetBackgroundThreads((int)kTotalCompactTasks, Env::LOW);
+
+  test::SleepingBackgroundTask sleeping_compact_tasks[kTotalCompactTasks];
+
+  // Block all compaction threads in thread pool.
+  for (size_t i = 0; i < kTotalCompactTasks; i++) {
+    env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask,
+                   &sleeping_compact_tasks[i], Env::LOW);
+    sleeping_compact_tasks[i].WaitUntilSleeping();
+  }
+
+  int keyIndex = 0;
+
+  for (int n = 0; n < options.level0_file_num_compaction_trigger; n++) {
+    for (unsigned int cf = 0; cf < cf_count; cf++) {
+      for (int i = 0; i < kNumKeysPerFile; i++) {
+        ASSERT_OK(Put(cf, Key(keyIndex++), ""));
+      }
+      // put extra key to trigger flush
+      ASSERT_OK(Put(cf, "", ""));
+    }
+
+    for (unsigned int cf = 0; cf < cf_count; cf++) {
+      dbfull()->TEST_WaitForFlushMemTable(handles_[cf]);
+    }
+  }
+
+  // Enough L0 files to trigger compaction
+  for (unsigned int cf = 0; cf < cf_count; cf++) {
+    ASSERT_EQ(NumTableFilesAtLevel(0, cf),
+      options.level0_file_num_compaction_trigger);
+  }
+
+  // Create more files for one column family, which triggers speed up
+  // condition, all compactions will be scheduled.
+  for (int num = 0; num < options.level0_file_num_compaction_trigger; num++) {
+    for (int i = 0; i < kNumKeysPerFile; i++) {
+      ASSERT_OK(Put(0, Key(i), ""));
+    }
+    // put extra key to trigger flush
+    ASSERT_OK(Put(0, "", ""));
+    dbfull()->TEST_WaitForFlushMemTable(handles_[0]);
+    ASSERT_EQ(options.level0_file_num_compaction_trigger + num + 1,
+              NumTableFilesAtLevel(0, 0));
+  }
+
+  // All CFs are pending compaction
+  ASSERT_EQ(cf_count, env_->GetThreadPoolQueueLen(Env::LOW));
+
+  // Unblock all compaction threads
+  for (size_t i = 0; i < kTotalCompactTasks; i++) {
+    sleeping_compact_tasks[i].WakeUp();
+    sleeping_compact_tasks[i].WaitUntilDone();
+  }
+
+  for (unsigned int cf = 0; cf < cf_count; cf++) {
+    dbfull()->TEST_WaitForFlushMemTable(handles_[cf]);
+  }
+
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  // Max outstanding compact tasks reached limit
+  for (auto& ls : limiter_settings) {
+    ASSERT_EQ(ls.limit_tasks, ls.max_tasks);
+    ASSERT_EQ(0, ls.limiter->GetOutstandingTask());
+  }
+
+  // test manual compaction under a fully throttled limiter
+  int cf_test = 1;
+  unique_limiter->SetMaxOutstandingTask(0);
+
+  // flush one more file to cf 1
+  for (int i = 0; i < kNumKeysPerFile; i++) {
+      ASSERT_OK(Put(cf_test, Key(keyIndex++), ""));
+  }
+  // put extra key to trigger flush
+  ASSERT_OK(Put(cf_test, "", ""));
+
+  dbfull()->TEST_WaitForFlushMemTable(handles_[cf_test]);
+  ASSERT_EQ(1, NumTableFilesAtLevel(0, cf_test));
+
+  Compact(cf_test, Key(0), Key(keyIndex));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+}
+
+INSTANTIATE_TEST_CASE_P(DBCompactionTestWithParam, DBCompactionTestWithParam,
+                        ::testing::Values(std::make_tuple(1, true),
+                                          std::make_tuple(1, false),
+                                          std::make_tuple(4, true),
+                                          std::make_tuple(4, false)));
+
+TEST_P(DBCompactionDirectIOTest, DirectIO) {
+  Options options = CurrentOptions();
+  Destroy(options);
+  options.create_if_missing = true;
+  options.disable_auto_compactions = true;
+  options.use_direct_io_for_flush_and_compaction = GetParam();
+  options.env = new MockEnv(Env::Default());
+  Reopen(options);
+  bool readahead = false;
+  SyncPoint::GetInstance()->SetCallBack(
+      "CompactionJob::OpenCompactionOutputFile", [&](void* arg) {
+        bool* use_direct_writes = static_cast<bool*>(arg);
+        ASSERT_EQ(*use_direct_writes,
+                  options.use_direct_io_for_flush_and_compaction);
+      });
+  if (options.use_direct_io_for_flush_and_compaction) {
+    SyncPoint::GetInstance()->SetCallBack(
+        "SanitizeOptions:direct_io", [&](void* /*arg*/) {
+          readahead = true;
+        });
+  }
+  SyncPoint::GetInstance()->EnableProcessing();
+  CreateAndReopenWithCF({"pikachu"}, options);
+  MakeTables(3, "p", "q", 1);
+  ASSERT_EQ("1,1,1", FilesPerLevel(1));
+  Compact(1, "p1", "p9");
+  ASSERT_EQ(readahead, options.use_direct_reads);
+  ASSERT_EQ("0,0,1", FilesPerLevel(1));
+  Destroy(options);
+  delete options.env;
+}
+
+INSTANTIATE_TEST_CASE_P(DBCompactionDirectIOTest, DBCompactionDirectIOTest,
+                        testing::Bool());
+
+class CompactionPriTest : public DBTestBase,
+                          public testing::WithParamInterface<uint32_t> {
+ public:
+  CompactionPriTest() : DBTestBase("/compaction_pri_test") {
+    compaction_pri_ = GetParam();
+  }
+
+  // Required if inheriting from testing::WithParamInterface<>
+  static void SetUpTestCase() {}
+  static void TearDownTestCase() {}
+
+  uint32_t compaction_pri_;
+};
+
+TEST_P(CompactionPriTest, Test) {
+  Options options = CurrentOptions();
+  options.write_buffer_size = 16 * 1024;
+  options.compaction_pri = static_cast<CompactionPri>(compaction_pri_);
+  options.hard_pending_compaction_bytes_limit = 256 * 1024;
+  options.max_bytes_for_level_base = 64 * 1024;
+  options.max_bytes_for_level_multiplier = 4;
+  options.compression = kNoCompression;
+
+  DestroyAndReopen(options);
+
+  Random rnd(301);
+  const int kNKeys = 5000;
+  int keys[kNKeys];
+  for (int i = 0; i < kNKeys; i++) {
+    keys[i] = i;
+  }
+  std::random_shuffle(std::begin(keys), std::end(keys));
+
+  for (int i = 0; i < kNKeys; i++) {
+    ASSERT_OK(Put(Key(keys[i]), RandomString(&rnd, 102)));
+  }
+
+  dbfull()->TEST_WaitForCompact();
+  for (int i = 0; i < kNKeys; i++) {
+    ASSERT_NE("NOT_FOUND", Get(Key(i)));
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(
+    CompactionPriTest, CompactionPriTest,
+    ::testing::Values(CompactionPri::kByCompensatedSize,
+                      CompactionPri::kOldestLargestSeqFirst,
+                      CompactionPri::kOldestSmallestSeqFirst,
+                      CompactionPri::kMinOverlappingRatio));
+
+class NoopMergeOperator : public MergeOperator {
+ public:
+  NoopMergeOperator() {}
+
+  bool FullMergeV2(const MergeOperationInput& /*merge_in*/,
+                   MergeOperationOutput* merge_out) const override {
+    std::string val("bar");
+    merge_out->new_value = val;
+    return true;
+  }
+
+  const char* Name() const override { return "Noop"; }
+};
+
+TEST_F(DBCompactionTest, PartialManualCompaction) {
+  Options opts = CurrentOptions();
+  opts.num_levels = 3;
+  opts.level0_file_num_compaction_trigger = 10;
+  opts.compression = kNoCompression;
+  opts.merge_operator.reset(new NoopMergeOperator());
+  opts.target_file_size_base = 10240;
+  DestroyAndReopen(opts);
+
+  Random rnd(301);
+  for (auto i = 0; i < 8; ++i) {
+    for (auto j = 0; j < 10; ++j) {
+      Merge("foo", RandomString(&rnd, 1024));
+    }
+    Flush();
+  }
+
+  MoveFilesToLevel(2);
+
+  std::string prop;
+  EXPECT_TRUE(dbfull()->GetProperty(DB::Properties::kLiveSstFilesSize, &prop));
+  uint64_t max_compaction_bytes = atoi(prop.c_str()) / 2;
+  ASSERT_OK(dbfull()->SetOptions(
+      {{"max_compaction_bytes", std::to_string(max_compaction_bytes)}}));
+
+  CompactRangeOptions cro;
+  cro.bottommost_level_compaction = BottommostLevelCompaction::kForceOptimized;
+  dbfull()->CompactRange(cro, nullptr, nullptr);
+}
+
+TEST_F(DBCompactionTest, ManualCompactionFailsInReadOnlyMode) {
+  // Regression test for bug where manual compaction hangs forever when the DB
+  // is in read-only mode. Verify it now at least returns, despite failing.
+  const int kNumL0Files = 4;
+  std::unique_ptr<FaultInjectionTestEnv> mock_env(
+      new FaultInjectionTestEnv(Env::Default()));
+  Options opts = CurrentOptions();
+  opts.disable_auto_compactions = true;
+  opts.env = mock_env.get();
+  DestroyAndReopen(opts);
+
+  Random rnd(301);
+  for (int i = 0; i < kNumL0Files; ++i) {
+    // Make sure files are overlapping in key-range to prevent trivial move.
+    Put("key1", RandomString(&rnd, 1024));
+    Put("key2", RandomString(&rnd, 1024));
+    Flush();
+  }
+  ASSERT_EQ(kNumL0Files, NumTableFilesAtLevel(0));
+
+  // Enter read-only mode by failing a write.
+  mock_env->SetFilesystemActive(false);
+  // Make sure this is outside `CompactRange`'s range so that it doesn't fail
+  // early trying to flush memtable.
+  ASSERT_NOK(Put("key3", RandomString(&rnd, 1024)));
+
+  // In the bug scenario, the first manual compaction would fail and forget to
+  // unregister itself, causing the second one to hang forever due to conflict
+  // with a non-running compaction.
+  CompactRangeOptions cro;
+  cro.exclusive_manual_compaction = false;
+  Slice begin_key("key1");
+  Slice end_key("key2");
+  ASSERT_NOK(dbfull()->CompactRange(cro, &begin_key, &end_key));
+  ASSERT_NOK(dbfull()->CompactRange(cro, &begin_key, &end_key));
+
+  // Close before mock_env destruct.
+  Close();
+}
+
+// ManualCompactionBottomLevelOptimization tests the bottom level manual
+// compaction optimization to skip recompacting files created by Ln-1 to Ln
+// compaction
+TEST_F(DBCompactionTest, ManualCompactionBottomLevelOptimized) {
+  Options opts = CurrentOptions();
+  opts.num_levels = 3;
+  opts.level0_file_num_compaction_trigger = 5;
+  opts.compression = kNoCompression;
+  opts.merge_operator.reset(new NoopMergeOperator());
+  opts.target_file_size_base = 1024;
+  opts.max_bytes_for_level_multiplier = 2;
+  opts.disable_auto_compactions = true;
+  DestroyAndReopen(opts);
+  ColumnFamilyHandleImpl* cfh =
+      static_cast<ColumnFamilyHandleImpl*>(dbfull()->DefaultColumnFamily());
+  ColumnFamilyData* cfd = cfh->cfd();
+  InternalStats* internal_stats_ptr = cfd->internal_stats();
+  ASSERT_NE(internal_stats_ptr, nullptr);
+
+  Random rnd(301);
+  for (auto i = 0; i < 8; ++i) {
+    for (auto j = 0; j < 10; ++j) {
+      ASSERT_OK(
+          Put("foo" + std::to_string(i * 10 + j), RandomString(&rnd, 1024)));
+    }
+    Flush();
+  }
+
+  MoveFilesToLevel(2);
+
+  for (auto i = 0; i < 8; ++i) {
+    for (auto j = 0; j < 10; ++j) {
+      ASSERT_OK(
+          Put("bar" + std::to_string(i * 10 + j), RandomString(&rnd, 1024)));
+    }
+    Flush();
+  }
+  const std::vector<InternalStats::CompactionStats>& comp_stats =
+      internal_stats_ptr->TEST_GetCompactionStats();
+  int num = comp_stats[2].num_input_files_in_output_level;
+  ASSERT_EQ(num, 0);
+
+  CompactRangeOptions cro;
+  cro.bottommost_level_compaction = BottommostLevelCompaction::kForceOptimized;
+  dbfull()->CompactRange(cro, nullptr, nullptr);
+
+  const std::vector<InternalStats::CompactionStats>& comp_stats2 =
+      internal_stats_ptr->TEST_GetCompactionStats();
+  num = comp_stats2[2].num_input_files_in_output_level;
+  ASSERT_EQ(num, 0);
+}
+
+TEST_F(DBCompactionTest, CompactionDuringShutdown) {
+  Options opts = CurrentOptions();
+  opts.level0_file_num_compaction_trigger = 2;
+  opts.disable_auto_compactions = true;
+  DestroyAndReopen(opts);
+  ColumnFamilyHandleImpl* cfh =
+      static_cast<ColumnFamilyHandleImpl*>(dbfull()->DefaultColumnFamily());
+  ColumnFamilyData* cfd = cfh->cfd();
+  InternalStats* internal_stats_ptr = cfd->internal_stats();
+  ASSERT_NE(internal_stats_ptr, nullptr);
+
+  Random rnd(301);
+  for (auto i = 0; i < 2; ++i) {
+    for (auto j = 0; j < 10; ++j) {
+      ASSERT_OK(
+          Put("foo" + std::to_string(i * 10 + j), RandomString(&rnd, 1024)));
+    }
+    Flush();
+  }
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:NonTrivial:BeforeRun",
+      [&](void* /*arg*/) { dbfull()->shutting_down_.store(true); });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_OK(dbfull()->error_handler_.GetBGError());
+}
+
+// FixFileIngestionCompactionDeadlock tests and verifies that compaction and
+// file ingestion do not cause deadlock in the event of write stall triggered
+// by number of L0 files reaching level0_stop_writes_trigger.
+TEST_P(DBCompactionTestWithParam, FixFileIngestionCompactionDeadlock) {
+  const int kNumKeysPerFile = 100;
+  // Generate SST files.
+  Options options = CurrentOptions();
+
+  // Generate an external SST file containing a single key, i.e. 99
+  std::string sst_files_dir = dbname_ + "/sst_files/";
+  test::DestroyDir(env_, sst_files_dir);
+  ASSERT_OK(env_->CreateDir(sst_files_dir));
+  SstFileWriter sst_writer(EnvOptions(), options);
+  const std::string sst_file_path = sst_files_dir + "test.sst";
+  ASSERT_OK(sst_writer.Open(sst_file_path));
+  ASSERT_OK(sst_writer.Put(Key(kNumKeysPerFile - 1), "value"));
+  ASSERT_OK(sst_writer.Finish());
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->LoadDependency({
+      {"DBImpl::IngestExternalFile:AfterIncIngestFileCounter",
+       "BackgroundCallCompaction:0"},
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  options.write_buffer_size = 110 << 10;  // 110KB
+  options.level0_file_num_compaction_trigger =
+      options.level0_stop_writes_trigger;
+  options.max_subcompactions = max_subcompactions_;
+  options.memtable_factory.reset(new SpecialSkipListFactory(kNumKeysPerFile));
+  DestroyAndReopen(options);
+  Random rnd(301);
+
+  // Generate level0_stop_writes_trigger L0 files to trigger write stop
+  for (int i = 0; i != options.level0_file_num_compaction_trigger; ++i) {
+    for (int j = 0; j != kNumKeysPerFile; ++j) {
+      ASSERT_OK(Put(Key(j), RandomString(&rnd, 990)));
+    }
+    if (0 == i) {
+      // When we reach here, the memtables have kNumKeysPerFile keys. Note that
+      // flush is not yet triggered. We need to write an extra key so that the
+      // write path will call PreprocessWrite and flush the previous key-value
+      // pairs to e flushed. After that, there will be the newest key in the
+      // memtable, and a bunch of L0 files. Since there is already one key in
+      // the memtable, then for i = 1, 2, ..., we do not have to write this
+      // extra key to trigger flush.
+      ASSERT_OK(Put("", ""));
+    }
+    dbfull()->TEST_WaitForFlushMemTable();
+    ASSERT_EQ(NumTableFilesAtLevel(0 /*level*/, 0 /*cf*/), i + 1);
+  }
+  // When we reach this point, there will be level0_stop_writes_trigger L0
+  // files and one extra key (99) in memory, which overlaps with the external
+  // SST file. Write stall triggers, and can be cleared only after compaction
+  // reduces the number of L0 files.
+
+  // Compaction will also be triggered since we have reached the threshold for
+  // auto compaction. Note that compaction may begin after the following file
+  // ingestion thread and waits for ingestion to finish.
+
+  // Thread to ingest file with overlapping key range with the current
+  // memtable. Consequently ingestion will trigger a flush. The flush MUST
+  // proceed without waiting for the write stall condition to clear, otherwise
+  // deadlock can happen.
+  port::Thread ingestion_thr([&]() {
+    IngestExternalFileOptions ifo;
+    Status s = db_->IngestExternalFile({sst_file_path}, ifo);
+    ASSERT_OK(s);
+  });
+
+  // More write to trigger write stop
+  ingestion_thr.join();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  Close();
+}
+
+TEST_F(DBCompactionTest, ConsistencyFailTest) {
+  Options options = CurrentOptions();
+  DestroyAndReopen(options);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "VersionBuilder::CheckConsistency", [&](void* arg) {
+        auto p =
+            reinterpret_cast<std::pair<FileMetaData**, FileMetaData**>*>(arg);
+        // just swap the two FileMetaData so that we hit error
+        // in CheckConsistency funcion
+        FileMetaData* temp = *(p->first);
+        *(p->first) = *(p->second);
+        *(p->second) = temp;
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  for (int k = 0; k < 2; ++k) {
+    ASSERT_OK(Put("foo", "bar"));
+    Flush();
+  }
+
+  ASSERT_NOK(Put("foo", "bar"));
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+void IngestOneKeyValue(DBImpl* db, const std::string& key,
+                       const std::string& value, const Options& options) {
+  ExternalSstFileInfo info;
+  std::string f = test::PerThreadDBPath("sst_file" + key);
+  EnvOptions env;
+  ROCKSDB_NAMESPACE::SstFileWriter writer(env, options);
+  auto s = writer.Open(f);
+  ASSERT_OK(s);
+  // ASSERT_OK(writer.Put(Key(), ""));
+  ASSERT_OK(writer.Put(key, value));
+
+  ASSERT_OK(writer.Finish(&info));
+  IngestExternalFileOptions ingest_opt;
+
+  ASSERT_OK(db->IngestExternalFile({info.file_path}, ingest_opt));
+}
+
+TEST_P(DBCompactionTestWithParam,
+       FlushAfterIntraL0CompactionCheckConsistencyFail) {
+  Options options = CurrentOptions();
+  options.force_consistency_checks = true;
+  options.compression = kNoCompression;
+  options.level0_file_num_compaction_trigger = 5;
+  options.max_background_compactions = 2;
+  options.max_subcompactions = max_subcompactions_;
+  DestroyAndReopen(options);
+
+  const size_t kValueSize = 1 << 20;
+  Random rnd(301);
+  std::atomic<int> pick_intra_l0_count(0);
+  std::string value(RandomString(&rnd, kValueSize));
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBCompactionTestWithParam::FlushAfterIntraL0:1",
+        "CompactionJob::Run():Start"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "FindIntraL0Compaction",
+      [&](void* /*arg*/) { pick_intra_l0_count.fetch_add(1); });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // prevents trivial move
+  for (int i = 0; i < 10; ++i) {
+    ASSERT_OK(Put(Key(i), ""));  // prevents trivial move
+  }
+  ASSERT_OK(Flush());
+  Compact("", Key(99));
+  ASSERT_EQ(0, NumTableFilesAtLevel(0));
+
+  // Flush 5 L0 sst.
+  for (int i = 0; i < 5; ++i) {
+    ASSERT_OK(Put(Key(i + 1), value));
+    ASSERT_OK(Flush());
+  }
+  ASSERT_EQ(5, NumTableFilesAtLevel(0));
+
+  // Put one key, to make smallest log sequence number in this memtable is less
+  // than sst which would be ingested in next step.
+  ASSERT_OK(Put(Key(0), "a"));
+
+  ASSERT_EQ(5, NumTableFilesAtLevel(0));
+
+  // Ingest 5 L0 sst. And this files would trigger PickIntraL0Compaction.
+  for (int i = 5; i < 10; i++) {
+    IngestOneKeyValue(dbfull(), Key(i), value, options);
+    ASSERT_EQ(i + 1, NumTableFilesAtLevel(0));
+  }
+
+  TEST_SYNC_POINT("DBCompactionTestWithParam::FlushAfterIntraL0:1");
+  // Put one key, to make biggest log sequence number in this memtable is bigger
+  // than sst which would be ingested in next step.
+  ASSERT_OK(Put(Key(2), "b"));
+  ASSERT_EQ(10, NumTableFilesAtLevel(0));
+  dbfull()->TEST_WaitForCompact();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  std::vector<std::vector<FileMetaData>> level_to_files;
+  dbfull()->TEST_GetFilesMetaData(dbfull()->DefaultColumnFamily(),
+                                  &level_to_files);
+  ASSERT_GT(level_to_files[0].size(), 0);
+  ASSERT_GT(pick_intra_l0_count.load(), 0);
+
+  ASSERT_OK(Flush());
+}
+
+TEST_P(DBCompactionTestWithParam,
+       IntraL0CompactionAfterFlushCheckConsistencyFail) {
+  Options options = CurrentOptions();
+  options.force_consistency_checks = true;
+  options.compression = kNoCompression;
+  options.level0_file_num_compaction_trigger = 5;
+  options.max_background_compactions = 2;
+  options.max_subcompactions = max_subcompactions_;
+  options.write_buffer_size = 2 << 20;
+  options.max_write_buffer_number = 6;
+  DestroyAndReopen(options);
+
+  const size_t kValueSize = 1 << 20;
+  Random rnd(301);
+  std::string value(RandomString(&rnd, kValueSize));
+  std::string value2(RandomString(&rnd, kValueSize));
+  std::string bigvalue = value + value;
+
+  // prevents trivial move
+  for (int i = 0; i < 10; ++i) {
+    ASSERT_OK(Put(Key(i), ""));  // prevents trivial move
+  }
+  ASSERT_OK(Flush());
+  Compact("", Key(99));
+  ASSERT_EQ(0, NumTableFilesAtLevel(0));
+
+  std::atomic<int> pick_intra_l0_count(0);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBCompactionTestWithParam::IntraL0CompactionAfterFlush:1",
+        "CompactionJob::Run():Start"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "FindIntraL0Compaction",
+      [&](void* /*arg*/) { pick_intra_l0_count.fetch_add(1); });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  // Make 6 L0 sst.
+  for (int i = 0; i < 6; ++i) {
+    if (i % 2 == 0) {
+      IngestOneKeyValue(dbfull(), Key(i), value, options);
+    } else {
+      ASSERT_OK(Put(Key(i), value));
+      ASSERT_OK(Flush());
+    }
+  }
+
+  ASSERT_EQ(6, NumTableFilesAtLevel(0));
+
+  // Stop run flush job
+  env_->SetBackgroundThreads(1, Env::HIGH);
+  test::SleepingBackgroundTask sleeping_tasks;
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_tasks,
+                 Env::Priority::HIGH);
+  sleeping_tasks.WaitUntilSleeping();
+
+  // Put many keys to make memtable request to flush
+  for (int i = 0; i < 6; ++i) {
+    ASSERT_OK(Put(Key(i), bigvalue));
+  }
+
+  ASSERT_EQ(6, NumTableFilesAtLevel(0));
+  // ingest file to trigger IntraL0Compaction
+  for (int i = 6; i < 10; ++i) {
+    ASSERT_EQ(i, NumTableFilesAtLevel(0));
+    IngestOneKeyValue(dbfull(), Key(i), value2, options);
+  }
+  ASSERT_EQ(10, NumTableFilesAtLevel(0));
+
+  // Wake up flush job
+  sleeping_tasks.WakeUp();
+  sleeping_tasks.WaitUntilDone();
+  TEST_SYNC_POINT("DBCompactionTestWithParam::IntraL0CompactionAfterFlush:1");
+  dbfull()->TEST_WaitForCompact();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+  uint64_t error_count = 0;
+  db_->GetIntProperty("rocksdb.background-errors", &error_count);
+  ASSERT_EQ(error_count, 0);
+  ASSERT_GT(pick_intra_l0_count.load(), 0);
+  for (int i = 0; i < 6; ++i) {
+    ASSERT_EQ(bigvalue, Get(Key(i)));
+  }
+  for (int i = 6; i < 10; ++i) {
+    ASSERT_EQ(value2, Get(Key(i)));
+  }
+}
+
+#endif // !defined(ROCKSDB_LITE)
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+#if !defined(ROCKSDB_LITE)
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+#else
+  (void) argc;
+  (void) argv;
+  return 0;
+#endif
+}
diff --git a/src/rocksdb/db/db_dynamic_level_test.cc b/src/rocksdb/db/db_dynamic_level_test.cc
new file mode 100644
index 000000000..c26657701
--- /dev/null
+++ b/src/rocksdb/db/db_dynamic_level_test.cc
@@ -0,0 +1,505 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+// Introduction of SyncPoint effectively disabled building and running this test
+// in Release build.
+// which is a pity, it is a good test
+#if !defined(ROCKSDB_LITE)
+
+#include "db/db_test_util.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+
+namespace ROCKSDB_NAMESPACE {
+class DBTestDynamicLevel : public DBTestBase {
+ public:
+  DBTestDynamicLevel() : DBTestBase("/db_dynamic_level_test") {}
+};
+
+TEST_F(DBTestDynamicLevel, DynamicLevelMaxBytesBase) {
+  if (!Snappy_Supported() || !LZ4_Supported()) {
+    return;
+  }
+  // Use InMemoryEnv, or it would be too slow.
+  std::unique_ptr<Env> env(new MockEnv(env_));
+
+  const int kNKeys = 1000;
+  int keys[kNKeys];
+
+  auto verify_func = [&]() {
+    for (int i = 0; i < kNKeys; i++) {
+      ASSERT_NE("NOT_FOUND", Get(Key(i)));
+      ASSERT_NE("NOT_FOUND", Get(Key(kNKeys * 2 + i)));
+      if (i < kNKeys / 10) {
+        ASSERT_EQ("NOT_FOUND", Get(Key(kNKeys + keys[i])));
+      } else {
+        ASSERT_NE("NOT_FOUND", Get(Key(kNKeys + keys[i])));
+      }
+    }
+  };
+
+  Random rnd(301);
+  for (int ordered_insert = 0; ordered_insert <= 1; ordered_insert++) {
+    for (int i = 0; i < kNKeys; i++) {
+      keys[i] = i;
+    }
+    if (ordered_insert == 0) {
+      std::random_shuffle(std::begin(keys), std::end(keys));
+    }
+    for (int max_background_compactions = 1; max_background_compactions < 4;
+         max_background_compactions += 2) {
+      Options options;
+      options.env = env.get();
+      options.create_if_missing = true;
+      options.write_buffer_size = 2048;
+      options.max_write_buffer_number = 2;
+      options.level0_file_num_compaction_trigger = 2;
+      options.level0_slowdown_writes_trigger = 2;
+      options.level0_stop_writes_trigger = 2;
+      options.target_file_size_base = 2048;
+      options.level_compaction_dynamic_level_bytes = true;
+      options.max_bytes_for_level_base = 10240;
+      options.max_bytes_for_level_multiplier = 4;
+      options.soft_rate_limit = 1.1;
+      options.max_background_compactions = max_background_compactions;
+      options.num_levels = 5;
+
+      options.compression_per_level.resize(3);
+      options.compression_per_level[0] = kNoCompression;
+      options.compression_per_level[1] = kLZ4Compression;
+      options.compression_per_level[2] = kSnappyCompression;
+      options.env = env_;
+
+      DestroyAndReopen(options);
+
+      for (int i = 0; i < kNKeys; i++) {
+        int key = keys[i];
+        ASSERT_OK(Put(Key(kNKeys + key), RandomString(&rnd, 102)));
+        ASSERT_OK(Put(Key(key), RandomString(&rnd, 102)));
+        ASSERT_OK(Put(Key(kNKeys * 2 + key), RandomString(&rnd, 102)));
+        ASSERT_OK(Delete(Key(kNKeys + keys[i / 10])));
+        env_->SleepForMicroseconds(5000);
+      }
+
+      uint64_t int_prop;
+      ASSERT_TRUE(db_->GetIntProperty("rocksdb.background-errors", &int_prop));
+      ASSERT_EQ(0U, int_prop);
+
+      // Verify DB
+      for (int j = 0; j < 2; j++) {
+        verify_func();
+        if (j == 0) {
+          Reopen(options);
+        }
+      }
+
+      // Test compact range works
+      dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+      // All data should be in the last level.
+      ColumnFamilyMetaData cf_meta;
+      db_->GetColumnFamilyMetaData(&cf_meta);
+      ASSERT_EQ(5U, cf_meta.levels.size());
+      for (int i = 0; i < 4; i++) {
+        ASSERT_EQ(0U, cf_meta.levels[i].files.size());
+      }
+      ASSERT_GT(cf_meta.levels[4U].files.size(), 0U);
+      verify_func();
+
+      Close();
+    }
+  }
+
+  env_->SetBackgroundThreads(1, Env::LOW);
+  env_->SetBackgroundThreads(1, Env::HIGH);
+}
+
+// Test specific cases in dynamic max bytes
+TEST_F(DBTestDynamicLevel, DynamicLevelMaxBytesBase2) {
+  Random rnd(301);
+  int kMaxKey = 1000000;
+
+  Options options = CurrentOptions();
+  options.compression = kNoCompression;
+  options.create_if_missing = true;
+  options.write_buffer_size = 20480;
+  options.max_write_buffer_number = 2;
+  options.level0_file_num_compaction_trigger = 2;
+  options.level0_slowdown_writes_trigger = 9999;
+  options.level0_stop_writes_trigger = 9999;
+  options.target_file_size_base = 9102;
+  options.level_compaction_dynamic_level_bytes = true;
+  options.max_bytes_for_level_base = 40960;
+  options.max_bytes_for_level_multiplier = 4;
+  options.max_background_compactions = 2;
+  options.num_levels = 5;
+  options.max_compaction_bytes = 0;  // Force not expanding in compactions
+  BlockBasedTableOptions table_options;
+  table_options.block_size = 1024;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  DestroyAndReopen(options);
+  ASSERT_OK(dbfull()->SetOptions({
+      {"disable_auto_compactions", "true"},
+  }));
+
+  uint64_t int_prop;
+  std::string str_prop;
+
+  // Initial base level is the last level
+  ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop));
+  ASSERT_EQ(4U, int_prop);
+
+  // Put about 28K to L0
+  for (int i = 0; i < 70; i++) {
+    ASSERT_OK(Put(Key(static_cast<int>(rnd.Uniform(kMaxKey))),
+                  RandomString(&rnd, 380)));
+  }
+  ASSERT_OK(dbfull()->SetOptions({
+      {"disable_auto_compactions", "false"},
+  }));
+  Flush();
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop));
+  ASSERT_EQ(4U, int_prop);
+
+  // Insert extra about 28K to L0. After they are compacted to L4, the base
+  // level should be changed to L3.
+  ASSERT_OK(dbfull()->SetOptions({
+      {"disable_auto_compactions", "true"},
+  }));
+  for (int i = 0; i < 70; i++) {
+    ASSERT_OK(Put(Key(static_cast<int>(rnd.Uniform(kMaxKey))),
+                  RandomString(&rnd, 380)));
+  }
+
+  ASSERT_OK(dbfull()->SetOptions({
+      {"disable_auto_compactions", "false"},
+  }));
+  Flush();
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop));
+  ASSERT_EQ(3U, int_prop);
+  ASSERT_TRUE(db_->GetProperty("rocksdb.num-files-at-level1", &str_prop));
+  ASSERT_EQ("0", str_prop);
+  ASSERT_TRUE(db_->GetProperty("rocksdb.num-files-at-level2", &str_prop));
+  ASSERT_EQ("0", str_prop);
+
+  // Write even more data while leaving the base level at L3.
+  ASSERT_OK(dbfull()->SetOptions({
+      {"disable_auto_compactions", "true"},
+  }));
+  // Write about 40K more
+  for (int i = 0; i < 100; i++) {
+    ASSERT_OK(Put(Key(static_cast<int>(rnd.Uniform(kMaxKey))),
+                  RandomString(&rnd, 380)));
+  }
+  ASSERT_OK(dbfull()->SetOptions({
+      {"disable_auto_compactions", "false"},
+  }));
+  Flush();
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop));
+  ASSERT_EQ(3U, int_prop);
+
+  // Fill up L0, and then run an (auto) L0->Lmax compaction to raise the base
+  // level to 2.
+  ASSERT_OK(dbfull()->SetOptions({
+      {"disable_auto_compactions", "true"},
+  }));
+  // Write about 650K more.
+  // Each file is about 11KB, with 9KB of data.
+  for (int i = 0; i < 1300; i++) {
+    ASSERT_OK(Put(Key(static_cast<int>(rnd.Uniform(kMaxKey))),
+                  RandomString(&rnd, 380)));
+  }
+
+  // Make sure that the compaction starts before the last bit of data is
+  // flushed, so that the base level isn't raised to L1.
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+      {"CompactionJob::Run():Start", "DynamicLevelMaxBytesBase2:0"},
+  });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(dbfull()->SetOptions({
+      {"disable_auto_compactions", "false"},
+  }));
+
+  TEST_SYNC_POINT("DynamicLevelMaxBytesBase2:0");
+  Flush();
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop));
+  ASSERT_EQ(2U, int_prop);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  // Write more data until the base level changes to L1. There will be
+  // a manual compaction going on at the same time.
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+      {"CompactionJob::Run():Start", "DynamicLevelMaxBytesBase2:1"},
+      {"DynamicLevelMaxBytesBase2:2", "CompactionJob::Run():End"},
+      {"DynamicLevelMaxBytesBase2:compact_range_finish",
+       "FlushJob::WriteLevel0Table"},
+  });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ROCKSDB_NAMESPACE::port::Thread thread([this] {
+    TEST_SYNC_POINT("DynamicLevelMaxBytesBase2:compact_range_start");
+    ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+    TEST_SYNC_POINT("DynamicLevelMaxBytesBase2:compact_range_finish");
+  });
+
+  TEST_SYNC_POINT("DynamicLevelMaxBytesBase2:1");
+  for (int i = 0; i < 2; i++) {
+    ASSERT_OK(Put(Key(static_cast<int>(rnd.Uniform(kMaxKey))),
+                  RandomString(&rnd, 380)));
+  }
+  TEST_SYNC_POINT("DynamicLevelMaxBytesBase2:2");
+
+  Flush();
+
+  thread.join();
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop));
+  ASSERT_EQ(1U, int_prop);
+}
+
+// Test specific cases in dynamic max bytes
+TEST_F(DBTestDynamicLevel, DynamicLevelMaxBytesCompactRange) {
+  Random rnd(301);
+  int kMaxKey = 1000000;
+
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.write_buffer_size = 2048;
+  options.max_write_buffer_number = 2;
+  options.level0_file_num_compaction_trigger = 2;
+  options.level0_slowdown_writes_trigger = 9999;
+  options.level0_stop_writes_trigger = 9999;
+  options.target_file_size_base = 2;
+  options.level_compaction_dynamic_level_bytes = true;
+  options.max_bytes_for_level_base = 10240;
+  options.max_bytes_for_level_multiplier = 4;
+  options.max_background_compactions = 1;
+  const int kNumLevels = 5;
+  options.num_levels = kNumLevels;
+  options.max_compaction_bytes = 1;  // Force not expanding in compactions
+  BlockBasedTableOptions table_options;
+  table_options.block_size = 1024;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  DestroyAndReopen(options);
+
+  // Compact against empty DB
+  dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+
+  uint64_t int_prop;
+  std::string str_prop;
+
+  // Initial base level is the last level
+  ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop));
+  ASSERT_EQ(4U, int_prop);
+
+  // Put about 7K to L0
+  for (int i = 0; i < 140; i++) {
+    ASSERT_OK(Put(Key(static_cast<int>(rnd.Uniform(kMaxKey))),
+                  RandomString(&rnd, 80)));
+  }
+  Flush();
+  dbfull()->TEST_WaitForCompact();
+  if (NumTableFilesAtLevel(0) == 0) {
+    // Make sure level 0 is not empty
+    ASSERT_OK(Put(Key(static_cast<int>(rnd.Uniform(kMaxKey))),
+                  RandomString(&rnd, 80)));
+    Flush();
+  }
+
+  ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop));
+  ASSERT_EQ(3U, int_prop);
+  ASSERT_TRUE(db_->GetProperty("rocksdb.num-files-at-level1", &str_prop));
+  ASSERT_EQ("0", str_prop);
+  ASSERT_TRUE(db_->GetProperty("rocksdb.num-files-at-level2", &str_prop));
+  ASSERT_EQ("0", str_prop);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  std::set<int> output_levels;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "CompactionPicker::CompactRange:Return", [&](void* arg) {
+        Compaction* compaction = reinterpret_cast<Compaction*>(arg);
+        output_levels.insert(compaction->output_level());
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_EQ(output_levels.size(), 2);
+  ASSERT_TRUE(output_levels.find(3) != output_levels.end());
+  ASSERT_TRUE(output_levels.find(4) != output_levels.end());
+  ASSERT_TRUE(db_->GetProperty("rocksdb.num-files-at-level0", &str_prop));
+  ASSERT_EQ("0", str_prop);
+  ASSERT_TRUE(db_->GetProperty("rocksdb.num-files-at-level3", &str_prop));
+  ASSERT_EQ("0", str_prop);
+  // Base level is still level 3.
+  ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop));
+  ASSERT_EQ(3U, int_prop);
+}
+
+TEST_F(DBTestDynamicLevel, DynamicLevelMaxBytesBaseInc) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.write_buffer_size = 2048;
+  options.max_write_buffer_number = 2;
+  options.level0_file_num_compaction_trigger = 2;
+  options.level0_slowdown_writes_trigger = 2;
+  options.level0_stop_writes_trigger = 2;
+  options.target_file_size_base = 2048;
+  options.level_compaction_dynamic_level_bytes = true;
+  options.max_bytes_for_level_base = 10240;
+  options.max_bytes_for_level_multiplier = 4;
+  options.soft_rate_limit = 1.1;
+  options.max_background_compactions = 2;
+  options.num_levels = 5;
+  options.max_compaction_bytes = 100000000;
+
+  DestroyAndReopen(options);
+
+  int non_trivial = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:NonTrivial",
+      [&](void* /*arg*/) { non_trivial++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  Random rnd(301);
+  const int total_keys = 3000;
+  const int random_part_size = 100;
+  for (int i = 0; i < total_keys; i++) {
+    std::string value = RandomString(&rnd, random_part_size);
+    PutFixed32(&value, static_cast<uint32_t>(i));
+    ASSERT_OK(Put(Key(i), value));
+  }
+  Flush();
+  dbfull()->TEST_WaitForCompact();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+  ASSERT_EQ(non_trivial, 0);
+
+  for (int i = 0; i < total_keys; i++) {
+    std::string value = Get(Key(i));
+    ASSERT_EQ(DecodeFixed32(value.c_str() + random_part_size),
+              static_cast<uint32_t>(i));
+  }
+
+  env_->SetBackgroundThreads(1, Env::LOW);
+  env_->SetBackgroundThreads(1, Env::HIGH);
+}
+
+TEST_F(DBTestDynamicLevel, DISABLED_MigrateToDynamicLevelMaxBytesBase) {
+  Random rnd(301);
+  const int kMaxKey = 2000;
+
+  Options options;
+  options.create_if_missing = true;
+  options.write_buffer_size = 2048;
+  options.max_write_buffer_number = 8;
+  options.level0_file_num_compaction_trigger = 4;
+  options.level0_slowdown_writes_trigger = 4;
+  options.level0_stop_writes_trigger = 8;
+  options.target_file_size_base = 2048;
+  options.level_compaction_dynamic_level_bytes = false;
+  options.max_bytes_for_level_base = 10240;
+  options.max_bytes_for_level_multiplier = 4;
+  options.soft_rate_limit = 1.1;
+  options.num_levels = 8;
+
+  DestroyAndReopen(options);
+
+  auto verify_func = [&](int num_keys, bool if_sleep) {
+    for (int i = 0; i < num_keys; i++) {
+      ASSERT_NE("NOT_FOUND", Get(Key(kMaxKey + i)));
+      if (i < num_keys / 10) {
+        ASSERT_EQ("NOT_FOUND", Get(Key(i)));
+      } else {
+        ASSERT_NE("NOT_FOUND", Get(Key(i)));
+      }
+      if (if_sleep && i % 1000 == 0) {
+        // Without it, valgrind may choose not to give another
+        // thread a chance to run before finishing the function,
+        // causing the test to be extremely slow.
+        env_->SleepForMicroseconds(1);
+      }
+    }
+  };
+
+  int total_keys = 1000;
+  for (int i = 0; i < total_keys; i++) {
+    ASSERT_OK(Put(Key(i), RandomString(&rnd, 102)));
+    ASSERT_OK(Put(Key(kMaxKey + i), RandomString(&rnd, 102)));
+    ASSERT_OK(Delete(Key(i / 10)));
+  }
+  verify_func(total_keys, false);
+  dbfull()->TEST_WaitForCompact();
+
+  options.level_compaction_dynamic_level_bytes = true;
+  options.disable_auto_compactions = true;
+  Reopen(options);
+  verify_func(total_keys, false);
+
+  std::atomic_bool compaction_finished;
+  compaction_finished = false;
+  // Issue manual compaction in one thread and still verify DB state
+  // in main thread.
+  ROCKSDB_NAMESPACE::port::Thread t([&]() {
+    CompactRangeOptions compact_options;
+    compact_options.change_level = true;
+    compact_options.target_level = options.num_levels - 1;
+    dbfull()->CompactRange(compact_options, nullptr, nullptr);
+    compaction_finished.store(true);
+  });
+  do {
+    verify_func(total_keys, true);
+  } while (!compaction_finished.load());
+  t.join();
+
+  ASSERT_OK(dbfull()->SetOptions({
+      {"disable_auto_compactions", "false"},
+  }));
+
+  int total_keys2 = 2000;
+  for (int i = total_keys; i < total_keys2; i++) {
+    ASSERT_OK(Put(Key(i), RandomString(&rnd, 102)));
+    ASSERT_OK(Put(Key(kMaxKey + i), RandomString(&rnd, 102)));
+    ASSERT_OK(Delete(Key(i / 10)));
+  }
+
+  verify_func(total_keys2, false);
+  dbfull()->TEST_WaitForCompact();
+  verify_func(total_keys2, false);
+
+  // Base level is not level 1
+  ASSERT_EQ(NumTableFilesAtLevel(1), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(2), 0);
+}
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // !defined(ROCKSDB_LITE)
+
+int main(int argc, char** argv) {
+#if !defined(ROCKSDB_LITE)
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+#else
+  (void) argc;
+  (void) argv;
+  return 0;
+#endif
+}
diff --git a/src/rocksdb/db/db_encryption_test.cc b/src/rocksdb/db/db_encryption_test.cc
new file mode 100644
index 000000000..b1f3ce23f
--- /dev/null
+++ b/src/rocksdb/db/db_encryption_test.cc
@@ -0,0 +1,122 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#include "db/db_test_util.h"
+#include "port/stack_trace.h"
+#include "rocksdb/perf_context.h"
+#if !defined(ROCKSDB_LITE)
+#include "test_util/sync_point.h"
+#endif
+#include <iostream>
+#include <string>
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBEncryptionTest : public DBTestBase {
+ public:
+  DBEncryptionTest() : DBTestBase("/db_encryption_test") {}
+};
+
+#ifndef ROCKSDB_LITE
+
+TEST_F(DBEncryptionTest, CheckEncrypted) {
+  ASSERT_OK(Put("foo567", "v1.fetdq"));
+  ASSERT_OK(Put("bar123", "v2.dfgkjdfghsd"));
+  Close();
+
+  // Open all files and look for the values we've put in there.
+  // They should not be found if encrypted, otherwise
+  // they should be found.
+  std::vector<std::string> fileNames;
+  auto status = env_->GetChildren(dbname_, &fileNames);
+  ASSERT_OK(status);
+
+  auto defaultEnv = Env::Default();
+  int hits = 0;
+  for (auto it = fileNames.begin() ; it != fileNames.end(); ++it) {
+    if ((*it == "..") || (*it == ".")) {
+      continue;
+    }
+    auto filePath = dbname_ + "/" + *it;
+    std::unique_ptr<SequentialFile> seqFile;
+    auto envOptions = EnvOptions(CurrentOptions());
+    status = defaultEnv->NewSequentialFile(filePath, &seqFile, envOptions);
+    ASSERT_OK(status);
+
+    uint64_t fileSize;
+    status = defaultEnv->GetFileSize(filePath, &fileSize);
+    ASSERT_OK(status);
+
+    std::string scratch;
+    scratch.reserve(fileSize);
+    Slice data;
+    status = seqFile->Read(fileSize, &data, (char*)scratch.data());
+    ASSERT_OK(status);
+
+    if (data.ToString().find("foo567") != std::string::npos) {
+      hits++; 
+      //std::cout << "Hit in " << filePath << "\n";
+    }
+    if (data.ToString().find("v1.fetdq") != std::string::npos) {
+      hits++; 
+      //std::cout << "Hit in " << filePath << "\n";
+    }
+    if (data.ToString().find("bar123") != std::string::npos) {
+      hits++; 
+      //std::cout << "Hit in " << filePath << "\n";
+    }
+    if (data.ToString().find("v2.dfgkjdfghsd") != std::string::npos) {
+      hits++; 
+      //std::cout << "Hit in " << filePath << "\n";
+    }
+    if (data.ToString().find("dfgk") != std::string::npos) {
+      hits++; 
+      //std::cout << "Hit in " << filePath << "\n";
+    }
+  }
+  if (encrypted_env_) {
+    ASSERT_EQ(hits, 0);
+  } else {
+    ASSERT_GE(hits, 4);
+  }
+}
+
+TEST_F(DBEncryptionTest, ReadEmptyFile) {
+  auto defaultEnv = Env::Default();
+
+  // create empty file for reading it back in later
+  auto envOptions = EnvOptions(CurrentOptions());
+  auto filePath = dbname_ + "/empty.empty";
+
+  Status status;
+  {
+    std::unique_ptr<WritableFile> writableFile;
+    status = defaultEnv->NewWritableFile(filePath, &writableFile, envOptions);
+    ASSERT_OK(status);
+  }
+
+  std::unique_ptr<SequentialFile> seqFile;
+  status = defaultEnv->NewSequentialFile(filePath, &seqFile, envOptions);
+  ASSERT_OK(status);
+
+  std::string scratch;
+  Slice data;
+  // reading back 16 bytes from the empty file shouldn't trigger an assertion.
+  // it should just work and return an empty string
+  status = seqFile->Read(16, &data, (char*)scratch.data());
+  ASSERT_OK(status);
+
+  ASSERT_TRUE(data.empty());
+}
+
+#endif // ROCKSDB_LITE
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_filesnapshot.cc b/src/rocksdb/db/db_filesnapshot.cc
new file mode 100644
index 000000000..f0f22cb47
--- /dev/null
+++ b/src/rocksdb/db/db_filesnapshot.cc
@@ -0,0 +1,177 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+
+#ifndef ROCKSDB_LITE
+
+#include <stdint.h>
+#include <algorithm>
+#include <cinttypes>
+#include <string>
+#include "db/db_impl/db_impl.h"
+#include "db/job_context.h"
+#include "db/version_set.h"
+#include "file/file_util.h"
+#include "file/filename.h"
+#include "port/port.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "test_util/sync_point.h"
+#include "util/mutexlock.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+Status DBImpl::DisableFileDeletions() {
+  InstrumentedMutexLock l(&mutex_);
+  ++disable_delete_obsolete_files_;
+  if (disable_delete_obsolete_files_ == 1) {
+    ROCKS_LOG_INFO(immutable_db_options_.info_log, "File Deletions Disabled");
+  } else {
+    ROCKS_LOG_WARN(immutable_db_options_.info_log,
+                   "File Deletions Disabled, but already disabled. Counter: %d",
+                   disable_delete_obsolete_files_);
+  }
+  return Status::OK();
+}
+
+Status DBImpl::EnableFileDeletions(bool force) {
+  // Job id == 0 means that this is not our background process, but rather
+  // user thread
+  JobContext job_context(0);
+  bool file_deletion_enabled = false;
+  {
+    InstrumentedMutexLock l(&mutex_);
+    if (force) {
+      // if force, we need to enable file deletions right away
+      disable_delete_obsolete_files_ = 0;
+    } else if (disable_delete_obsolete_files_ > 0) {
+      --disable_delete_obsolete_files_;
+    }
+    if (disable_delete_obsolete_files_ == 0)  {
+      file_deletion_enabled = true;
+      FindObsoleteFiles(&job_context, true);
+      bg_cv_.SignalAll();
+    }
+  }
+  if (file_deletion_enabled) {
+    ROCKS_LOG_INFO(immutable_db_options_.info_log, "File Deletions Enabled");
+    if (job_context.HaveSomethingToDelete()) {
+      PurgeObsoleteFiles(job_context);
+    }
+  } else {
+    ROCKS_LOG_WARN(immutable_db_options_.info_log,
+                   "File Deletions Enable, but not really enabled. Counter: %d",
+                   disable_delete_obsolete_files_);
+  }
+  job_context.Clean();
+  LogFlush(immutable_db_options_.info_log);
+  return Status::OK();
+}
+
+int DBImpl::IsFileDeletionsEnabled() const {
+  return !disable_delete_obsolete_files_;
+}
+
+Status DBImpl::GetLiveFiles(std::vector<std::string>& ret,
+                            uint64_t* manifest_file_size,
+                            bool flush_memtable) {
+  *manifest_file_size = 0;
+
+  mutex_.Lock();
+
+  if (flush_memtable) {
+    // flush all dirty data to disk.
+    Status status;
+    if (immutable_db_options_.atomic_flush) {
+      autovector<ColumnFamilyData*> cfds;
+      SelectColumnFamiliesForAtomicFlush(&cfds);
+      mutex_.Unlock();
+      status = AtomicFlushMemTables(cfds, FlushOptions(),
+                                    FlushReason::kGetLiveFiles);
+      mutex_.Lock();
+    } else {
+      for (auto cfd : *versions_->GetColumnFamilySet()) {
+        if (cfd->IsDropped()) {
+          continue;
+        }
+        cfd->Ref();
+        mutex_.Unlock();
+        status = FlushMemTable(cfd, FlushOptions(), FlushReason::kGetLiveFiles);
+        TEST_SYNC_POINT("DBImpl::GetLiveFiles:1");
+        TEST_SYNC_POINT("DBImpl::GetLiveFiles:2");
+        mutex_.Lock();
+        cfd->UnrefAndTryDelete();
+        if (!status.ok()) {
+          break;
+        }
+      }
+    }
+    versions_->GetColumnFamilySet()->FreeDeadColumnFamilies();
+
+    if (!status.ok()) {
+      mutex_.Unlock();
+      ROCKS_LOG_ERROR(immutable_db_options_.info_log, "Cannot Flush data %s\n",
+                      status.ToString().c_str());
+      return status;
+    }
+  }
+
+  // Make a set of all of the live *.sst files
+  std::vector<FileDescriptor> live;
+  for (auto cfd : *versions_->GetColumnFamilySet()) {
+    if (cfd->IsDropped()) {
+      continue;
+    }
+    cfd->current()->AddLiveFiles(&live);
+  }
+
+  ret.clear();
+  ret.reserve(live.size() + 3);  // *.sst + CURRENT + MANIFEST + OPTIONS
+
+  // create names of the live files. The names are not absolute
+  // paths, instead they are relative to dbname_;
+  for (const auto& live_file : live) {
+    ret.push_back(MakeTableFileName("", live_file.GetNumber()));
+  }
+
+  ret.push_back(CurrentFileName(""));
+  ret.push_back(DescriptorFileName("", versions_->manifest_file_number()));
+  ret.push_back(OptionsFileName("", versions_->options_file_number()));
+
+  // find length of manifest file while holding the mutex lock
+  *manifest_file_size = versions_->manifest_file_size();
+
+  mutex_.Unlock();
+  return Status::OK();
+}
+
+Status DBImpl::GetSortedWalFiles(VectorLogPtr& files) {
+  {
+    // If caller disabled deletions, this function should return files that are
+    // guaranteed not to be deleted until deletions are re-enabled. We need to
+    // wait for pending purges to finish since WalManager doesn't know which
+    // files are going to be purged. Additional purges won't be scheduled as
+    // long as deletions are disabled (so the below loop must terminate).
+    InstrumentedMutexLock l(&mutex_);
+    while (disable_delete_obsolete_files_ > 0 &&
+           pending_purge_obsolete_files_ > 0) {
+      bg_cv_.Wait();
+    }
+  }
+  return wal_manager_.GetSortedWalFiles(files);
+}
+
+Status DBImpl::GetCurrentWalFile(std::unique_ptr<LogFile>* current_log_file) {
+  uint64_t current_logfile_number;
+  {
+    InstrumentedMutexLock l(&mutex_);
+    current_logfile_number = logfile_number_;
+  }
+
+  return wal_manager_.GetLiveWalFile(current_logfile_number, current_log_file);
+}
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/db/db_flush_test.cc b/src/rocksdb/db/db_flush_test.cc
new file mode 100644
index 000000000..bab206d3d
--- /dev/null
+++ b/src/rocksdb/db/db_flush_test.cc
@@ -0,0 +1,784 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <atomic>
+
+#include "db/db_impl/db_impl.h"
+#include "db/db_test_util.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "test_util/fault_injection_test_env.h"
+#include "test_util/sync_point.h"
+#include "util/cast_util.h"
+#include "util/mutexlock.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBFlushTest : public DBTestBase {
+ public:
+  DBFlushTest() : DBTestBase("/db_flush_test") {}
+};
+
+class DBFlushDirectIOTest : public DBFlushTest,
+                            public ::testing::WithParamInterface<bool> {
+ public:
+  DBFlushDirectIOTest() : DBFlushTest() {}
+};
+
+class DBAtomicFlushTest : public DBFlushTest,
+                          public ::testing::WithParamInterface<bool> {
+ public:
+  DBAtomicFlushTest() : DBFlushTest() {}
+};
+
+// We had issue when two background threads trying to flush at the same time,
+// only one of them get committed. The test verifies the issue is fixed.
+TEST_F(DBFlushTest, FlushWhileWritingManifest) {
+  Options options;
+  options.disable_auto_compactions = true;
+  options.max_background_flushes = 2;
+  options.env = env_;
+  Reopen(options);
+  FlushOptions no_wait;
+  no_wait.wait = false;
+  no_wait.allow_write_stall=true;
+
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"VersionSet::LogAndApply:WriteManifest",
+        "DBFlushTest::FlushWhileWritingManifest:1"},
+       {"MemTableList::TryInstallMemtableFlushResults:InProgress",
+        "VersionSet::LogAndApply:WriteManifestDone"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(Put("foo", "v"));
+  ASSERT_OK(dbfull()->Flush(no_wait));
+  TEST_SYNC_POINT("DBFlushTest::FlushWhileWritingManifest:1");
+  ASSERT_OK(Put("bar", "v"));
+  ASSERT_OK(dbfull()->Flush(no_wait));
+  // If the issue is hit we will wait here forever.
+  dbfull()->TEST_WaitForFlushMemTable();
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ(2, TotalTableFiles());
+#endif  // ROCKSDB_LITE
+}
+
+// Disable this test temporarily on Travis as it fails intermittently.
+// Github issue: #4151
+TEST_F(DBFlushTest, SyncFail) {
+  std::unique_ptr<FaultInjectionTestEnv> fault_injection_env(
+      new FaultInjectionTestEnv(env_));
+  Options options;
+  options.disable_auto_compactions = true;
+  options.env = fault_injection_env.get();
+
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"DBFlushTest::SyncFail:GetVersionRefCount:1",
+        "DBImpl::FlushMemTableToOutputFile:BeforePickMemtables"},
+       {"DBImpl::FlushMemTableToOutputFile:AfterPickMemtables",
+        "DBFlushTest::SyncFail:GetVersionRefCount:2"},
+       {"DBFlushTest::SyncFail:1", "DBImpl::SyncClosedLogs:Start"},
+       {"DBImpl::SyncClosedLogs:Failed", "DBFlushTest::SyncFail:2"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  CreateAndReopenWithCF({"pikachu"}, options);
+  Put("key", "value");
+  auto* cfd =
+      reinterpret_cast<ColumnFamilyHandleImpl*>(db_->DefaultColumnFamily())
+          ->cfd();
+  FlushOptions flush_options;
+  flush_options.wait = false;
+  ASSERT_OK(dbfull()->Flush(flush_options));
+  // Flush installs a new super-version. Get the ref count after that.
+  auto current_before = cfd->current();
+  int refs_before = cfd->current()->TEST_refs();
+  TEST_SYNC_POINT("DBFlushTest::SyncFail:GetVersionRefCount:1");
+  TEST_SYNC_POINT("DBFlushTest::SyncFail:GetVersionRefCount:2");
+  int refs_after_picking_memtables = cfd->current()->TEST_refs();
+  ASSERT_EQ(refs_before + 1, refs_after_picking_memtables);
+  fault_injection_env->SetFilesystemActive(false);
+  TEST_SYNC_POINT("DBFlushTest::SyncFail:1");
+  TEST_SYNC_POINT("DBFlushTest::SyncFail:2");
+  fault_injection_env->SetFilesystemActive(true);
+  // Now the background job will do the flush; wait for it.
+  dbfull()->TEST_WaitForFlushMemTable();
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ("", FilesPerLevel());  // flush failed.
+#endif                             // ROCKSDB_LITE
+  // Backgroun flush job should release ref count to current version.
+  ASSERT_EQ(current_before, cfd->current());
+  ASSERT_EQ(refs_before, cfd->current()->TEST_refs());
+  Destroy(options);
+}
+
+TEST_F(DBFlushTest, SyncSkip) {
+  Options options = CurrentOptions();
+
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"DBFlushTest::SyncSkip:1", "DBImpl::SyncClosedLogs:Skip"},
+       {"DBImpl::SyncClosedLogs:Skip", "DBFlushTest::SyncSkip:2"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  Reopen(options);
+  Put("key", "value");
+
+  FlushOptions flush_options;
+  flush_options.wait = false;
+  ASSERT_OK(dbfull()->Flush(flush_options));
+
+  TEST_SYNC_POINT("DBFlushTest::SyncSkip:1");
+  TEST_SYNC_POINT("DBFlushTest::SyncSkip:2");
+
+  // Now the background job will do the flush; wait for it.
+  dbfull()->TEST_WaitForFlushMemTable();
+
+  Destroy(options);
+}
+
+TEST_F(DBFlushTest, FlushInLowPriThreadPool) {
+  // Verify setting an empty high-pri (flush) thread pool causes flushes to be
+  // scheduled in the low-pri (compaction) thread pool.
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = 4;
+  options.memtable_factory.reset(new SpecialSkipListFactory(1));
+  Reopen(options);
+  env_->SetBackgroundThreads(0, Env::HIGH);
+
+  std::thread::id tid;
+  int num_flushes = 0, num_compactions = 0;
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BGWorkFlush", [&](void* /*arg*/) {
+        if (tid == std::thread::id()) {
+          tid = std::this_thread::get_id();
+        } else {
+          ASSERT_EQ(tid, std::this_thread::get_id());
+        }
+        ++num_flushes;
+      });
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BGWorkCompaction", [&](void* /*arg*/) {
+        ASSERT_EQ(tid, std::this_thread::get_id());
+        ++num_compactions;
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(Put("key", "val"));
+  for (int i = 0; i < 4; ++i) {
+    ASSERT_OK(Put("key", "val"));
+    dbfull()->TEST_WaitForFlushMemTable();
+  }
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(4, num_flushes);
+  ASSERT_EQ(1, num_compactions);
+}
+
+TEST_F(DBFlushTest, ManualFlushWithMinWriteBufferNumberToMerge) {
+  Options options = CurrentOptions();
+  options.write_buffer_size = 100;
+  options.max_write_buffer_number = 4;
+  options.min_write_buffer_number_to_merge = 3;
+  Reopen(options);
+
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::BGWorkFlush",
+        "DBFlushTest::ManualFlushWithMinWriteBufferNumberToMerge:1"},
+       {"DBFlushTest::ManualFlushWithMinWriteBufferNumberToMerge:2",
+        "FlushJob::WriteLevel0Table"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(Put("key1", "value1"));
+
+  port::Thread t([&]() {
+    // The call wait for flush to finish, i.e. with flush_options.wait = true.
+    ASSERT_OK(Flush());
+  });
+
+  // Wait for flush start.
+  TEST_SYNC_POINT("DBFlushTest::ManualFlushWithMinWriteBufferNumberToMerge:1");
+  // Insert a second memtable before the manual flush finish.
+  // At the end of the manual flush job, it will check if further flush
+  // is needed, but it will not trigger flush of the second memtable because
+  // min_write_buffer_number_to_merge is not reached.
+  ASSERT_OK(Put("key2", "value2"));
+  ASSERT_OK(dbfull()->TEST_SwitchMemtable());
+  TEST_SYNC_POINT("DBFlushTest::ManualFlushWithMinWriteBufferNumberToMerge:2");
+
+  // Manual flush should return, without waiting for flush indefinitely.
+  t.join();
+}
+
+TEST_F(DBFlushTest, ScheduleOnlyOneBgThread) {
+  Options options = CurrentOptions();
+  Reopen(options);
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  int called = 0;
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::MaybeScheduleFlushOrCompaction:AfterSchedule:0", [&](void* arg) {
+        ASSERT_NE(nullptr, arg);
+        auto unscheduled_flushes = *reinterpret_cast<int*>(arg);
+        ASSERT_EQ(0, unscheduled_flushes);
+        ++called;
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(Put("a", "foo"));
+  FlushOptions flush_opts;
+  ASSERT_OK(dbfull()->Flush(flush_opts));
+  ASSERT_EQ(1, called);
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_P(DBFlushDirectIOTest, DirectIO) {
+  Options options;
+  options.create_if_missing = true;
+  options.disable_auto_compactions = true;
+  options.max_background_flushes = 2;
+  options.use_direct_io_for_flush_and_compaction = GetParam();
+  options.env = new MockEnv(Env::Default());
+  SyncPoint::GetInstance()->SetCallBack(
+      "BuildTable:create_file", [&](void* arg) {
+        bool* use_direct_writes = static_cast<bool*>(arg);
+        ASSERT_EQ(*use_direct_writes,
+                  options.use_direct_io_for_flush_and_compaction);
+      });
+
+  SyncPoint::GetInstance()->EnableProcessing();
+  Reopen(options);
+  ASSERT_OK(Put("foo", "v"));
+  FlushOptions flush_options;
+  flush_options.wait = true;
+  ASSERT_OK(dbfull()->Flush(flush_options));
+  Destroy(options);
+  delete options.env;
+}
+
+TEST_F(DBFlushTest, FlushError) {
+  Options options;
+  std::unique_ptr<FaultInjectionTestEnv> fault_injection_env(
+      new FaultInjectionTestEnv(env_));
+  options.write_buffer_size = 100;
+  options.max_write_buffer_number = 4;
+  options.min_write_buffer_number_to_merge = 3;
+  options.disable_auto_compactions = true;
+  options.env = fault_injection_env.get();
+  Reopen(options);
+
+  ASSERT_OK(Put("key1", "value1"));
+  ASSERT_OK(Put("key2", "value2"));
+  fault_injection_env->SetFilesystemActive(false);
+  Status s = dbfull()->TEST_SwitchMemtable();
+  fault_injection_env->SetFilesystemActive(true);
+  Destroy(options);
+  ASSERT_NE(s, Status::OK());
+}
+
+TEST_F(DBFlushTest, ManualFlushFailsInReadOnlyMode) {
+  // Regression test for bug where manual flush hangs forever when the DB
+  // is in read-only mode. Verify it now at least returns, despite failing.
+  Options options;
+  std::unique_ptr<FaultInjectionTestEnv> fault_injection_env(
+      new FaultInjectionTestEnv(env_));
+  options.env = fault_injection_env.get();
+  options.max_write_buffer_number = 2;
+  Reopen(options);
+
+  // Trigger a first flush but don't let it run
+  ASSERT_OK(db_->PauseBackgroundWork());
+  ASSERT_OK(Put("key1", "value1"));
+  FlushOptions flush_opts;
+  flush_opts.wait = false;
+  ASSERT_OK(db_->Flush(flush_opts));
+
+  // Write a key to the second memtable so we have something to flush later
+  // after the DB is in read-only mode.
+  ASSERT_OK(Put("key2", "value2"));
+
+  // Let the first flush continue, hit an error, and put the DB in read-only
+  // mode.
+  fault_injection_env->SetFilesystemActive(false);
+  ASSERT_OK(db_->ContinueBackgroundWork());
+  dbfull()->TEST_WaitForFlushMemTable();
+#ifndef ROCKSDB_LITE
+  uint64_t num_bg_errors;
+  ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBackgroundErrors,
+                                  &num_bg_errors));
+  ASSERT_GT(num_bg_errors, 0);
+#endif  // ROCKSDB_LITE
+
+  // In the bug scenario, triggering another flush would cause the second flush
+  // to hang forever. After the fix we expect it to return an error.
+  ASSERT_NOK(db_->Flush(FlushOptions()));
+
+  Close();
+}
+
+TEST_F(DBFlushTest, CFDropRaceWithWaitForFlushMemTables) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  CreateAndReopenWithCF({"pikachu"}, options);
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::FlushMemTable:AfterScheduleFlush",
+        "DBFlushTest::CFDropRaceWithWaitForFlushMemTables:BeforeDrop"},
+       {"DBFlushTest::CFDropRaceWithWaitForFlushMemTables:AfterFree",
+        "DBImpl::BackgroundCallFlush:start"},
+       {"DBImpl::BackgroundCallFlush:start",
+        "DBImpl::FlushMemTable:BeforeWaitForBgFlush"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_EQ(2, handles_.size());
+  ASSERT_OK(Put(1, "key", "value"));
+  auto* cfd = static_cast<ColumnFamilyHandleImpl*>(handles_[1])->cfd();
+  port::Thread drop_cf_thr([&]() {
+    TEST_SYNC_POINT(
+        "DBFlushTest::CFDropRaceWithWaitForFlushMemTables:BeforeDrop");
+    ASSERT_OK(dbfull()->DropColumnFamily(handles_[1]));
+    ASSERT_OK(dbfull()->DestroyColumnFamilyHandle(handles_[1]));
+    handles_.resize(1);
+    TEST_SYNC_POINT(
+        "DBFlushTest::CFDropRaceWithWaitForFlushMemTables:AfterFree");
+  });
+  FlushOptions flush_opts;
+  flush_opts.allow_write_stall = true;
+  ASSERT_NOK(dbfull()->TEST_FlushMemTable(cfd, flush_opts));
+  drop_cf_thr.join();
+  Close();
+  SyncPoint::GetInstance()->DisableProcessing();
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBFlushTest, FireOnFlushCompletedAfterCommittedResult) {
+  class TestListener : public EventListener {
+   public:
+    void OnFlushCompleted(DB* db, const FlushJobInfo& info) override {
+      // There's only one key in each flush.
+      ASSERT_EQ(info.smallest_seqno, info.largest_seqno);
+      ASSERT_NE(0, info.smallest_seqno);
+      if (info.smallest_seqno == seq1) {
+        // First flush completed
+        ASSERT_FALSE(completed1);
+        completed1 = true;
+        CheckFlushResultCommitted(db, seq1);
+      } else {
+        // Second flush completed
+        ASSERT_FALSE(completed2);
+        completed2 = true;
+        ASSERT_EQ(info.smallest_seqno, seq2);
+        CheckFlushResultCommitted(db, seq2);
+      }
+    }
+
+    void CheckFlushResultCommitted(DB* db, SequenceNumber seq) {
+      DBImpl* db_impl = static_cast_with_check<DBImpl>(db);
+      InstrumentedMutex* mutex = db_impl->mutex();
+      mutex->Lock();
+      auto* cfd =
+          reinterpret_cast<ColumnFamilyHandleImpl*>(db->DefaultColumnFamily())
+              ->cfd();
+      ASSERT_LT(seq, cfd->imm()->current()->GetEarliestSequenceNumber());
+      mutex->Unlock();
+    }
+
+    std::atomic<SequenceNumber> seq1{0};
+    std::atomic<SequenceNumber> seq2{0};
+    std::atomic<bool> completed1{false};
+    std::atomic<bool> completed2{false};
+  };
+  std::shared_ptr<TestListener> listener = std::make_shared<TestListener>();
+
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::BackgroundCallFlush:start",
+        "DBFlushTest::FireOnFlushCompletedAfterCommittedResult:WaitFirst"},
+       {"DBImpl::FlushMemTableToOutputFile:Finish",
+        "DBFlushTest::FireOnFlushCompletedAfterCommittedResult:WaitSecond"}});
+  SyncPoint::GetInstance()->SetCallBack(
+      "FlushJob::WriteLevel0Table", [&listener](void* arg) {
+        // Wait for the second flush finished, out of mutex.
+        auto* mems = reinterpret_cast<autovector<MemTable*>*>(arg);
+        if (mems->front()->GetEarliestSequenceNumber() == listener->seq1 - 1) {
+          TEST_SYNC_POINT(
+              "DBFlushTest::FireOnFlushCompletedAfterCommittedResult:"
+              "WaitSecond");
+        }
+      });
+
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.listeners.push_back(listener);
+  // Setting max_flush_jobs = max_background_jobs / 4 = 2.
+  options.max_background_jobs = 8;
+  // Allow 2 immutable memtables.
+  options.max_write_buffer_number = 3;
+  Reopen(options);
+  SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_OK(Put("foo", "v"));
+  listener->seq1 = db_->GetLatestSequenceNumber();
+  // t1 will wait for the second flush complete before committing flush result.
+  auto t1 = port::Thread([&]() {
+    // flush_opts.wait = true
+    ASSERT_OK(db_->Flush(FlushOptions()));
+  });
+  // Wait for first flush started.
+  TEST_SYNC_POINT(
+      "DBFlushTest::FireOnFlushCompletedAfterCommittedResult:WaitFirst");
+  // The second flush will exit early without commit its result. The work
+  // is delegated to the first flush.
+  ASSERT_OK(Put("bar", "v"));
+  listener->seq2 = db_->GetLatestSequenceNumber();
+  FlushOptions flush_opts;
+  flush_opts.wait = false;
+  ASSERT_OK(db_->Flush(flush_opts));
+  t1.join();
+  ASSERT_TRUE(listener->completed1);
+  ASSERT_TRUE(listener->completed2);
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+#endif  // !ROCKSDB_LITE
+
+TEST_P(DBAtomicFlushTest, ManualAtomicFlush) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.atomic_flush = GetParam();
+  options.write_buffer_size = (static_cast<size_t>(64) << 20);
+
+  CreateAndReopenWithCF({"pikachu", "eevee"}, options);
+  size_t num_cfs = handles_.size();
+  ASSERT_EQ(3, num_cfs);
+  WriteOptions wopts;
+  wopts.disableWAL = true;
+  for (size_t i = 0; i != num_cfs; ++i) {
+    ASSERT_OK(Put(static_cast<int>(i) /*cf*/, "key", "value", wopts));
+  }
+  std::vector<int> cf_ids;
+  for (size_t i = 0; i != num_cfs; ++i) {
+    cf_ids.emplace_back(static_cast<int>(i));
+  }
+  ASSERT_OK(Flush(cf_ids));
+  for (size_t i = 0; i != num_cfs; ++i) {
+    auto cfh = static_cast<ColumnFamilyHandleImpl*>(handles_[i]);
+    ASSERT_EQ(0, cfh->cfd()->imm()->NumNotFlushed());
+    ASSERT_TRUE(cfh->cfd()->mem()->IsEmpty());
+  }
+}
+
+TEST_P(DBAtomicFlushTest, AtomicFlushTriggeredByMemTableFull) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.atomic_flush = GetParam();
+  // 4KB so that we can easily trigger auto flush.
+  options.write_buffer_size = 4096;
+
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::BackgroundCallFlush:FlushFinish:0",
+        "DBAtomicFlushTest::AtomicFlushTriggeredByMemTableFull:BeforeCheck"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  CreateAndReopenWithCF({"pikachu", "eevee"}, options);
+  size_t num_cfs = handles_.size();
+  ASSERT_EQ(3, num_cfs);
+  WriteOptions wopts;
+  wopts.disableWAL = true;
+  for (size_t i = 0; i != num_cfs; ++i) {
+    ASSERT_OK(Put(static_cast<int>(i) /*cf*/, "key", "value", wopts));
+  }
+  // Keep writing to one of them column families to trigger auto flush.
+  for (int i = 0; i != 4000; ++i) {
+    ASSERT_OK(Put(static_cast<int>(num_cfs) - 1 /*cf*/,
+                  "key" + std::to_string(i), "value" + std::to_string(i),
+                  wopts));
+  }
+
+  TEST_SYNC_POINT(
+      "DBAtomicFlushTest::AtomicFlushTriggeredByMemTableFull:BeforeCheck");
+  if (options.atomic_flush) {
+    for (size_t i = 0; i != num_cfs - 1; ++i) {
+      auto cfh = static_cast<ColumnFamilyHandleImpl*>(handles_[i]);
+      ASSERT_EQ(0, cfh->cfd()->imm()->NumNotFlushed());
+      ASSERT_TRUE(cfh->cfd()->mem()->IsEmpty());
+    }
+  } else {
+    for (size_t i = 0; i != num_cfs - 1; ++i) {
+      auto cfh = static_cast<ColumnFamilyHandleImpl*>(handles_[i]);
+      ASSERT_EQ(0, cfh->cfd()->imm()->NumNotFlushed());
+      ASSERT_FALSE(cfh->cfd()->mem()->IsEmpty());
+    }
+  }
+  SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_P(DBAtomicFlushTest, AtomicFlushRollbackSomeJobs) {
+  bool atomic_flush = GetParam();
+  if (!atomic_flush) {
+    return;
+  }
+  std::unique_ptr<FaultInjectionTestEnv> fault_injection_env(
+      new FaultInjectionTestEnv(env_));
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.atomic_flush = atomic_flush;
+  options.env = fault_injection_env.get();
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::AtomicFlushMemTablesToOutputFiles:SomeFlushJobsComplete:1",
+        "DBAtomicFlushTest::AtomicFlushRollbackSomeJobs:1"},
+       {"DBAtomicFlushTest::AtomicFlushRollbackSomeJobs:2",
+        "DBImpl::AtomicFlushMemTablesToOutputFiles:SomeFlushJobsComplete:2"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  CreateAndReopenWithCF({"pikachu", "eevee"}, options);
+  size_t num_cfs = handles_.size();
+  ASSERT_EQ(3, num_cfs);
+  WriteOptions wopts;
+  wopts.disableWAL = true;
+  for (size_t i = 0; i != num_cfs; ++i) {
+    int cf_id = static_cast<int>(i);
+    ASSERT_OK(Put(cf_id, "key", "value", wopts));
+  }
+  FlushOptions flush_opts;
+  flush_opts.wait = false;
+  ASSERT_OK(dbfull()->Flush(flush_opts, handles_));
+  TEST_SYNC_POINT("DBAtomicFlushTest::AtomicFlushRollbackSomeJobs:1");
+  fault_injection_env->SetFilesystemActive(false);
+  TEST_SYNC_POINT("DBAtomicFlushTest::AtomicFlushRollbackSomeJobs:2");
+  for (auto* cfh : handles_) {
+    dbfull()->TEST_WaitForFlushMemTable(cfh);
+  }
+  for (size_t i = 0; i != num_cfs; ++i) {
+    auto cfh = static_cast<ColumnFamilyHandleImpl*>(handles_[i]);
+    ASSERT_EQ(1, cfh->cfd()->imm()->NumNotFlushed());
+    ASSERT_TRUE(cfh->cfd()->mem()->IsEmpty());
+  }
+  fault_injection_env->SetFilesystemActive(true);
+  Destroy(options);
+}
+
+TEST_P(DBAtomicFlushTest, FlushMultipleCFs_DropSomeBeforeRequestFlush) {
+  bool atomic_flush = GetParam();
+  if (!atomic_flush) {
+    return;
+  }
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.atomic_flush = atomic_flush;
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  CreateAndReopenWithCF({"pikachu", "eevee"}, options);
+  size_t num_cfs = handles_.size();
+  ASSERT_EQ(3, num_cfs);
+  WriteOptions wopts;
+  wopts.disableWAL = true;
+  std::vector<int> cf_ids;
+  for (size_t i = 0; i != num_cfs; ++i) {
+    int cf_id = static_cast<int>(i);
+    ASSERT_OK(Put(cf_id, "key", "value", wopts));
+    cf_ids.push_back(cf_id);
+  }
+  ASSERT_OK(dbfull()->DropColumnFamily(handles_[1]));
+  ASSERT_TRUE(Flush(cf_ids).IsColumnFamilyDropped());
+  Destroy(options);
+}
+
+TEST_P(DBAtomicFlushTest,
+       FlushMultipleCFs_DropSomeAfterScheduleFlushBeforeFlushJobRun) {
+  bool atomic_flush = GetParam();
+  if (!atomic_flush) {
+    return;
+  }
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.atomic_flush = atomic_flush;
+
+  CreateAndReopenWithCF({"pikachu", "eevee"}, options);
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::AtomicFlushMemTables:AfterScheduleFlush",
+        "DBAtomicFlushTest::BeforeDropCF"},
+       {"DBAtomicFlushTest::AfterDropCF",
+        "DBImpl::BackgroundCallFlush:start"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  size_t num_cfs = handles_.size();
+  ASSERT_EQ(3, num_cfs);
+  WriteOptions wopts;
+  wopts.disableWAL = true;
+  for (size_t i = 0; i != num_cfs; ++i) {
+    int cf_id = static_cast<int>(i);
+    ASSERT_OK(Put(cf_id, "key", "value", wopts));
+  }
+  port::Thread user_thread([&]() {
+    TEST_SYNC_POINT("DBAtomicFlushTest::BeforeDropCF");
+    ASSERT_OK(dbfull()->DropColumnFamily(handles_[1]));
+    TEST_SYNC_POINT("DBAtomicFlushTest::AfterDropCF");
+  });
+  FlushOptions flush_opts;
+  flush_opts.wait = true;
+  ASSERT_OK(dbfull()->Flush(flush_opts, handles_));
+  user_thread.join();
+  for (size_t i = 0; i != num_cfs; ++i) {
+    int cf_id = static_cast<int>(i);
+    ASSERT_EQ("value", Get(cf_id, "key"));
+  }
+
+  ReopenWithColumnFamilies({kDefaultColumnFamilyName, "eevee"}, options);
+  num_cfs = handles_.size();
+  ASSERT_EQ(2, num_cfs);
+  for (size_t i = 0; i != num_cfs; ++i) {
+    int cf_id = static_cast<int>(i);
+    ASSERT_EQ("value", Get(cf_id, "key"));
+  }
+  Destroy(options);
+}
+
+TEST_P(DBAtomicFlushTest, TriggerFlushAndClose) {
+  bool atomic_flush = GetParam();
+  if (!atomic_flush) {
+    return;
+  }
+  const int kNumKeysTriggerFlush = 4;
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.atomic_flush = atomic_flush;
+  options.memtable_factory.reset(
+      new SpecialSkipListFactory(kNumKeysTriggerFlush));
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  for (int i = 0; i != kNumKeysTriggerFlush; ++i) {
+    ASSERT_OK(Put(0, "key" + std::to_string(i), "value" + std::to_string(i)));
+  }
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_OK(Put(0, "key", "value"));
+  Close();
+
+  ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"}, options);
+  ASSERT_EQ("value", Get(0, "key"));
+}
+
+TEST_P(DBAtomicFlushTest, PickMemtablesRaceWithBackgroundFlush) {
+  bool atomic_flush = GetParam();
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.atomic_flush = atomic_flush;
+  options.max_write_buffer_number = 4;
+  // Set min_write_buffer_number_to_merge to be greater than 1, so that
+  // a column family with one memtable in the imm will not cause IsFlushPending
+  // to return true when flush_requested_ is false.
+  options.min_write_buffer_number_to_merge = 2;
+  CreateAndReopenWithCF({"pikachu"}, options);
+  ASSERT_EQ(2, handles_.size());
+  ASSERT_OK(dbfull()->PauseBackgroundWork());
+  ASSERT_OK(Put(0, "key00", "value00"));
+  ASSERT_OK(Put(1, "key10", "value10"));
+  FlushOptions flush_opts;
+  flush_opts.wait = false;
+  ASSERT_OK(dbfull()->Flush(flush_opts, handles_));
+  ASSERT_OK(Put(0, "key01", "value01"));
+  // Since max_write_buffer_number is 4, the following flush won't cause write
+  // stall.
+  ASSERT_OK(dbfull()->Flush(flush_opts));
+  ASSERT_OK(dbfull()->DropColumnFamily(handles_[1]));
+  ASSERT_OK(dbfull()->DestroyColumnFamilyHandle(handles_[1]));
+  handles_[1] = nullptr;
+  ASSERT_OK(dbfull()->ContinueBackgroundWork());
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[0]));
+  delete handles_[0];
+  handles_.clear();
+}
+
+TEST_P(DBAtomicFlushTest, CFDropRaceWithWaitForFlushMemTables) {
+  bool atomic_flush = GetParam();
+  if (!atomic_flush) {
+    return;
+  }
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.atomic_flush = atomic_flush;
+  CreateAndReopenWithCF({"pikachu"}, options);
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::AtomicFlushMemTables:AfterScheduleFlush",
+        "DBAtomicFlushTest::CFDropRaceWithWaitForFlushMemTables:BeforeDrop"},
+       {"DBAtomicFlushTest::CFDropRaceWithWaitForFlushMemTables:AfterFree",
+        "DBImpl::BackgroundCallFlush:start"},
+       {"DBImpl::BackgroundCallFlush:start",
+        "DBImpl::AtomicFlushMemTables:BeforeWaitForBgFlush"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_EQ(2, handles_.size());
+  ASSERT_OK(Put(0, "key", "value"));
+  ASSERT_OK(Put(1, "key", "value"));
+  auto* cfd_default =
+      static_cast<ColumnFamilyHandleImpl*>(dbfull()->DefaultColumnFamily())
+          ->cfd();
+  auto* cfd_pikachu = static_cast<ColumnFamilyHandleImpl*>(handles_[1])->cfd();
+  port::Thread drop_cf_thr([&]() {
+    TEST_SYNC_POINT(
+        "DBAtomicFlushTest::CFDropRaceWithWaitForFlushMemTables:BeforeDrop");
+    ASSERT_OK(dbfull()->DropColumnFamily(handles_[1]));
+    delete handles_[1];
+    handles_.resize(1);
+    TEST_SYNC_POINT(
+        "DBAtomicFlushTest::CFDropRaceWithWaitForFlushMemTables:AfterFree");
+  });
+  FlushOptions flush_opts;
+  flush_opts.allow_write_stall = true;
+  ASSERT_OK(dbfull()->TEST_AtomicFlushMemTables({cfd_default, cfd_pikachu},
+                                                flush_opts));
+  drop_cf_thr.join();
+  Close();
+  SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_P(DBAtomicFlushTest, RollbackAfterFailToInstallResults) {
+  bool atomic_flush = GetParam();
+  if (!atomic_flush) {
+    return;
+  }
+  auto fault_injection_env = std::make_shared<FaultInjectionTestEnv>(env_);
+  Options options = CurrentOptions();
+  options.env = fault_injection_env.get();
+  options.create_if_missing = true;
+  options.atomic_flush = atomic_flush;
+  CreateAndReopenWithCF({"pikachu"}, options);
+  ASSERT_EQ(2, handles_.size());
+  for (size_t cf = 0; cf < handles_.size(); ++cf) {
+    ASSERT_OK(Put(static_cast<int>(cf), "a", "value"));
+  }
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::ProcessManifestWrites:BeforeWriteLastVersionEdit:0",
+      [&](void* /*arg*/) { fault_injection_env->SetFilesystemActive(false); });
+  SyncPoint::GetInstance()->EnableProcessing();
+  FlushOptions flush_opts;
+  Status s = db_->Flush(flush_opts, handles_);
+  ASSERT_NOK(s);
+  fault_injection_env->SetFilesystemActive(true);
+  Close();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+INSTANTIATE_TEST_CASE_P(DBFlushDirectIOTest, DBFlushDirectIOTest,
+                        testing::Bool());
+
+INSTANTIATE_TEST_CASE_P(DBAtomicFlushTest, DBAtomicFlushTest, testing::Bool());
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_impl/db_impl.cc b/src/rocksdb/db/db_impl/db_impl.cc
new file mode 100644
index 000000000..d7880fc1a
--- /dev/null
+++ b/src/rocksdb/db/db_impl/db_impl.cc
@@ -0,0 +1,4550 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include "db/db_impl/db_impl.h"
+
+#include <stdint.h>
+#ifdef OS_SOLARIS
+#include <alloca.h>
+#endif
+
+#include <algorithm>
+#include <cinttypes>
+#include <cstdio>
+#include <map>
+#include <set>
+#include <stdexcept>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "db/arena_wrapped_db_iter.h"
+#include "db/builder.h"
+#include "db/compaction/compaction_job.h"
+#include "db/db_info_dumper.h"
+#include "db/db_iter.h"
+#include "db/dbformat.h"
+#include "db/error_handler.h"
+#include "db/event_helpers.h"
+#include "db/external_sst_file_ingestion_job.h"
+#include "db/flush_job.h"
+#include "db/forward_iterator.h"
+#include "db/import_column_family_job.h"
+#include "db/job_context.h"
+#include "db/log_reader.h"
+#include "db/log_writer.h"
+#include "db/malloc_stats.h"
+#include "db/memtable.h"
+#include "db/memtable_list.h"
+#include "db/merge_context.h"
+#include "db/merge_helper.h"
+#include "db/range_tombstone_fragmenter.h"
+#include "db/table_cache.h"
+#include "db/table_properties_collector.h"
+#include "db/transaction_log_impl.h"
+#include "db/version_set.h"
+#include "db/write_batch_internal.h"
+#include "db/write_callback.h"
+#include "env/composite_env_wrapper.h"
+#include "file/file_util.h"
+#include "file/filename.h"
+#include "file/random_access_file_reader.h"
+#include "file/sst_file_manager_impl.h"
+#include "logging/auto_roll_logger.h"
+#include "logging/log_buffer.h"
+#include "logging/logging.h"
+#include "memtable/hash_linklist_rep.h"
+#include "memtable/hash_skiplist_rep.h"
+#include "monitoring/in_memory_stats_history.h"
+#include "monitoring/iostats_context_imp.h"
+#include "monitoring/perf_context_imp.h"
+#include "monitoring/persistent_stats_history.h"
+#include "monitoring/thread_status_updater.h"
+#include "monitoring/thread_status_util.h"
+#include "options/cf_options.h"
+#include "options/options_helper.h"
+#include "options/options_parser.h"
+#include "port/port.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/stats_history.h"
+#include "rocksdb/status.h"
+#include "rocksdb/table.h"
+#include "rocksdb/write_buffer_manager.h"
+#include "table/block_based/block.h"
+#include "table/block_based/block_based_table_factory.h"
+#include "table/get_context.h"
+#include "table/merging_iterator.h"
+#include "table/multiget_context.h"
+#include "table/table_builder.h"
+#include "table/two_level_iterator.h"
+#include "test_util/sync_point.h"
+#include "tools/sst_dump_tool_imp.h"
+#include "util/autovector.h"
+#include "util/build_version.h"
+#include "util/cast_util.h"
+#include "util/coding.h"
+#include "util/compression.h"
+#include "util/crc32c.h"
+#include "util/mutexlock.h"
+#include "util/stop_watch.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+const std::string kDefaultColumnFamilyName("default");
+const std::string kPersistentStatsColumnFamilyName(
+    "___rocksdb_stats_history___");
+void DumpRocksDBBuildVersion(Logger* log);
+
+CompressionType GetCompressionFlush(
+    const ImmutableCFOptions& ioptions,
+    const MutableCFOptions& mutable_cf_options) {
+  // Compressing memtable flushes might not help unless the sequential load
+  // optimization is used for leveled compaction. Otherwise the CPU and
+  // latency overhead is not offset by saving much space.
+  if (ioptions.compaction_style == kCompactionStyleUniversal) {
+    if (mutable_cf_options.compaction_options_universal
+            .compression_size_percent < 0) {
+      return mutable_cf_options.compression;
+    } else {
+      return kNoCompression;
+    }
+  } else if (!ioptions.compression_per_level.empty()) {
+    // For leveled compress when min_level_to_compress != 0.
+    return ioptions.compression_per_level[0];
+  } else {
+    return mutable_cf_options.compression;
+  }
+}
+
+namespace {
+void DumpSupportInfo(Logger* logger) {
+  ROCKS_LOG_HEADER(logger, "Compression algorithms supported:");
+  for (auto& compression : OptionsHelper::compression_type_string_map) {
+    if (compression.second != kNoCompression &&
+        compression.second != kDisableCompressionOption) {
+      ROCKS_LOG_HEADER(logger, "\t%s supported: %d", compression.first.c_str(),
+                       CompressionTypeSupported(compression.second));
+    }
+  }
+  ROCKS_LOG_HEADER(logger, "Fast CRC32 supported: %s",
+                   crc32c::IsFastCrc32Supported().c_str());
+}
+}  // namespace
+
+DBImpl::DBImpl(const DBOptions& options, const std::string& dbname,
+               const bool seq_per_batch, const bool batch_per_txn)
+    : dbname_(dbname),
+      own_info_log_(options.info_log == nullptr),
+      initial_db_options_(SanitizeOptions(dbname, options)),
+      env_(initial_db_options_.env),
+      fs_(initial_db_options_.file_system),
+      immutable_db_options_(initial_db_options_),
+      mutable_db_options_(initial_db_options_),
+      stats_(immutable_db_options_.statistics.get()),
+      mutex_(stats_, env_, DB_MUTEX_WAIT_MICROS,
+             immutable_db_options_.use_adaptive_mutex),
+      default_cf_handle_(nullptr),
+      max_total_in_memory_state_(0),
+      file_options_(BuildDBOptions(immutable_db_options_, mutable_db_options_)),
+      file_options_for_compaction_(fs_->OptimizeForCompactionTableWrite(
+          file_options_, immutable_db_options_)),
+      seq_per_batch_(seq_per_batch),
+      batch_per_txn_(batch_per_txn),
+      db_lock_(nullptr),
+      shutting_down_(false),
+      manual_compaction_paused_(false),
+      bg_cv_(&mutex_),
+      logfile_number_(0),
+      log_dir_synced_(false),
+      log_empty_(true),
+      persist_stats_cf_handle_(nullptr),
+      log_sync_cv_(&mutex_),
+      total_log_size_(0),
+      is_snapshot_supported_(true),
+      write_buffer_manager_(immutable_db_options_.write_buffer_manager.get()),
+      write_thread_(immutable_db_options_),
+      nonmem_write_thread_(immutable_db_options_),
+      write_controller_(mutable_db_options_.delayed_write_rate),
+      last_batch_group_size_(0),
+      unscheduled_flushes_(0),
+      unscheduled_compactions_(0),
+      bg_bottom_compaction_scheduled_(0),
+      bg_compaction_scheduled_(0),
+      num_running_compactions_(0),
+      bg_flush_scheduled_(0),
+      num_running_flushes_(0),
+      bg_purge_scheduled_(0),
+      disable_delete_obsolete_files_(0),
+      pending_purge_obsolete_files_(0),
+      delete_obsolete_files_last_run_(env_->NowMicros()),
+      last_stats_dump_time_microsec_(0),
+      next_job_id_(1),
+      has_unpersisted_data_(false),
+      unable_to_release_oldest_log_(false),
+      num_running_ingest_file_(0),
+#ifndef ROCKSDB_LITE
+      wal_manager_(immutable_db_options_, file_options_, seq_per_batch),
+#endif  // ROCKSDB_LITE
+      event_logger_(immutable_db_options_.info_log.get()),
+      bg_work_paused_(0),
+      bg_compaction_paused_(0),
+      refitting_level_(false),
+      opened_successfully_(false),
+      two_write_queues_(options.two_write_queues),
+      manual_wal_flush_(options.manual_wal_flush),
+      // last_sequencee_ is always maintained by the main queue that also writes
+      // to the memtable. When two_write_queues_ is disabled last seq in
+      // memtable is the same as last seq published to the readers. When it is
+      // enabled but seq_per_batch_ is disabled, last seq in memtable still
+      // indicates last published seq since wal-only writes that go to the 2nd
+      // queue do not consume a sequence number. Otherwise writes performed by
+      // the 2nd queue could change what is visible to the readers. In this
+      // cases, last_seq_same_as_publish_seq_==false, the 2nd queue maintains a
+      // separate variable to indicate the last published sequence.
+      last_seq_same_as_publish_seq_(
+          !(seq_per_batch && options.two_write_queues)),
+      // Since seq_per_batch_ is currently set only by WritePreparedTxn which
+      // requires a custom gc for compaction, we use that to set use_custom_gc_
+      // as well.
+      use_custom_gc_(seq_per_batch),
+      shutdown_initiated_(false),
+      own_sfm_(options.sst_file_manager == nullptr),
+      preserve_deletes_(options.preserve_deletes),
+      closed_(false),
+      error_handler_(this, immutable_db_options_, &mutex_),
+      atomic_flush_install_cv_(&mutex_) {
+  // !batch_per_trx_ implies seq_per_batch_ because it is only unset for
+  // WriteUnprepared, which should use seq_per_batch_.
+  assert(batch_per_txn_ || seq_per_batch_);
+  env_->GetAbsolutePath(dbname, &db_absolute_path_);
+
+  // Reserve ten files or so for other uses and give the rest to TableCache.
+  // Give a large number for setting of "infinite" open files.
+  const int table_cache_size = (mutable_db_options_.max_open_files == -1)
+                                   ? TableCache::kInfiniteCapacity
+                                   : mutable_db_options_.max_open_files - 10;
+  LRUCacheOptions co;
+  co.capacity = table_cache_size;
+  co.num_shard_bits = immutable_db_options_.table_cache_numshardbits;
+  co.metadata_charge_policy = kDontChargeCacheMetadata;
+  table_cache_ = NewLRUCache(co);
+
+  versions_.reset(new VersionSet(dbname_, &immutable_db_options_, file_options_,
+                                 table_cache_.get(), write_buffer_manager_,
+                                 &write_controller_, &block_cache_tracer_));
+  column_family_memtables_.reset(
+      new ColumnFamilyMemTablesImpl(versions_->GetColumnFamilySet()));
+
+  DumpRocksDBBuildVersion(immutable_db_options_.info_log.get());
+  DumpDBFileSummary(immutable_db_options_, dbname_);
+  immutable_db_options_.Dump(immutable_db_options_.info_log.get());
+  mutable_db_options_.Dump(immutable_db_options_.info_log.get());
+  DumpSupportInfo(immutable_db_options_.info_log.get());
+
+  // always open the DB with 0 here, which means if preserve_deletes_==true
+  // we won't drop any deletion markers until SetPreserveDeletesSequenceNumber()
+  // is called by client and this seqnum is advanced.
+  preserve_deletes_seqnum_.store(0);
+}
+
+Status DBImpl::Resume() {
+  ROCKS_LOG_INFO(immutable_db_options_.info_log, "Resuming DB");
+
+  InstrumentedMutexLock db_mutex(&mutex_);
+
+  if (!error_handler_.IsDBStopped() && !error_handler_.IsBGWorkStopped()) {
+    // Nothing to do
+    return Status::OK();
+  }
+
+  if (error_handler_.IsRecoveryInProgress()) {
+    // Don't allow a mix of manual and automatic recovery
+    return Status::Busy();
+  }
+
+  mutex_.Unlock();
+  Status s = error_handler_.RecoverFromBGError(true);
+  mutex_.Lock();
+  return s;
+}
+
+// This function implements the guts of recovery from a background error. It
+// is eventually called for both manual as well as automatic recovery. It does
+// the following -
+// 1. Wait for currently scheduled background flush/compaction to exit, in
+//    order to inadvertently causing an error and thinking recovery failed
+// 2. Flush memtables if there's any data for all the CFs. This may result
+//    another error, which will be saved by error_handler_ and reported later
+//    as the recovery status
+// 3. Find and delete any obsolete files
+// 4. Schedule compactions if needed for all the CFs. This is needed as the
+//    flush in the prior step might have been a no-op for some CFs, which
+//    means a new super version wouldn't have been installed
+Status DBImpl::ResumeImpl() {
+  mutex_.AssertHeld();
+  WaitForBackgroundWork();
+
+  Status bg_error = error_handler_.GetBGError();
+  Status s;
+  if (shutdown_initiated_) {
+    // Returning shutdown status to SFM during auto recovery will cause it
+    // to abort the recovery and allow the shutdown to progress
+    s = Status::ShutdownInProgress();
+  }
+  if (s.ok() && bg_error.severity() > Status::Severity::kHardError) {
+    ROCKS_LOG_INFO(
+        immutable_db_options_.info_log,
+        "DB resume requested but failed due to Fatal/Unrecoverable error");
+    s = bg_error;
+  }
+
+  // We cannot guarantee consistency of the WAL. So force flush Memtables of
+  // all the column families
+  if (s.ok()) {
+    FlushOptions flush_opts;
+    // We allow flush to stall write since we are trying to resume from error.
+    flush_opts.allow_write_stall = true;
+    if (immutable_db_options_.atomic_flush) {
+      autovector<ColumnFamilyData*> cfds;
+      SelectColumnFamiliesForAtomicFlush(&cfds);
+      mutex_.Unlock();
+      s = AtomicFlushMemTables(cfds, flush_opts, FlushReason::kErrorRecovery);
+      mutex_.Lock();
+    } else {
+      for (auto cfd : *versions_->GetColumnFamilySet()) {
+        if (cfd->IsDropped()) {
+          continue;
+        }
+        cfd->Ref();
+        mutex_.Unlock();
+        s = FlushMemTable(cfd, flush_opts, FlushReason::kErrorRecovery);
+        mutex_.Lock();
+        cfd->UnrefAndTryDelete();
+        if (!s.ok()) {
+          break;
+        }
+      }
+    }
+    if (!s.ok()) {
+      ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                     "DB resume requested but failed due to Flush failure [%s]",
+                     s.ToString().c_str());
+    }
+  }
+
+  JobContext job_context(0);
+  FindObsoleteFiles(&job_context, true);
+  if (s.ok()) {
+    s = error_handler_.ClearBGError();
+  }
+  mutex_.Unlock();
+
+  job_context.manifest_file_number = 1;
+  if (job_context.HaveSomethingToDelete()) {
+    PurgeObsoleteFiles(job_context);
+  }
+  job_context.Clean();
+
+  if (s.ok()) {
+    ROCKS_LOG_INFO(immutable_db_options_.info_log, "Successfully resumed DB");
+  }
+  mutex_.Lock();
+  // Check for shutdown again before scheduling further compactions,
+  // since we released and re-acquired the lock above
+  if (shutdown_initiated_) {
+    s = Status::ShutdownInProgress();
+  }
+  if (s.ok()) {
+    for (auto cfd : *versions_->GetColumnFamilySet()) {
+      SchedulePendingCompaction(cfd);
+    }
+    MaybeScheduleFlushOrCompaction();
+  }
+
+  // Wake up any waiters - in this case, it could be the shutdown thread
+  bg_cv_.SignalAll();
+
+  // No need to check BGError again. If something happened, event listener would
+  // be notified and the operation causing it would have failed
+  return s;
+}
+
+void DBImpl::WaitForBackgroundWork() {
+  // Wait for background work to finish
+  while (bg_bottom_compaction_scheduled_ || bg_compaction_scheduled_ ||
+         bg_flush_scheduled_) {
+    bg_cv_.Wait();
+  }
+}
+
+// Will lock the mutex_,  will wait for completion if wait is true
+void DBImpl::CancelAllBackgroundWork(bool wait) {
+  ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                 "Shutdown: canceling all background work");
+
+  if (thread_dump_stats_ != nullptr) {
+    thread_dump_stats_->cancel();
+    thread_dump_stats_.reset();
+  }
+  if (thread_persist_stats_ != nullptr) {
+    thread_persist_stats_->cancel();
+    thread_persist_stats_.reset();
+  }
+  InstrumentedMutexLock l(&mutex_);
+  if (!shutting_down_.load(std::memory_order_acquire) &&
+      has_unpersisted_data_.load(std::memory_order_relaxed) &&
+      !mutable_db_options_.avoid_flush_during_shutdown) {
+    if (immutable_db_options_.atomic_flush) {
+      autovector<ColumnFamilyData*> cfds;
+      SelectColumnFamiliesForAtomicFlush(&cfds);
+      mutex_.Unlock();
+      AtomicFlushMemTables(cfds, FlushOptions(), FlushReason::kShutDown);
+      mutex_.Lock();
+    } else {
+      for (auto cfd : *versions_->GetColumnFamilySet()) {
+        if (!cfd->IsDropped() && cfd->initialized() && !cfd->mem()->IsEmpty()) {
+          cfd->Ref();
+          mutex_.Unlock();
+          FlushMemTable(cfd, FlushOptions(), FlushReason::kShutDown);
+          mutex_.Lock();
+          cfd->UnrefAndTryDelete();
+        }
+      }
+    }
+    versions_->GetColumnFamilySet()->FreeDeadColumnFamilies();
+  }
+
+  shutting_down_.store(true, std::memory_order_release);
+  bg_cv_.SignalAll();
+  if (!wait) {
+    return;
+  }
+  WaitForBackgroundWork();
+}
+
+Status DBImpl::CloseHelper() {
+  // Guarantee that there is no background error recovery in progress before
+  // continuing with the shutdown
+  mutex_.Lock();
+  shutdown_initiated_ = true;
+  error_handler_.CancelErrorRecovery();
+  while (error_handler_.IsRecoveryInProgress()) {
+    bg_cv_.Wait();
+  }
+  mutex_.Unlock();
+
+  // CancelAllBackgroundWork called with false means we just set the shutdown
+  // marker. After this we do a variant of the waiting and unschedule work
+  // (to consider: moving all the waiting into CancelAllBackgroundWork(true))
+  CancelAllBackgroundWork(false);
+  int bottom_compactions_unscheduled =
+      env_->UnSchedule(this, Env::Priority::BOTTOM);
+  int compactions_unscheduled = env_->UnSchedule(this, Env::Priority::LOW);
+  int flushes_unscheduled = env_->UnSchedule(this, Env::Priority::HIGH);
+  Status ret;
+  mutex_.Lock();
+  bg_bottom_compaction_scheduled_ -= bottom_compactions_unscheduled;
+  bg_compaction_scheduled_ -= compactions_unscheduled;
+  bg_flush_scheduled_ -= flushes_unscheduled;
+
+  // Wait for background work to finish
+  while (bg_bottom_compaction_scheduled_ || bg_compaction_scheduled_ ||
+         bg_flush_scheduled_ || bg_purge_scheduled_ ||
+         pending_purge_obsolete_files_ ||
+         error_handler_.IsRecoveryInProgress()) {
+    TEST_SYNC_POINT("DBImpl::~DBImpl:WaitJob");
+    bg_cv_.Wait();
+  }
+  TEST_SYNC_POINT_CALLBACK("DBImpl::CloseHelper:PendingPurgeFinished",
+                           &files_grabbed_for_purge_);
+  EraseThreadStatusDbInfo();
+  flush_scheduler_.Clear();
+  trim_history_scheduler_.Clear();
+
+  while (!flush_queue_.empty()) {
+    const FlushRequest& flush_req = PopFirstFromFlushQueue();
+    for (const auto& iter : flush_req) {
+      iter.first->UnrefAndTryDelete();
+    }
+  }
+  while (!compaction_queue_.empty()) {
+    auto cfd = PopFirstFromCompactionQueue();
+    cfd->UnrefAndTryDelete();
+  }
+
+  if (default_cf_handle_ != nullptr || persist_stats_cf_handle_ != nullptr) {
+    // we need to delete handle outside of lock because it does its own locking
+    mutex_.Unlock();
+    if (default_cf_handle_) {
+      delete default_cf_handle_;
+      default_cf_handle_ = nullptr;
+    }
+    if (persist_stats_cf_handle_) {
+      delete persist_stats_cf_handle_;
+      persist_stats_cf_handle_ = nullptr;
+    }
+    mutex_.Lock();
+  }
+
+  // Clean up obsolete files due to SuperVersion release.
+  // (1) Need to delete to obsolete files before closing because RepairDB()
+  // scans all existing files in the file system and builds manifest file.
+  // Keeping obsolete files confuses the repair process.
+  // (2) Need to check if we Open()/Recover() the DB successfully before
+  // deleting because if VersionSet recover fails (may be due to corrupted
+  // manifest file), it is not able to identify live files correctly. As a
+  // result, all "live" files can get deleted by accident. However, corrupted
+  // manifest is recoverable by RepairDB().
+  if (opened_successfully_) {
+    JobContext job_context(next_job_id_.fetch_add(1));
+    FindObsoleteFiles(&job_context, true);
+
+    mutex_.Unlock();
+    // manifest number starting from 2
+    job_context.manifest_file_number = 1;
+    if (job_context.HaveSomethingToDelete()) {
+      PurgeObsoleteFiles(job_context);
+    }
+    job_context.Clean();
+    mutex_.Lock();
+  }
+
+  for (auto l : logs_to_free_) {
+    delete l;
+  }
+  for (auto& log : logs_) {
+    uint64_t log_number = log.writer->get_log_number();
+    Status s = log.ClearWriter();
+    if (!s.ok()) {
+      ROCKS_LOG_WARN(
+          immutable_db_options_.info_log,
+          "Unable to Sync WAL file %s with error -- %s",
+          LogFileName(immutable_db_options_.wal_dir, log_number).c_str(),
+          s.ToString().c_str());
+      // Retain the first error
+      if (ret.ok()) {
+        ret = s;
+      }
+    }
+  }
+  logs_.clear();
+
+  // Table cache may have table handles holding blocks from the block cache.
+  // We need to release them before the block cache is destroyed. The block
+  // cache may be destroyed inside versions_.reset(), when column family data
+  // list is destroyed, so leaving handles in table cache after
+  // versions_.reset() may cause issues.
+  // Here we clean all unreferenced handles in table cache.
+  // Now we assume all user queries have finished, so only version set itself
+  // can possibly hold the blocks from block cache. After releasing unreferenced
+  // handles here, only handles held by version set left and inside
+  // versions_.reset(), we will release them. There, we need to make sure every
+  // time a handle is released, we erase it from the cache too. By doing that,
+  // we can guarantee that after versions_.reset(), table cache is empty
+  // so the cache can be safely destroyed.
+  table_cache_->EraseUnRefEntries();
+
+  for (auto& txn_entry : recovered_transactions_) {
+    delete txn_entry.second;
+  }
+
+  // versions need to be destroyed before table_cache since it can hold
+  // references to table_cache.
+  versions_.reset();
+  mutex_.Unlock();
+  if (db_lock_ != nullptr) {
+    env_->UnlockFile(db_lock_);
+  }
+
+  ROCKS_LOG_INFO(immutable_db_options_.info_log, "Shutdown complete");
+  LogFlush(immutable_db_options_.info_log);
+
+#ifndef ROCKSDB_LITE
+  // If the sst_file_manager was allocated by us during DB::Open(), ccall
+  // Close() on it before closing the info_log. Otherwise, background thread
+  // in SstFileManagerImpl might try to log something
+  if (immutable_db_options_.sst_file_manager && own_sfm_) {
+    auto sfm = static_cast<SstFileManagerImpl*>(
+        immutable_db_options_.sst_file_manager.get());
+    sfm->Close();
+  }
+#endif  // ROCKSDB_LITE
+
+  if (immutable_db_options_.info_log && own_info_log_) {
+    Status s = immutable_db_options_.info_log->Close();
+    if (ret.ok()) {
+      ret = s;
+    }
+  }
+
+  if (ret.IsAborted()) {
+    // Reserve IsAborted() error for those where users didn't release
+    // certain resource and they can release them and come back and
+    // retry. In this case, we wrap this exception to something else.
+    return Status::Incomplete(ret.ToString());
+  }
+  return ret;
+}
+
+Status DBImpl::CloseImpl() { return CloseHelper(); }
+
+DBImpl::~DBImpl() {
+  if (!closed_) {
+    closed_ = true;
+    CloseHelper();
+  }
+}
+
+void DBImpl::MaybeIgnoreError(Status* s) const {
+  if (s->ok() || immutable_db_options_.paranoid_checks) {
+    // No change needed
+  } else {
+    ROCKS_LOG_WARN(immutable_db_options_.info_log, "Ignoring error %s",
+                   s->ToString().c_str());
+    *s = Status::OK();
+  }
+}
+
+const Status DBImpl::CreateArchivalDirectory() {
+  if (immutable_db_options_.wal_ttl_seconds > 0 ||
+      immutable_db_options_.wal_size_limit_mb > 0) {
+    std::string archivalPath = ArchivalDirectory(immutable_db_options_.wal_dir);
+    return env_->CreateDirIfMissing(archivalPath);
+  }
+  return Status::OK();
+}
+
+void DBImpl::PrintStatistics() {
+  auto dbstats = immutable_db_options_.statistics.get();
+  if (dbstats) {
+    ROCKS_LOG_INFO(immutable_db_options_.info_log, "STATISTICS:\n %s",
+                   dbstats->ToString().c_str());
+  }
+}
+
+void DBImpl::StartTimedTasks() {
+  unsigned int stats_dump_period_sec = 0;
+  unsigned int stats_persist_period_sec = 0;
+  {
+    InstrumentedMutexLock l(&mutex_);
+    stats_dump_period_sec = mutable_db_options_.stats_dump_period_sec;
+    if (stats_dump_period_sec > 0) {
+      if (!thread_dump_stats_) {
+        thread_dump_stats_.reset(new ROCKSDB_NAMESPACE::RepeatableThread(
+            [this]() { DBImpl::DumpStats(); }, "dump_st", env_,
+            static_cast<uint64_t>(stats_dump_period_sec) * kMicrosInSecond));
+      }
+    }
+    stats_persist_period_sec = mutable_db_options_.stats_persist_period_sec;
+    if (stats_persist_period_sec > 0) {
+      if (!thread_persist_stats_) {
+        thread_persist_stats_.reset(new ROCKSDB_NAMESPACE::RepeatableThread(
+            [this]() { DBImpl::PersistStats(); }, "pst_st", env_,
+            static_cast<uint64_t>(stats_persist_period_sec) * kMicrosInSecond));
+      }
+    }
+  }
+}
+
+// esitmate the total size of stats_history_
+size_t DBImpl::EstimateInMemoryStatsHistorySize() const {
+  size_t size_total =
+      sizeof(std::map<uint64_t, std::map<std::string, uint64_t>>);
+  if (stats_history_.size() == 0) return size_total;
+  size_t size_per_slice =
+      sizeof(uint64_t) + sizeof(std::map<std::string, uint64_t>);
+  // non-empty map, stats_history_.begin() guaranteed to exist
+  std::map<std::string, uint64_t> sample_slice(stats_history_.begin()->second);
+  for (const auto& pairs : sample_slice) {
+    size_per_slice +=
+        pairs.first.capacity() + sizeof(pairs.first) + sizeof(pairs.second);
+  }
+  size_total = size_per_slice * stats_history_.size();
+  return size_total;
+}
+
+void DBImpl::PersistStats() {
+  TEST_SYNC_POINT("DBImpl::PersistStats:Entry");
+#ifndef ROCKSDB_LITE
+  if (shutdown_initiated_) {
+    return;
+  }
+  uint64_t now_seconds = env_->NowMicros() / kMicrosInSecond;
+  Statistics* statistics = immutable_db_options_.statistics.get();
+  if (!statistics) {
+    return;
+  }
+  size_t stats_history_size_limit = 0;
+  {
+    InstrumentedMutexLock l(&mutex_);
+    stats_history_size_limit = mutable_db_options_.stats_history_buffer_size;
+  }
+
+  std::map<std::string, uint64_t> stats_map;
+  if (!statistics->getTickerMap(&stats_map)) {
+    return;
+  }
+  ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                 "------- PERSISTING STATS -------");
+
+  if (immutable_db_options_.persist_stats_to_disk) {
+    WriteBatch batch;
+    if (stats_slice_initialized_) {
+      ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                     "Reading %" ROCKSDB_PRIszt " stats from statistics\n",
+                     stats_slice_.size());
+      for (const auto& stat : stats_map) {
+        char key[100];
+        int length =
+            EncodePersistentStatsKey(now_seconds, stat.first, 100, key);
+        // calculate the delta from last time
+        if (stats_slice_.find(stat.first) != stats_slice_.end()) {
+          uint64_t delta = stat.second - stats_slice_[stat.first];
+          batch.Put(persist_stats_cf_handle_, Slice(key, std::min(100, length)),
+                    ToString(delta));
+        }
+      }
+    }
+    stats_slice_initialized_ = true;
+    std::swap(stats_slice_, stats_map);
+    WriteOptions wo;
+    wo.low_pri = true;
+    wo.no_slowdown = true;
+    wo.sync = false;
+    Status s = Write(wo, &batch);
+    if (!s.ok()) {
+      ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                     "Writing to persistent stats CF failed -- %s",
+                     s.ToString().c_str());
+    } else {
+      ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                     "Writing %" ROCKSDB_PRIszt " stats with timestamp %" PRIu64
+                     " to persistent stats CF succeeded",
+                     stats_slice_.size(), now_seconds);
+    }
+    // TODO(Zhongyi): add purging for persisted data
+  } else {
+    InstrumentedMutexLock l(&stats_history_mutex_);
+    // calculate the delta from last time
+    if (stats_slice_initialized_) {
+      std::map<std::string, uint64_t> stats_delta;
+      for (const auto& stat : stats_map) {
+        if (stats_slice_.find(stat.first) != stats_slice_.end()) {
+          stats_delta[stat.first] = stat.second - stats_slice_[stat.first];
+        }
+      }
+      ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                     "Storing %" ROCKSDB_PRIszt " stats with timestamp %" PRIu64
+                     " to in-memory stats history",
+                     stats_slice_.size(), now_seconds);
+      stats_history_[now_seconds] = stats_delta;
+    }
+    stats_slice_initialized_ = true;
+    std::swap(stats_slice_, stats_map);
+    TEST_SYNC_POINT("DBImpl::PersistStats:StatsCopied");
+
+    // delete older stats snapshots to control memory consumption
+    size_t stats_history_size = EstimateInMemoryStatsHistorySize();
+    bool purge_needed = stats_history_size > stats_history_size_limit;
+    ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                   "[Pre-GC] In-memory stats history size: %" ROCKSDB_PRIszt
+                   " bytes, slice count: %" ROCKSDB_PRIszt,
+                   stats_history_size, stats_history_.size());
+    while (purge_needed && !stats_history_.empty()) {
+      stats_history_.erase(stats_history_.begin());
+      purge_needed =
+          EstimateInMemoryStatsHistorySize() > stats_history_size_limit;
+    }
+    ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                   "[Post-GC] In-memory stats history size: %" ROCKSDB_PRIszt
+                   " bytes, slice count: %" ROCKSDB_PRIszt,
+                   stats_history_size, stats_history_.size());
+  }
+#endif  // !ROCKSDB_LITE
+}
+
+bool DBImpl::FindStatsByTime(uint64_t start_time, uint64_t end_time,
+                             uint64_t* new_time,
+                             std::map<std::string, uint64_t>* stats_map) {
+  assert(new_time);
+  assert(stats_map);
+  if (!new_time || !stats_map) return false;
+  // lock when search for start_time
+  {
+    InstrumentedMutexLock l(&stats_history_mutex_);
+    auto it = stats_history_.lower_bound(start_time);
+    if (it != stats_history_.end() && it->first < end_time) {
+      // make a copy for timestamp and stats_map
+      *new_time = it->first;
+      *stats_map = it->second;
+      return true;
+    } else {
+      return false;
+    }
+  }
+}
+
+Status DBImpl::GetStatsHistory(
+    uint64_t start_time, uint64_t end_time,
+    std::unique_ptr<StatsHistoryIterator>* stats_iterator) {
+  if (!stats_iterator) {
+    return Status::InvalidArgument("stats_iterator not preallocated.");
+  }
+  if (immutable_db_options_.persist_stats_to_disk) {
+    stats_iterator->reset(
+        new PersistentStatsHistoryIterator(start_time, end_time, this));
+  } else {
+    stats_iterator->reset(
+        new InMemoryStatsHistoryIterator(start_time, end_time, this));
+  }
+  return (*stats_iterator)->status();
+}
+
+void DBImpl::DumpStats() {
+  TEST_SYNC_POINT("DBImpl::DumpStats:1");
+#ifndef ROCKSDB_LITE
+  const DBPropertyInfo* cf_property_info =
+      GetPropertyInfo(DB::Properties::kCFStats);
+  assert(cf_property_info != nullptr);
+  const DBPropertyInfo* db_property_info =
+      GetPropertyInfo(DB::Properties::kDBStats);
+  assert(db_property_info != nullptr);
+
+  std::string stats;
+  if (shutdown_initiated_) {
+    return;
+  }
+  {
+    InstrumentedMutexLock l(&mutex_);
+    default_cf_internal_stats_->GetStringProperty(
+        *db_property_info, DB::Properties::kDBStats, &stats);
+    for (auto cfd : *versions_->GetColumnFamilySet()) {
+      if (cfd->initialized()) {
+        cfd->internal_stats()->GetStringProperty(
+            *cf_property_info, DB::Properties::kCFStatsNoFileHistogram, &stats);
+      }
+    }
+    for (auto cfd : *versions_->GetColumnFamilySet()) {
+      if (cfd->initialized()) {
+        cfd->internal_stats()->GetStringProperty(
+            *cf_property_info, DB::Properties::kCFFileHistogram, &stats);
+      }
+    }
+  }
+  TEST_SYNC_POINT("DBImpl::DumpStats:2");
+  ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                 "------- DUMPING STATS -------");
+  ROCKS_LOG_INFO(immutable_db_options_.info_log, "%s", stats.c_str());
+  if (immutable_db_options_.dump_malloc_stats) {
+    stats.clear();
+    DumpMallocStats(&stats);
+    if (!stats.empty()) {
+      ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                     "------- Malloc STATS -------");
+      ROCKS_LOG_INFO(immutable_db_options_.info_log, "%s", stats.c_str());
+    }
+  }
+#endif  // !ROCKSDB_LITE
+
+  PrintStatistics();
+}
+
+Status DBImpl::TablesRangeTombstoneSummary(ColumnFamilyHandle* column_family,
+                                           int max_entries_to_print,
+                                           std::string* out_str) {
+  auto* cfh =
+      static_cast_with_check<ColumnFamilyHandleImpl, ColumnFamilyHandle>(
+          column_family);
+  ColumnFamilyData* cfd = cfh->cfd();
+
+  SuperVersion* super_version = cfd->GetReferencedSuperVersion(this);
+  Version* version = super_version->current;
+
+  Status s =
+      version->TablesRangeTombstoneSummary(max_entries_to_print, out_str);
+
+  CleanupSuperVersion(super_version);
+  return s;
+}
+
+void DBImpl::ScheduleBgLogWriterClose(JobContext* job_context) {
+  if (!job_context->logs_to_free.empty()) {
+    for (auto l : job_context->logs_to_free) {
+      AddToLogsToFreeQueue(l);
+    }
+    job_context->logs_to_free.clear();
+  }
+}
+
+Directory* DBImpl::GetDataDir(ColumnFamilyData* cfd, size_t path_id) const {
+  assert(cfd);
+  Directory* ret_dir = cfd->GetDataDir(path_id);
+  if (ret_dir == nullptr) {
+    return directories_.GetDataDir(path_id);
+  }
+  return ret_dir;
+}
+
+Status DBImpl::SetOptions(
+    ColumnFamilyHandle* column_family,
+    const std::unordered_map<std::string, std::string>& options_map) {
+#ifdef ROCKSDB_LITE
+  (void)column_family;
+  (void)options_map;
+  return Status::NotSupported("Not supported in ROCKSDB LITE");
+#else
+  auto* cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family)->cfd();
+  if (options_map.empty()) {
+    ROCKS_LOG_WARN(immutable_db_options_.info_log,
+                   "SetOptions() on column family [%s], empty input",
+                   cfd->GetName().c_str());
+    return Status::InvalidArgument("empty input");
+  }
+
+  MutableCFOptions new_options;
+  Status s;
+  Status persist_options_status;
+  SuperVersionContext sv_context(/* create_superversion */ true);
+  {
+    auto db_options = GetDBOptions();
+    InstrumentedMutexLock l(&mutex_);
+    s = cfd->SetOptions(db_options, options_map);
+    if (s.ok()) {
+      new_options = *cfd->GetLatestMutableCFOptions();
+      // Append new version to recompute compaction score.
+      VersionEdit dummy_edit;
+      versions_->LogAndApply(cfd, new_options, &dummy_edit, &mutex_,
+                             directories_.GetDbDir());
+      // Trigger possible flush/compactions. This has to be before we persist
+      // options to file, otherwise there will be a deadlock with writer
+      // thread.
+      InstallSuperVersionAndScheduleWork(cfd, &sv_context, new_options);
+
+      persist_options_status = WriteOptionsFile(
+          false /*need_mutex_lock*/, true /*need_enter_write_thread*/);
+      bg_cv_.SignalAll();
+    }
+  }
+  sv_context.Clean();
+
+  ROCKS_LOG_INFO(
+      immutable_db_options_.info_log,
+      "SetOptions() on column family [%s], inputs:", cfd->GetName().c_str());
+  for (const auto& o : options_map) {
+    ROCKS_LOG_INFO(immutable_db_options_.info_log, "%s: %s\n", o.first.c_str(),
+                   o.second.c_str());
+  }
+  if (s.ok()) {
+    ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                   "[%s] SetOptions() succeeded", cfd->GetName().c_str());
+    new_options.Dump(immutable_db_options_.info_log.get());
+    if (!persist_options_status.ok()) {
+      s = persist_options_status;
+    }
+  } else {
+    ROCKS_LOG_WARN(immutable_db_options_.info_log, "[%s] SetOptions() failed",
+                   cfd->GetName().c_str());
+  }
+  LogFlush(immutable_db_options_.info_log);
+  return s;
+#endif  // ROCKSDB_LITE
+}
+
+Status DBImpl::SetDBOptions(
+    const std::unordered_map<std::string, std::string>& options_map) {
+#ifdef ROCKSDB_LITE
+  (void)options_map;
+  return Status::NotSupported("Not supported in ROCKSDB LITE");
+#else
+  if (options_map.empty()) {
+    ROCKS_LOG_WARN(immutable_db_options_.info_log,
+                   "SetDBOptions(), empty input.");
+    return Status::InvalidArgument("empty input");
+  }
+
+  MutableDBOptions new_options;
+  Status s;
+  Status persist_options_status;
+  bool wal_changed = false;
+  WriteContext write_context;
+  {
+    InstrumentedMutexLock l(&mutex_);
+    s = GetMutableDBOptionsFromStrings(mutable_db_options_, options_map,
+                                       &new_options);
+    if (new_options.bytes_per_sync == 0) {
+      new_options.bytes_per_sync = 1024 * 1024;
+    }
+    DBOptions new_db_options =
+        BuildDBOptions(immutable_db_options_, new_options);
+    if (s.ok()) {
+      s = ValidateOptions(new_db_options);
+    }
+    if (s.ok()) {
+      for (auto c : *versions_->GetColumnFamilySet()) {
+        if (!c->IsDropped()) {
+          auto cf_options = c->GetLatestCFOptions();
+          s = ColumnFamilyData::ValidateOptions(new_db_options, cf_options);
+          if (!s.ok()) {
+            break;
+          }
+        }
+      }
+    }
+    if (s.ok()) {
+      const BGJobLimits current_bg_job_limits =
+          GetBGJobLimits(immutable_db_options_.max_background_flushes,
+                         mutable_db_options_.max_background_compactions,
+                         mutable_db_options_.max_background_jobs,
+                         /* parallelize_compactions */ true);
+      const BGJobLimits new_bg_job_limits = GetBGJobLimits(
+          immutable_db_options_.max_background_flushes,
+          new_options.max_background_compactions,
+          new_options.max_background_jobs, /* parallelize_compactions */ true);
+
+      const bool max_flushes_increased =
+          new_bg_job_limits.max_flushes > current_bg_job_limits.max_flushes;
+      const bool max_compactions_increased =
+          new_bg_job_limits.max_compactions >
+          current_bg_job_limits.max_compactions;
+
+      if (max_flushes_increased || max_compactions_increased) {
+        if (max_flushes_increased) {
+          env_->IncBackgroundThreadsIfNeeded(new_bg_job_limits.max_flushes,
+                                             Env::Priority::HIGH);
+        }
+
+        if (max_compactions_increased) {
+          env_->IncBackgroundThreadsIfNeeded(new_bg_job_limits.max_compactions,
+                                             Env::Priority::LOW);
+        }
+
+        MaybeScheduleFlushOrCompaction();
+      }
+
+      if (new_options.stats_dump_period_sec !=
+          mutable_db_options_.stats_dump_period_sec) {
+        if (thread_dump_stats_) {
+          mutex_.Unlock();
+          thread_dump_stats_->cancel();
+          mutex_.Lock();
+        }
+        if (new_options.stats_dump_period_sec > 0) {
+          thread_dump_stats_.reset(new ROCKSDB_NAMESPACE::RepeatableThread(
+              [this]() { DBImpl::DumpStats(); }, "dump_st", env_,
+              static_cast<uint64_t>(new_options.stats_dump_period_sec) *
+                  kMicrosInSecond));
+        } else {
+          thread_dump_stats_.reset();
+        }
+      }
+      if (new_options.stats_persist_period_sec !=
+          mutable_db_options_.stats_persist_period_sec) {
+        if (thread_persist_stats_) {
+          mutex_.Unlock();
+          thread_persist_stats_->cancel();
+          mutex_.Lock();
+        }
+        if (new_options.stats_persist_period_sec > 0) {
+          thread_persist_stats_.reset(new ROCKSDB_NAMESPACE::RepeatableThread(
+              [this]() { DBImpl::PersistStats(); }, "pst_st", env_,
+              static_cast<uint64_t>(new_options.stats_persist_period_sec) *
+                  kMicrosInSecond));
+        } else {
+          thread_persist_stats_.reset();
+        }
+      }
+      write_controller_.set_max_delayed_write_rate(
+          new_options.delayed_write_rate);
+      table_cache_.get()->SetCapacity(new_options.max_open_files == -1
+                                          ? TableCache::kInfiniteCapacity
+                                          : new_options.max_open_files - 10);
+      wal_changed = mutable_db_options_.wal_bytes_per_sync !=
+                    new_options.wal_bytes_per_sync;
+      mutable_db_options_ = new_options;
+      file_options_for_compaction_ = FileOptions(new_db_options);
+      file_options_for_compaction_ = fs_->OptimizeForCompactionTableWrite(
+          file_options_for_compaction_, immutable_db_options_);
+      versions_->ChangeFileOptions(mutable_db_options_);
+      //TODO(xiez): clarify why apply optimize for read to write options
+      file_options_for_compaction_ = fs_->OptimizeForCompactionTableRead(
+          file_options_for_compaction_, immutable_db_options_);
+      file_options_for_compaction_.compaction_readahead_size =
+          mutable_db_options_.compaction_readahead_size;
+      WriteThread::Writer w;
+      write_thread_.EnterUnbatched(&w, &mutex_);
+      if (total_log_size_ > GetMaxTotalWalSize() || wal_changed) {
+        Status purge_wal_status = SwitchWAL(&write_context);
+        if (!purge_wal_status.ok()) {
+          ROCKS_LOG_WARN(immutable_db_options_.info_log,
+                         "Unable to purge WAL files in SetDBOptions() -- %s",
+                         purge_wal_status.ToString().c_str());
+        }
+      }
+      persist_options_status = WriteOptionsFile(
+          false /*need_mutex_lock*/, false /*need_enter_write_thread*/);
+      write_thread_.ExitUnbatched(&w);
+    }
+  }
+  ROCKS_LOG_INFO(immutable_db_options_.info_log, "SetDBOptions(), inputs:");
+  for (const auto& o : options_map) {
+    ROCKS_LOG_INFO(immutable_db_options_.info_log, "%s: %s\n", o.first.c_str(),
+                   o.second.c_str());
+  }
+  if (s.ok()) {
+    ROCKS_LOG_INFO(immutable_db_options_.info_log, "SetDBOptions() succeeded");
+    new_options.Dump(immutable_db_options_.info_log.get());
+    if (!persist_options_status.ok()) {
+      if (immutable_db_options_.fail_if_options_file_error) {
+        s = Status::IOError(
+            "SetDBOptions() succeeded, but unable to persist options",
+            persist_options_status.ToString());
+      }
+      ROCKS_LOG_WARN(immutable_db_options_.info_log,
+                     "Unable to persist options in SetDBOptions() -- %s",
+                     persist_options_status.ToString().c_str());
+    }
+  } else {
+    ROCKS_LOG_WARN(immutable_db_options_.info_log, "SetDBOptions failed");
+  }
+  LogFlush(immutable_db_options_.info_log);
+  return s;
+#endif  // ROCKSDB_LITE
+}
+
+// return the same level if it cannot be moved
+int DBImpl::FindMinimumEmptyLevelFitting(
+    ColumnFamilyData* cfd, const MutableCFOptions& /*mutable_cf_options*/,
+    int level) {
+  mutex_.AssertHeld();
+  const auto* vstorage = cfd->current()->storage_info();
+  int minimum_level = level;
+  for (int i = level - 1; i > 0; --i) {
+    // stop if level i is not empty
+    if (vstorage->NumLevelFiles(i) > 0) break;
+    // stop if level i is too small (cannot fit the level files)
+    if (vstorage->MaxBytesForLevel(i) < vstorage->NumLevelBytes(level)) {
+      break;
+    }
+
+    minimum_level = i;
+  }
+  return minimum_level;
+}
+
+Status DBImpl::FlushWAL(bool sync) {
+  if (manual_wal_flush_) {
+    Status s;
+    {
+      // We need to lock log_write_mutex_ since logs_ might change concurrently
+      InstrumentedMutexLock wl(&log_write_mutex_);
+      log::Writer* cur_log_writer = logs_.back().writer;
+      s = cur_log_writer->WriteBuffer();
+    }
+    if (!s.ok()) {
+      ROCKS_LOG_ERROR(immutable_db_options_.info_log, "WAL flush error %s",
+                      s.ToString().c_str());
+      // In case there is a fs error we should set it globally to prevent the
+      // future writes
+      WriteStatusCheck(s);
+      // whether sync or not, we should abort the rest of function upon error
+      return s;
+    }
+    if (!sync) {
+      ROCKS_LOG_DEBUG(immutable_db_options_.info_log, "FlushWAL sync=false");
+      return s;
+    }
+  }
+  if (!sync) {
+    return Status::OK();
+  }
+  // sync = true
+  ROCKS_LOG_DEBUG(immutable_db_options_.info_log, "FlushWAL sync=true");
+  return SyncWAL();
+}
+
+Status DBImpl::SyncWAL() {
+  autovector<log::Writer*, 1> logs_to_sync;
+  bool need_log_dir_sync;
+  uint64_t current_log_number;
+
+  {
+    InstrumentedMutexLock l(&mutex_);
+    assert(!logs_.empty());
+
+    // This SyncWAL() call only cares about logs up to this number.
+    current_log_number = logfile_number_;
+
+    while (logs_.front().number <= current_log_number &&
+           logs_.front().getting_synced) {
+      log_sync_cv_.Wait();
+    }
+    // First check that logs are safe to sync in background.
+    for (auto it = logs_.begin();
+         it != logs_.end() && it->number <= current_log_number; ++it) {
+      if (!it->writer->file()->writable_file()->IsSyncThreadSafe()) {
+        return Status::NotSupported(
+            "SyncWAL() is not supported for this implementation of WAL file",
+            immutable_db_options_.allow_mmap_writes
+                ? "try setting Options::allow_mmap_writes to false"
+                : Slice());
+      }
+    }
+    for (auto it = logs_.begin();
+         it != logs_.end() && it->number <= current_log_number; ++it) {
+      auto& log = *it;
+      assert(!log.getting_synced);
+      log.getting_synced = true;
+      logs_to_sync.push_back(log.writer);
+    }
+
+    need_log_dir_sync = !log_dir_synced_;
+  }
+
+  TEST_SYNC_POINT("DBWALTest::SyncWALNotWaitWrite:1");
+  RecordTick(stats_, WAL_FILE_SYNCED);
+  Status status;
+  for (log::Writer* log : logs_to_sync) {
+    status = log->file()->SyncWithoutFlush(immutable_db_options_.use_fsync);
+    if (!status.ok()) {
+      break;
+    }
+  }
+  if (status.ok() && need_log_dir_sync) {
+    status = directories_.GetWalDir()->Fsync();
+  }
+  TEST_SYNC_POINT("DBWALTest::SyncWALNotWaitWrite:2");
+
+  TEST_SYNC_POINT("DBImpl::SyncWAL:BeforeMarkLogsSynced:1");
+  {
+    InstrumentedMutexLock l(&mutex_);
+    MarkLogsSynced(current_log_number, need_log_dir_sync, status);
+  }
+  TEST_SYNC_POINT("DBImpl::SyncWAL:BeforeMarkLogsSynced:2");
+
+  return status;
+}
+
+Status DBImpl::LockWAL() {
+  log_write_mutex_.Lock();
+  auto cur_log_writer = logs_.back().writer;
+  auto status = cur_log_writer->WriteBuffer();
+  if (!status.ok()) {
+    ROCKS_LOG_ERROR(immutable_db_options_.info_log, "WAL flush error %s",
+                    status.ToString().c_str());
+    // In case there is a fs error we should set it globally to prevent the
+    // future writes
+    WriteStatusCheck(status);
+  }
+  return status;
+}
+
+Status DBImpl::UnlockWAL() {
+  log_write_mutex_.Unlock();
+  return Status::OK();
+}
+
+void DBImpl::MarkLogsSynced(uint64_t up_to, bool synced_dir,
+                            const Status& status) {
+  mutex_.AssertHeld();
+  if (synced_dir && logfile_number_ == up_to && status.ok()) {
+    log_dir_synced_ = true;
+  }
+  for (auto it = logs_.begin(); it != logs_.end() && it->number <= up_to;) {
+    auto& log = *it;
+    assert(log.getting_synced);
+    if (status.ok() && logs_.size() > 1) {
+      logs_to_free_.push_back(log.ReleaseWriter());
+      // To modify logs_ both mutex_ and log_write_mutex_ must be held
+      InstrumentedMutexLock l(&log_write_mutex_);
+      it = logs_.erase(it);
+    } else {
+      log.getting_synced = false;
+      ++it;
+    }
+  }
+  assert(!status.ok() || logs_.empty() || logs_[0].number > up_to ||
+         (logs_.size() == 1 && !logs_[0].getting_synced));
+  log_sync_cv_.SignalAll();
+}
+
+SequenceNumber DBImpl::GetLatestSequenceNumber() const {
+  return versions_->LastSequence();
+}
+
+void DBImpl::SetLastPublishedSequence(SequenceNumber seq) {
+  versions_->SetLastPublishedSequence(seq);
+}
+
+bool DBImpl::SetPreserveDeletesSequenceNumber(SequenceNumber seqnum) {
+  if (seqnum > preserve_deletes_seqnum_.load()) {
+    preserve_deletes_seqnum_.store(seqnum);
+    return true;
+  } else {
+    return false;
+  }
+}
+
+InternalIterator* DBImpl::NewInternalIterator(
+    Arena* arena, RangeDelAggregator* range_del_agg, SequenceNumber sequence,
+    ColumnFamilyHandle* column_family) {
+  ColumnFamilyData* cfd;
+  if (column_family == nullptr) {
+    cfd = default_cf_handle_->cfd();
+  } else {
+    auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+    cfd = cfh->cfd();
+  }
+
+  mutex_.Lock();
+  SuperVersion* super_version = cfd->GetSuperVersion()->Ref();
+  mutex_.Unlock();
+  ReadOptions roptions;
+  return NewInternalIterator(roptions, cfd, super_version, arena, range_del_agg,
+                             sequence);
+}
+
+void DBImpl::SchedulePurge() {
+  mutex_.AssertHeld();
+  assert(opened_successfully_);
+
+  // Purge operations are put into High priority queue
+  bg_purge_scheduled_++;
+  env_->Schedule(&DBImpl::BGWorkPurge, this, Env::Priority::HIGH, nullptr);
+}
+
+void DBImpl::BackgroundCallPurge() {
+  mutex_.Lock();
+
+  while (!logs_to_free_queue_.empty()) {
+    assert(!logs_to_free_queue_.empty());
+    log::Writer* log_writer = *(logs_to_free_queue_.begin());
+    logs_to_free_queue_.pop_front();
+    mutex_.Unlock();
+    delete log_writer;
+    mutex_.Lock();
+  }
+  while (!superversions_to_free_queue_.empty()) {
+    assert(!superversions_to_free_queue_.empty());
+    SuperVersion* sv = superversions_to_free_queue_.front();
+    superversions_to_free_queue_.pop_front();
+    mutex_.Unlock();
+    delete sv;
+    mutex_.Lock();
+  }
+
+  // Can't use iterator to go over purge_files_ because inside the loop we're
+  // unlocking the mutex that protects purge_files_.
+  while (!purge_files_.empty()) {
+    auto it = purge_files_.begin();
+    // Need to make a copy of the PurgeFilesInfo before unlocking the mutex.
+    PurgeFileInfo purge_file = it->second;
+
+    const std::string& fname = purge_file.fname;
+    const std::string& dir_to_sync = purge_file.dir_to_sync;
+    FileType type = purge_file.type;
+    uint64_t number = purge_file.number;
+    int job_id = purge_file.job_id;
+
+    purge_files_.erase(it);
+
+    mutex_.Unlock();
+    DeleteObsoleteFileImpl(job_id, fname, dir_to_sync, type, number);
+    mutex_.Lock();
+  }
+
+  bg_purge_scheduled_--;
+
+  bg_cv_.SignalAll();
+  // IMPORTANT:there should be no code after calling SignalAll. This call may
+  // signal the DB destructor that it's OK to proceed with destruction. In
+  // that case, all DB variables will be dealloacated and referencing them
+  // will cause trouble.
+  mutex_.Unlock();
+}
+
+namespace {
+struct IterState {
+  IterState(DBImpl* _db, InstrumentedMutex* _mu, SuperVersion* _super_version,
+            bool _background_purge)
+      : db(_db),
+        mu(_mu),
+        super_version(_super_version),
+        background_purge(_background_purge) {}
+
+  DBImpl* db;
+  InstrumentedMutex* mu;
+  SuperVersion* super_version;
+  bool background_purge;
+};
+
+static void CleanupIteratorState(void* arg1, void* /*arg2*/) {
+  IterState* state = reinterpret_cast<IterState*>(arg1);
+
+  if (state->super_version->Unref()) {
+    // Job id == 0 means that this is not our background process, but rather
+    // user thread
+    JobContext job_context(0);
+
+    state->mu->Lock();
+    state->super_version->Cleanup();
+    state->db->FindObsoleteFiles(&job_context, false, true);
+    if (state->background_purge) {
+      state->db->ScheduleBgLogWriterClose(&job_context);
+      state->db->AddSuperVersionsToFreeQueue(state->super_version);
+      state->db->SchedulePurge();
+    }
+    state->mu->Unlock();
+
+    if (!state->background_purge) {
+      delete state->super_version;
+    }
+    if (job_context.HaveSomethingToDelete()) {
+      if (state->background_purge) {
+        // PurgeObsoleteFiles here does not delete files. Instead, it adds the
+        // files to be deleted to a job queue, and deletes it in a separate
+        // background thread.
+        state->db->PurgeObsoleteFiles(job_context, true /* schedule only */);
+        state->mu->Lock();
+        state->db->SchedulePurge();
+        state->mu->Unlock();
+      } else {
+        state->db->PurgeObsoleteFiles(job_context);
+      }
+    }
+    job_context.Clean();
+  }
+
+  delete state;
+}
+}  // namespace
+
+InternalIterator* DBImpl::NewInternalIterator(const ReadOptions& read_options,
+                                              ColumnFamilyData* cfd,
+                                              SuperVersion* super_version,
+                                              Arena* arena,
+                                              RangeDelAggregator* range_del_agg,
+                                              SequenceNumber sequence) {
+  InternalIterator* internal_iter;
+  assert(arena != nullptr);
+  assert(range_del_agg != nullptr);
+  // Need to create internal iterator from the arena.
+  MergeIteratorBuilder merge_iter_builder(
+      &cfd->internal_comparator(), arena,
+      !read_options.total_order_seek &&
+          super_version->mutable_cf_options.prefix_extractor != nullptr);
+  // Collect iterator for mutable mem
+  merge_iter_builder.AddIterator(
+      super_version->mem->NewIterator(read_options, arena));
+  std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter;
+  Status s;
+  if (!read_options.ignore_range_deletions) {
+    range_del_iter.reset(
+        super_version->mem->NewRangeTombstoneIterator(read_options, sequence));
+    range_del_agg->AddTombstones(std::move(range_del_iter));
+  }
+  // Collect all needed child iterators for immutable memtables
+  if (s.ok()) {
+    super_version->imm->AddIterators(read_options, &merge_iter_builder);
+    if (!read_options.ignore_range_deletions) {
+      s = super_version->imm->AddRangeTombstoneIterators(read_options, arena,
+                                                         range_del_agg);
+    }
+  }
+  TEST_SYNC_POINT_CALLBACK("DBImpl::NewInternalIterator:StatusCallback", &s);
+  if (s.ok()) {
+    // Collect iterators for files in L0 - Ln
+    if (read_options.read_tier != kMemtableTier) {
+      super_version->current->AddIterators(read_options, file_options_,
+                                           &merge_iter_builder, range_del_agg);
+    }
+    internal_iter = merge_iter_builder.Finish();
+    IterState* cleanup =
+        new IterState(this, &mutex_, super_version,
+                      read_options.background_purge_on_iterator_cleanup ||
+                      immutable_db_options_.avoid_unnecessary_blocking_io);
+    internal_iter->RegisterCleanup(CleanupIteratorState, cleanup, nullptr);
+
+    return internal_iter;
+  } else {
+    CleanupSuperVersion(super_version);
+  }
+  return NewErrorInternalIterator<Slice>(s, arena);
+}
+
+ColumnFamilyHandle* DBImpl::DefaultColumnFamily() const {
+  return default_cf_handle_;
+}
+
+ColumnFamilyHandle* DBImpl::PersistentStatsColumnFamily() const {
+  return persist_stats_cf_handle_;
+}
+
+Status DBImpl::Get(const ReadOptions& read_options,
+                   ColumnFamilyHandle* column_family, const Slice& key,
+                   PinnableSlice* value) {
+  GetImplOptions get_impl_options;
+  get_impl_options.column_family = column_family;
+  get_impl_options.value = value;
+  return GetImpl(read_options, key, get_impl_options);
+}
+
+Status DBImpl::GetImpl(const ReadOptions& read_options, const Slice& key,
+                       GetImplOptions get_impl_options) {
+  assert(get_impl_options.value != nullptr ||
+         get_impl_options.merge_operands != nullptr);
+  PERF_CPU_TIMER_GUARD(get_cpu_nanos, env_);
+  StopWatch sw(env_, stats_, DB_GET);
+  PERF_TIMER_GUARD(get_snapshot_time);
+
+  auto cfh =
+      reinterpret_cast<ColumnFamilyHandleImpl*>(get_impl_options.column_family);
+  auto cfd = cfh->cfd();
+
+  if (tracer_) {
+    // TODO: This mutex should be removed later, to improve performance when
+    // tracing is enabled.
+    InstrumentedMutexLock lock(&trace_mutex_);
+    if (tracer_) {
+      tracer_->Get(get_impl_options.column_family, key);
+    }
+  }
+
+  // Acquire SuperVersion
+  SuperVersion* sv = GetAndRefSuperVersion(cfd);
+
+  TEST_SYNC_POINT("DBImpl::GetImpl:1");
+  TEST_SYNC_POINT("DBImpl::GetImpl:2");
+
+  SequenceNumber snapshot;
+  if (read_options.snapshot != nullptr) {
+    if (get_impl_options.callback) {
+      // Already calculated based on read_options.snapshot
+      snapshot = get_impl_options.callback->max_visible_seq();
+    } else {
+      snapshot =
+          reinterpret_cast<const SnapshotImpl*>(read_options.snapshot)->number_;
+    }
+  } else {
+    // Note that the snapshot is assigned AFTER referencing the super
+    // version because otherwise a flush happening in between may compact away
+    // data for the snapshot, so the reader would see neither data that was be
+    // visible to the snapshot before compaction nor the newer data inserted
+    // afterwards.
+    snapshot = last_seq_same_as_publish_seq_
+                   ? versions_->LastSequence()
+                   : versions_->LastPublishedSequence();
+    if (get_impl_options.callback) {
+      // The unprep_seqs are not published for write unprepared, so it could be
+      // that max_visible_seq is larger. Seek to the std::max of the two.
+      // However, we still want our callback to contain the actual snapshot so
+      // that it can do the correct visibility filtering.
+      get_impl_options.callback->Refresh(snapshot);
+
+      // Internally, WriteUnpreparedTxnReadCallback::Refresh would set
+      // max_visible_seq = max(max_visible_seq, snapshot)
+      //
+      // Currently, the commented out assert is broken by
+      // InvalidSnapshotReadCallback, but if write unprepared recovery followed
+      // the regular transaction flow, then this special read callback would not
+      // be needed.
+      //
+      // assert(callback->max_visible_seq() >= snapshot);
+      snapshot = get_impl_options.callback->max_visible_seq();
+    }
+  }
+  TEST_SYNC_POINT("DBImpl::GetImpl:3");
+  TEST_SYNC_POINT("DBImpl::GetImpl:4");
+
+  // Prepare to store a list of merge operations if merge occurs.
+  MergeContext merge_context;
+  SequenceNumber max_covering_tombstone_seq = 0;
+
+  Status s;
+  // First look in the memtable, then in the immutable memtable (if any).
+  // s is both in/out. When in, s could either be OK or MergeInProgress.
+  // merge_operands will contain the sequence of merges in the latter case.
+  LookupKey lkey(key, snapshot, read_options.timestamp);
+  PERF_TIMER_STOP(get_snapshot_time);
+
+  bool skip_memtable = (read_options.read_tier == kPersistedTier &&
+                        has_unpersisted_data_.load(std::memory_order_relaxed));
+  bool done = false;
+  if (!skip_memtable) {
+    // Get value associated with key
+    if (get_impl_options.get_value) {
+      if (sv->mem->Get(lkey, get_impl_options.value->GetSelf(), &s,
+                       &merge_context, &max_covering_tombstone_seq,
+                       read_options, get_impl_options.callback,
+                       get_impl_options.is_blob_index)) {
+        done = true;
+        get_impl_options.value->PinSelf();
+        RecordTick(stats_, MEMTABLE_HIT);
+      } else if ((s.ok() || s.IsMergeInProgress()) &&
+                 sv->imm->Get(lkey, get_impl_options.value->GetSelf(), &s,
+                              &merge_context, &max_covering_tombstone_seq,
+                              read_options, get_impl_options.callback,
+                              get_impl_options.is_blob_index)) {
+        done = true;
+        get_impl_options.value->PinSelf();
+        RecordTick(stats_, MEMTABLE_HIT);
+      }
+    } else {
+      // Get Merge Operands associated with key, Merge Operands should not be
+      // merged and raw values should be returned to the user.
+      if (sv->mem->Get(lkey, nullptr, &s, &merge_context,
+                       &max_covering_tombstone_seq, read_options, nullptr,
+                       nullptr, false)) {
+        done = true;
+        RecordTick(stats_, MEMTABLE_HIT);
+      } else if ((s.ok() || s.IsMergeInProgress()) &&
+                 sv->imm->GetMergeOperands(lkey, &s, &merge_context,
+                                           &max_covering_tombstone_seq,
+                                           read_options)) {
+        done = true;
+        RecordTick(stats_, MEMTABLE_HIT);
+      }
+    }
+    if (!done && !s.ok() && !s.IsMergeInProgress()) {
+      ReturnAndCleanupSuperVersion(cfd, sv);
+      return s;
+    }
+  }
+  if (!done) {
+    PERF_TIMER_GUARD(get_from_output_files_time);
+    sv->current->Get(
+        read_options, lkey, get_impl_options.value, &s, &merge_context,
+        &max_covering_tombstone_seq,
+        get_impl_options.get_value ? get_impl_options.value_found : nullptr,
+        nullptr, nullptr,
+        get_impl_options.get_value ? get_impl_options.callback : nullptr,
+        get_impl_options.get_value ? get_impl_options.is_blob_index : nullptr,
+        get_impl_options.get_value);
+    RecordTick(stats_, MEMTABLE_MISS);
+  }
+
+  {
+    PERF_TIMER_GUARD(get_post_process_time);
+
+    ReturnAndCleanupSuperVersion(cfd, sv);
+
+    RecordTick(stats_, NUMBER_KEYS_READ);
+    size_t size = 0;
+    if (s.ok()) {
+      if (get_impl_options.get_value) {
+        size = get_impl_options.value->size();
+      } else {
+        // Return all merge operands for get_impl_options.key
+        *get_impl_options.number_of_operands =
+            static_cast<int>(merge_context.GetNumOperands());
+        if (*get_impl_options.number_of_operands >
+            get_impl_options.get_merge_operands_options
+                ->expected_max_number_of_operands) {
+          s = Status::Incomplete(
+              Status::SubCode::KMergeOperandsInsufficientCapacity);
+        } else {
+          for (const Slice& sl : merge_context.GetOperands()) {
+            size += sl.size();
+            get_impl_options.merge_operands->PinSelf(sl);
+            get_impl_options.merge_operands++;
+          }
+        }
+      }
+      RecordTick(stats_, BYTES_READ, size);
+      PERF_COUNTER_ADD(get_read_bytes, size);
+    }
+    RecordInHistogram(stats_, BYTES_PER_READ, size);
+  }
+  return s;
+}
+
+std::vector<Status> DBImpl::MultiGet(
+    const ReadOptions& read_options,
+    const std::vector<ColumnFamilyHandle*>& column_family,
+    const std::vector<Slice>& keys, std::vector<std::string>* values) {
+  PERF_CPU_TIMER_GUARD(get_cpu_nanos, env_);
+  StopWatch sw(env_, stats_, DB_MULTIGET);
+  PERF_TIMER_GUARD(get_snapshot_time);
+
+  SequenceNumber consistent_seqnum;
+  ;
+
+  std::unordered_map<uint32_t, MultiGetColumnFamilyData> multiget_cf_data(
+      column_family.size());
+  for (auto cf : column_family) {
+    auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(cf);
+    auto cfd = cfh->cfd();
+    if (multiget_cf_data.find(cfd->GetID()) == multiget_cf_data.end()) {
+      multiget_cf_data.emplace(cfd->GetID(),
+                               MultiGetColumnFamilyData(cfh, nullptr));
+    }
+  }
+
+  std::function<MultiGetColumnFamilyData*(
+      std::unordered_map<uint32_t, MultiGetColumnFamilyData>::iterator&)>
+      iter_deref_lambda =
+          [](std::unordered_map<uint32_t, MultiGetColumnFamilyData>::iterator&
+                 cf_iter) { return &cf_iter->second; };
+
+  bool unref_only =
+      MultiCFSnapshot<std::unordered_map<uint32_t, MultiGetColumnFamilyData>>(
+          read_options, nullptr, iter_deref_lambda, &multiget_cf_data,
+          &consistent_seqnum);
+
+  // Contain a list of merge operations if merge occurs.
+  MergeContext merge_context;
+
+  // Note: this always resizes the values array
+  size_t num_keys = keys.size();
+  std::vector<Status> stat_list(num_keys);
+  values->resize(num_keys);
+
+  // Keep track of bytes that we read for statistics-recording later
+  uint64_t bytes_read = 0;
+  PERF_TIMER_STOP(get_snapshot_time);
+
+  // For each of the given keys, apply the entire "get" process as follows:
+  // First look in the memtable, then in the immutable memtable (if any).
+  // s is both in/out. When in, s could either be OK or MergeInProgress.
+  // merge_operands will contain the sequence of merges in the latter case.
+  size_t num_found = 0;
+  for (size_t i = 0; i < num_keys; ++i) {
+    merge_context.Clear();
+    Status& s = stat_list[i];
+    std::string* value = &(*values)[i];
+
+    LookupKey lkey(keys[i], consistent_seqnum);
+    auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family[i]);
+    SequenceNumber max_covering_tombstone_seq = 0;
+    auto mgd_iter = multiget_cf_data.find(cfh->cfd()->GetID());
+    assert(mgd_iter != multiget_cf_data.end());
+    auto mgd = mgd_iter->second;
+    auto super_version = mgd.super_version;
+    bool skip_memtable =
+        (read_options.read_tier == kPersistedTier &&
+         has_unpersisted_data_.load(std::memory_order_relaxed));
+    bool done = false;
+    if (!skip_memtable) {
+      if (super_version->mem->Get(lkey, value, &s, &merge_context,
+                                  &max_covering_tombstone_seq, read_options)) {
+        done = true;
+        RecordTick(stats_, MEMTABLE_HIT);
+      } else if (super_version->imm->Get(lkey, value, &s, &merge_context,
+                                         &max_covering_tombstone_seq,
+                                         read_options)) {
+        done = true;
+        RecordTick(stats_, MEMTABLE_HIT);
+      }
+    }
+    if (!done) {
+      PinnableSlice pinnable_val;
+      PERF_TIMER_GUARD(get_from_output_files_time);
+      super_version->current->Get(read_options, lkey, &pinnable_val, &s,
+                                  &merge_context, &max_covering_tombstone_seq);
+      value->assign(pinnable_val.data(), pinnable_val.size());
+      RecordTick(stats_, MEMTABLE_MISS);
+    }
+
+    if (s.ok()) {
+      bytes_read += value->size();
+      num_found++;
+    }
+  }
+
+  // Post processing (decrement reference counts and record statistics)
+  PERF_TIMER_GUARD(get_post_process_time);
+  autovector<SuperVersion*> superversions_to_delete;
+
+  for (auto mgd_iter : multiget_cf_data) {
+    auto mgd = mgd_iter.second;
+    if (!unref_only) {
+      ReturnAndCleanupSuperVersion(mgd.cfd, mgd.super_version);
+    } else {
+      mgd.cfd->GetSuperVersion()->Unref();
+    }
+  }
+  RecordTick(stats_, NUMBER_MULTIGET_CALLS);
+  RecordTick(stats_, NUMBER_MULTIGET_KEYS_READ, num_keys);
+  RecordTick(stats_, NUMBER_MULTIGET_KEYS_FOUND, num_found);
+  RecordTick(stats_, NUMBER_MULTIGET_BYTES_READ, bytes_read);
+  RecordInHistogram(stats_, BYTES_PER_MULTIGET, bytes_read);
+  PERF_COUNTER_ADD(multiget_read_bytes, bytes_read);
+  PERF_TIMER_STOP(get_post_process_time);
+
+  return stat_list;
+}
+
+template <class T>
+bool DBImpl::MultiCFSnapshot(
+    const ReadOptions& read_options, ReadCallback* callback,
+    std::function<MultiGetColumnFamilyData*(typename T::iterator&)>&
+        iter_deref_func,
+    T* cf_list, SequenceNumber* snapshot) {
+  PERF_TIMER_GUARD(get_snapshot_time);
+
+  bool last_try = false;
+  if (cf_list->size() == 1) {
+    // Fast path for a single column family. We can simply get the thread loca
+    // super version
+    auto cf_iter = cf_list->begin();
+    auto node = iter_deref_func(cf_iter);
+    node->super_version = GetAndRefSuperVersion(node->cfd);
+    if (read_options.snapshot != nullptr) {
+      // Note: In WritePrepared txns this is not necessary but not harmful
+      // either.  Because prep_seq > snapshot => commit_seq > snapshot so if
+      // a snapshot is specified we should be fine with skipping seq numbers
+      // that are greater than that.
+      //
+      // In WriteUnprepared, we cannot set snapshot in the lookup key because we
+      // may skip uncommitted data that should be visible to the transaction for
+      // reading own writes.
+      *snapshot =
+          static_cast<const SnapshotImpl*>(read_options.snapshot)->number_;
+      if (callback) {
+        *snapshot = std::max(*snapshot, callback->max_visible_seq());
+      }
+    } else {
+      // Since we get and reference the super version before getting
+      // the snapshot number, without a mutex protection, it is possible
+      // that a memtable switch happened in the middle and not all the
+      // data for this snapshot is available. But it will contain all
+      // the data available in the super version we have, which is also
+      // a valid snapshot to read from.
+      // We shouldn't get snapshot before finding and referencing the super
+      // version because a flush happening in between may compact away data for
+      // the snapshot, but the snapshot is earlier than the data overwriting it,
+      // so users may see wrong results.
+      *snapshot = last_seq_same_as_publish_seq_
+                      ? versions_->LastSequence()
+                      : versions_->LastPublishedSequence();
+    }
+  } else {
+    // If we end up with the same issue of memtable geting sealed during 2
+    // consecutive retries, it means the write rate is very high. In that case
+    // its probably ok to take the mutex on the 3rd try so we can succeed for
+    // sure
+    static const int num_retries = 3;
+    for (int i = 0; i < num_retries; ++i) {
+      last_try = (i == num_retries - 1);
+      bool retry = false;
+
+      if (i > 0) {
+        for (auto cf_iter = cf_list->begin(); cf_iter != cf_list->end();
+             ++cf_iter) {
+          auto node = iter_deref_func(cf_iter);
+          SuperVersion* super_version = node->super_version;
+          ColumnFamilyData* cfd = node->cfd;
+          if (super_version != nullptr) {
+            ReturnAndCleanupSuperVersion(cfd, super_version);
+          }
+          node->super_version = nullptr;
+        }
+      }
+      if (read_options.snapshot == nullptr) {
+        if (last_try) {
+          TEST_SYNC_POINT("DBImpl::MultiGet::LastTry");
+          // We're close to max number of retries. For the last retry,
+          // acquire the lock so we're sure to succeed
+          mutex_.Lock();
+        }
+        *snapshot = last_seq_same_as_publish_seq_
+                        ? versions_->LastSequence()
+                        : versions_->LastPublishedSequence();
+      } else {
+        *snapshot = reinterpret_cast<const SnapshotImpl*>(read_options.snapshot)
+                        ->number_;
+      }
+      for (auto cf_iter = cf_list->begin(); cf_iter != cf_list->end();
+           ++cf_iter) {
+        auto node = iter_deref_func(cf_iter);
+        if (!last_try) {
+          node->super_version = GetAndRefSuperVersion(node->cfd);
+        } else {
+          node->super_version = node->cfd->GetSuperVersion()->Ref();
+        }
+        TEST_SYNC_POINT("DBImpl::MultiGet::AfterRefSV");
+        if (read_options.snapshot != nullptr || last_try) {
+          // If user passed a snapshot, then we don't care if a memtable is
+          // sealed or compaction happens because the snapshot would ensure
+          // that older key versions are kept around. If this is the last
+          // retry, then we have the lock so nothing bad can happen
+          continue;
+        }
+        // We could get the earliest sequence number for the whole list of
+        // memtables, which will include immutable memtables as well, but that
+        // might be tricky to maintain in case we decide, in future, to do
+        // memtable compaction.
+        if (!last_try) {
+          SequenceNumber seq =
+              node->super_version->mem->GetEarliestSequenceNumber();
+          if (seq > *snapshot) {
+            retry = true;
+            break;
+          }
+        }
+      }
+      if (!retry) {
+        if (last_try) {
+          mutex_.Unlock();
+        }
+        break;
+      }
+    }
+  }
+
+  // Keep track of bytes that we read for statistics-recording later
+  PERF_TIMER_STOP(get_snapshot_time);
+
+  return last_try;
+}
+
+void DBImpl::MultiGet(const ReadOptions& read_options, const size_t num_keys,
+                      ColumnFamilyHandle** column_families, const Slice* keys,
+                      PinnableSlice* values, Status* statuses,
+                      const bool sorted_input) {
+  if (num_keys == 0) {
+    return;
+  }
+  autovector<KeyContext, MultiGetContext::MAX_BATCH_SIZE> key_context;
+  autovector<KeyContext*, MultiGetContext::MAX_BATCH_SIZE> sorted_keys;
+  sorted_keys.resize(num_keys);
+  for (size_t i = 0; i < num_keys; ++i) {
+    key_context.emplace_back(column_families[i], keys[i], &values[i],
+                             &statuses[i]);
+  }
+  for (size_t i = 0; i < num_keys; ++i) {
+    sorted_keys[i] = &key_context[i];
+  }
+  PrepareMultiGetKeys(num_keys, sorted_input, &sorted_keys);
+
+  autovector<MultiGetColumnFamilyData, MultiGetContext::MAX_BATCH_SIZE>
+      multiget_cf_data;
+  size_t cf_start = 0;
+  ColumnFamilyHandle* cf = sorted_keys[0]->column_family;
+  for (size_t i = 0; i < num_keys; ++i) {
+    KeyContext* key_ctx = sorted_keys[i];
+    if (key_ctx->column_family != cf) {
+      multiget_cf_data.emplace_back(
+          MultiGetColumnFamilyData(cf, cf_start, i - cf_start, nullptr));
+      cf_start = i;
+      cf = key_ctx->column_family;
+    }
+  }
+  {
+    // multiget_cf_data.emplace_back(
+    // MultiGetColumnFamilyData(cf, cf_start, num_keys - cf_start, nullptr));
+    multiget_cf_data.emplace_back(cf, cf_start, num_keys - cf_start, nullptr);
+  }
+  std::function<MultiGetColumnFamilyData*(
+      autovector<MultiGetColumnFamilyData,
+                 MultiGetContext::MAX_BATCH_SIZE>::iterator&)>
+      iter_deref_lambda =
+          [](autovector<MultiGetColumnFamilyData,
+                        MultiGetContext::MAX_BATCH_SIZE>::iterator& cf_iter) {
+            return &(*cf_iter);
+          };
+
+  SequenceNumber consistent_seqnum;
+  bool unref_only = MultiCFSnapshot<
+      autovector<MultiGetColumnFamilyData, MultiGetContext::MAX_BATCH_SIZE>>(
+      read_options, nullptr, iter_deref_lambda, &multiget_cf_data,
+      &consistent_seqnum);
+
+  for (auto cf_iter = multiget_cf_data.begin();
+       cf_iter != multiget_cf_data.end(); ++cf_iter) {
+    MultiGetImpl(read_options, cf_iter->start, cf_iter->num_keys, &sorted_keys,
+                 cf_iter->super_version, consistent_seqnum, nullptr, nullptr);
+    if (!unref_only) {
+      ReturnAndCleanupSuperVersion(cf_iter->cfd, cf_iter->super_version);
+    } else {
+      cf_iter->cfd->GetSuperVersion()->Unref();
+    }
+  }
+}
+
+namespace {
+// Order keys by CF ID, followed by key contents
+struct CompareKeyContext {
+  inline bool operator()(const KeyContext* lhs, const KeyContext* rhs) {
+    ColumnFamilyHandleImpl* cfh =
+        static_cast<ColumnFamilyHandleImpl*>(lhs->column_family);
+    uint32_t cfd_id1 = cfh->cfd()->GetID();
+    const Comparator* comparator = cfh->cfd()->user_comparator();
+    cfh = static_cast<ColumnFamilyHandleImpl*>(lhs->column_family);
+    uint32_t cfd_id2 = cfh->cfd()->GetID();
+
+    if (cfd_id1 < cfd_id2) {
+      return true;
+    } else if (cfd_id1 > cfd_id2) {
+      return false;
+    }
+
+    // Both keys are from the same column family
+    int cmp = comparator->Compare(*(lhs->key), *(rhs->key));
+    if (cmp < 0) {
+      return true;
+    }
+    return false;
+  }
+};
+
+}  // anonymous namespace
+
+void DBImpl::PrepareMultiGetKeys(
+    size_t num_keys, bool sorted_input,
+    autovector<KeyContext*, MultiGetContext::MAX_BATCH_SIZE>* sorted_keys) {
+#ifndef NDEBUG
+  if (sorted_input) {
+    for (size_t index = 0; index < sorted_keys->size(); ++index) {
+      if (index > 0) {
+        KeyContext* lhs = (*sorted_keys)[index - 1];
+        KeyContext* rhs = (*sorted_keys)[index];
+        ColumnFamilyHandleImpl* cfh =
+            reinterpret_cast<ColumnFamilyHandleImpl*>(lhs->column_family);
+        uint32_t cfd_id1 = cfh->cfd()->GetID();
+        const Comparator* comparator = cfh->cfd()->user_comparator();
+        cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(lhs->column_family);
+        uint32_t cfd_id2 = cfh->cfd()->GetID();
+
+        assert(cfd_id1 <= cfd_id2);
+        if (cfd_id1 < cfd_id2) {
+          continue;
+        }
+
+        // Both keys are from the same column family
+        int cmp = comparator->Compare(*(lhs->key), *(rhs->key));
+        assert(cmp <= 0);
+      }
+      index++;
+    }
+  }
+#endif
+  if (!sorted_input) {
+    CompareKeyContext sort_comparator;
+    std::sort(sorted_keys->begin(), sorted_keys->begin() + num_keys,
+              sort_comparator);
+  }
+}
+
+void DBImpl::MultiGet(const ReadOptions& read_options,
+                      ColumnFamilyHandle* column_family, const size_t num_keys,
+                      const Slice* keys, PinnableSlice* values,
+                      Status* statuses, const bool sorted_input) {
+  autovector<KeyContext, MultiGetContext::MAX_BATCH_SIZE> key_context;
+  autovector<KeyContext*, MultiGetContext::MAX_BATCH_SIZE> sorted_keys;
+  sorted_keys.resize(num_keys);
+  for (size_t i = 0; i < num_keys; ++i) {
+    key_context.emplace_back(column_family, keys[i], &values[i], &statuses[i]);
+  }
+  for (size_t i = 0; i < num_keys; ++i) {
+    sorted_keys[i] = &key_context[i];
+  }
+  PrepareMultiGetKeys(num_keys, sorted_input, &sorted_keys);
+  MultiGetWithCallback(read_options, column_family, nullptr, &sorted_keys);
+}
+
+void DBImpl::MultiGetWithCallback(
+    const ReadOptions& read_options, ColumnFamilyHandle* column_family,
+    ReadCallback* callback,
+    autovector<KeyContext*, MultiGetContext::MAX_BATCH_SIZE>* sorted_keys) {
+  std::array<MultiGetColumnFamilyData, 1> multiget_cf_data;
+  multiget_cf_data[0] = MultiGetColumnFamilyData(column_family, nullptr);
+  std::function<MultiGetColumnFamilyData*(
+      std::array<MultiGetColumnFamilyData, 1>::iterator&)>
+      iter_deref_lambda =
+          [](std::array<MultiGetColumnFamilyData, 1>::iterator& cf_iter) {
+            return &(*cf_iter);
+          };
+
+  size_t num_keys = sorted_keys->size();
+  SequenceNumber consistent_seqnum;
+  bool unref_only = MultiCFSnapshot<std::array<MultiGetColumnFamilyData, 1>>(
+      read_options, callback, iter_deref_lambda, &multiget_cf_data,
+      &consistent_seqnum);
+#ifndef NDEBUG
+  assert(!unref_only);
+#else
+  // Silence unused variable warning
+  (void)unref_only;
+#endif  // NDEBUG
+
+  if (callback && read_options.snapshot == nullptr) {
+    // The unprep_seqs are not published for write unprepared, so it could be
+    // that max_visible_seq is larger. Seek to the std::max of the two.
+    // However, we still want our callback to contain the actual snapshot so
+    // that it can do the correct visibility filtering.
+    callback->Refresh(consistent_seqnum);
+
+    // Internally, WriteUnpreparedTxnReadCallback::Refresh would set
+    // max_visible_seq = max(max_visible_seq, snapshot)
+    //
+    // Currently, the commented out assert is broken by
+    // InvalidSnapshotReadCallback, but if write unprepared recovery followed
+    // the regular transaction flow, then this special read callback would not
+    // be needed.
+    //
+    // assert(callback->max_visible_seq() >= snapshot);
+    consistent_seqnum = callback->max_visible_seq();
+  }
+
+  MultiGetImpl(read_options, 0, num_keys, sorted_keys,
+               multiget_cf_data[0].super_version, consistent_seqnum, nullptr,
+               nullptr);
+  ReturnAndCleanupSuperVersion(multiget_cf_data[0].cfd,
+                               multiget_cf_data[0].super_version);
+}
+
+void DBImpl::MultiGetImpl(
+    const ReadOptions& read_options, size_t start_key, size_t num_keys,
+    autovector<KeyContext*, MultiGetContext::MAX_BATCH_SIZE>* sorted_keys,
+    SuperVersion* super_version, SequenceNumber snapshot,
+    ReadCallback* callback, bool* is_blob_index) {
+  PERF_CPU_TIMER_GUARD(get_cpu_nanos, env_);
+  StopWatch sw(env_, stats_, DB_MULTIGET);
+
+  // For each of the given keys, apply the entire "get" process as follows:
+  // First look in the memtable, then in the immutable memtable (if any).
+  // s is both in/out. When in, s could either be OK or MergeInProgress.
+  // merge_operands will contain the sequence of merges in the latter case.
+  size_t keys_left = num_keys;
+  while (keys_left) {
+    size_t batch_size = (keys_left > MultiGetContext::MAX_BATCH_SIZE)
+                            ? MultiGetContext::MAX_BATCH_SIZE
+                            : keys_left;
+    MultiGetContext ctx(sorted_keys, start_key + num_keys - keys_left,
+                        batch_size, snapshot);
+    MultiGetRange range = ctx.GetMultiGetRange();
+    bool lookup_current = false;
+
+    keys_left -= batch_size;
+    for (auto mget_iter = range.begin(); mget_iter != range.end();
+         ++mget_iter) {
+      mget_iter->merge_context.Clear();
+      *mget_iter->s = Status::OK();
+    }
+
+    bool skip_memtable =
+        (read_options.read_tier == kPersistedTier &&
+         has_unpersisted_data_.load(std::memory_order_relaxed));
+    if (!skip_memtable) {
+      super_version->mem->MultiGet(read_options, &range, callback,
+                                   is_blob_index);
+      if (!range.empty()) {
+        super_version->imm->MultiGet(read_options, &range, callback,
+                                     is_blob_index);
+      }
+      if (!range.empty()) {
+        lookup_current = true;
+        uint64_t left = range.KeysLeft();
+        RecordTick(stats_, MEMTABLE_MISS, left);
+      }
+    }
+    if (lookup_current) {
+      PERF_TIMER_GUARD(get_from_output_files_time);
+      super_version->current->MultiGet(read_options, &range, callback,
+                                       is_blob_index);
+    }
+  }
+
+  // Post processing (decrement reference counts and record statistics)
+  PERF_TIMER_GUARD(get_post_process_time);
+  size_t num_found = 0;
+  uint64_t bytes_read = 0;
+  for (size_t i = start_key; i < start_key + num_keys; ++i) {
+    KeyContext* key = (*sorted_keys)[i];
+    if (key->s->ok()) {
+      bytes_read += key->value->size();
+      num_found++;
+    }
+  }
+
+  RecordTick(stats_, NUMBER_MULTIGET_CALLS);
+  RecordTick(stats_, NUMBER_MULTIGET_KEYS_READ, num_keys);
+  RecordTick(stats_, NUMBER_MULTIGET_KEYS_FOUND, num_found);
+  RecordTick(stats_, NUMBER_MULTIGET_BYTES_READ, bytes_read);
+  RecordInHistogram(stats_, BYTES_PER_MULTIGET, bytes_read);
+  PERF_COUNTER_ADD(multiget_read_bytes, bytes_read);
+  PERF_TIMER_STOP(get_post_process_time);
+}
+
+Status DBImpl::CreateColumnFamily(const ColumnFamilyOptions& cf_options,
+                                  const std::string& column_family,
+                                  ColumnFamilyHandle** handle) {
+  assert(handle != nullptr);
+  Status s = CreateColumnFamilyImpl(cf_options, column_family, handle);
+  if (s.ok()) {
+    s = WriteOptionsFile(true /*need_mutex_lock*/,
+                         true /*need_enter_write_thread*/);
+  }
+  return s;
+}
+
+Status DBImpl::CreateColumnFamilies(
+    const ColumnFamilyOptions& cf_options,
+    const std::vector<std::string>& column_family_names,
+    std::vector<ColumnFamilyHandle*>* handles) {
+  assert(handles != nullptr);
+  handles->clear();
+  size_t num_cf = column_family_names.size();
+  Status s;
+  bool success_once = false;
+  for (size_t i = 0; i < num_cf; i++) {
+    ColumnFamilyHandle* handle;
+    s = CreateColumnFamilyImpl(cf_options, column_family_names[i], &handle);
+    if (!s.ok()) {
+      break;
+    }
+    handles->push_back(handle);
+    success_once = true;
+  }
+  if (success_once) {
+    Status persist_options_status = WriteOptionsFile(
+        true /*need_mutex_lock*/, true /*need_enter_write_thread*/);
+    if (s.ok() && !persist_options_status.ok()) {
+      s = persist_options_status;
+    }
+  }
+  return s;
+}
+
+Status DBImpl::CreateColumnFamilies(
+    const std::vector<ColumnFamilyDescriptor>& column_families,
+    std::vector<ColumnFamilyHandle*>* handles) {
+  assert(handles != nullptr);
+  handles->clear();
+  size_t num_cf = column_families.size();
+  Status s;
+  bool success_once = false;
+  for (size_t i = 0; i < num_cf; i++) {
+    ColumnFamilyHandle* handle;
+    s = CreateColumnFamilyImpl(column_families[i].options,
+                               column_families[i].name, &handle);
+    if (!s.ok()) {
+      break;
+    }
+    handles->push_back(handle);
+    success_once = true;
+  }
+  if (success_once) {
+    Status persist_options_status = WriteOptionsFile(
+        true /*need_mutex_lock*/, true /*need_enter_write_thread*/);
+    if (s.ok() && !persist_options_status.ok()) {
+      s = persist_options_status;
+    }
+  }
+  return s;
+}
+
+Status DBImpl::CreateColumnFamilyImpl(const ColumnFamilyOptions& cf_options,
+                                      const std::string& column_family_name,
+                                      ColumnFamilyHandle** handle) {
+  Status s;
+  Status persist_options_status;
+  *handle = nullptr;
+
+  DBOptions db_options =
+      BuildDBOptions(immutable_db_options_, mutable_db_options_);
+  s = ColumnFamilyData::ValidateOptions(db_options, cf_options);
+  if (s.ok()) {
+    for (auto& cf_path : cf_options.cf_paths) {
+      s = env_->CreateDirIfMissing(cf_path.path);
+      if (!s.ok()) {
+        break;
+      }
+    }
+  }
+  if (!s.ok()) {
+    return s;
+  }
+
+  SuperVersionContext sv_context(/* create_superversion */ true);
+  {
+    InstrumentedMutexLock l(&mutex_);
+
+    if (versions_->GetColumnFamilySet()->GetColumnFamily(column_family_name) !=
+        nullptr) {
+      return Status::InvalidArgument("Column family already exists");
+    }
+    VersionEdit edit;
+    edit.AddColumnFamily(column_family_name);
+    uint32_t new_id = versions_->GetColumnFamilySet()->GetNextColumnFamilyID();
+    edit.SetColumnFamily(new_id);
+    edit.SetLogNumber(logfile_number_);
+    edit.SetComparatorName(cf_options.comparator->Name());
+
+    // LogAndApply will both write the creation in MANIFEST and create
+    // ColumnFamilyData object
+    {  // write thread
+      WriteThread::Writer w;
+      write_thread_.EnterUnbatched(&w, &mutex_);
+      // LogAndApply will both write the creation in MANIFEST and create
+      // ColumnFamilyData object
+      s = versions_->LogAndApply(nullptr, MutableCFOptions(cf_options), &edit,
+                                 &mutex_, directories_.GetDbDir(), false,
+                                 &cf_options);
+      write_thread_.ExitUnbatched(&w);
+    }
+    if (s.ok()) {
+      auto* cfd =
+          versions_->GetColumnFamilySet()->GetColumnFamily(column_family_name);
+      assert(cfd != nullptr);
+      std::map<std::string, std::shared_ptr<Directory>> dummy_created_dirs;
+      s = cfd->AddDirectories(&dummy_created_dirs);
+    }
+    if (s.ok()) {
+      single_column_family_mode_ = false;
+      auto* cfd =
+          versions_->GetColumnFamilySet()->GetColumnFamily(column_family_name);
+      assert(cfd != nullptr);
+      InstallSuperVersionAndScheduleWork(cfd, &sv_context,
+                                         *cfd->GetLatestMutableCFOptions());
+
+      if (!cfd->mem()->IsSnapshotSupported()) {
+        is_snapshot_supported_ = false;
+      }
+
+      cfd->set_initialized();
+
+      *handle = new ColumnFamilyHandleImpl(cfd, this, &mutex_);
+      ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                     "Created column family [%s] (ID %u)",
+                     column_family_name.c_str(), (unsigned)cfd->GetID());
+    } else {
+      ROCKS_LOG_ERROR(immutable_db_options_.info_log,
+                      "Creating column family [%s] FAILED -- %s",
+                      column_family_name.c_str(), s.ToString().c_str());
+    }
+  }  // InstrumentedMutexLock l(&mutex_)
+
+  sv_context.Clean();
+  // this is outside the mutex
+  if (s.ok()) {
+    NewThreadStatusCfInfo(
+        reinterpret_cast<ColumnFamilyHandleImpl*>(*handle)->cfd());
+  }
+  return s;
+}
+
+Status DBImpl::DropColumnFamily(ColumnFamilyHandle* column_family) {
+  assert(column_family != nullptr);
+  Status s = DropColumnFamilyImpl(column_family);
+  if (s.ok()) {
+    s = WriteOptionsFile(true /*need_mutex_lock*/,
+                         true /*need_enter_write_thread*/);
+  }
+  return s;
+}
+
+Status DBImpl::DropColumnFamilies(
+    const std::vector<ColumnFamilyHandle*>& column_families) {
+  Status s;
+  bool success_once = false;
+  for (auto* handle : column_families) {
+    s = DropColumnFamilyImpl(handle);
+    if (!s.ok()) {
+      break;
+    }
+    success_once = true;
+  }
+  if (success_once) {
+    Status persist_options_status = WriteOptionsFile(
+        true /*need_mutex_lock*/, true /*need_enter_write_thread*/);
+    if (s.ok() && !persist_options_status.ok()) {
+      s = persist_options_status;
+    }
+  }
+  return s;
+}
+
+Status DBImpl::DropColumnFamilyImpl(ColumnFamilyHandle* column_family) {
+  auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+  auto cfd = cfh->cfd();
+  if (cfd->GetID() == 0) {
+    return Status::InvalidArgument("Can't drop default column family");
+  }
+
+  bool cf_support_snapshot = cfd->mem()->IsSnapshotSupported();
+
+  VersionEdit edit;
+  edit.DropColumnFamily();
+  edit.SetColumnFamily(cfd->GetID());
+
+  Status s;
+  {
+    InstrumentedMutexLock l(&mutex_);
+    if (cfd->IsDropped()) {
+      s = Status::InvalidArgument("Column family already dropped!\n");
+    }
+    if (s.ok()) {
+      // we drop column family from a single write thread
+      WriteThread::Writer w;
+      write_thread_.EnterUnbatched(&w, &mutex_);
+      s = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), &edit,
+                                 &mutex_);
+      write_thread_.ExitUnbatched(&w);
+    }
+    if (s.ok()) {
+      auto* mutable_cf_options = cfd->GetLatestMutableCFOptions();
+      max_total_in_memory_state_ -= mutable_cf_options->write_buffer_size *
+                                    mutable_cf_options->max_write_buffer_number;
+    }
+
+    if (!cf_support_snapshot) {
+      // Dropped Column Family doesn't support snapshot. Need to recalculate
+      // is_snapshot_supported_.
+      bool new_is_snapshot_supported = true;
+      for (auto c : *versions_->GetColumnFamilySet()) {
+        if (!c->IsDropped() && !c->mem()->IsSnapshotSupported()) {
+          new_is_snapshot_supported = false;
+          break;
+        }
+      }
+      is_snapshot_supported_ = new_is_snapshot_supported;
+    }
+    bg_cv_.SignalAll();
+  }
+
+  if (s.ok()) {
+    // Note that here we erase the associated cf_info of the to-be-dropped
+    // cfd before its ref-count goes to zero to avoid having to erase cf_info
+    // later inside db_mutex.
+    EraseThreadStatusCfInfo(cfd);
+    assert(cfd->IsDropped());
+    ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                   "Dropped column family with id %u\n", cfd->GetID());
+  } else {
+    ROCKS_LOG_ERROR(immutable_db_options_.info_log,
+                    "Dropping column family with id %u FAILED -- %s\n",
+                    cfd->GetID(), s.ToString().c_str());
+  }
+
+  return s;
+}
+
+bool DBImpl::KeyMayExist(const ReadOptions& read_options,
+                         ColumnFamilyHandle* column_family, const Slice& key,
+                         std::string* value, bool* value_found) {
+  assert(value != nullptr);
+  if (value_found != nullptr) {
+    // falsify later if key-may-exist but can't fetch value
+    *value_found = true;
+  }
+  ReadOptions roptions = read_options;
+  roptions.read_tier = kBlockCacheTier;  // read from block cache only
+  PinnableSlice pinnable_val;
+  GetImplOptions get_impl_options;
+  get_impl_options.column_family = column_family;
+  get_impl_options.value = &pinnable_val;
+  get_impl_options.value_found = value_found;
+  auto s = GetImpl(roptions, key, get_impl_options);
+  value->assign(pinnable_val.data(), pinnable_val.size());
+
+  // If block_cache is enabled and the index block of the table didn't
+  // not present in block_cache, the return value will be Status::Incomplete.
+  // In this case, key may still exist in the table.
+  return s.ok() || s.IsIncomplete();
+}
+
+Iterator* DBImpl::NewIterator(const ReadOptions& read_options,
+                              ColumnFamilyHandle* column_family) {
+  if (read_options.managed) {
+    return NewErrorIterator(
+        Status::NotSupported("Managed iterator is not supported anymore."));
+  }
+  Iterator* result = nullptr;
+  if (read_options.read_tier == kPersistedTier) {
+    return NewErrorIterator(Status::NotSupported(
+        "ReadTier::kPersistedData is not yet supported in iterators."));
+  }
+  // if iterator wants internal keys, we can only proceed if
+  // we can guarantee the deletes haven't been processed yet
+  if (immutable_db_options_.preserve_deletes &&
+      read_options.iter_start_seqnum > 0 &&
+      read_options.iter_start_seqnum < preserve_deletes_seqnum_.load()) {
+    return NewErrorIterator(Status::InvalidArgument(
+        "Iterator requested internal keys which are too old and are not"
+        " guaranteed to be preserved, try larger iter_start_seqnum opt."));
+  }
+  auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+  auto cfd = cfh->cfd();
+  ReadCallback* read_callback = nullptr;  // No read callback provided.
+  if (read_options.tailing) {
+#ifdef ROCKSDB_LITE
+    // not supported in lite version
+    result = nullptr;
+
+#else
+    SuperVersion* sv = cfd->GetReferencedSuperVersion(this);
+    auto iter = new ForwardIterator(this, read_options, cfd, sv);
+    result = NewDBIterator(
+        env_, read_options, *cfd->ioptions(), sv->mutable_cf_options,
+        cfd->user_comparator(), iter, kMaxSequenceNumber,
+        sv->mutable_cf_options.max_sequential_skip_in_iterations, read_callback,
+        this, cfd);
+#endif
+  } else {
+    // Note: no need to consider the special case of
+    // last_seq_same_as_publish_seq_==false since NewIterator is overridden in
+    // WritePreparedTxnDB
+    auto snapshot = read_options.snapshot != nullptr
+                        ? read_options.snapshot->GetSequenceNumber()
+                        : versions_->LastSequence();
+    result = NewIteratorImpl(read_options, cfd, snapshot, read_callback);
+  }
+  return result;
+}
+
+ArenaWrappedDBIter* DBImpl::NewIteratorImpl(const ReadOptions& read_options,
+                                            ColumnFamilyData* cfd,
+                                            SequenceNumber snapshot,
+                                            ReadCallback* read_callback,
+                                            bool allow_blob,
+                                            bool allow_refresh) {
+  SuperVersion* sv = cfd->GetReferencedSuperVersion(this);
+
+  // Try to generate a DB iterator tree in continuous memory area to be
+  // cache friendly. Here is an example of result:
+  // +-------------------------------+
+  // |                               |
+  // | ArenaWrappedDBIter            |
+  // |  +                            |
+  // |  +---> Inner Iterator   ------------+
+  // |  |                            |     |
+  // |  |    +-- -- -- -- -- -- -- --+     |
+  // |  +--- | Arena                 |     |
+  // |       |                       |     |
+  // |          Allocated Memory:    |     |
+  // |       |   +-------------------+     |
+  // |       |   | DBIter            | <---+
+  // |           |  +                |
+  // |       |   |  +-> iter_  ------------+
+  // |       |   |                   |     |
+  // |       |   +-------------------+     |
+  // |       |   | MergingIterator   | <---+
+  // |           |  +                |
+  // |       |   |  +->child iter1  ------------+
+  // |       |   |  |                |          |
+  // |           |  +->child iter2  ----------+ |
+  // |       |   |  |                |        | |
+  // |       |   |  +->child iter3  --------+ | |
+  // |           |                   |      | | |
+  // |       |   +-------------------+      | | |
+  // |       |   | Iterator1         | <--------+
+  // |       |   +-------------------+      | |
+  // |       |   | Iterator2         | <------+
+  // |       |   +-------------------+      |
+  // |       |   | Iterator3         | <----+
+  // |       |   +-------------------+
+  // |       |                       |
+  // +-------+-----------------------+
+  //
+  // ArenaWrappedDBIter inlines an arena area where all the iterators in
+  // the iterator tree are allocated in the order of being accessed when
+  // querying.
+  // Laying out the iterators in the order of being accessed makes it more
+  // likely that any iterator pointer is close to the iterator it points to so
+  // that they are likely to be in the same cache line and/or page.
+  ArenaWrappedDBIter* db_iter = NewArenaWrappedDbIterator(
+      env_, read_options, *cfd->ioptions(), sv->mutable_cf_options, snapshot,
+      sv->mutable_cf_options.max_sequential_skip_in_iterations,
+      sv->version_number, read_callback, this, cfd, allow_blob,
+      read_options.snapshot != nullptr ? false : allow_refresh);
+
+  InternalIterator* internal_iter =
+      NewInternalIterator(read_options, cfd, sv, db_iter->GetArena(),
+                          db_iter->GetRangeDelAggregator(), snapshot);
+  db_iter->SetIterUnderDBIter(internal_iter);
+
+  return db_iter;
+}
+
+Status DBImpl::NewIterators(
+    const ReadOptions& read_options,
+    const std::vector<ColumnFamilyHandle*>& column_families,
+    std::vector<Iterator*>* iterators) {
+  if (read_options.managed) {
+    return Status::NotSupported("Managed iterator is not supported anymore.");
+  }
+  if (read_options.read_tier == kPersistedTier) {
+    return Status::NotSupported(
+        "ReadTier::kPersistedData is not yet supported in iterators.");
+  }
+  ReadCallback* read_callback = nullptr;  // No read callback provided.
+  iterators->clear();
+  iterators->reserve(column_families.size());
+  if (read_options.tailing) {
+#ifdef ROCKSDB_LITE
+    return Status::InvalidArgument(
+        "Tailing iterator not supported in RocksDB lite");
+#else
+    for (auto cfh : column_families) {
+      auto cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(cfh)->cfd();
+      SuperVersion* sv = cfd->GetReferencedSuperVersion(this);
+      auto iter = new ForwardIterator(this, read_options, cfd, sv);
+      iterators->push_back(NewDBIterator(
+          env_, read_options, *cfd->ioptions(), sv->mutable_cf_options,
+          cfd->user_comparator(), iter, kMaxSequenceNumber,
+          sv->mutable_cf_options.max_sequential_skip_in_iterations,
+          read_callback, this, cfd));
+    }
+#endif
+  } else {
+    // Note: no need to consider the special case of
+    // last_seq_same_as_publish_seq_==false since NewIterators is overridden in
+    // WritePreparedTxnDB
+    auto snapshot = read_options.snapshot != nullptr
+                        ? read_options.snapshot->GetSequenceNumber()
+                        : versions_->LastSequence();
+    for (size_t i = 0; i < column_families.size(); ++i) {
+      auto* cfd =
+          reinterpret_cast<ColumnFamilyHandleImpl*>(column_families[i])->cfd();
+      iterators->push_back(
+          NewIteratorImpl(read_options, cfd, snapshot, read_callback));
+    }
+  }
+
+  return Status::OK();
+}
+
+const Snapshot* DBImpl::GetSnapshot() { return GetSnapshotImpl(false); }
+
+#ifndef ROCKSDB_LITE
+const Snapshot* DBImpl::GetSnapshotForWriteConflictBoundary() {
+  return GetSnapshotImpl(true);
+}
+#endif  // ROCKSDB_LITE
+
+SnapshotImpl* DBImpl::GetSnapshotImpl(bool is_write_conflict_boundary,
+                                      bool lock) {
+  int64_t unix_time = 0;
+  env_->GetCurrentTime(&unix_time);  // Ignore error
+  SnapshotImpl* s = new SnapshotImpl;
+
+  if (lock) {
+    mutex_.Lock();
+  }
+  // returns null if the underlying memtable does not support snapshot.
+  if (!is_snapshot_supported_) {
+    if (lock) {
+      mutex_.Unlock();
+    }
+    delete s;
+    return nullptr;
+  }
+  auto snapshot_seq = last_seq_same_as_publish_seq_
+                          ? versions_->LastSequence()
+                          : versions_->LastPublishedSequence();
+  SnapshotImpl* snapshot =
+      snapshots_.New(s, snapshot_seq, unix_time, is_write_conflict_boundary);
+  if (lock) {
+    mutex_.Unlock();
+  }
+  return snapshot;
+}
+
+namespace {
+typedef autovector<ColumnFamilyData*, 2> CfdList;
+bool CfdListContains(const CfdList& list, ColumnFamilyData* cfd) {
+  for (const ColumnFamilyData* t : list) {
+    if (t == cfd) {
+      return true;
+    }
+  }
+  return false;
+}
+}  //  namespace
+
+void DBImpl::ReleaseSnapshot(const Snapshot* s) {
+  const SnapshotImpl* casted_s = reinterpret_cast<const SnapshotImpl*>(s);
+  {
+    InstrumentedMutexLock l(&mutex_);
+    snapshots_.Delete(casted_s);
+    uint64_t oldest_snapshot;
+    if (snapshots_.empty()) {
+      oldest_snapshot = last_seq_same_as_publish_seq_
+                            ? versions_->LastSequence()
+                            : versions_->LastPublishedSequence();
+    } else {
+      oldest_snapshot = snapshots_.oldest()->number_;
+    }
+    // Avoid to go through every column family by checking a global threshold
+    // first.
+    if (oldest_snapshot > bottommost_files_mark_threshold_) {
+      CfdList cf_scheduled;
+      for (auto* cfd : *versions_->GetColumnFamilySet()) {
+        cfd->current()->storage_info()->UpdateOldestSnapshot(oldest_snapshot);
+        if (!cfd->current()
+                 ->storage_info()
+                 ->BottommostFilesMarkedForCompaction()
+                 .empty()) {
+          SchedulePendingCompaction(cfd);
+          MaybeScheduleFlushOrCompaction();
+          cf_scheduled.push_back(cfd);
+        }
+      }
+
+      // Calculate a new threshold, skipping those CFs where compactions are
+      // scheduled. We do not do the same pass as the previous loop because
+      // mutex might be unlocked during the loop, making the result inaccurate.
+      SequenceNumber new_bottommost_files_mark_threshold = kMaxSequenceNumber;
+      for (auto* cfd : *versions_->GetColumnFamilySet()) {
+        if (CfdListContains(cf_scheduled, cfd)) {
+          continue;
+        }
+        new_bottommost_files_mark_threshold = std::min(
+            new_bottommost_files_mark_threshold,
+            cfd->current()->storage_info()->bottommost_files_mark_threshold());
+      }
+      bottommost_files_mark_threshold_ = new_bottommost_files_mark_threshold;
+    }
+  }
+  delete casted_s;
+}
+
+#ifndef ROCKSDB_LITE
+Status DBImpl::GetPropertiesOfAllTables(ColumnFamilyHandle* column_family,
+                                        TablePropertiesCollection* props) {
+  auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+  auto cfd = cfh->cfd();
+
+  // Increment the ref count
+  mutex_.Lock();
+  auto version = cfd->current();
+  version->Ref();
+  mutex_.Unlock();
+
+  auto s = version->GetPropertiesOfAllTables(props);
+
+  // Decrement the ref count
+  mutex_.Lock();
+  version->Unref();
+  mutex_.Unlock();
+
+  return s;
+}
+
+Status DBImpl::GetPropertiesOfTablesInRange(ColumnFamilyHandle* column_family,
+                                            const Range* range, std::size_t n,
+                                            TablePropertiesCollection* props) {
+  auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+  auto cfd = cfh->cfd();
+
+  // Increment the ref count
+  mutex_.Lock();
+  auto version = cfd->current();
+  version->Ref();
+  mutex_.Unlock();
+
+  auto s = version->GetPropertiesOfTablesInRange(range, n, props);
+
+  // Decrement the ref count
+  mutex_.Lock();
+  version->Unref();
+  mutex_.Unlock();
+
+  return s;
+}
+
+#endif  // ROCKSDB_LITE
+
+const std::string& DBImpl::GetName() const { return dbname_; }
+
+Env* DBImpl::GetEnv() const { return env_; }
+
+FileSystem* DB::GetFileSystem() const {
+  static LegacyFileSystemWrapper fs_wrap(GetEnv());
+  return &fs_wrap;
+}
+
+FileSystem* DBImpl::GetFileSystem() const {
+  return immutable_db_options_.fs.get();
+}
+
+Options DBImpl::GetOptions(ColumnFamilyHandle* column_family) const {
+  InstrumentedMutexLock l(&mutex_);
+  auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+  return Options(BuildDBOptions(immutable_db_options_, mutable_db_options_),
+                 cfh->cfd()->GetLatestCFOptions());
+}
+
+DBOptions DBImpl::GetDBOptions() const {
+  InstrumentedMutexLock l(&mutex_);
+  return BuildDBOptions(immutable_db_options_, mutable_db_options_);
+}
+
+bool DBImpl::GetProperty(ColumnFamilyHandle* column_family,
+                         const Slice& property, std::string* value) {
+  const DBPropertyInfo* property_info = GetPropertyInfo(property);
+  value->clear();
+  auto cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family)->cfd();
+  if (property_info == nullptr) {
+    return false;
+  } else if (property_info->handle_int) {
+    uint64_t int_value;
+    bool ret_value =
+        GetIntPropertyInternal(cfd, *property_info, false, &int_value);
+    if (ret_value) {
+      *value = ToString(int_value);
+    }
+    return ret_value;
+  } else if (property_info->handle_string) {
+    InstrumentedMutexLock l(&mutex_);
+    return cfd->internal_stats()->GetStringProperty(*property_info, property,
+                                                    value);
+  } else if (property_info->handle_string_dbimpl) {
+    std::string tmp_value;
+    bool ret_value = (this->*(property_info->handle_string_dbimpl))(&tmp_value);
+    if (ret_value) {
+      *value = tmp_value;
+    }
+    return ret_value;
+  }
+  // Shouldn't reach here since exactly one of handle_string and handle_int
+  // should be non-nullptr.
+  assert(false);
+  return false;
+}
+
+bool DBImpl::GetMapProperty(ColumnFamilyHandle* column_family,
+                            const Slice& property,
+                            std::map<std::string, std::string>* value) {
+  const DBPropertyInfo* property_info = GetPropertyInfo(property);
+  value->clear();
+  auto cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family)->cfd();
+  if (property_info == nullptr) {
+    return false;
+  } else if (property_info->handle_map) {
+    InstrumentedMutexLock l(&mutex_);
+    return cfd->internal_stats()->GetMapProperty(*property_info, property,
+                                                 value);
+  }
+  // If we reach this point it means that handle_map is not provided for the
+  // requested property
+  return false;
+}
+
+bool DBImpl::GetIntProperty(ColumnFamilyHandle* column_family,
+                            const Slice& property, uint64_t* value) {
+  const DBPropertyInfo* property_info = GetPropertyInfo(property);
+  if (property_info == nullptr || property_info->handle_int == nullptr) {
+    return false;
+  }
+  auto cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family)->cfd();
+  return GetIntPropertyInternal(cfd, *property_info, false, value);
+}
+
+bool DBImpl::GetIntPropertyInternal(ColumnFamilyData* cfd,
+                                    const DBPropertyInfo& property_info,
+                                    bool is_locked, uint64_t* value) {
+  assert(property_info.handle_int != nullptr);
+  if (!property_info.need_out_of_mutex) {
+    if (is_locked) {
+      mutex_.AssertHeld();
+      return cfd->internal_stats()->GetIntProperty(property_info, value, this);
+    } else {
+      InstrumentedMutexLock l(&mutex_);
+      return cfd->internal_stats()->GetIntProperty(property_info, value, this);
+    }
+  } else {
+    SuperVersion* sv = nullptr;
+    if (!is_locked) {
+      sv = GetAndRefSuperVersion(cfd);
+    } else {
+      sv = cfd->GetSuperVersion();
+    }
+
+    bool ret = cfd->internal_stats()->GetIntPropertyOutOfMutex(
+        property_info, sv->current, value);
+
+    if (!is_locked) {
+      ReturnAndCleanupSuperVersion(cfd, sv);
+    }
+
+    return ret;
+  }
+}
+
+bool DBImpl::GetPropertyHandleOptionsStatistics(std::string* value) {
+  assert(value != nullptr);
+  Statistics* statistics = immutable_db_options_.statistics.get();
+  if (!statistics) {
+    return false;
+  }
+  *value = statistics->ToString();
+  return true;
+}
+
+#ifndef ROCKSDB_LITE
+Status DBImpl::ResetStats() {
+  InstrumentedMutexLock l(&mutex_);
+  for (auto* cfd : *versions_->GetColumnFamilySet()) {
+    if (cfd->initialized()) {
+      cfd->internal_stats()->Clear();
+    }
+  }
+  return Status::OK();
+}
+#endif  // ROCKSDB_LITE
+
+bool DBImpl::GetAggregatedIntProperty(const Slice& property,
+                                      uint64_t* aggregated_value) {
+  const DBPropertyInfo* property_info = GetPropertyInfo(property);
+  if (property_info == nullptr || property_info->handle_int == nullptr) {
+    return false;
+  }
+
+  uint64_t sum = 0;
+  {
+    // Needs mutex to protect the list of column families.
+    InstrumentedMutexLock l(&mutex_);
+    uint64_t value;
+    for (auto* cfd : *versions_->GetColumnFamilySet()) {
+      if (!cfd->initialized()) {
+        continue;
+      }
+      if (GetIntPropertyInternal(cfd, *property_info, true, &value)) {
+        sum += value;
+      } else {
+        return false;
+      }
+    }
+  }
+  *aggregated_value = sum;
+  return true;
+}
+
+SuperVersion* DBImpl::GetAndRefSuperVersion(ColumnFamilyData* cfd) {
+  // TODO(ljin): consider using GetReferencedSuperVersion() directly
+  return cfd->GetThreadLocalSuperVersion(this);
+}
+
+// REQUIRED: this function should only be called on the write thread or if the
+// mutex is held.
+SuperVersion* DBImpl::GetAndRefSuperVersion(uint32_t column_family_id) {
+  auto column_family_set = versions_->GetColumnFamilySet();
+  auto cfd = column_family_set->GetColumnFamily(column_family_id);
+  if (!cfd) {
+    return nullptr;
+  }
+
+  return GetAndRefSuperVersion(cfd);
+}
+
+void DBImpl::CleanupSuperVersion(SuperVersion* sv) {
+  // Release SuperVersion
+  if (sv->Unref()) {
+    bool defer_purge =
+            immutable_db_options().avoid_unnecessary_blocking_io;
+    {
+      InstrumentedMutexLock l(&mutex_);
+      sv->Cleanup();
+      if (defer_purge) {
+        AddSuperVersionsToFreeQueue(sv);
+        SchedulePurge();
+      }
+    }
+    if (!defer_purge) {
+      delete sv;
+    }
+    RecordTick(stats_, NUMBER_SUPERVERSION_CLEANUPS);
+  }
+  RecordTick(stats_, NUMBER_SUPERVERSION_RELEASES);
+}
+
+void DBImpl::ReturnAndCleanupSuperVersion(ColumnFamilyData* cfd,
+                                          SuperVersion* sv) {
+  if (!cfd->ReturnThreadLocalSuperVersion(sv)) {
+    CleanupSuperVersion(sv);
+  }
+}
+
+// REQUIRED: this function should only be called on the write thread.
+void DBImpl::ReturnAndCleanupSuperVersion(uint32_t column_family_id,
+                                          SuperVersion* sv) {
+  auto column_family_set = versions_->GetColumnFamilySet();
+  auto cfd = column_family_set->GetColumnFamily(column_family_id);
+
+  // If SuperVersion is held, and we successfully fetched a cfd using
+  // GetAndRefSuperVersion(), it must still exist.
+  assert(cfd != nullptr);
+  ReturnAndCleanupSuperVersion(cfd, sv);
+}
+
+// REQUIRED: this function should only be called on the write thread or if the
+// mutex is held.
+ColumnFamilyHandle* DBImpl::GetColumnFamilyHandle(uint32_t column_family_id) {
+  ColumnFamilyMemTables* cf_memtables = column_family_memtables_.get();
+
+  if (!cf_memtables->Seek(column_family_id)) {
+    return nullptr;
+  }
+
+  return cf_memtables->GetColumnFamilyHandle();
+}
+
+// REQUIRED: mutex is NOT held.
+std::unique_ptr<ColumnFamilyHandle> DBImpl::GetColumnFamilyHandleUnlocked(
+    uint32_t column_family_id) {
+  InstrumentedMutexLock l(&mutex_);
+
+  auto* cfd =
+      versions_->GetColumnFamilySet()->GetColumnFamily(column_family_id);
+  if (cfd == nullptr) {
+    return nullptr;
+  }
+
+  return std::unique_ptr<ColumnFamilyHandleImpl>(
+      new ColumnFamilyHandleImpl(cfd, this, &mutex_));
+}
+
+void DBImpl::GetApproximateMemTableStats(ColumnFamilyHandle* column_family,
+                                         const Range& range,
+                                         uint64_t* const count,
+                                         uint64_t* const size) {
+  ColumnFamilyHandleImpl* cfh =
+      reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+  ColumnFamilyData* cfd = cfh->cfd();
+  SuperVersion* sv = GetAndRefSuperVersion(cfd);
+
+  // Convert user_key into a corresponding internal key.
+  InternalKey k1(range.start, kMaxSequenceNumber, kValueTypeForSeek);
+  InternalKey k2(range.limit, kMaxSequenceNumber, kValueTypeForSeek);
+  MemTable::MemTableStats memStats =
+      sv->mem->ApproximateStats(k1.Encode(), k2.Encode());
+  MemTable::MemTableStats immStats =
+      sv->imm->ApproximateStats(k1.Encode(), k2.Encode());
+  *count = memStats.count + immStats.count;
+  *size = memStats.size + immStats.size;
+
+  ReturnAndCleanupSuperVersion(cfd, sv);
+}
+
+Status DBImpl::GetApproximateSizes(const SizeApproximationOptions& options,
+                                   ColumnFamilyHandle* column_family,
+                                   const Range* range, int n, uint64_t* sizes) {
+  if (!options.include_memtabtles && !options.include_files) {
+    return Status::InvalidArgument("Invalid options");
+  }
+
+  Version* v;
+  auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+  auto cfd = cfh->cfd();
+  SuperVersion* sv = GetAndRefSuperVersion(cfd);
+  v = sv->current;
+
+  for (int i = 0; i < n; i++) {
+    // Convert user_key into a corresponding internal key.
+    InternalKey k1(range[i].start, kMaxSequenceNumber, kValueTypeForSeek);
+    InternalKey k2(range[i].limit, kMaxSequenceNumber, kValueTypeForSeek);
+    sizes[i] = 0;
+    if (options.include_files) {
+      sizes[i] += versions_->ApproximateSize(
+          options, v, k1.Encode(), k2.Encode(), /*start_level=*/0,
+          /*end_level=*/-1, TableReaderCaller::kUserApproximateSize);
+    }
+    if (options.include_memtabtles) {
+      sizes[i] += sv->mem->ApproximateStats(k1.Encode(), k2.Encode()).size;
+      sizes[i] += sv->imm->ApproximateStats(k1.Encode(), k2.Encode()).size;
+    }
+  }
+
+  ReturnAndCleanupSuperVersion(cfd, sv);
+  return Status::OK();
+}
+
+std::list<uint64_t>::iterator
+DBImpl::CaptureCurrentFileNumberInPendingOutputs() {
+  // We need to remember the iterator of our insert, because after the
+  // background job is done, we need to remove that element from
+  // pending_outputs_.
+  pending_outputs_.push_back(versions_->current_next_file_number());
+  auto pending_outputs_inserted_elem = pending_outputs_.end();
+  --pending_outputs_inserted_elem;
+  return pending_outputs_inserted_elem;
+}
+
+void DBImpl::ReleaseFileNumberFromPendingOutputs(
+    std::unique_ptr<std::list<uint64_t>::iterator>& v) {
+  if (v.get() != nullptr) {
+    pending_outputs_.erase(*v.get());
+    v.reset();
+  }
+}
+
+#ifndef ROCKSDB_LITE
+Status DBImpl::GetUpdatesSince(
+    SequenceNumber seq, std::unique_ptr<TransactionLogIterator>* iter,
+    const TransactionLogIterator::ReadOptions& read_options) {
+  RecordTick(stats_, GET_UPDATES_SINCE_CALLS);
+  if (seq > versions_->LastSequence()) {
+    return Status::NotFound("Requested sequence not yet written in the db");
+  }
+  return wal_manager_.GetUpdatesSince(seq, iter, read_options, versions_.get());
+}
+
+Status DBImpl::DeleteFile(std::string name) {
+  uint64_t number;
+  FileType type;
+  WalFileType log_type;
+  if (!ParseFileName(name, &number, &type, &log_type) ||
+      (type != kTableFile && type != kLogFile)) {
+    ROCKS_LOG_ERROR(immutable_db_options_.info_log, "DeleteFile %s failed.\n",
+                    name.c_str());
+    return Status::InvalidArgument("Invalid file name");
+  }
+
+  Status status;
+  if (type == kLogFile) {
+    // Only allow deleting archived log files
+    if (log_type != kArchivedLogFile) {
+      ROCKS_LOG_ERROR(immutable_db_options_.info_log,
+                      "DeleteFile %s failed - not archived log.\n",
+                      name.c_str());
+      return Status::NotSupported("Delete only supported for archived logs");
+    }
+    status = wal_manager_.DeleteFile(name, number);
+    if (!status.ok()) {
+      ROCKS_LOG_ERROR(immutable_db_options_.info_log,
+                      "DeleteFile %s failed -- %s.\n", name.c_str(),
+                      status.ToString().c_str());
+    }
+    return status;
+  }
+
+  int level;
+  FileMetaData* metadata;
+  ColumnFamilyData* cfd;
+  VersionEdit edit;
+  JobContext job_context(next_job_id_.fetch_add(1), true);
+  {
+    InstrumentedMutexLock l(&mutex_);
+    status = versions_->GetMetadataForFile(number, &level, &metadata, &cfd);
+    if (!status.ok()) {
+      ROCKS_LOG_WARN(immutable_db_options_.info_log,
+                     "DeleteFile %s failed. File not found\n", name.c_str());
+      job_context.Clean();
+      return Status::InvalidArgument("File not found");
+    }
+    assert(level < cfd->NumberLevels());
+
+    // If the file is being compacted no need to delete.
+    if (metadata->being_compacted) {
+      ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                     "DeleteFile %s Skipped. File about to be compacted\n",
+                     name.c_str());
+      job_context.Clean();
+      return Status::OK();
+    }
+
+    // Only the files in the last level can be deleted externally.
+    // This is to make sure that any deletion tombstones are not
+    // lost. Check that the level passed is the last level.
+    auto* vstoreage = cfd->current()->storage_info();
+    for (int i = level + 1; i < cfd->NumberLevels(); i++) {
+      if (vstoreage->NumLevelFiles(i) != 0) {
+        ROCKS_LOG_WARN(immutable_db_options_.info_log,
+                       "DeleteFile %s FAILED. File not in last level\n",
+                       name.c_str());
+        job_context.Clean();
+        return Status::InvalidArgument("File not in last level");
+      }
+    }
+    // if level == 0, it has to be the oldest file
+    if (level == 0 &&
+        vstoreage->LevelFiles(0).back()->fd.GetNumber() != number) {
+      ROCKS_LOG_WARN(immutable_db_options_.info_log,
+                     "DeleteFile %s failed ---"
+                     " target file in level 0 must be the oldest.",
+                     name.c_str());
+      job_context.Clean();
+      return Status::InvalidArgument("File in level 0, but not oldest");
+    }
+    edit.SetColumnFamily(cfd->GetID());
+    edit.DeleteFile(level, number);
+    status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(),
+                                    &edit, &mutex_, directories_.GetDbDir());
+    if (status.ok()) {
+      InstallSuperVersionAndScheduleWork(cfd,
+                                         &job_context.superversion_contexts[0],
+                                         *cfd->GetLatestMutableCFOptions());
+    }
+    FindObsoleteFiles(&job_context, false);
+  }  // lock released here
+
+  LogFlush(immutable_db_options_.info_log);
+  // remove files outside the db-lock
+  if (job_context.HaveSomethingToDelete()) {
+    // Call PurgeObsoleteFiles() without holding mutex.
+    PurgeObsoleteFiles(job_context);
+  }
+  job_context.Clean();
+  return status;
+}
+
+Status DBImpl::DeleteFilesInRanges(ColumnFamilyHandle* column_family,
+                                   const RangePtr* ranges, size_t n,
+                                   bool include_end) {
+  Status status;
+  auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+  ColumnFamilyData* cfd = cfh->cfd();
+  VersionEdit edit;
+  std::set<FileMetaData*> deleted_files;
+  JobContext job_context(next_job_id_.fetch_add(1), true);
+  {
+    InstrumentedMutexLock l(&mutex_);
+    Version* input_version = cfd->current();
+
+    auto* vstorage = input_version->storage_info();
+    for (size_t r = 0; r < n; r++) {
+      auto begin = ranges[r].start, end = ranges[r].limit;
+      for (int i = 1; i < cfd->NumberLevels(); i++) {
+        if (vstorage->LevelFiles(i).empty() ||
+            !vstorage->OverlapInLevel(i, begin, end)) {
+          continue;
+        }
+        std::vector<FileMetaData*> level_files;
+        InternalKey begin_storage, end_storage, *begin_key, *end_key;
+        if (begin == nullptr) {
+          begin_key = nullptr;
+        } else {
+          begin_storage.SetMinPossibleForUserKey(*begin);
+          begin_key = &begin_storage;
+        }
+        if (end == nullptr) {
+          end_key = nullptr;
+        } else {
+          end_storage.SetMaxPossibleForUserKey(*end);
+          end_key = &end_storage;
+        }
+
+        vstorage->GetCleanInputsWithinInterval(
+            i, begin_key, end_key, &level_files, -1 /* hint_index */,
+            nullptr /* file_index */);
+        FileMetaData* level_file;
+        for (uint32_t j = 0; j < level_files.size(); j++) {
+          level_file = level_files[j];
+          if (level_file->being_compacted) {
+            continue;
+          }
+          if (deleted_files.find(level_file) != deleted_files.end()) {
+            continue;
+          }
+          if (!include_end && end != nullptr &&
+              cfd->user_comparator()->Compare(level_file->largest.user_key(),
+                                              *end) == 0) {
+            continue;
+          }
+          edit.SetColumnFamily(cfd->GetID());
+          edit.DeleteFile(i, level_file->fd.GetNumber());
+          deleted_files.insert(level_file);
+          level_file->being_compacted = true;
+        }
+      }
+    }
+    if (edit.GetDeletedFiles().empty()) {
+      job_context.Clean();
+      return Status::OK();
+    }
+    input_version->Ref();
+    status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(),
+                                    &edit, &mutex_, directories_.GetDbDir());
+    if (status.ok()) {
+      InstallSuperVersionAndScheduleWork(cfd,
+                                         &job_context.superversion_contexts[0],
+                                         *cfd->GetLatestMutableCFOptions());
+    }
+    for (auto* deleted_file : deleted_files) {
+      deleted_file->being_compacted = false;
+    }
+    input_version->Unref();
+    FindObsoleteFiles(&job_context, false);
+  }  // lock released here
+
+  LogFlush(immutable_db_options_.info_log);
+  // remove files outside the db-lock
+  if (job_context.HaveSomethingToDelete()) {
+    // Call PurgeObsoleteFiles() without holding mutex.
+    PurgeObsoleteFiles(job_context);
+  }
+  job_context.Clean();
+  return status;
+}
+
+void DBImpl::GetLiveFilesMetaData(std::vector<LiveFileMetaData>* metadata) {
+  InstrumentedMutexLock l(&mutex_);
+  versions_->GetLiveFilesMetaData(metadata);
+}
+
+void DBImpl::GetColumnFamilyMetaData(ColumnFamilyHandle* column_family,
+                                     ColumnFamilyMetaData* cf_meta) {
+  assert(column_family);
+  auto* cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family)->cfd();
+  auto* sv = GetAndRefSuperVersion(cfd);
+  {
+    // Without mutex, Version::GetColumnFamilyMetaData will have data race with
+    // Compaction::MarkFilesBeingCompacted. One solution is to use mutex, but
+    // this may cause regression. An alternative is to make
+    // FileMetaData::being_compacted atomic, but it will make FileMetaData
+    // non-copy-able. Another option is to separate these variables from
+    // original FileMetaData struct, and this requires re-organization of data
+    // structures. For now, we take the easy approach. If
+    // DB::GetColumnFamilyMetaData is not called frequently, the regression
+    // should not be big. We still need to keep an eye on it.
+    InstrumentedMutexLock l(&mutex_);
+    sv->current->GetColumnFamilyMetaData(cf_meta);
+  }
+  ReturnAndCleanupSuperVersion(cfd, sv);
+}
+
+#endif  // ROCKSDB_LITE
+
+Status DBImpl::CheckConsistency() {
+  mutex_.AssertHeld();
+  std::vector<LiveFileMetaData> metadata;
+  versions_->GetLiveFilesMetaData(&metadata);
+  TEST_SYNC_POINT("DBImpl::CheckConsistency:AfterGetLiveFilesMetaData");
+
+  std::string corruption_messages;
+
+  if (immutable_db_options_.skip_checking_sst_file_sizes_on_db_open) {
+    // Instead of calling GetFileSize() for each expected file, call
+    // GetChildren() for the DB directory and check that all expected files
+    // are listed, without checking their sizes.
+    // Since sst files might be in different directories, do it for each
+    // directory separately.
+    std::map<std::string, std::vector<std::string>> files_by_directory;
+    for (const auto& md : metadata) {
+      // md.name has a leading "/". Remove it.
+      std::string fname = md.name;
+      if (!fname.empty() && fname[0] == '/') {
+        fname = fname.substr(1);
+      }
+      files_by_directory[md.db_path].push_back(fname);
+    }
+    for (const auto& dir_files : files_by_directory) {
+      std::string directory = dir_files.first;
+      std::vector<std::string> existing_files;
+      Status s = env_->GetChildren(directory, &existing_files);
+      if (!s.ok()) {
+        corruption_messages +=
+            "Can't list files in " + directory + ": " + s.ToString() + "\n";
+        continue;
+      }
+      std::sort(existing_files.begin(), existing_files.end());
+
+      for (const std::string& fname : dir_files.second) {
+        if (!std::binary_search(existing_files.begin(), existing_files.end(),
+                                fname) &&
+            !std::binary_search(existing_files.begin(), existing_files.end(),
+                                Rocks2LevelTableFileName(fname))) {
+          corruption_messages +=
+              "Missing sst file " + fname + " in " + directory + "\n";
+        }
+      }
+    }
+  } else {
+    for (const auto& md : metadata) {
+      // md.name has a leading "/".
+      std::string file_path = md.db_path + md.name;
+
+      uint64_t fsize = 0;
+      TEST_SYNC_POINT("DBImpl::CheckConsistency:BeforeGetFileSize");
+      Status s = env_->GetFileSize(file_path, &fsize);
+      if (!s.ok() &&
+          env_->GetFileSize(Rocks2LevelTableFileName(file_path), &fsize).ok()) {
+        s = Status::OK();
+      }
+      if (!s.ok()) {
+        corruption_messages +=
+            "Can't access " + md.name + ": " + s.ToString() + "\n";
+      } else if (fsize != md.size) {
+        corruption_messages += "Sst file size mismatch: " + file_path +
+                               ". Size recorded in manifest " +
+                               ToString(md.size) + ", actual size " +
+                               ToString(fsize) + "\n";
+      }
+    }
+  }
+
+  if (corruption_messages.size() == 0) {
+    return Status::OK();
+  } else {
+    return Status::Corruption(corruption_messages);
+  }
+}
+
+Status DBImpl::GetDbIdentity(std::string& identity) const {
+  identity.assign(db_id_);
+  return Status::OK();
+}
+
+Status DBImpl::GetDbIdentityFromIdentityFile(std::string* identity) const {
+  std::string idfilename = IdentityFileName(dbname_);
+  const FileOptions soptions;
+
+  Status s = ReadFileToString(fs_.get(), idfilename, identity);
+  if (!s.ok()) {
+    return s;
+  }
+
+  // If last character is '\n' remove it from identity
+  if (identity->size() > 0 && identity->back() == '\n') {
+    identity->pop_back();
+  }
+  return s;
+}
+
+// Default implementation -- returns not supported status
+Status DB::CreateColumnFamily(const ColumnFamilyOptions& /*cf_options*/,
+                              const std::string& /*column_family_name*/,
+                              ColumnFamilyHandle** /*handle*/) {
+  return Status::NotSupported("");
+}
+
+Status DB::CreateColumnFamilies(
+    const ColumnFamilyOptions& /*cf_options*/,
+    const std::vector<std::string>& /*column_family_names*/,
+    std::vector<ColumnFamilyHandle*>* /*handles*/) {
+  return Status::NotSupported("");
+}
+
+Status DB::CreateColumnFamilies(
+    const std::vector<ColumnFamilyDescriptor>& /*column_families*/,
+    std::vector<ColumnFamilyHandle*>* /*handles*/) {
+  return Status::NotSupported("");
+}
+
+Status DB::DropColumnFamily(ColumnFamilyHandle* /*column_family*/) {
+  return Status::NotSupported("");
+}
+
+Status DB::DropColumnFamilies(
+    const std::vector<ColumnFamilyHandle*>& /*column_families*/) {
+  return Status::NotSupported("");
+}
+
+Status DB::DestroyColumnFamilyHandle(ColumnFamilyHandle* column_family) {
+  delete column_family;
+  return Status::OK();
+}
+
+DB::~DB() {}
+
+Status DBImpl::Close() {
+  if (!closed_) {
+    {
+      InstrumentedMutexLock l(&mutex_);
+      // If there is unreleased snapshot, fail the close call
+      if (!snapshots_.empty()) {
+        return Status::Aborted("Cannot close DB with unreleased snapshot.");
+      }
+    }
+
+    closed_ = true;
+    return CloseImpl();
+  }
+  return Status::OK();
+}
+
+Status DB::ListColumnFamilies(const DBOptions& db_options,
+                              const std::string& name,
+                              std::vector<std::string>* column_families) {
+  FileSystem* fs = db_options.file_system.get();
+  LegacyFileSystemWrapper legacy_fs(db_options.env);
+  if (!fs) {
+    fs = &legacy_fs;
+  }
+  return VersionSet::ListColumnFamilies(column_families, name, fs);
+}
+
+Snapshot::~Snapshot() {}
+
+Status DestroyDB(const std::string& dbname, const Options& options,
+                 const std::vector<ColumnFamilyDescriptor>& column_families) {
+  ImmutableDBOptions soptions(SanitizeOptions(dbname, options));
+  Env* env = soptions.env;
+  std::vector<std::string> filenames;
+  bool wal_in_db_path = IsWalDirSameAsDBPath(&soptions);
+
+  // Reset the logger because it holds a handle to the
+  // log file and prevents cleanup and directory removal
+  soptions.info_log.reset();
+  // Ignore error in case directory does not exist
+  env->GetChildren(dbname, &filenames);
+
+  FileLock* lock;
+  const std::string lockname = LockFileName(dbname);
+  Status result = env->LockFile(lockname, &lock);
+  if (result.ok()) {
+    uint64_t number;
+    FileType type;
+    InfoLogPrefix info_log_prefix(!soptions.db_log_dir.empty(), dbname);
+    for (const auto& fname : filenames) {
+      if (ParseFileName(fname, &number, info_log_prefix.prefix, &type) &&
+          type != kDBLockFile) {  // Lock file will be deleted at end
+        Status del;
+        std::string path_to_delete = dbname + "/" + fname;
+        if (type == kMetaDatabase) {
+          del = DestroyDB(path_to_delete, options);
+        } else if (type == kTableFile || type == kLogFile) {
+          del = DeleteDBFile(&soptions, path_to_delete, dbname,
+                             /*force_bg=*/false, /*force_fg=*/!wal_in_db_path);
+        } else {
+          del = env->DeleteFile(path_to_delete);
+        }
+        if (result.ok() && !del.ok()) {
+          result = del;
+        }
+      }
+    }
+
+    std::vector<std::string> paths;
+
+    for (const auto& path : options.db_paths) {
+      paths.emplace_back(path.path);
+    }
+    for (const auto& cf : column_families) {
+      for (const auto& path : cf.options.cf_paths) {
+        paths.emplace_back(path.path);
+      }
+    }
+
+    // Remove duplicate paths.
+    // Note that we compare only the actual paths but not path ids.
+    // This reason is that same path can appear at different path_ids
+    // for different column families.
+    std::sort(paths.begin(), paths.end());
+    paths.erase(std::unique(paths.begin(), paths.end()), paths.end());
+
+    for (const auto& path : paths) {
+      if (env->GetChildren(path, &filenames).ok()) {
+        for (const auto& fname : filenames) {
+          if (ParseFileName(fname, &number, &type) &&
+              type == kTableFile) {  // Lock file will be deleted at end
+            std::string table_path = path + "/" + fname;
+            Status del = DeleteDBFile(&soptions, table_path, dbname,
+                                      /*force_bg=*/false, /*force_fg=*/false);
+            if (result.ok() && !del.ok()) {
+              result = del;
+            }
+          }
+        }
+        env->DeleteDir(path);
+      }
+    }
+
+    std::vector<std::string> walDirFiles;
+    std::string archivedir = ArchivalDirectory(dbname);
+    bool wal_dir_exists = false;
+    if (dbname != soptions.wal_dir) {
+      wal_dir_exists = env->GetChildren(soptions.wal_dir, &walDirFiles).ok();
+      archivedir = ArchivalDirectory(soptions.wal_dir);
+    }
+
+    // Archive dir may be inside wal dir or dbname and should be
+    // processed and removed before those otherwise we have issues
+    // removing them
+    std::vector<std::string> archiveFiles;
+    if (env->GetChildren(archivedir, &archiveFiles).ok()) {
+      // Delete archival files.
+      for (const auto& file : archiveFiles) {
+        if (ParseFileName(file, &number, &type) && type == kLogFile) {
+          Status del =
+              DeleteDBFile(&soptions, archivedir + "/" + file, archivedir,
+                           /*force_bg=*/false, /*force_fg=*/!wal_in_db_path);
+          if (result.ok() && !del.ok()) {
+            result = del;
+          }
+        }
+      }
+      env->DeleteDir(archivedir);
+    }
+
+    // Delete log files in the WAL dir
+    if (wal_dir_exists) {
+      for (const auto& file : walDirFiles) {
+        if (ParseFileName(file, &number, &type) && type == kLogFile) {
+          Status del =
+              DeleteDBFile(&soptions, LogFileName(soptions.wal_dir, number),
+                           soptions.wal_dir, /*force_bg=*/false,
+                           /*force_fg=*/!wal_in_db_path);
+          if (result.ok() && !del.ok()) {
+            result = del;
+          }
+        }
+      }
+      env->DeleteDir(soptions.wal_dir);
+    }
+
+    env->UnlockFile(lock);  // Ignore error since state is already gone
+    env->DeleteFile(lockname);
+
+    // sst_file_manager holds a ref to the logger. Make sure the logger is
+    // gone before trying to remove the directory.
+    soptions.sst_file_manager.reset();
+
+    env->DeleteDir(dbname);  // Ignore error in case dir contains other files
+  }
+  return result;
+}
+
+Status DBImpl::WriteOptionsFile(bool need_mutex_lock,
+                                bool need_enter_write_thread) {
+#ifndef ROCKSDB_LITE
+  WriteThread::Writer w;
+  if (need_mutex_lock) {
+    mutex_.Lock();
+  } else {
+    mutex_.AssertHeld();
+  }
+  if (need_enter_write_thread) {
+    write_thread_.EnterUnbatched(&w, &mutex_);
+  }
+
+  std::vector<std::string> cf_names;
+  std::vector<ColumnFamilyOptions> cf_opts;
+
+  // This part requires mutex to protect the column family options
+  for (auto cfd : *versions_->GetColumnFamilySet()) {
+    if (cfd->IsDropped()) {
+      continue;
+    }
+    cf_names.push_back(cfd->GetName());
+    cf_opts.push_back(cfd->GetLatestCFOptions());
+  }
+
+  // Unlock during expensive operations.  New writes cannot get here
+  // because the single write thread ensures all new writes get queued.
+  DBOptions db_options =
+      BuildDBOptions(immutable_db_options_, mutable_db_options_);
+  mutex_.Unlock();
+
+  TEST_SYNC_POINT("DBImpl::WriteOptionsFile:1");
+  TEST_SYNC_POINT("DBImpl::WriteOptionsFile:2");
+
+  std::string file_name =
+      TempOptionsFileName(GetName(), versions_->NewFileNumber());
+  Status s = PersistRocksDBOptions(db_options, cf_names, cf_opts, file_name,
+                                   GetFileSystem());
+
+  if (s.ok()) {
+    s = RenameTempFileToOptionsFile(file_name);
+  }
+  // restore lock
+  if (!need_mutex_lock) {
+    mutex_.Lock();
+  }
+  if (need_enter_write_thread) {
+    write_thread_.ExitUnbatched(&w);
+  }
+  if (!s.ok()) {
+    ROCKS_LOG_WARN(immutable_db_options_.info_log,
+                   "Unnable to persist options -- %s", s.ToString().c_str());
+    if (immutable_db_options_.fail_if_options_file_error) {
+      return Status::IOError("Unable to persist options.",
+                             s.ToString().c_str());
+    }
+  }
+#else
+  (void)need_mutex_lock;
+  (void)need_enter_write_thread;
+#endif  // !ROCKSDB_LITE
+  return Status::OK();
+}
+
+#ifndef ROCKSDB_LITE
+namespace {
+void DeleteOptionsFilesHelper(const std::map<uint64_t, std::string>& filenames,
+                              const size_t num_files_to_keep,
+                              const std::shared_ptr<Logger>& info_log,
+                              Env* env) {
+  if (filenames.size() <= num_files_to_keep) {
+    return;
+  }
+  for (auto iter = std::next(filenames.begin(), num_files_to_keep);
+       iter != filenames.end(); ++iter) {
+    if (!env->DeleteFile(iter->second).ok()) {
+      ROCKS_LOG_WARN(info_log, "Unable to delete options file %s",
+                     iter->second.c_str());
+    }
+  }
+}
+}  // namespace
+#endif  // !ROCKSDB_LITE
+
+Status DBImpl::DeleteObsoleteOptionsFiles() {
+#ifndef ROCKSDB_LITE
+  std::vector<std::string> filenames;
+  // use ordered map to store keep the filenames sorted from the newest
+  // to the oldest.
+  std::map<uint64_t, std::string> options_filenames;
+  Status s;
+  s = GetEnv()->GetChildren(GetName(), &filenames);
+  if (!s.ok()) {
+    return s;
+  }
+  for (auto& filename : filenames) {
+    uint64_t file_number;
+    FileType type;
+    if (ParseFileName(filename, &file_number, &type) && type == kOptionsFile) {
+      options_filenames.insert(
+          {std::numeric_limits<uint64_t>::max() - file_number,
+           GetName() + "/" + filename});
+    }
+  }
+
+  // Keeps the latest 2 Options file
+  const size_t kNumOptionsFilesKept = 2;
+  DeleteOptionsFilesHelper(options_filenames, kNumOptionsFilesKept,
+                           immutable_db_options_.info_log, GetEnv());
+  return Status::OK();
+#else
+  return Status::OK();
+#endif  // !ROCKSDB_LITE
+}
+
+Status DBImpl::RenameTempFileToOptionsFile(const std::string& file_name) {
+#ifndef ROCKSDB_LITE
+  Status s;
+
+  uint64_t options_file_number = versions_->NewFileNumber();
+  std::string options_file_name =
+      OptionsFileName(GetName(), options_file_number);
+  // Retry if the file name happen to conflict with an existing one.
+  s = GetEnv()->RenameFile(file_name, options_file_name);
+  if (s.ok()) {
+    InstrumentedMutexLock l(&mutex_);
+    versions_->options_file_number_ = options_file_number;
+  }
+
+  if (0 == disable_delete_obsolete_files_) {
+    DeleteObsoleteOptionsFiles();
+  }
+  return s;
+#else
+  (void)file_name;
+  return Status::OK();
+#endif  // !ROCKSDB_LITE
+}
+
+#ifdef ROCKSDB_USING_THREAD_STATUS
+
+void DBImpl::NewThreadStatusCfInfo(ColumnFamilyData* cfd) const {
+  if (immutable_db_options_.enable_thread_tracking) {
+    ThreadStatusUtil::NewColumnFamilyInfo(this, cfd, cfd->GetName(),
+                                          cfd->ioptions()->env);
+  }
+}
+
+void DBImpl::EraseThreadStatusCfInfo(ColumnFamilyData* cfd) const {
+  if (immutable_db_options_.enable_thread_tracking) {
+    ThreadStatusUtil::EraseColumnFamilyInfo(cfd);
+  }
+}
+
+void DBImpl::EraseThreadStatusDbInfo() const {
+  if (immutable_db_options_.enable_thread_tracking) {
+    ThreadStatusUtil::EraseDatabaseInfo(this);
+  }
+}
+
+#else
+void DBImpl::NewThreadStatusCfInfo(ColumnFamilyData* /*cfd*/) const {}
+
+void DBImpl::EraseThreadStatusCfInfo(ColumnFamilyData* /*cfd*/) const {}
+
+void DBImpl::EraseThreadStatusDbInfo() const {}
+#endif  // ROCKSDB_USING_THREAD_STATUS
+
+//
+// A global method that can dump out the build version
+void DumpRocksDBBuildVersion(Logger* log) {
+#if !defined(IOS_CROSS_COMPILE)
+  // if we compile with Xcode, we don't run build_detect_version, so we don't
+  // generate util/build_version.cc
+  ROCKS_LOG_HEADER(log, "RocksDB version: %d.%d.%d\n", ROCKSDB_MAJOR,
+                   ROCKSDB_MINOR, ROCKSDB_PATCH);
+  ROCKS_LOG_HEADER(log, "Git sha %s", rocksdb_build_git_sha);
+  ROCKS_LOG_HEADER(log, "Compile date %s", rocksdb_build_compile_date);
+#else
+  (void)log;  // ignore "-Wunused-parameter"
+#endif
+}
+
+#ifndef ROCKSDB_LITE
+SequenceNumber DBImpl::GetEarliestMemTableSequenceNumber(SuperVersion* sv,
+                                                         bool include_history) {
+  // Find the earliest sequence number that we know we can rely on reading
+  // from the memtable without needing to check sst files.
+  SequenceNumber earliest_seq =
+      sv->imm->GetEarliestSequenceNumber(include_history);
+  if (earliest_seq == kMaxSequenceNumber) {
+    earliest_seq = sv->mem->GetEarliestSequenceNumber();
+  }
+  assert(sv->mem->GetEarliestSequenceNumber() >= earliest_seq);
+
+  return earliest_seq;
+}
+#endif  // ROCKSDB_LITE
+
+#ifndef ROCKSDB_LITE
+Status DBImpl::GetLatestSequenceForKey(SuperVersion* sv, const Slice& key,
+                                       bool cache_only,
+                                       SequenceNumber lower_bound_seq,
+                                       SequenceNumber* seq,
+                                       bool* found_record_for_key,
+                                       bool* is_blob_index) {
+  Status s;
+  MergeContext merge_context;
+  SequenceNumber max_covering_tombstone_seq = 0;
+
+  ReadOptions read_options;
+  SequenceNumber current_seq = versions_->LastSequence();
+  LookupKey lkey(key, current_seq);
+
+  *seq = kMaxSequenceNumber;
+  *found_record_for_key = false;
+
+  // Check if there is a record for this key in the latest memtable
+  sv->mem->Get(lkey, nullptr, &s, &merge_context, &max_covering_tombstone_seq,
+               seq, read_options, nullptr /*read_callback*/, is_blob_index);
+
+  if (!(s.ok() || s.IsNotFound() || s.IsMergeInProgress())) {
+    // unexpected error reading memtable.
+    ROCKS_LOG_ERROR(immutable_db_options_.info_log,
+                    "Unexpected status returned from MemTable::Get: %s\n",
+                    s.ToString().c_str());
+
+    return s;
+  }
+
+  if (*seq != kMaxSequenceNumber) {
+    // Found a sequence number, no need to check immutable memtables
+    *found_record_for_key = true;
+    return Status::OK();
+  }
+
+  SequenceNumber lower_bound_in_mem = sv->mem->GetEarliestSequenceNumber();
+  if (lower_bound_in_mem != kMaxSequenceNumber &&
+      lower_bound_in_mem < lower_bound_seq) {
+    *found_record_for_key = false;
+    return Status::OK();
+  }
+
+  // Check if there is a record for this key in the immutable memtables
+  sv->imm->Get(lkey, nullptr, &s, &merge_context, &max_covering_tombstone_seq,
+               seq, read_options, nullptr /*read_callback*/, is_blob_index);
+
+  if (!(s.ok() || s.IsNotFound() || s.IsMergeInProgress())) {
+    // unexpected error reading memtable.
+    ROCKS_LOG_ERROR(immutable_db_options_.info_log,
+                    "Unexpected status returned from MemTableList::Get: %s\n",
+                    s.ToString().c_str());
+
+    return s;
+  }
+
+  if (*seq != kMaxSequenceNumber) {
+    // Found a sequence number, no need to check memtable history
+    *found_record_for_key = true;
+    return Status::OK();
+  }
+
+  SequenceNumber lower_bound_in_imm = sv->imm->GetEarliestSequenceNumber();
+  if (lower_bound_in_imm != kMaxSequenceNumber &&
+      lower_bound_in_imm < lower_bound_seq) {
+    *found_record_for_key = false;
+    return Status::OK();
+  }
+
+  // Check if there is a record for this key in the immutable memtables
+  sv->imm->GetFromHistory(lkey, nullptr, &s, &merge_context,
+                          &max_covering_tombstone_seq, seq, read_options,
+                          is_blob_index);
+
+  if (!(s.ok() || s.IsNotFound() || s.IsMergeInProgress())) {
+    // unexpected error reading memtable.
+    ROCKS_LOG_ERROR(
+        immutable_db_options_.info_log,
+        "Unexpected status returned from MemTableList::GetFromHistory: %s\n",
+        s.ToString().c_str());
+
+    return s;
+  }
+
+  if (*seq != kMaxSequenceNumber) {
+    // Found a sequence number, no need to check SST files
+    *found_record_for_key = true;
+    return Status::OK();
+  }
+
+  // We could do a sv->imm->GetEarliestSequenceNumber(/*include_history*/ true)
+  // check here to skip the history if possible. But currently the caller
+  // already does that. Maybe we should move the logic here later.
+
+  // TODO(agiardullo): possible optimization: consider checking cached
+  // SST files if cache_only=true?
+  if (!cache_only) {
+    // Check tables
+    sv->current->Get(read_options, lkey, nullptr, &s, &merge_context,
+                     &max_covering_tombstone_seq, nullptr /* value_found */,
+                     found_record_for_key, seq, nullptr /*read_callback*/,
+                     is_blob_index);
+
+    if (!(s.ok() || s.IsNotFound() || s.IsMergeInProgress())) {
+      // unexpected error reading SST files
+      ROCKS_LOG_ERROR(immutable_db_options_.info_log,
+                      "Unexpected status returned from Version::Get: %s\n",
+                      s.ToString().c_str());
+    }
+  }
+
+  return s;
+}
+
+Status DBImpl::IngestExternalFile(
+    ColumnFamilyHandle* column_family,
+    const std::vector<std::string>& external_files,
+    const IngestExternalFileOptions& ingestion_options) {
+  IngestExternalFileArg arg;
+  arg.column_family = column_family;
+  arg.external_files = external_files;
+  arg.options = ingestion_options;
+  return IngestExternalFiles({arg});
+}
+
+Status DBImpl::IngestExternalFiles(
+    const std::vector<IngestExternalFileArg>& args) {
+  if (args.empty()) {
+    return Status::InvalidArgument("ingestion arg list is empty");
+  }
+  {
+    std::unordered_set<ColumnFamilyHandle*> unique_cfhs;
+    for (const auto& arg : args) {
+      if (arg.column_family == nullptr) {
+        return Status::InvalidArgument("column family handle is null");
+      } else if (unique_cfhs.count(arg.column_family) > 0) {
+        return Status::InvalidArgument(
+            "ingestion args have duplicate column families");
+      }
+      unique_cfhs.insert(arg.column_family);
+    }
+  }
+  // Ingest multiple external SST files atomically.
+  size_t num_cfs = args.size();
+  for (size_t i = 0; i != num_cfs; ++i) {
+    if (args[i].external_files.empty()) {
+      char err_msg[128] = {0};
+      snprintf(err_msg, 128, "external_files[%zu] is empty", i);
+      return Status::InvalidArgument(err_msg);
+    }
+  }
+  for (const auto& arg : args) {
+    const IngestExternalFileOptions& ingest_opts = arg.options;
+    if (ingest_opts.ingest_behind &&
+        !immutable_db_options_.allow_ingest_behind) {
+      return Status::InvalidArgument(
+          "can't ingest_behind file in DB with allow_ingest_behind=false");
+    }
+  }
+
+  // TODO (yanqin) maybe handle the case in which column_families have
+  // duplicates
+  std::unique_ptr<std::list<uint64_t>::iterator> pending_output_elem;
+  size_t total = 0;
+  for (const auto& arg : args) {
+    total += arg.external_files.size();
+  }
+  uint64_t next_file_number = 0;
+  Status status = ReserveFileNumbersBeforeIngestion(
+      static_cast<ColumnFamilyHandleImpl*>(args[0].column_family)->cfd(), total,
+      pending_output_elem, &next_file_number);
+  if (!status.ok()) {
+    InstrumentedMutexLock l(&mutex_);
+    ReleaseFileNumberFromPendingOutputs(pending_output_elem);
+    return status;
+  }
+
+  std::vector<ExternalSstFileIngestionJob> ingestion_jobs;
+  for (const auto& arg : args) {
+    auto* cfd = static_cast<ColumnFamilyHandleImpl*>(arg.column_family)->cfd();
+    ingestion_jobs.emplace_back(
+        env_, versions_.get(), cfd, immutable_db_options_, file_options_,
+        &snapshots_, arg.options, &directories_, &event_logger_);
+  }
+  std::vector<std::pair<bool, Status>> exec_results;
+  for (size_t i = 0; i != num_cfs; ++i) {
+    exec_results.emplace_back(false, Status::OK());
+  }
+  // TODO(yanqin) maybe make jobs run in parallel
+  uint64_t start_file_number = next_file_number;
+  for (size_t i = 1; i != num_cfs; ++i) {
+    start_file_number += args[i - 1].external_files.size();
+    auto* cfd =
+        static_cast<ColumnFamilyHandleImpl*>(args[i].column_family)->cfd();
+    SuperVersion* super_version = cfd->GetReferencedSuperVersion(this);
+    exec_results[i].second = ingestion_jobs[i].Prepare(
+        args[i].external_files, start_file_number, super_version);
+    exec_results[i].first = true;
+    CleanupSuperVersion(super_version);
+  }
+  TEST_SYNC_POINT("DBImpl::IngestExternalFiles:BeforeLastJobPrepare:0");
+  TEST_SYNC_POINT("DBImpl::IngestExternalFiles:BeforeLastJobPrepare:1");
+  {
+    auto* cfd =
+        static_cast<ColumnFamilyHandleImpl*>(args[0].column_family)->cfd();
+    SuperVersion* super_version = cfd->GetReferencedSuperVersion(this);
+    exec_results[0].second = ingestion_jobs[0].Prepare(
+        args[0].external_files, next_file_number, super_version);
+    exec_results[0].first = true;
+    CleanupSuperVersion(super_version);
+  }
+  for (const auto& exec_result : exec_results) {
+    if (!exec_result.second.ok()) {
+      status = exec_result.second;
+      break;
+    }
+  }
+  if (!status.ok()) {
+    for (size_t i = 0; i != num_cfs; ++i) {
+      if (exec_results[i].first) {
+        ingestion_jobs[i].Cleanup(status);
+      }
+    }
+    InstrumentedMutexLock l(&mutex_);
+    ReleaseFileNumberFromPendingOutputs(pending_output_elem);
+    return status;
+  }
+
+  std::vector<SuperVersionContext> sv_ctxs;
+  for (size_t i = 0; i != num_cfs; ++i) {
+    sv_ctxs.emplace_back(true /* create_superversion */);
+  }
+  TEST_SYNC_POINT("DBImpl::IngestExternalFiles:BeforeJobsRun:0");
+  TEST_SYNC_POINT("DBImpl::IngestExternalFiles:BeforeJobsRun:1");
+  TEST_SYNC_POINT("DBImpl::AddFile:Start");
+  {
+    InstrumentedMutexLock l(&mutex_);
+    TEST_SYNC_POINT("DBImpl::AddFile:MutexLock");
+
+    // Stop writes to the DB by entering both write threads
+    WriteThread::Writer w;
+    write_thread_.EnterUnbatched(&w, &mutex_);
+    WriteThread::Writer nonmem_w;
+    if (two_write_queues_) {
+      nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_);
+    }
+
+    // When unordered_write is enabled, the keys are writing to memtable in an
+    // unordered way. If the ingestion job checks memtable key range before the
+    // key landing in memtable, the ingestion job may skip the necessary
+    // memtable flush.
+    // So wait here to ensure there is no pending write to memtable.
+    WaitForPendingWrites();
+
+    num_running_ingest_file_ += static_cast<int>(num_cfs);
+    TEST_SYNC_POINT("DBImpl::IngestExternalFile:AfterIncIngestFileCounter");
+
+    bool at_least_one_cf_need_flush = false;
+    std::vector<bool> need_flush(num_cfs, false);
+    for (size_t i = 0; i != num_cfs; ++i) {
+      auto* cfd =
+          static_cast<ColumnFamilyHandleImpl*>(args[i].column_family)->cfd();
+      if (cfd->IsDropped()) {
+        // TODO (yanqin) investigate whether we should abort ingestion or
+        // proceed with other non-dropped column families.
+        status = Status::InvalidArgument(
+            "cannot ingest an external file into a dropped CF");
+        break;
+      }
+      bool tmp = false;
+      status = ingestion_jobs[i].NeedsFlush(&tmp, cfd->GetSuperVersion());
+      need_flush[i] = tmp;
+      at_least_one_cf_need_flush = (at_least_one_cf_need_flush || tmp);
+      if (!status.ok()) {
+        break;
+      }
+    }
+    TEST_SYNC_POINT_CALLBACK("DBImpl::IngestExternalFile:NeedFlush",
+                             &at_least_one_cf_need_flush);
+
+    if (status.ok() && at_least_one_cf_need_flush) {
+      FlushOptions flush_opts;
+      flush_opts.allow_write_stall = true;
+      if (immutable_db_options_.atomic_flush) {
+        autovector<ColumnFamilyData*> cfds_to_flush;
+        SelectColumnFamiliesForAtomicFlush(&cfds_to_flush);
+        mutex_.Unlock();
+        status = AtomicFlushMemTables(cfds_to_flush, flush_opts,
+                                      FlushReason::kExternalFileIngestion,
+                                      true /* writes_stopped */);
+        mutex_.Lock();
+      } else {
+        for (size_t i = 0; i != num_cfs; ++i) {
+          if (need_flush[i]) {
+            mutex_.Unlock();
+            auto* cfd =
+                static_cast<ColumnFamilyHandleImpl*>(args[i].column_family)
+                    ->cfd();
+            status = FlushMemTable(cfd, flush_opts,
+                                   FlushReason::kExternalFileIngestion,
+                                   true /* writes_stopped */);
+            mutex_.Lock();
+            if (!status.ok()) {
+              break;
+            }
+          }
+        }
+      }
+    }
+    // Run ingestion jobs.
+    if (status.ok()) {
+      for (size_t i = 0; i != num_cfs; ++i) {
+        status = ingestion_jobs[i].Run();
+        if (!status.ok()) {
+          break;
+        }
+      }
+    }
+    if (status.ok()) {
+      int consumed_seqno_count =
+          ingestion_jobs[0].ConsumedSequenceNumbersCount();
+#ifndef NDEBUG
+      for (size_t i = 1; i != num_cfs; ++i) {
+        assert(!!consumed_seqno_count ==
+               !!ingestion_jobs[i].ConsumedSequenceNumbersCount());
+        consumed_seqno_count +=
+            ingestion_jobs[i].ConsumedSequenceNumbersCount();
+      }
+#endif
+      if (consumed_seqno_count > 0) {
+        const SequenceNumber last_seqno = versions_->LastSequence();
+        versions_->SetLastAllocatedSequence(last_seqno + consumed_seqno_count);
+        versions_->SetLastPublishedSequence(last_seqno + consumed_seqno_count);
+        versions_->SetLastSequence(last_seqno + consumed_seqno_count);
+      }
+      autovector<ColumnFamilyData*> cfds_to_commit;
+      autovector<const MutableCFOptions*> mutable_cf_options_list;
+      autovector<autovector<VersionEdit*>> edit_lists;
+      uint32_t num_entries = 0;
+      for (size_t i = 0; i != num_cfs; ++i) {
+        auto* cfd =
+            static_cast<ColumnFamilyHandleImpl*>(args[i].column_family)->cfd();
+        if (cfd->IsDropped()) {
+          continue;
+        }
+        cfds_to_commit.push_back(cfd);
+        mutable_cf_options_list.push_back(cfd->GetLatestMutableCFOptions());
+        autovector<VersionEdit*> edit_list;
+        edit_list.push_back(ingestion_jobs[i].edit());
+        edit_lists.push_back(edit_list);
+        ++num_entries;
+      }
+      // Mark the version edits as an atomic group if the number of version
+      // edits exceeds 1.
+      if (cfds_to_commit.size() > 1) {
+        for (auto& edits : edit_lists) {
+          assert(edits.size() == 1);
+          edits[0]->MarkAtomicGroup(--num_entries);
+        }
+        assert(0 == num_entries);
+      }
+      status =
+          versions_->LogAndApply(cfds_to_commit, mutable_cf_options_list,
+                                 edit_lists, &mutex_, directories_.GetDbDir());
+    }
+
+    if (status.ok()) {
+      for (size_t i = 0; i != num_cfs; ++i) {
+        auto* cfd =
+            static_cast<ColumnFamilyHandleImpl*>(args[i].column_family)->cfd();
+        if (!cfd->IsDropped()) {
+          InstallSuperVersionAndScheduleWork(cfd, &sv_ctxs[i],
+                                             *cfd->GetLatestMutableCFOptions());
+#ifndef NDEBUG
+          if (0 == i && num_cfs > 1) {
+            TEST_SYNC_POINT(
+                "DBImpl::IngestExternalFiles:InstallSVForFirstCF:0");
+            TEST_SYNC_POINT(
+                "DBImpl::IngestExternalFiles:InstallSVForFirstCF:1");
+          }
+#endif  // !NDEBUG
+        }
+      }
+    }
+
+    // Resume writes to the DB
+    if (two_write_queues_) {
+      nonmem_write_thread_.ExitUnbatched(&nonmem_w);
+    }
+    write_thread_.ExitUnbatched(&w);
+
+    if (status.ok()) {
+      for (auto& job : ingestion_jobs) {
+        job.UpdateStats();
+      }
+    }
+    ReleaseFileNumberFromPendingOutputs(pending_output_elem);
+    num_running_ingest_file_ -= static_cast<int>(num_cfs);
+    if (0 == num_running_ingest_file_) {
+      bg_cv_.SignalAll();
+    }
+    TEST_SYNC_POINT("DBImpl::AddFile:MutexUnlock");
+  }
+  // mutex_ is unlocked here
+
+  // Cleanup
+  for (size_t i = 0; i != num_cfs; ++i) {
+    sv_ctxs[i].Clean();
+    // This may rollback jobs that have completed successfully. This is
+    // intended for atomicity.
+    ingestion_jobs[i].Cleanup(status);
+  }
+  if (status.ok()) {
+    for (size_t i = 0; i != num_cfs; ++i) {
+      auto* cfd =
+          static_cast<ColumnFamilyHandleImpl*>(args[i].column_family)->cfd();
+      if (!cfd->IsDropped()) {
+        NotifyOnExternalFileIngested(cfd, ingestion_jobs[i]);
+      }
+    }
+  }
+  return status;
+}
+
+Status DBImpl::CreateColumnFamilyWithImport(
+    const ColumnFamilyOptions& options, const std::string& column_family_name,
+    const ImportColumnFamilyOptions& import_options,
+    const ExportImportFilesMetaData& metadata, ColumnFamilyHandle** handle) {
+  assert(handle != nullptr);
+  assert(*handle == nullptr);
+  std::string cf_comparator_name = options.comparator->Name();
+  if (cf_comparator_name != metadata.db_comparator_name) {
+    return Status::InvalidArgument("Comparator name mismatch");
+  }
+
+  // Create column family.
+  auto status = CreateColumnFamily(options, column_family_name, handle);
+  if (!status.ok()) {
+    return status;
+  }
+
+  // Import sst files from metadata.
+  auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(*handle);
+  auto cfd = cfh->cfd();
+  ImportColumnFamilyJob import_job(env_, versions_.get(), cfd,
+                                   immutable_db_options_, file_options_,
+                                   import_options, metadata.files);
+
+  SuperVersionContext dummy_sv_ctx(/* create_superversion */ true);
+  VersionEdit dummy_edit;
+  uint64_t next_file_number = 0;
+  std::unique_ptr<std::list<uint64_t>::iterator> pending_output_elem;
+  {
+    // Lock db mutex
+    InstrumentedMutexLock l(&mutex_);
+    if (error_handler_.IsDBStopped()) {
+      // Don't import files when there is a bg_error
+      status = error_handler_.GetBGError();
+    }
+
+    // Make sure that bg cleanup wont delete the files that we are importing
+    pending_output_elem.reset(new std::list<uint64_t>::iterator(
+        CaptureCurrentFileNumberInPendingOutputs()));
+
+    if (status.ok()) {
+      // If crash happen after a hard link established, Recover function may
+      // reuse the file number that has already assigned to the internal file,
+      // and this will overwrite the external file. To protect the external
+      // file, we have to make sure the file number will never being reused.
+      next_file_number = versions_->FetchAddFileNumber(metadata.files.size());
+      auto cf_options = cfd->GetLatestMutableCFOptions();
+      status = versions_->LogAndApply(cfd, *cf_options, &dummy_edit, &mutex_,
+                                      directories_.GetDbDir());
+      if (status.ok()) {
+        InstallSuperVersionAndScheduleWork(cfd, &dummy_sv_ctx, *cf_options);
+      }
+    }
+  }
+  dummy_sv_ctx.Clean();
+
+  if (status.ok()) {
+    SuperVersion* sv = cfd->GetReferencedSuperVersion(this);
+    status = import_job.Prepare(next_file_number, sv);
+    CleanupSuperVersion(sv);
+  }
+
+  if (status.ok()) {
+    SuperVersionContext sv_context(true /*create_superversion*/);
+    {
+      // Lock db mutex
+      InstrumentedMutexLock l(&mutex_);
+
+      // Stop writes to the DB by entering both write threads
+      WriteThread::Writer w;
+      write_thread_.EnterUnbatched(&w, &mutex_);
+      WriteThread::Writer nonmem_w;
+      if (two_write_queues_) {
+        nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_);
+      }
+
+      num_running_ingest_file_++;
+      assert(!cfd->IsDropped());
+      status = import_job.Run();
+
+      // Install job edit [Mutex will be unlocked here]
+      if (status.ok()) {
+        auto cf_options = cfd->GetLatestMutableCFOptions();
+        status = versions_->LogAndApply(cfd, *cf_options, import_job.edit(),
+                                        &mutex_, directories_.GetDbDir());
+        if (status.ok()) {
+          InstallSuperVersionAndScheduleWork(cfd, &sv_context, *cf_options);
+        }
+      }
+
+      // Resume writes to the DB
+      if (two_write_queues_) {
+        nonmem_write_thread_.ExitUnbatched(&nonmem_w);
+      }
+      write_thread_.ExitUnbatched(&w);
+
+      num_running_ingest_file_--;
+      if (num_running_ingest_file_ == 0) {
+        bg_cv_.SignalAll();
+      }
+    }
+    // mutex_ is unlocked here
+
+    sv_context.Clean();
+  }
+
+  {
+    InstrumentedMutexLock l(&mutex_);
+    ReleaseFileNumberFromPendingOutputs(pending_output_elem);
+  }
+
+  import_job.Cleanup(status);
+  if (!status.ok()) {
+    DropColumnFamily(*handle);
+    DestroyColumnFamilyHandle(*handle);
+    *handle = nullptr;
+  }
+  return status;
+}
+
+Status DBImpl::VerifyChecksum(const ReadOptions& read_options) {
+  Status s;
+  std::vector<ColumnFamilyData*> cfd_list;
+  {
+    InstrumentedMutexLock l(&mutex_);
+    for (auto cfd : *versions_->GetColumnFamilySet()) {
+      if (!cfd->IsDropped() && cfd->initialized()) {
+        cfd->Ref();
+        cfd_list.push_back(cfd);
+      }
+    }
+  }
+  std::vector<SuperVersion*> sv_list;
+  for (auto cfd : cfd_list) {
+    sv_list.push_back(cfd->GetReferencedSuperVersion(this));
+  }
+  for (auto& sv : sv_list) {
+    VersionStorageInfo* vstorage = sv->current->storage_info();
+    ColumnFamilyData* cfd = sv->current->cfd();
+    Options opts;
+    {
+      InstrumentedMutexLock l(&mutex_);
+      opts = Options(BuildDBOptions(immutable_db_options_, mutable_db_options_),
+                     cfd->GetLatestCFOptions());
+    }
+    for (int i = 0; i < vstorage->num_non_empty_levels() && s.ok(); i++) {
+      for (size_t j = 0; j < vstorage->LevelFilesBrief(i).num_files && s.ok();
+           j++) {
+        const auto& fd = vstorage->LevelFilesBrief(i).files[j].fd;
+        std::string fname = TableFileName(cfd->ioptions()->cf_paths,
+                                          fd.GetNumber(), fd.GetPathId());
+        s = ROCKSDB_NAMESPACE::VerifySstFileChecksum(opts, file_options_,
+                                                     read_options, fname);
+      }
+    }
+    if (!s.ok()) {
+      break;
+    }
+  }
+  bool defer_purge =
+          immutable_db_options().avoid_unnecessary_blocking_io;
+  {
+    InstrumentedMutexLock l(&mutex_);
+    for (auto sv : sv_list) {
+      if (sv && sv->Unref()) {
+        sv->Cleanup();
+        if (defer_purge) {
+          AddSuperVersionsToFreeQueue(sv);
+        } else {
+          delete sv;
+        }
+      }
+    }
+    if (defer_purge) {
+      SchedulePurge();
+    }
+    for (auto cfd : cfd_list) {
+      cfd->UnrefAndTryDelete();
+    }
+  }
+  return s;
+}
+
+void DBImpl::NotifyOnExternalFileIngested(
+    ColumnFamilyData* cfd, const ExternalSstFileIngestionJob& ingestion_job) {
+  if (immutable_db_options_.listeners.empty()) {
+    return;
+  }
+
+  for (const IngestedFileInfo& f : ingestion_job.files_to_ingest()) {
+    ExternalFileIngestionInfo info;
+    info.cf_name = cfd->GetName();
+    info.external_file_path = f.external_file_path;
+    info.internal_file_path = f.internal_file_path;
+    info.global_seqno = f.assigned_seqno;
+    info.table_properties = f.table_properties;
+    for (auto listener : immutable_db_options_.listeners) {
+      listener->OnExternalFileIngested(this, info);
+    }
+  }
+}
+
+void DBImpl::WaitForIngestFile() {
+  mutex_.AssertHeld();
+  while (num_running_ingest_file_ > 0) {
+    bg_cv_.Wait();
+  }
+}
+
+Status DBImpl::StartTrace(const TraceOptions& trace_options,
+                          std::unique_ptr<TraceWriter>&& trace_writer) {
+  InstrumentedMutexLock lock(&trace_mutex_);
+  tracer_.reset(new Tracer(env_, trace_options, std::move(trace_writer)));
+  return Status::OK();
+}
+
+Status DBImpl::EndTrace() {
+  InstrumentedMutexLock lock(&trace_mutex_);
+  Status s;
+  if (tracer_ != nullptr) {
+    s = tracer_->Close();
+    tracer_.reset();
+  } else {
+    return Status::IOError("No trace file to close");
+  }
+  return s;
+}
+
+Status DBImpl::StartBlockCacheTrace(
+    const TraceOptions& trace_options,
+    std::unique_ptr<TraceWriter>&& trace_writer) {
+  return block_cache_tracer_.StartTrace(env_, trace_options,
+                                        std::move(trace_writer));
+}
+
+Status DBImpl::EndBlockCacheTrace() {
+  block_cache_tracer_.EndTrace();
+  return Status::OK();
+}
+
+Status DBImpl::TraceIteratorSeek(const uint32_t& cf_id, const Slice& key) {
+  Status s;
+  if (tracer_) {
+    InstrumentedMutexLock lock(&trace_mutex_);
+    if (tracer_) {
+      s = tracer_->IteratorSeek(cf_id, key);
+    }
+  }
+  return s;
+}
+
+Status DBImpl::TraceIteratorSeekForPrev(const uint32_t& cf_id,
+                                        const Slice& key) {
+  Status s;
+  if (tracer_) {
+    InstrumentedMutexLock lock(&trace_mutex_);
+    if (tracer_) {
+      s = tracer_->IteratorSeekForPrev(cf_id, key);
+    }
+  }
+  return s;
+}
+
+Status DBImpl::ReserveFileNumbersBeforeIngestion(
+    ColumnFamilyData* cfd, uint64_t num,
+    std::unique_ptr<std::list<uint64_t>::iterator>& pending_output_elem,
+    uint64_t* next_file_number) {
+  Status s;
+  SuperVersionContext dummy_sv_ctx(true /* create_superversion */);
+  assert(nullptr != next_file_number);
+  InstrumentedMutexLock l(&mutex_);
+  if (error_handler_.IsDBStopped()) {
+    // Do not ingest files when there is a bg_error
+    return error_handler_.GetBGError();
+  }
+  pending_output_elem.reset(new std::list<uint64_t>::iterator(
+      CaptureCurrentFileNumberInPendingOutputs()));
+  *next_file_number = versions_->FetchAddFileNumber(static_cast<uint64_t>(num));
+  auto cf_options = cfd->GetLatestMutableCFOptions();
+  VersionEdit dummy_edit;
+  // If crash happen after a hard link established, Recover function may
+  // reuse the file number that has already assigned to the internal file,
+  // and this will overwrite the external file. To protect the external
+  // file, we have to make sure the file number will never being reused.
+  s = versions_->LogAndApply(cfd, *cf_options, &dummy_edit, &mutex_,
+                             directories_.GetDbDir());
+  if (s.ok()) {
+    InstallSuperVersionAndScheduleWork(cfd, &dummy_sv_ctx, *cf_options);
+  }
+  dummy_sv_ctx.Clean();
+  return s;
+}
+
+Status DBImpl::GetCreationTimeOfOldestFile(uint64_t* creation_time) {
+  if (mutable_db_options_.max_open_files == -1) {
+    uint64_t oldest_time = port::kMaxUint64;
+    for (auto cfd : *versions_->GetColumnFamilySet()) {
+      if (!cfd->IsDropped()) {
+        uint64_t ctime;
+        {
+          SuperVersion* sv = GetAndRefSuperVersion(cfd);
+          Version* version = sv->current;
+          version->GetCreationTimeOfOldestFile(&ctime);
+          ReturnAndCleanupSuperVersion(cfd, sv);
+        }
+
+        if (ctime < oldest_time) {
+          oldest_time = ctime;
+        }
+        if (oldest_time == 0) {
+          break;
+        }
+      }
+    }
+    *creation_time = oldest_time;
+    return Status::OK();
+  } else {
+    return Status::NotSupported("This API only works if max_open_files = -1");
+  }
+}
+#endif  // ROCKSDB_LITE
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/db_impl/db_impl.h b/src/rocksdb/db/db_impl/db_impl.h
new file mode 100644
index 000000000..119555cb4
--- /dev/null
+++ b/src/rocksdb/db/db_impl/db_impl.h
@@ -0,0 +1,2107 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+
+#include <atomic>
+#include <deque>
+#include <functional>
+#include <limits>
+#include <list>
+#include <map>
+#include <set>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "db/column_family.h"
+#include "db/compaction/compaction_job.h"
+#include "db/dbformat.h"
+#include "db/error_handler.h"
+#include "db/event_helpers.h"
+#include "db/external_sst_file_ingestion_job.h"
+#include "db/flush_job.h"
+#include "db/flush_scheduler.h"
+#include "db/import_column_family_job.h"
+#include "db/internal_stats.h"
+#include "db/log_writer.h"
+#include "db/logs_with_prep_tracker.h"
+#include "db/memtable_list.h"
+#include "db/pre_release_callback.h"
+#include "db/range_del_aggregator.h"
+#include "db/read_callback.h"
+#include "db/snapshot_checker.h"
+#include "db/snapshot_impl.h"
+#include "db/trim_history_scheduler.h"
+#include "db/version_edit.h"
+#include "db/wal_manager.h"
+#include "db/write_controller.h"
+#include "db/write_thread.h"
+#include "logging/event_logger.h"
+#include "monitoring/instrumented_mutex.h"
+#include "options/db_options.h"
+#include "port/port.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/memtablerep.h"
+#include "rocksdb/status.h"
+#include "rocksdb/trace_reader_writer.h"
+#include "rocksdb/transaction_log.h"
+#include "rocksdb/write_buffer_manager.h"
+#include "table/scoped_arena_iterator.h"
+#include "trace_replay/block_cache_tracer.h"
+#include "trace_replay/trace_replay.h"
+#include "util/autovector.h"
+#include "util/hash.h"
+#include "util/repeatable_thread.h"
+#include "util/stop_watch.h"
+#include "util/thread_local.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Arena;
+class ArenaWrappedDBIter;
+class InMemoryStatsHistoryIterator;
+class MemTable;
+class PersistentStatsHistoryIterator;
+class TableCache;
+class TaskLimiterToken;
+class Version;
+class VersionEdit;
+class VersionSet;
+class WriteCallback;
+struct JobContext;
+struct ExternalSstFileInfo;
+struct MemTableInfo;
+
+// Class to maintain directories for all database paths other than main one.
+class Directories {
+ public:
+  Status SetDirectories(Env* env, const std::string& dbname,
+                        const std::string& wal_dir,
+                        const std::vector<DbPath>& data_paths);
+
+  Directory* GetDataDir(size_t path_id) const {
+    assert(path_id < data_dirs_.size());
+    Directory* ret_dir = data_dirs_[path_id].get();
+    if (ret_dir == nullptr) {
+      // Should use db_dir_
+      return db_dir_.get();
+    }
+    return ret_dir;
+  }
+
+  Directory* GetWalDir() {
+    if (wal_dir_) {
+      return wal_dir_.get();
+    }
+    return db_dir_.get();
+  }
+
+  Directory* GetDbDir() { return db_dir_.get(); }
+
+ private:
+  std::unique_ptr<Directory> db_dir_;
+  std::vector<std::unique_ptr<Directory>> data_dirs_;
+  std::unique_ptr<Directory> wal_dir_;
+};
+
+// While DB is the public interface of RocksDB, and DBImpl is the actual
+// class implementing it. It's the entrance of the core RocksdB engine.
+// All other DB implementations, e.g. TransactionDB, BlobDB, etc, wrap a
+// DBImpl internally.
+// Other than functions implementing the DB interface, some public
+// functions are there for other internal components to call. For
+// example, TransactionDB directly calls DBImpl::WriteImpl() and
+// BlobDB directly calls DBImpl::GetImpl(). Some other functions
+// are for sub-components to call. For example, ColumnFamilyHandleImpl
+// calls DBImpl::FindObsoleteFiles().
+//
+// Since it's a very large class, the definition of the functions is
+// divided in several db_impl_*.cc files, besides db_impl.cc.
+class DBImpl : public DB {
+ public:
+  DBImpl(const DBOptions& options, const std::string& dbname,
+         const bool seq_per_batch = false, const bool batch_per_txn = true);
+  // No copying allowed
+  DBImpl(const DBImpl&) = delete;
+  void operator=(const DBImpl&) = delete;
+
+  virtual ~DBImpl();
+
+  // ---- Implementations of the DB interface ----
+
+  using DB::Resume;
+  virtual Status Resume() override;
+
+  using DB::Put;
+  virtual Status Put(const WriteOptions& options,
+                     ColumnFamilyHandle* column_family, const Slice& key,
+                     const Slice& value) override;
+  using DB::Merge;
+  virtual Status Merge(const WriteOptions& options,
+                       ColumnFamilyHandle* column_family, const Slice& key,
+                       const Slice& value) override;
+  using DB::Delete;
+  virtual Status Delete(const WriteOptions& options,
+                        ColumnFamilyHandle* column_family,
+                        const Slice& key) override;
+  using DB::SingleDelete;
+  virtual Status SingleDelete(const WriteOptions& options,
+                              ColumnFamilyHandle* column_family,
+                              const Slice& key) override;
+  using DB::Write;
+  virtual Status Write(const WriteOptions& options,
+                       WriteBatch* updates) override;
+
+  using DB::Get;
+  virtual Status Get(const ReadOptions& options,
+                     ColumnFamilyHandle* column_family, const Slice& key,
+                     PinnableSlice* value) override;
+
+  using DB::GetMergeOperands;
+  Status GetMergeOperands(const ReadOptions& options,
+                          ColumnFamilyHandle* column_family, const Slice& key,
+                          PinnableSlice* merge_operands,
+                          GetMergeOperandsOptions* get_merge_operands_options,
+                          int* number_of_operands) override {
+    GetImplOptions get_impl_options;
+    get_impl_options.column_family = column_family;
+    get_impl_options.merge_operands = merge_operands;
+    get_impl_options.get_merge_operands_options = get_merge_operands_options;
+    get_impl_options.number_of_operands = number_of_operands;
+    get_impl_options.get_value = false;
+    return GetImpl(options, key, get_impl_options);
+  }
+
+  using DB::MultiGet;
+  virtual std::vector<Status> MultiGet(
+      const ReadOptions& options,
+      const std::vector<ColumnFamilyHandle*>& column_family,
+      const std::vector<Slice>& keys,
+      std::vector<std::string>* values) override;
+
+  // This MultiGet is a batched version, which may be faster than calling Get
+  // multiple times, especially if the keys have some spatial locality that
+  // enables them to be queried in the same SST files/set of files. The larger
+  // the batch size, the more scope for batching and performance improvement
+  // The values and statuses parameters are arrays with number of elements
+  // equal to keys.size(). This allows the storage for those to be alloacted
+  // by the caller on the stack for small batches
+  virtual void MultiGet(const ReadOptions& options,
+                        ColumnFamilyHandle* column_family,
+                        const size_t num_keys, const Slice* keys,
+                        PinnableSlice* values, Status* statuses,
+                        const bool sorted_input = false) override;
+
+  virtual void MultiGet(const ReadOptions& options, const size_t num_keys,
+                        ColumnFamilyHandle** column_families, const Slice* keys,
+                        PinnableSlice* values, Status* statuses,
+                        const bool sorted_input = false) override;
+
+  virtual void MultiGetWithCallback(
+      const ReadOptions& options, ColumnFamilyHandle* column_family,
+      ReadCallback* callback,
+      autovector<KeyContext*, MultiGetContext::MAX_BATCH_SIZE>* sorted_keys);
+
+  virtual Status CreateColumnFamily(const ColumnFamilyOptions& cf_options,
+                                    const std::string& column_family,
+                                    ColumnFamilyHandle** handle) override;
+  virtual Status CreateColumnFamilies(
+      const ColumnFamilyOptions& cf_options,
+      const std::vector<std::string>& column_family_names,
+      std::vector<ColumnFamilyHandle*>* handles) override;
+  virtual Status CreateColumnFamilies(
+      const std::vector<ColumnFamilyDescriptor>& column_families,
+      std::vector<ColumnFamilyHandle*>* handles) override;
+  virtual Status DropColumnFamily(ColumnFamilyHandle* column_family) override;
+  virtual Status DropColumnFamilies(
+      const std::vector<ColumnFamilyHandle*>& column_families) override;
+
+  // Returns false if key doesn't exist in the database and true if it may.
+  // If value_found is not passed in as null, then return the value if found in
+  // memory. On return, if value was found, then value_found will be set to true
+  // , otherwise false.
+  using DB::KeyMayExist;
+  virtual bool KeyMayExist(const ReadOptions& options,
+                           ColumnFamilyHandle* column_family, const Slice& key,
+                           std::string* value,
+                           bool* value_found = nullptr) override;
+
+  using DB::NewIterator;
+  virtual Iterator* NewIterator(const ReadOptions& options,
+                                ColumnFamilyHandle* column_family) override;
+  virtual Status NewIterators(
+      const ReadOptions& options,
+      const std::vector<ColumnFamilyHandle*>& column_families,
+      std::vector<Iterator*>* iterators) override;
+
+  virtual const Snapshot* GetSnapshot() override;
+  virtual void ReleaseSnapshot(const Snapshot* snapshot) override;
+  using DB::GetProperty;
+  virtual bool GetProperty(ColumnFamilyHandle* column_family,
+                           const Slice& property, std::string* value) override;
+  using DB::GetMapProperty;
+  virtual bool GetMapProperty(
+      ColumnFamilyHandle* column_family, const Slice& property,
+      std::map<std::string, std::string>* value) override;
+  using DB::GetIntProperty;
+  virtual bool GetIntProperty(ColumnFamilyHandle* column_family,
+                              const Slice& property, uint64_t* value) override;
+  using DB::GetAggregatedIntProperty;
+  virtual bool GetAggregatedIntProperty(const Slice& property,
+                                        uint64_t* aggregated_value) override;
+  using DB::GetApproximateSizes;
+  virtual Status GetApproximateSizes(const SizeApproximationOptions& options,
+                                     ColumnFamilyHandle* column_family,
+                                     const Range* range, int n,
+                                     uint64_t* sizes) override;
+  using DB::GetApproximateMemTableStats;
+  virtual void GetApproximateMemTableStats(ColumnFamilyHandle* column_family,
+                                           const Range& range,
+                                           uint64_t* const count,
+                                           uint64_t* const size) override;
+  using DB::CompactRange;
+  virtual Status CompactRange(const CompactRangeOptions& options,
+                              ColumnFamilyHandle* column_family,
+                              const Slice* begin, const Slice* end) override;
+
+  using DB::CompactFiles;
+  virtual Status CompactFiles(
+      const CompactionOptions& compact_options,
+      ColumnFamilyHandle* column_family,
+      const std::vector<std::string>& input_file_names, const int output_level,
+      const int output_path_id = -1,
+      std::vector<std::string>* const output_file_names = nullptr,
+      CompactionJobInfo* compaction_job_info = nullptr) override;
+
+  virtual Status PauseBackgroundWork() override;
+  virtual Status ContinueBackgroundWork() override;
+
+  virtual Status EnableAutoCompaction(
+      const std::vector<ColumnFamilyHandle*>& column_family_handles) override;
+
+  virtual void EnableManualCompaction() override;
+  virtual void DisableManualCompaction() override;
+
+  using DB::SetOptions;
+  Status SetOptions(
+      ColumnFamilyHandle* column_family,
+      const std::unordered_map<std::string, std::string>& options_map) override;
+
+  virtual Status SetDBOptions(
+      const std::unordered_map<std::string, std::string>& options_map) override;
+
+  using DB::NumberLevels;
+  virtual int NumberLevels(ColumnFamilyHandle* column_family) override;
+  using DB::MaxMemCompactionLevel;
+  virtual int MaxMemCompactionLevel(ColumnFamilyHandle* column_family) override;
+  using DB::Level0StopWriteTrigger;
+  virtual int Level0StopWriteTrigger(
+      ColumnFamilyHandle* column_family) override;
+  virtual const std::string& GetName() const override;
+  virtual Env* GetEnv() const override;
+  virtual FileSystem* GetFileSystem() const override;
+  using DB::GetOptions;
+  virtual Options GetOptions(ColumnFamilyHandle* column_family) const override;
+  using DB::GetDBOptions;
+  virtual DBOptions GetDBOptions() const override;
+  using DB::Flush;
+  virtual Status Flush(const FlushOptions& options,
+                       ColumnFamilyHandle* column_family) override;
+  virtual Status Flush(
+      const FlushOptions& options,
+      const std::vector<ColumnFamilyHandle*>& column_families) override;
+  virtual Status FlushWAL(bool sync) override;
+  bool TEST_WALBufferIsEmpty(bool lock = true);
+  virtual Status SyncWAL() override;
+  virtual Status LockWAL() override;
+  virtual Status UnlockWAL() override;
+
+  virtual SequenceNumber GetLatestSequenceNumber() const override;
+
+  virtual bool SetPreserveDeletesSequenceNumber(SequenceNumber seqnum) override;
+
+  virtual Status GetDbIdentity(std::string& identity) const override;
+
+  virtual Status GetDbIdentityFromIdentityFile(std::string* identity) const;
+
+  ColumnFamilyHandle* DefaultColumnFamily() const override;
+
+  ColumnFamilyHandle* PersistentStatsColumnFamily() const;
+
+  virtual Status Close() override;
+
+  Status GetStatsHistory(
+      uint64_t start_time, uint64_t end_time,
+      std::unique_ptr<StatsHistoryIterator>* stats_iterator) override;
+
+#ifndef ROCKSDB_LITE
+  using DB::ResetStats;
+  virtual Status ResetStats() override;
+  virtual Status DisableFileDeletions() override;
+  virtual Status EnableFileDeletions(bool force) override;
+  virtual int IsFileDeletionsEnabled() const;
+  // All the returned filenames start with "/"
+  virtual Status GetLiveFiles(std::vector<std::string>&,
+                              uint64_t* manifest_file_size,
+                              bool flush_memtable = true) override;
+  virtual Status GetSortedWalFiles(VectorLogPtr& files) override;
+  virtual Status GetCurrentWalFile(
+      std::unique_ptr<LogFile>* current_log_file) override;
+  virtual Status GetCreationTimeOfOldestFile(
+      uint64_t* creation_time) override;
+
+  virtual Status GetUpdatesSince(
+      SequenceNumber seq_number, std::unique_ptr<TransactionLogIterator>* iter,
+      const TransactionLogIterator::ReadOptions& read_options =
+          TransactionLogIterator::ReadOptions()) override;
+  virtual Status DeleteFile(std::string name) override;
+  Status DeleteFilesInRanges(ColumnFamilyHandle* column_family,
+                             const RangePtr* ranges, size_t n,
+                             bool include_end = true);
+
+  virtual void GetLiveFilesMetaData(
+      std::vector<LiveFileMetaData>* metadata) override;
+
+  // Obtains the meta data of the specified column family of the DB.
+  // Status::NotFound() will be returned if the current DB does not have
+  // any column family match the specified name.
+  // TODO(yhchiang): output parameter is placed in the end in this codebase.
+  virtual void GetColumnFamilyMetaData(ColumnFamilyHandle* column_family,
+                                       ColumnFamilyMetaData* metadata) override;
+
+  Status SuggestCompactRange(ColumnFamilyHandle* column_family,
+                             const Slice* begin, const Slice* end) override;
+
+  Status PromoteL0(ColumnFamilyHandle* column_family,
+                   int target_level) override;
+
+  using DB::IngestExternalFile;
+  virtual Status IngestExternalFile(
+      ColumnFamilyHandle* column_family,
+      const std::vector<std::string>& external_files,
+      const IngestExternalFileOptions& ingestion_options) override;
+
+  using DB::IngestExternalFiles;
+  virtual Status IngestExternalFiles(
+      const std::vector<IngestExternalFileArg>& args) override;
+
+  using DB::CreateColumnFamilyWithImport;
+  virtual Status CreateColumnFamilyWithImport(
+      const ColumnFamilyOptions& options, const std::string& column_family_name,
+      const ImportColumnFamilyOptions& import_options,
+      const ExportImportFilesMetaData& metadata,
+      ColumnFamilyHandle** handle) override;
+
+  using DB::VerifyChecksum;
+  virtual Status VerifyChecksum(const ReadOptions& /*read_options*/) override;
+
+  using DB::StartTrace;
+  virtual Status StartTrace(
+      const TraceOptions& options,
+      std::unique_ptr<TraceWriter>&& trace_writer) override;
+
+  using DB::EndTrace;
+  virtual Status EndTrace() override;
+
+  using DB::StartBlockCacheTrace;
+  Status StartBlockCacheTrace(
+      const TraceOptions& options,
+      std::unique_ptr<TraceWriter>&& trace_writer) override;
+
+  using DB::EndBlockCacheTrace;
+  Status EndBlockCacheTrace() override;
+
+  using DB::GetPropertiesOfAllTables;
+  virtual Status GetPropertiesOfAllTables(
+      ColumnFamilyHandle* column_family,
+      TablePropertiesCollection* props) override;
+  virtual Status GetPropertiesOfTablesInRange(
+      ColumnFamilyHandle* column_family, const Range* range, std::size_t n,
+      TablePropertiesCollection* props) override;
+
+#endif  // ROCKSDB_LITE
+
+  // ---- End of implementations of the DB interface ----
+
+  struct GetImplOptions {
+    ColumnFamilyHandle* column_family = nullptr;
+    PinnableSlice* value = nullptr;
+    bool* value_found = nullptr;
+    ReadCallback* callback = nullptr;
+    bool* is_blob_index = nullptr;
+    // If true return value associated with key via value pointer else return
+    // all merge operands for key via merge_operands pointer
+    bool get_value = true;
+    // Pointer to an array of size
+    // get_merge_operands_options.expected_max_number_of_operands allocated by
+    // user
+    PinnableSlice* merge_operands = nullptr;
+    GetMergeOperandsOptions* get_merge_operands_options = nullptr;
+    int* number_of_operands = nullptr;
+  };
+
+  // Function that Get and KeyMayExist call with no_io true or false
+  // Note: 'value_found' from KeyMayExist propagates here
+  // This function is also called by GetMergeOperands
+  // If get_impl_options.get_value = true get value associated with
+  // get_impl_options.key via get_impl_options.value
+  // If get_impl_options.get_value = false get merge operands associated with
+  // get_impl_options.key via get_impl_options.merge_operands
+  Status GetImpl(const ReadOptions& options, const Slice& key,
+                 GetImplOptions get_impl_options);
+
+  ArenaWrappedDBIter* NewIteratorImpl(const ReadOptions& options,
+                                      ColumnFamilyData* cfd,
+                                      SequenceNumber snapshot,
+                                      ReadCallback* read_callback,
+                                      bool allow_blob = false,
+                                      bool allow_refresh = true);
+
+  virtual SequenceNumber GetLastPublishedSequence() const {
+    if (last_seq_same_as_publish_seq_) {
+      return versions_->LastSequence();
+    } else {
+      return versions_->LastPublishedSequence();
+    }
+  }
+
+  // REQUIRES: joined the main write queue if two_write_queues is disabled, and
+  // the second write queue otherwise.
+  virtual void SetLastPublishedSequence(SequenceNumber seq);
+  // Returns LastSequence in last_seq_same_as_publish_seq_
+  // mode and LastAllocatedSequence otherwise. This is useful when visiblility
+  // depends also on data written to the WAL but not to the memtable.
+  SequenceNumber TEST_GetLastVisibleSequence() const;
+
+#ifndef ROCKSDB_LITE
+  // Similar to Write() but will call the callback once on the single write
+  // thread to determine whether it is safe to perform the write.
+  virtual Status WriteWithCallback(const WriteOptions& write_options,
+                                   WriteBatch* my_batch,
+                                   WriteCallback* callback);
+
+  // Returns the sequence number that is guaranteed to be smaller than or equal
+  // to the sequence number of any key that could be inserted into the current
+  // memtables. It can then be assumed that any write with a larger(or equal)
+  // sequence number will be present in this memtable or a later memtable.
+  //
+  // If the earliest sequence number could not be determined,
+  // kMaxSequenceNumber will be returned.
+  //
+  // If include_history=true, will also search Memtables in MemTableList
+  // History.
+  SequenceNumber GetEarliestMemTableSequenceNumber(SuperVersion* sv,
+                                                   bool include_history);
+
+  // For a given key, check to see if there are any records for this key
+  // in the memtables, including memtable history.  If cache_only is false,
+  // SST files will also be checked.
+  //
+  // If a key is found, *found_record_for_key will be set to true and
+  // *seq will be set to the stored sequence number for the latest
+  // operation on this key or kMaxSequenceNumber if unknown.
+  // If no key is found, *found_record_for_key will be set to false.
+  //
+  // Note: If cache_only=false, it is possible for *seq to be set to 0 if
+  // the sequence number has been cleared from the record.  If the caller is
+  // holding an active db snapshot, we know the missing sequence must be less
+  // than the snapshot's sequence number (sequence numbers are only cleared
+  // when there are no earlier active snapshots).
+  //
+  // If NotFound is returned and found_record_for_key is set to false, then no
+  // record for this key was found.  If the caller is holding an active db
+  // snapshot, we know that no key could have existing after this snapshot
+  // (since we do not compact keys that have an earlier snapshot).
+  //
+  // Only records newer than or at `lower_bound_seq` are guaranteed to be
+  // returned. Memtables and files may not be checked if it only contains data
+  // older than `lower_bound_seq`.
+  //
+  // Returns OK or NotFound on success,
+  // other status on unexpected error.
+  // TODO(andrewkr): this API need to be aware of range deletion operations
+  Status GetLatestSequenceForKey(SuperVersion* sv, const Slice& key,
+                                 bool cache_only,
+                                 SequenceNumber lower_bound_seq,
+                                 SequenceNumber* seq,
+                                 bool* found_record_for_key,
+                                 bool* is_blob_index = nullptr);
+
+  Status TraceIteratorSeek(const uint32_t& cf_id, const Slice& key);
+  Status TraceIteratorSeekForPrev(const uint32_t& cf_id, const Slice& key);
+#endif  // ROCKSDB_LITE
+
+  // Similar to GetSnapshot(), but also lets the db know that this snapshot
+  // will be used for transaction write-conflict checking.  The DB can then
+  // make sure not to compact any keys that would prevent a write-conflict from
+  // being detected.
+  const Snapshot* GetSnapshotForWriteConflictBoundary();
+
+  // checks if all live files exist on file system and that their file sizes
+  // match to our in-memory records
+  virtual Status CheckConsistency();
+
+  // max_file_num_to_ignore allows bottom level compaction to filter out newly
+  // compacted SST files. Setting max_file_num_to_ignore to kMaxUint64 will
+  // disable the filtering
+  Status RunManualCompaction(ColumnFamilyData* cfd, int input_level,
+                             int output_level,
+                             const CompactRangeOptions& compact_range_options,
+                             const Slice* begin, const Slice* end,
+                             bool exclusive, bool disallow_trivial_move,
+                             uint64_t max_file_num_to_ignore);
+
+  // Return an internal iterator over the current state of the database.
+  // The keys of this iterator are internal keys (see format.h).
+  // The returned iterator should be deleted when no longer needed.
+  InternalIterator* NewInternalIterator(
+      Arena* arena, RangeDelAggregator* range_del_agg, SequenceNumber sequence,
+      ColumnFamilyHandle* column_family = nullptr);
+
+  LogsWithPrepTracker* logs_with_prep_tracker() {
+    return &logs_with_prep_tracker_;
+  }
+
+  struct BGJobLimits {
+    int max_flushes;
+    int max_compactions;
+  };
+  // Returns maximum background flushes and compactions allowed to be scheduled
+  BGJobLimits GetBGJobLimits() const;
+  // Need a static version that can be called during SanitizeOptions().
+  static BGJobLimits GetBGJobLimits(int max_background_flushes,
+                                    int max_background_compactions,
+                                    int max_background_jobs,
+                                    bool parallelize_compactions);
+
+  // move logs pending closing from job_context to the DB queue and
+  // schedule a purge
+  void ScheduleBgLogWriterClose(JobContext* job_context);
+
+  uint64_t MinLogNumberToKeep();
+
+  // Returns the lower bound file number for SSTs that won't be deleted, even if
+  // they're obsolete. This lower bound is used internally to prevent newly
+  // created flush/compaction output files from being deleted before they're
+  // installed. This technique avoids the need for tracking the exact numbers of
+  // files pending creation, although it prevents more files than necessary from
+  // being deleted.
+  uint64_t MinObsoleteSstNumberToKeep();
+
+  // Returns the list of live files in 'live' and the list
+  // of all files in the filesystem in 'candidate_files'.
+  // If force == false and the last call was less than
+  // db_options_.delete_obsolete_files_period_micros microseconds ago,
+  // it will not fill up the job_context
+  void FindObsoleteFiles(JobContext* job_context, bool force,
+                         bool no_full_scan = false);
+
+  // Diffs the files listed in filenames and those that do not
+  // belong to live files are possibly removed. Also, removes all the
+  // files in sst_delete_files and log_delete_files.
+  // It is not necessary to hold the mutex when invoking this method.
+  // If FindObsoleteFiles() was run, we need to also run
+  // PurgeObsoleteFiles(), even if disable_delete_obsolete_files_ is true
+  void PurgeObsoleteFiles(JobContext& background_contet,
+                          bool schedule_only = false);
+
+  // Schedule a background job to actually delete obsolete files.
+  void SchedulePurge();
+
+  const SnapshotList& snapshots() const { return snapshots_; }
+
+  // load list of snapshots to `snap_vector` that is no newer than `max_seq`
+  // in ascending order.
+  // `oldest_write_conflict_snapshot` is filled with the oldest snapshot
+  // which satisfies SnapshotImpl.is_write_conflict_boundary_ = true.
+  void LoadSnapshots(std::vector<SequenceNumber>* snap_vector,
+                     SequenceNumber* oldest_write_conflict_snapshot,
+                     const SequenceNumber& max_seq) const {
+    InstrumentedMutexLock l(mutex());
+    snapshots().GetAll(snap_vector, oldest_write_conflict_snapshot, max_seq);
+  }
+
+  const ImmutableDBOptions& immutable_db_options() const {
+    return immutable_db_options_;
+  }
+
+  // Cancel all background jobs, including flush, compaction, background
+  // purging, stats dumping threads, etc. If `wait` = true, wait for the
+  // running jobs to abort or finish before returning. Otherwise, only
+  // sends the signals.
+  void CancelAllBackgroundWork(bool wait);
+
+  // Find Super version and reference it. Based on options, it might return
+  // the thread local cached one.
+  // Call ReturnAndCleanupSuperVersion() when it is no longer needed.
+  SuperVersion* GetAndRefSuperVersion(ColumnFamilyData* cfd);
+
+  // Similar to the previous function but looks up based on a column family id.
+  // nullptr will be returned if this column family no longer exists.
+  // REQUIRED: this function should only be called on the write thread or if the
+  // mutex is held.
+  SuperVersion* GetAndRefSuperVersion(uint32_t column_family_id);
+
+  // Un-reference the super version and clean it up if it is the last reference.
+  void CleanupSuperVersion(SuperVersion* sv);
+
+  // Un-reference the super version and return it to thread local cache if
+  // needed. If it is the last reference of the super version. Clean it up
+  // after un-referencing it.
+  void ReturnAndCleanupSuperVersion(ColumnFamilyData* cfd, SuperVersion* sv);
+
+  // Similar to the previous function but looks up based on a column family id.
+  // nullptr will be returned if this column family no longer exists.
+  // REQUIRED: this function should only be called on the write thread.
+  void ReturnAndCleanupSuperVersion(uint32_t colun_family_id, SuperVersion* sv);
+
+  // REQUIRED: this function should only be called on the write thread or if the
+  // mutex is held.  Return value only valid until next call to this function or
+  // mutex is released.
+  ColumnFamilyHandle* GetColumnFamilyHandle(uint32_t column_family_id);
+
+  // Same as above, should called without mutex held and not on write thread.
+  std::unique_ptr<ColumnFamilyHandle> GetColumnFamilyHandleUnlocked(
+      uint32_t column_family_id);
+
+  // Returns the number of currently running flushes.
+  // REQUIREMENT: mutex_ must be held when calling this function.
+  int num_running_flushes() {
+    mutex_.AssertHeld();
+    return num_running_flushes_;
+  }
+
+  // Returns the number of currently running compactions.
+  // REQUIREMENT: mutex_ must be held when calling this function.
+  int num_running_compactions() {
+    mutex_.AssertHeld();
+    return num_running_compactions_;
+  }
+
+  const WriteController& write_controller() { return write_controller_; }
+
+  InternalIterator* NewInternalIterator(
+      const ReadOptions&, ColumnFamilyData* cfd, SuperVersion* super_version,
+      Arena* arena, RangeDelAggregator* range_del_agg, SequenceNumber sequence);
+
+  // hollow transactions shell used for recovery.
+  // these will then be passed to TransactionDB so that
+  // locks can be reacquired before writing can resume.
+  struct RecoveredTransaction {
+    std::string name_;
+    bool unprepared_;
+
+    struct BatchInfo {
+      uint64_t log_number_;
+      // TODO(lth): For unprepared, the memory usage here can be big for
+      // unprepared transactions. This is only useful for rollbacks, and we
+      // can in theory just keep keyset for that.
+      WriteBatch* batch_;
+      // Number of sub-batches. A new sub-batch is created if txn attempts to
+      // insert a duplicate key,seq to memtable. This is currently used in
+      // WritePreparedTxn/WriteUnpreparedTxn.
+      size_t batch_cnt_;
+    };
+
+    // This maps the seq of the first key in the batch to BatchInfo, which
+    // contains WriteBatch and other information relevant to the batch.
+    //
+    // For WriteUnprepared, batches_ can have size greater than 1, but for
+    // other write policies, it must be of size 1.
+    std::map<SequenceNumber, BatchInfo> batches_;
+
+    explicit RecoveredTransaction(const uint64_t log, const std::string& name,
+                                  WriteBatch* batch, SequenceNumber seq,
+                                  size_t batch_cnt, bool unprepared)
+        : name_(name), unprepared_(unprepared) {
+      batches_[seq] = {log, batch, batch_cnt};
+    }
+
+    ~RecoveredTransaction() {
+      for (auto& it : batches_) {
+        delete it.second.batch_;
+      }
+    }
+
+    void AddBatch(SequenceNumber seq, uint64_t log_number, WriteBatch* batch,
+                  size_t batch_cnt, bool unprepared) {
+      assert(batches_.count(seq) == 0);
+      batches_[seq] = {log_number, batch, batch_cnt};
+      // Prior state must be unprepared, since the prepare batch must be the
+      // last batch.
+      assert(unprepared_);
+      unprepared_ = unprepared;
+    }
+  };
+
+  bool allow_2pc() const { return immutable_db_options_.allow_2pc; }
+
+  std::unordered_map<std::string, RecoveredTransaction*>
+  recovered_transactions() {
+    return recovered_transactions_;
+  }
+
+  RecoveredTransaction* GetRecoveredTransaction(const std::string& name) {
+    auto it = recovered_transactions_.find(name);
+    if (it == recovered_transactions_.end()) {
+      return nullptr;
+    } else {
+      return it->second;
+    }
+  }
+
+  void InsertRecoveredTransaction(const uint64_t log, const std::string& name,
+                                  WriteBatch* batch, SequenceNumber seq,
+                                  size_t batch_cnt, bool unprepared_batch) {
+    // For WriteUnpreparedTxn, InsertRecoveredTransaction is called multiple
+    // times for every unprepared batch encountered during recovery.
+    //
+    // If the transaction is prepared, then the last call to
+    // InsertRecoveredTransaction will have unprepared_batch = false.
+    auto rtxn = recovered_transactions_.find(name);
+    if (rtxn == recovered_transactions_.end()) {
+      recovered_transactions_[name] = new RecoveredTransaction(
+          log, name, batch, seq, batch_cnt, unprepared_batch);
+    } else {
+      rtxn->second->AddBatch(seq, log, batch, batch_cnt, unprepared_batch);
+    }
+    logs_with_prep_tracker_.MarkLogAsContainingPrepSection(log);
+  }
+
+  void DeleteRecoveredTransaction(const std::string& name) {
+    auto it = recovered_transactions_.find(name);
+    assert(it != recovered_transactions_.end());
+    auto* trx = it->second;
+    recovered_transactions_.erase(it);
+    for (const auto& info : trx->batches_) {
+      logs_with_prep_tracker_.MarkLogAsHavingPrepSectionFlushed(
+          info.second.log_number_);
+    }
+    delete trx;
+  }
+
+  void DeleteAllRecoveredTransactions() {
+    for (auto it = recovered_transactions_.begin();
+         it != recovered_transactions_.end(); ++it) {
+      delete it->second;
+    }
+    recovered_transactions_.clear();
+  }
+
+  void AddToLogsToFreeQueue(log::Writer* log_writer) {
+    logs_to_free_queue_.push_back(log_writer);
+  }
+
+  void AddSuperVersionsToFreeQueue(SuperVersion* sv) {
+    superversions_to_free_queue_.push_back(sv);
+  }
+
+  void SetSnapshotChecker(SnapshotChecker* snapshot_checker);
+
+  // Fill JobContext with snapshot information needed by flush and compaction.
+  void GetSnapshotContext(JobContext* job_context,
+                          std::vector<SequenceNumber>* snapshot_seqs,
+                          SequenceNumber* earliest_write_conflict_snapshot,
+                          SnapshotChecker** snapshot_checker);
+
+  // Not thread-safe.
+  void SetRecoverableStatePreReleaseCallback(PreReleaseCallback* callback);
+
+  InstrumentedMutex* mutex() const { return &mutex_; }
+
+  // Initialize a brand new DB. The DB directory is expected to be empty before
+  // calling it.
+  Status NewDB();
+
+  // This is to be used only by internal rocksdb classes.
+  static Status Open(const DBOptions& db_options, const std::string& name,
+                     const std::vector<ColumnFamilyDescriptor>& column_families,
+                     std::vector<ColumnFamilyHandle*>* handles, DB** dbptr,
+                     const bool seq_per_batch, const bool batch_per_txn);
+
+  static Status CreateAndNewDirectory(Env* env, const std::string& dirname,
+                                      std::unique_ptr<Directory>* directory);
+
+  // find stats map from stats_history_ with smallest timestamp in
+  // the range of [start_time, end_time)
+  bool FindStatsByTime(uint64_t start_time, uint64_t end_time,
+                       uint64_t* new_time,
+                       std::map<std::string, uint64_t>* stats_map);
+
+  // Print information of all tombstones of all iterators to the std::string
+  // This is only used by ldb. The output might be capped. Tombstones
+  // printed out are not guaranteed to be in any order.
+  Status TablesRangeTombstoneSummary(ColumnFamilyHandle* column_family,
+                                     int max_entries_to_print,
+                                     std::string* out_str);
+
+#ifndef NDEBUG
+  // Compact any files in the named level that overlap [*begin, *end]
+  Status TEST_CompactRange(int level, const Slice* begin, const Slice* end,
+                           ColumnFamilyHandle* column_family = nullptr,
+                           bool disallow_trivial_move = false);
+
+  void TEST_SwitchWAL();
+
+  bool TEST_UnableToReleaseOldestLog() { return unable_to_release_oldest_log_; }
+
+  bool TEST_IsLogGettingFlushed() {
+    return alive_log_files_.begin()->getting_flushed;
+  }
+
+  Status TEST_SwitchMemtable(ColumnFamilyData* cfd = nullptr);
+
+  // Force current memtable contents to be flushed.
+  Status TEST_FlushMemTable(bool wait = true, bool allow_write_stall = false,
+                            ColumnFamilyHandle* cfh = nullptr);
+
+  Status TEST_FlushMemTable(ColumnFamilyData* cfd,
+                            const FlushOptions& flush_opts);
+
+  // Flush (multiple) ColumnFamilyData without using ColumnFamilyHandle. This
+  // is because in certain cases, we can flush column families, wait for the
+  // flush to complete, but delete the column family handle before the wait
+  // finishes. For example in CompactRange.
+  Status TEST_AtomicFlushMemTables(const autovector<ColumnFamilyData*>& cfds,
+                                   const FlushOptions& flush_opts);
+
+  // Wait for memtable compaction
+  Status TEST_WaitForFlushMemTable(ColumnFamilyHandle* column_family = nullptr);
+
+  // Wait for any compaction
+  // We add a bool parameter to wait for unscheduledCompactions_ == 0, but this
+  // is only for the special test of CancelledCompactions
+  Status TEST_WaitForCompact(bool waitUnscheduled = false);
+
+  // Return the maximum overlapping data (in bytes) at next level for any
+  // file at a level >= 1.
+  int64_t TEST_MaxNextLevelOverlappingBytes(
+      ColumnFamilyHandle* column_family = nullptr);
+
+  // Return the current manifest file no.
+  uint64_t TEST_Current_Manifest_FileNo();
+
+  // Returns the number that'll be assigned to the next file that's created.
+  uint64_t TEST_Current_Next_FileNo();
+
+  // get total level0 file size. Only for testing.
+  uint64_t TEST_GetLevel0TotalSize();
+
+  void TEST_GetFilesMetaData(ColumnFamilyHandle* column_family,
+                             std::vector<std::vector<FileMetaData>>* metadata);
+
+  void TEST_LockMutex();
+
+  void TEST_UnlockMutex();
+
+  // REQUIRES: mutex locked
+  void* TEST_BeginWrite();
+
+  // REQUIRES: mutex locked
+  // pass the pointer that you got from TEST_BeginWrite()
+  void TEST_EndWrite(void* w);
+
+  uint64_t TEST_MaxTotalInMemoryState() const {
+    return max_total_in_memory_state_;
+  }
+
+  size_t TEST_LogsToFreeSize();
+
+  uint64_t TEST_LogfileNumber();
+
+  uint64_t TEST_total_log_size() const { return total_log_size_; }
+
+  // Returns column family name to ImmutableCFOptions map.
+  Status TEST_GetAllImmutableCFOptions(
+      std::unordered_map<std::string, const ImmutableCFOptions*>* iopts_map);
+
+  // Return the lastest MutableCFOptions of a column family
+  Status TEST_GetLatestMutableCFOptions(ColumnFamilyHandle* column_family,
+                                        MutableCFOptions* mutable_cf_options);
+
+  Cache* TEST_table_cache() { return table_cache_.get(); }
+
+  WriteController& TEST_write_controler() { return write_controller_; }
+
+  uint64_t TEST_FindMinLogContainingOutstandingPrep();
+  uint64_t TEST_FindMinPrepLogReferencedByMemTable();
+  size_t TEST_PreparedSectionCompletedSize();
+  size_t TEST_LogsWithPrepSize();
+
+  int TEST_BGCompactionsAllowed() const;
+  int TEST_BGFlushesAllowed() const;
+  size_t TEST_GetWalPreallocateBlockSize(uint64_t write_buffer_size) const;
+  void TEST_WaitForDumpStatsRun(std::function<void()> callback) const;
+  void TEST_WaitForPersistStatsRun(std::function<void()> callback) const;
+  bool TEST_IsPersistentStatsEnabled() const;
+  size_t TEST_EstimateInMemoryStatsHistorySize() const;
+#endif  // NDEBUG
+
+ protected:
+  const std::string dbname_;
+  std::string db_id_;
+  std::unique_ptr<VersionSet> versions_;
+  // Flag to check whether we allocated and own the info log file
+  bool own_info_log_;
+  const DBOptions initial_db_options_;
+  Env* const env_;
+  std::shared_ptr<FileSystem> fs_;
+  const ImmutableDBOptions immutable_db_options_;
+  MutableDBOptions mutable_db_options_;
+  Statistics* stats_;
+  std::unordered_map<std::string, RecoveredTransaction*>
+      recovered_transactions_;
+  std::unique_ptr<Tracer> tracer_;
+  InstrumentedMutex trace_mutex_;
+  BlockCacheTracer block_cache_tracer_;
+
+  // State below is protected by mutex_
+  // With two_write_queues enabled, some of the variables that accessed during
+  // WriteToWAL need different synchronization: log_empty_, alive_log_files_,
+  // logs_, logfile_number_. Refer to the definition of each variable below for
+  // more description.
+  mutable InstrumentedMutex mutex_;
+
+  ColumnFamilyHandleImpl* default_cf_handle_;
+  InternalStats* default_cf_internal_stats_;
+
+  // only used for dynamically adjusting max_total_wal_size. it is a sum of
+  // [write_buffer_size * max_write_buffer_number] over all column families
+  uint64_t max_total_in_memory_state_;
+  // If true, we have only one (default) column family. We use this to optimize
+  // some code-paths
+  bool single_column_family_mode_;
+
+  // The options to access storage files
+  const FileOptions file_options_;
+
+  // Additonal options for compaction and flush
+  FileOptions file_options_for_compaction_;
+
+  std::unique_ptr<ColumnFamilyMemTablesImpl> column_family_memtables_;
+
+  // Increase the sequence number after writing each batch, whether memtable is
+  // disabled for that or not. Otherwise the sequence number is increased after
+  // writing each key into memtable. This implies that when disable_memtable is
+  // set, the seq is not increased at all.
+  //
+  // Default: false
+  const bool seq_per_batch_;
+  // This determines during recovery whether we expect one writebatch per
+  // recovered transaction, or potentially multiple writebatches per
+  // transaction. For WriteUnprepared, this is set to false, since multiple
+  // batches can exist per transaction.
+  //
+  // Default: true
+  const bool batch_per_txn_;
+
+  // Except in DB::Open(), WriteOptionsFile can only be called when:
+  // Persist options to options file.
+  // If need_mutex_lock = false, the method will lock DB mutex.
+  // If need_enter_write_thread = false, the method will enter write thread.
+  Status WriteOptionsFile(bool need_mutex_lock, bool need_enter_write_thread);
+
+  // The following two functions can only be called when:
+  // 1. WriteThread::Writer::EnterUnbatched() is used.
+  // 2. db_mutex is NOT held
+  Status RenameTempFileToOptionsFile(const std::string& file_name);
+  Status DeleteObsoleteOptionsFiles();
+
+  void NotifyOnFlushBegin(ColumnFamilyData* cfd, FileMetaData* file_meta,
+                          const MutableCFOptions& mutable_cf_options,
+                          int job_id);
+
+  void NotifyOnFlushCompleted(
+      ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options,
+      std::list<std::unique_ptr<FlushJobInfo>>* flush_jobs_info);
+
+  void NotifyOnCompactionBegin(ColumnFamilyData* cfd, Compaction* c,
+                               const Status& st,
+                               const CompactionJobStats& job_stats, int job_id);
+
+  void NotifyOnCompactionCompleted(ColumnFamilyData* cfd, Compaction* c,
+                                   const Status& st,
+                                   const CompactionJobStats& job_stats,
+                                   int job_id);
+  void NotifyOnMemTableSealed(ColumnFamilyData* cfd,
+                              const MemTableInfo& mem_table_info);
+
+#ifndef ROCKSDB_LITE
+  void NotifyOnExternalFileIngested(
+      ColumnFamilyData* cfd, const ExternalSstFileIngestionJob& ingestion_job);
+#endif  // !ROCKSDB_LITE
+
+  void NewThreadStatusCfInfo(ColumnFamilyData* cfd) const;
+
+  void EraseThreadStatusCfInfo(ColumnFamilyData* cfd) const;
+
+  void EraseThreadStatusDbInfo() const;
+
+  // If disable_memtable is set the application logic must guarantee that the
+  // batch will still be skipped from memtable during the recovery. An excption
+  // to this is seq_per_batch_ mode, in which since each batch already takes one
+  // seq, it is ok for the batch to write to memtable during recovery as long as
+  // it only takes one sequence number: i.e., no duplicate keys.
+  // In WriteCommitted it is guarnateed since disable_memtable is used for
+  // prepare batch which will be written to memtable later during the commit,
+  // and in WritePrepared it is guaranteed since it will be used only for WAL
+  // markers which will never be written to memtable. If the commit marker is
+  // accompanied with CommitTimeWriteBatch that is not written to memtable as
+  // long as it has no duplicate keys, it does not violate the one-seq-per-batch
+  // policy.
+  // batch_cnt is expected to be non-zero in seq_per_batch mode and
+  // indicates the number of sub-patches. A sub-patch is a subset of the write
+  // batch that does not have duplicate keys.
+  Status WriteImpl(const WriteOptions& options, WriteBatch* updates,
+                   WriteCallback* callback = nullptr,
+                   uint64_t* log_used = nullptr, uint64_t log_ref = 0,
+                   bool disable_memtable = false, uint64_t* seq_used = nullptr,
+                   size_t batch_cnt = 0,
+                   PreReleaseCallback* pre_release_callback = nullptr);
+
+  Status PipelinedWriteImpl(const WriteOptions& options, WriteBatch* updates,
+                            WriteCallback* callback = nullptr,
+                            uint64_t* log_used = nullptr, uint64_t log_ref = 0,
+                            bool disable_memtable = false,
+                            uint64_t* seq_used = nullptr);
+
+  // Write only to memtables without joining any write queue
+  Status UnorderedWriteMemtable(const WriteOptions& write_options,
+                                WriteBatch* my_batch, WriteCallback* callback,
+                                uint64_t log_ref, SequenceNumber seq,
+                                const size_t sub_batch_cnt);
+
+  // Whether the batch requires to be assigned with an order
+  enum AssignOrder : bool { kDontAssignOrder, kDoAssignOrder };
+  // Whether it requires publishing last sequence or not
+  enum PublishLastSeq : bool { kDontPublishLastSeq, kDoPublishLastSeq };
+
+  // Join the write_thread to write the batch only to the WAL. It is the
+  // responsibility of the caller to also write the write batch to the memtable
+  // if it required.
+  //
+  // sub_batch_cnt is expected to be non-zero when assign_order = kDoAssignOrder
+  // indicating the number of sub-batches in my_batch. A sub-patch is a subset
+  // of the write batch that does not have duplicate keys. When seq_per_batch is
+  // not set, each key is a separate sub_batch. Otherwise each duplicate key
+  // marks start of a new sub-batch.
+  Status WriteImplWALOnly(
+      WriteThread* write_thread, const WriteOptions& options,
+      WriteBatch* updates, WriteCallback* callback, uint64_t* log_used,
+      const uint64_t log_ref, uint64_t* seq_used, const size_t sub_batch_cnt,
+      PreReleaseCallback* pre_release_callback, const AssignOrder assign_order,
+      const PublishLastSeq publish_last_seq, const bool disable_memtable);
+
+  // write cached_recoverable_state_ to memtable if it is not empty
+  // The writer must be the leader in write_thread_ and holding mutex_
+  Status WriteRecoverableState();
+
+  // Actual implementation of Close()
+  Status CloseImpl();
+
+  // Recover the descriptor from persistent storage.  May do a significant
+  // amount of work to recover recently logged updates.  Any changes to
+  // be made to the descriptor are added to *edit.
+  // recovered_seq is set to less than kMaxSequenceNumber if the log's tail is
+  // skipped.
+  virtual Status Recover(
+      const std::vector<ColumnFamilyDescriptor>& column_families,
+      bool read_only = false, bool error_if_log_file_exist = false,
+      bool error_if_data_exists_in_logs = false,
+      uint64_t* recovered_seq = nullptr);
+
+  virtual bool OwnTablesAndLogs() const { return true; }
+
+ private:
+  friend class DB;
+  friend class ErrorHandler;
+  friend class InternalStats;
+  friend class PessimisticTransaction;
+  friend class TransactionBaseImpl;
+  friend class WriteCommittedTxn;
+  friend class WritePreparedTxn;
+  friend class WritePreparedTxnDB;
+  friend class WriteBatchWithIndex;
+  friend class WriteUnpreparedTxnDB;
+  friend class WriteUnpreparedTxn;
+
+#ifndef ROCKSDB_LITE
+  friend class ForwardIterator;
+#endif
+  friend struct SuperVersion;
+  friend class CompactedDBImpl;
+  friend class DBTest_ConcurrentFlushWAL_Test;
+  friend class DBTest_MixedSlowdownOptionsStop_Test;
+  friend class DBCompactionTest_CompactBottomLevelFilesWithDeletions_Test;
+  friend class DBCompactionTest_CompactionDuringShutdown_Test;
+  friend class StatsHistoryTest_PersistentStatsCreateColumnFamilies_Test;
+#ifndef NDEBUG
+  friend class DBTest2_ReadCallbackTest_Test;
+  friend class WriteCallbackTest_WriteWithCallbackTest_Test;
+  friend class XFTransactionWriteHandler;
+  friend class DBBlobIndexTest;
+  friend class WriteUnpreparedTransactionTest_RecoveryTest_Test;
+#endif
+
+  struct CompactionState;
+  struct PrepickedCompaction;
+  struct PurgeFileInfo;
+
+  struct WriteContext {
+    SuperVersionContext superversion_context;
+    autovector<MemTable*> memtables_to_free_;
+
+    explicit WriteContext(bool create_superversion = false)
+        : superversion_context(create_superversion) {}
+
+    ~WriteContext() {
+      superversion_context.Clean();
+      for (auto& m : memtables_to_free_) {
+        delete m;
+      }
+    }
+  };
+
+  struct LogFileNumberSize {
+    explicit LogFileNumberSize(uint64_t _number) : number(_number) {}
+    void AddSize(uint64_t new_size) { size += new_size; }
+    uint64_t number;
+    uint64_t size = 0;
+    bool getting_flushed = false;
+  };
+
+  struct LogWriterNumber {
+    // pass ownership of _writer
+    LogWriterNumber(uint64_t _number, log::Writer* _writer)
+        : number(_number), writer(_writer) {}
+
+    log::Writer* ReleaseWriter() {
+      auto* w = writer;
+      writer = nullptr;
+      return w;
+    }
+    Status ClearWriter() {
+      Status s = writer->WriteBuffer();
+      delete writer;
+      writer = nullptr;
+      return s;
+    }
+
+    uint64_t number;
+    // Visual Studio doesn't support deque's member to be noncopyable because
+    // of a std::unique_ptr as a member.
+    log::Writer* writer;  // own
+    // true for some prefix of logs_
+    bool getting_synced = false;
+  };
+
+  // PurgeFileInfo is a structure to hold information of files to be deleted in
+  // purge_files_
+  struct PurgeFileInfo {
+    std::string fname;
+    std::string dir_to_sync;
+    FileType type;
+    uint64_t number;
+    int job_id;
+    PurgeFileInfo(std::string fn, std::string d, FileType t, uint64_t num,
+                  int jid)
+        : fname(fn), dir_to_sync(d), type(t), number(num), job_id(jid) {}
+  };
+
+  // Argument required by background flush thread.
+  struct BGFlushArg {
+    BGFlushArg()
+        : cfd_(nullptr), max_memtable_id_(0), superversion_context_(nullptr) {}
+    BGFlushArg(ColumnFamilyData* cfd, uint64_t max_memtable_id,
+               SuperVersionContext* superversion_context)
+        : cfd_(cfd),
+          max_memtable_id_(max_memtable_id),
+          superversion_context_(superversion_context) {}
+
+    // Column family to flush.
+    ColumnFamilyData* cfd_;
+    // Maximum ID of memtable to flush. In this column family, memtables with
+    // IDs smaller than this value must be flushed before this flush completes.
+    uint64_t max_memtable_id_;
+    // Pointer to a SuperVersionContext object. After flush completes, RocksDB
+    // installs a new superversion for the column family. This operation
+    // requires a SuperVersionContext object (currently embedded in JobContext).
+    SuperVersionContext* superversion_context_;
+  };
+
+  // Argument passed to flush thread.
+  struct FlushThreadArg {
+    DBImpl* db_;
+
+    Env::Priority thread_pri_;
+  };
+
+  // Information for a manual compaction
+  struct ManualCompactionState {
+    ColumnFamilyData* cfd;
+    int input_level;
+    int output_level;
+    uint32_t output_path_id;
+    Status status;
+    bool done;
+    bool in_progress;            // compaction request being processed?
+    bool incomplete;             // only part of requested range compacted
+    bool exclusive;              // current behavior of only one manual
+    bool disallow_trivial_move;  // Force actual compaction to run
+    const InternalKey* begin;    // nullptr means beginning of key range
+    const InternalKey* end;      // nullptr means end of key range
+    InternalKey* manual_end;     // how far we are compacting
+    InternalKey tmp_storage;     // Used to keep track of compaction progress
+    InternalKey tmp_storage1;    // Used to keep track of compaction progress
+  };
+  struct PrepickedCompaction {
+    // background compaction takes ownership of `compaction`.
+    Compaction* compaction;
+    // caller retains ownership of `manual_compaction_state` as it is reused
+    // across background compactions.
+    ManualCompactionState* manual_compaction_state;  // nullptr if non-manual
+    // task limiter token is requested during compaction picking.
+    std::unique_ptr<TaskLimiterToken> task_token;
+  };
+
+  struct CompactionArg {
+    // caller retains ownership of `db`.
+    DBImpl* db;
+    // background compaction takes ownership of `prepicked_compaction`.
+    PrepickedCompaction* prepicked_compaction;
+  };
+
+  // Initialize the built-in column family for persistent stats. Depending on
+  // whether on-disk persistent stats have been enabled before, it may either
+  // create a new column family and column family handle or just a column family
+  // handle.
+  // Required: DB mutex held
+  Status InitPersistStatsColumnFamily();
+
+  // Persistent Stats column family has two format version key which are used
+  // for compatibility check. Write format version if it's created for the
+  // first time, read format version and check compatibility if recovering
+  // from disk. This function requires DB mutex held at entrance but may
+  // release and re-acquire DB mutex in the process.
+  // Required: DB mutex held
+  Status PersistentStatsProcessFormatVersion();
+
+  Status ResumeImpl();
+
+  void MaybeIgnoreError(Status* s) const;
+
+  const Status CreateArchivalDirectory();
+
+  Status CreateColumnFamilyImpl(const ColumnFamilyOptions& cf_options,
+                                const std::string& cf_name,
+                                ColumnFamilyHandle** handle);
+
+  Status DropColumnFamilyImpl(ColumnFamilyHandle* column_family);
+
+  // Delete any unneeded files and stale in-memory entries.
+  void DeleteObsoleteFiles();
+  // Delete obsolete files and log status and information of file deletion
+  void DeleteObsoleteFileImpl(int job_id, const std::string& fname,
+                              const std::string& path_to_sync, FileType type,
+                              uint64_t number);
+
+  // Background process needs to call
+  //     auto x = CaptureCurrentFileNumberInPendingOutputs()
+  //     auto file_num = versions_->NewFileNumber();
+  //     <do something>
+  //     ReleaseFileNumberFromPendingOutputs(x)
+  // This will protect any file with number `file_num` or greater from being
+  // deleted while <do something> is running.
+  // -----------
+  // This function will capture current file number and append it to
+  // pending_outputs_. This will prevent any background process to delete any
+  // file created after this point.
+  std::list<uint64_t>::iterator CaptureCurrentFileNumberInPendingOutputs();
+  // This function should be called with the result of
+  // CaptureCurrentFileNumberInPendingOutputs(). It then marks that any file
+  // created between the calls CaptureCurrentFileNumberInPendingOutputs() and
+  // ReleaseFileNumberFromPendingOutputs() can now be deleted (if it's not live
+  // and blocked by any other pending_outputs_ calls)
+  void ReleaseFileNumberFromPendingOutputs(
+      std::unique_ptr<std::list<uint64_t>::iterator>& v);
+
+  Status SyncClosedLogs(JobContext* job_context);
+
+  // Flush the in-memory write buffer to storage.  Switches to a new
+  // log-file/memtable and writes a new descriptor iff successful. Then
+  // installs a new super version for the column family.
+  Status FlushMemTableToOutputFile(
+      ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options,
+      bool* madeProgress, JobContext* job_context,
+      SuperVersionContext* superversion_context,
+      std::vector<SequenceNumber>& snapshot_seqs,
+      SequenceNumber earliest_write_conflict_snapshot,
+      SnapshotChecker* snapshot_checker, LogBuffer* log_buffer,
+      Env::Priority thread_pri);
+
+  // Flush the memtables of (multiple) column families to multiple files on
+  // persistent storage.
+  Status FlushMemTablesToOutputFiles(
+      const autovector<BGFlushArg>& bg_flush_args, bool* made_progress,
+      JobContext* job_context, LogBuffer* log_buffer, Env::Priority thread_pri);
+
+  Status AtomicFlushMemTablesToOutputFiles(
+      const autovector<BGFlushArg>& bg_flush_args, bool* made_progress,
+      JobContext* job_context, LogBuffer* log_buffer, Env::Priority thread_pri);
+
+  // REQUIRES: log_numbers are sorted in ascending order
+  // corrupted_log_found is set to true if we recover from a corrupted log file.
+  Status RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
+                         SequenceNumber* next_sequence, bool read_only,
+                         bool* corrupted_log_found);
+
+  // The following two methods are used to flush a memtable to
+  // storage. The first one is used at database RecoveryTime (when the
+  // database is opened) and is heavyweight because it holds the mutex
+  // for the entire period. The second method WriteLevel0Table supports
+  // concurrent flush memtables to storage.
+  Status WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
+                                     MemTable* mem, VersionEdit* edit);
+
+  // Restore alive_log_files_ and total_log_size_ after recovery.
+  // It needs to run only when there's no flush during recovery
+  // (e.g. avoid_flush_during_recovery=true). May also trigger flush
+  // in case total_log_size > max_total_wal_size.
+  Status RestoreAliveLogFiles(const std::vector<uint64_t>& log_numbers);
+
+  // num_bytes: for slowdown case, delay time is calculated based on
+  //            `num_bytes` going through.
+  Status DelayWrite(uint64_t num_bytes, const WriteOptions& write_options);
+
+  Status ThrottleLowPriWritesIfNeeded(const WriteOptions& write_options,
+                                      WriteBatch* my_batch);
+
+  // REQUIRES: mutex locked and in write thread.
+  Status ScheduleFlushes(WriteContext* context);
+
+  void MaybeFlushStatsCF(autovector<ColumnFamilyData*>* cfds);
+
+  Status TrimMemtableHistory(WriteContext* context);
+
+  Status SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context);
+
+  void SelectColumnFamiliesForAtomicFlush(autovector<ColumnFamilyData*>* cfds);
+
+  // Force current memtable contents to be flushed.
+  Status FlushMemTable(ColumnFamilyData* cfd, const FlushOptions& options,
+                       FlushReason flush_reason, bool writes_stopped = false);
+
+  Status AtomicFlushMemTables(
+      const autovector<ColumnFamilyData*>& column_family_datas,
+      const FlushOptions& options, FlushReason flush_reason,
+      bool writes_stopped = false);
+
+  // Wait until flushing this column family won't stall writes
+  Status WaitUntilFlushWouldNotStallWrites(ColumnFamilyData* cfd,
+                                           bool* flush_needed);
+
+  // Wait for memtable flushed.
+  // If flush_memtable_id is non-null, wait until the memtable with the ID
+  // gets flush. Otherwise, wait until the column family don't have any
+  // memtable pending flush.
+  // resuming_from_bg_err indicates whether the caller is attempting to resume
+  // from background error.
+  Status WaitForFlushMemTable(ColumnFamilyData* cfd,
+                              const uint64_t* flush_memtable_id = nullptr,
+                              bool resuming_from_bg_err = false) {
+    return WaitForFlushMemTables({cfd}, {flush_memtable_id},
+                                 resuming_from_bg_err);
+  }
+  // Wait for memtables to be flushed for multiple column families.
+  Status WaitForFlushMemTables(
+      const autovector<ColumnFamilyData*>& cfds,
+      const autovector<const uint64_t*>& flush_memtable_ids,
+      bool resuming_from_bg_err);
+
+  inline void WaitForPendingWrites() {
+    mutex_.AssertHeld();
+    TEST_SYNC_POINT("DBImpl::WaitForPendingWrites:BeforeBlock");
+    // In case of pipelined write is enabled, wait for all pending memtable
+    // writers.
+    if (immutable_db_options_.enable_pipelined_write) {
+      // Memtable writers may call DB::Get in case max_successive_merges > 0,
+      // which may lock mutex. Unlocking mutex here to avoid deadlock.
+      mutex_.Unlock();
+      write_thread_.WaitForMemTableWriters();
+      mutex_.Lock();
+    }
+
+    if (!immutable_db_options_.unordered_write) {
+      // Then the writes are finished before the next write group starts
+      return;
+    }
+
+    // Wait for the ones who already wrote to the WAL to finish their
+    // memtable write.
+    if (pending_memtable_writes_.load() != 0) {
+      std::unique_lock<std::mutex> guard(switch_mutex_);
+      switch_cv_.wait(guard,
+                      [&] { return pending_memtable_writes_.load() == 0; });
+    }
+  }
+
+  // REQUIRES: mutex locked and in write thread.
+  void AssignAtomicFlushSeq(const autovector<ColumnFamilyData*>& cfds);
+
+  // REQUIRES: mutex locked and in write thread.
+  Status SwitchWAL(WriteContext* write_context);
+
+  // REQUIRES: mutex locked and in write thread.
+  Status HandleWriteBufferFull(WriteContext* write_context);
+
+  // REQUIRES: mutex locked
+  Status PreprocessWrite(const WriteOptions& write_options, bool* need_log_sync,
+                         WriteContext* write_context);
+
+  WriteBatch* MergeBatch(const WriteThread::WriteGroup& write_group,
+                         WriteBatch* tmp_batch, size_t* write_with_wal,
+                         WriteBatch** to_be_cached_state);
+
+  Status WriteToWAL(const WriteBatch& merged_batch, log::Writer* log_writer,
+                    uint64_t* log_used, uint64_t* log_size);
+
+  Status WriteToWAL(const WriteThread::WriteGroup& write_group,
+                    log::Writer* log_writer, uint64_t* log_used,
+                    bool need_log_sync, bool need_log_dir_sync,
+                    SequenceNumber sequence);
+
+  Status ConcurrentWriteToWAL(const WriteThread::WriteGroup& write_group,
+                              uint64_t* log_used, SequenceNumber* last_sequence,
+                              size_t seq_inc);
+
+  // Used by WriteImpl to update bg_error_ if paranoid check is enabled.
+  void WriteStatusCheck(const Status& status);
+
+  // Used by WriteImpl to update bg_error_ in case of memtable insert error.
+  void MemTableInsertStatusCheck(const Status& memtable_insert_status);
+
+#ifndef ROCKSDB_LITE
+
+  Status CompactFilesImpl(const CompactionOptions& compact_options,
+                          ColumnFamilyData* cfd, Version* version,
+                          const std::vector<std::string>& input_file_names,
+                          std::vector<std::string>* const output_file_names,
+                          const int output_level, int output_path_id,
+                          JobContext* job_context, LogBuffer* log_buffer,
+                          CompactionJobInfo* compaction_job_info);
+
+  // Wait for current IngestExternalFile() calls to finish.
+  // REQUIRES: mutex_ held
+  void WaitForIngestFile();
+
+#else
+  // IngestExternalFile is not supported in ROCKSDB_LITE so this function
+  // will be no-op
+  void WaitForIngestFile() {}
+#endif  // ROCKSDB_LITE
+
+  ColumnFamilyData* GetColumnFamilyDataByName(const std::string& cf_name);
+
+  void MaybeScheduleFlushOrCompaction();
+
+  // A flush request specifies the column families to flush as well as the
+  // largest memtable id to persist for each column family. Once all the
+  // memtables whose IDs are smaller than or equal to this per-column-family
+  // specified value, this flush request is considered to have completed its
+  // work of flushing this column family. After completing the work for all
+  // column families in this request, this flush is considered complete.
+  typedef std::vector<std::pair<ColumnFamilyData*, uint64_t>> FlushRequest;
+
+  void GenerateFlushRequest(const autovector<ColumnFamilyData*>& cfds,
+                            FlushRequest* req);
+
+  void SchedulePendingFlush(const FlushRequest& req, FlushReason flush_reason);
+
+  void SchedulePendingCompaction(ColumnFamilyData* cfd);
+  void SchedulePendingPurge(std::string fname, std::string dir_to_sync,
+                            FileType type, uint64_t number, int job_id);
+  static void BGWorkCompaction(void* arg);
+  // Runs a pre-chosen universal compaction involving bottom level in a
+  // separate, bottom-pri thread pool.
+  static void BGWorkBottomCompaction(void* arg);
+  static void BGWorkFlush(void* arg);
+  static void BGWorkPurge(void* arg);
+  static void UnscheduleCompactionCallback(void* arg);
+  static void UnscheduleFlushCallback(void* arg);
+  void BackgroundCallCompaction(PrepickedCompaction* prepicked_compaction,
+                                Env::Priority thread_pri);
+  void BackgroundCallFlush(Env::Priority thread_pri);
+  void BackgroundCallPurge();
+  Status BackgroundCompaction(bool* madeProgress, JobContext* job_context,
+                              LogBuffer* log_buffer,
+                              PrepickedCompaction* prepicked_compaction,
+                              Env::Priority thread_pri);
+  Status BackgroundFlush(bool* madeProgress, JobContext* job_context,
+                         LogBuffer* log_buffer, FlushReason* reason,
+                         Env::Priority thread_pri);
+
+  bool EnoughRoomForCompaction(ColumnFamilyData* cfd,
+                               const std::vector<CompactionInputFiles>& inputs,
+                               bool* sfm_bookkeeping, LogBuffer* log_buffer);
+
+  // Request compaction tasks token from compaction thread limiter.
+  // It always succeeds if force = true or limiter is disable.
+  bool RequestCompactionToken(ColumnFamilyData* cfd, bool force,
+                              std::unique_ptr<TaskLimiterToken>* token,
+                              LogBuffer* log_buffer);
+
+  // Schedule background tasks
+  void StartTimedTasks();
+
+  void PrintStatistics();
+
+  size_t EstimateInMemoryStatsHistorySize() const;
+
+  // persist stats to column family "_persistent_stats"
+  void PersistStats();
+
+  // dump rocksdb.stats to LOG
+  void DumpStats();
+
+  // Return the minimum empty level that could hold the total data in the
+  // input level. Return the input level, if such level could not be found.
+  int FindMinimumEmptyLevelFitting(ColumnFamilyData* cfd,
+                                   const MutableCFOptions& mutable_cf_options,
+                                   int level);
+
+  // Move the files in the input level to the target level.
+  // If target_level < 0, automatically calculate the minimum level that could
+  // hold the data set.
+  Status ReFitLevel(ColumnFamilyData* cfd, int level, int target_level = -1);
+
+  // helper functions for adding and removing from flush & compaction queues
+  void AddToCompactionQueue(ColumnFamilyData* cfd);
+  ColumnFamilyData* PopFirstFromCompactionQueue();
+  FlushRequest PopFirstFromFlushQueue();
+
+  // Pick the first unthrottled compaction with task token from queue.
+  ColumnFamilyData* PickCompactionFromQueue(
+      std::unique_ptr<TaskLimiterToken>* token, LogBuffer* log_buffer);
+
+  // helper function to call after some of the logs_ were synced
+  void MarkLogsSynced(uint64_t up_to, bool synced_dir, const Status& status);
+
+  SnapshotImpl* GetSnapshotImpl(bool is_write_conflict_boundary,
+                                bool lock = true);
+
+  uint64_t GetMaxTotalWalSize() const;
+
+  Directory* GetDataDir(ColumnFamilyData* cfd, size_t path_id) const;
+
+  Status CloseHelper();
+
+  void WaitForBackgroundWork();
+
+  // Background threads call this function, which is just a wrapper around
+  // the InstallSuperVersion() function. Background threads carry
+  // sv_context which can have new_superversion already
+  // allocated.
+  // All ColumnFamily state changes go through this function. Here we analyze
+  // the new state and we schedule background work if we detect that the new
+  // state needs flush or compaction.
+  void InstallSuperVersionAndScheduleWork(
+      ColumnFamilyData* cfd, SuperVersionContext* sv_context,
+      const MutableCFOptions& mutable_cf_options);
+
+  bool GetIntPropertyInternal(ColumnFamilyData* cfd,
+                              const DBPropertyInfo& property_info,
+                              bool is_locked, uint64_t* value);
+  bool GetPropertyHandleOptionsStatistics(std::string* value);
+
+  bool HasPendingManualCompaction();
+  bool HasExclusiveManualCompaction();
+  void AddManualCompaction(ManualCompactionState* m);
+  void RemoveManualCompaction(ManualCompactionState* m);
+  bool ShouldntRunManualCompaction(ManualCompactionState* m);
+  bool HaveManualCompaction(ColumnFamilyData* cfd);
+  bool MCOverlap(ManualCompactionState* m, ManualCompactionState* m1);
+#ifndef ROCKSDB_LITE
+  void BuildCompactionJobInfo(const ColumnFamilyData* cfd, Compaction* c,
+                              const Status& st,
+                              const CompactionJobStats& compaction_job_stats,
+                              const int job_id, const Version* current,
+                              CompactionJobInfo* compaction_job_info) const;
+  // Reserve the next 'num' file numbers for to-be-ingested external SST files,
+  // and return the current file_number in 'next_file_number'.
+  // Write a version edit to the MANIFEST.
+  Status ReserveFileNumbersBeforeIngestion(
+      ColumnFamilyData* cfd, uint64_t num,
+      std::unique_ptr<std::list<uint64_t>::iterator>& pending_output_elem,
+      uint64_t* next_file_number);
+#endif  //! ROCKSDB_LITE
+
+  bool ShouldPurge(uint64_t file_number) const;
+  void MarkAsGrabbedForPurge(uint64_t file_number);
+
+  size_t GetWalPreallocateBlockSize(uint64_t write_buffer_size) const;
+  Env::WriteLifeTimeHint CalculateWALWriteHint() { return Env::WLTH_SHORT; }
+
+  Status CreateWAL(uint64_t log_file_num, uint64_t recycle_log_number,
+                   size_t preallocate_block_size, log::Writer** new_log);
+
+  // Validate self-consistency of DB options
+  static Status ValidateOptions(const DBOptions& db_options);
+  // Validate self-consistency of DB options and its consistency with cf options
+  static Status ValidateOptions(
+      const DBOptions& db_options,
+      const std::vector<ColumnFamilyDescriptor>& column_families);
+
+  // Utility function to do some debug validation and sort the given vector
+  // of MultiGet keys
+  void PrepareMultiGetKeys(
+      const size_t num_keys, bool sorted,
+      autovector<KeyContext*, MultiGetContext::MAX_BATCH_SIZE>* key_ptrs);
+
+  // A structure to hold the information required to process MultiGet of keys
+  // belonging to one column family. For a multi column family MultiGet, there
+  // will be a container of these objects.
+  struct MultiGetColumnFamilyData {
+    ColumnFamilyHandle* cf;
+    ColumnFamilyData* cfd;
+
+    // For the batched MultiGet which relies on sorted keys, start specifies
+    // the index of first key belonging to this column family in the sorted
+    // list.
+    size_t start;
+
+    // For the batched MultiGet case, num_keys specifies the number of keys
+    // belonging to this column family in the sorted list
+    size_t num_keys;
+
+    // SuperVersion for the column family obtained in a manner that ensures a
+    // consistent view across all column families in the DB
+    SuperVersion* super_version;
+    MultiGetColumnFamilyData(ColumnFamilyHandle* column_family,
+                             SuperVersion* sv)
+        : cf(column_family),
+          cfd(static_cast<ColumnFamilyHandleImpl*>(cf)->cfd()),
+          start(0),
+          num_keys(0),
+          super_version(sv) {}
+
+    MultiGetColumnFamilyData(ColumnFamilyHandle* column_family, size_t first,
+                             size_t count, SuperVersion* sv)
+        : cf(column_family),
+          cfd(static_cast<ColumnFamilyHandleImpl*>(cf)->cfd()),
+          start(first),
+          num_keys(count),
+          super_version(sv) {}
+
+    MultiGetColumnFamilyData() = default;
+  };
+
+  // A common function to obtain a consistent snapshot, which can be implicit
+  // if the user doesn't specify a snapshot in read_options, across
+  // multiple column families for MultiGet. It will attempt to get an implicit
+  // snapshot without acquiring the db_mutes, but will give up after a few
+  // tries and acquire the mutex if a memtable flush happens. The template
+  // allows both the batched and non-batched MultiGet to call this with
+  // either an std::unordered_map or autovector of column families.
+  //
+  // If callback is non-null, the callback is refreshed with the snapshot
+  // sequence number
+  //
+  // A return value of true indicates that the SuperVersions were obtained
+  // from the ColumnFamilyData, whereas false indicates they are thread
+  // local
+  template <class T>
+  bool MultiCFSnapshot(
+      const ReadOptions& read_options, ReadCallback* callback,
+      std::function<MultiGetColumnFamilyData*(typename T::iterator&)>&
+          iter_deref_func,
+      T* cf_list, SequenceNumber* snapshot);
+
+  // The actual implementation of the batching MultiGet. The caller is expected
+  // to have acquired the SuperVersion and pass in a snapshot sequence number
+  // in order to construct the LookupKeys. The start_key and num_keys specify
+  // the range of keys in the sorted_keys vector for a single column family.
+  void MultiGetImpl(
+      const ReadOptions& read_options, size_t start_key, size_t num_keys,
+      autovector<KeyContext*, MultiGetContext::MAX_BATCH_SIZE>* sorted_keys,
+      SuperVersion* sv, SequenceNumber snap_seqnum, ReadCallback* callback,
+      bool* is_blob_index);
+
+  // table_cache_ provides its own synchronization
+  std::shared_ptr<Cache> table_cache_;
+
+  // Lock over the persistent DB state.  Non-nullptr iff successfully acquired.
+  FileLock* db_lock_;
+
+  // In addition to mutex_, log_write_mutex_ protected writes to stats_history_
+  InstrumentedMutex stats_history_mutex_;
+  // In addition to mutex_, log_write_mutex_ protected writes to logs_ and
+  // logfile_number_. With two_write_queues it also protects alive_log_files_,
+  // and log_empty_. Refer to the definition of each variable below for more
+  // details.
+  // Note: to avoid dealock, if needed to acquire both log_write_mutex_ and
+  // mutex_, the order should be first mutex_ and then log_write_mutex_.
+  InstrumentedMutex log_write_mutex_;
+
+  std::atomic<bool> shutting_down_;
+  std::atomic<bool> manual_compaction_paused_;
+  // This condition variable is signaled on these conditions:
+  // * whenever bg_compaction_scheduled_ goes down to 0
+  // * if AnyManualCompaction, whenever a compaction finishes, even if it hasn't
+  // made any progress
+  // * whenever a compaction made any progress
+  // * whenever bg_flush_scheduled_ or bg_purge_scheduled_ value decreases
+  // (i.e. whenever a flush is done, even if it didn't make any progress)
+  // * whenever there is an error in background purge, flush or compaction
+  // * whenever num_running_ingest_file_ goes to 0.
+  // * whenever pending_purge_obsolete_files_ goes to 0.
+  // * whenever disable_delete_obsolete_files_ goes to 0.
+  // * whenever SetOptions successfully updates options.
+  // * whenever a column family is dropped.
+  InstrumentedCondVar bg_cv_;
+  // Writes are protected by locking both mutex_ and log_write_mutex_, and reads
+  // must be under either mutex_ or log_write_mutex_. Since after ::Open,
+  // logfile_number_ is currently updated only in write_thread_, it can be read
+  // from the same write_thread_ without any locks.
+  uint64_t logfile_number_;
+  std::deque<uint64_t>
+      log_recycle_files_;  // a list of log files that we can recycle
+  bool log_dir_synced_;
+  // Without two_write_queues, read and writes to log_empty_ are protected by
+  // mutex_. Since it is currently updated/read only in write_thread_, it can be
+  // accessed from the same write_thread_ without any locks. With
+  // two_write_queues writes, where it can be updated in different threads,
+  // read and writes are protected by log_write_mutex_ instead. This is to avoid
+  // expesnive mutex_ lock during WAL write, which update log_empty_.
+  bool log_empty_;
+
+  ColumnFamilyHandleImpl* persist_stats_cf_handle_;
+
+  bool persistent_stats_cfd_exists_ = true;
+
+  // Without two_write_queues, read and writes to alive_log_files_ are
+  // protected by mutex_. However since back() is never popped, and push_back()
+  // is done only from write_thread_, the same thread can access the item
+  // reffered by back() without mutex_. With two_write_queues_, writes
+  // are protected by locking both mutex_ and log_write_mutex_, and reads must
+  // be under either mutex_ or log_write_mutex_.
+  std::deque<LogFileNumberSize> alive_log_files_;
+  // Log files that aren't fully synced, and the current log file.
+  // Synchronization:
+  //  - push_back() is done from write_thread_ with locked mutex_ and
+  //  log_write_mutex_
+  //  - pop_front() is done from any thread with locked mutex_ and
+  //  log_write_mutex_
+  //  - reads are done with either locked mutex_ or log_write_mutex_
+  //  - back() and items with getting_synced=true are not popped,
+  //  - The same thread that sets getting_synced=true will reset it.
+  //  - it follows that the object referred by back() can be safely read from
+  //  the write_thread_ without using mutex
+  //  - it follows that the items with getting_synced=true can be safely read
+  //  from the same thread that has set getting_synced=true
+  std::deque<LogWriterNumber> logs_;
+  // Signaled when getting_synced becomes false for some of the logs_.
+  InstrumentedCondVar log_sync_cv_;
+  // This is the app-level state that is written to the WAL but will be used
+  // only during recovery. Using this feature enables not writing the state to
+  // memtable on normal writes and hence improving the throughput. Each new
+  // write of the state will replace the previous state entirely even if the
+  // keys in the two consecuitive states do not overlap.
+  // It is protected by log_write_mutex_ when two_write_queues_ is enabled.
+  // Otherwise only the heaad of write_thread_ can access it.
+  WriteBatch cached_recoverable_state_;
+  std::atomic<bool> cached_recoverable_state_empty_ = {true};
+  std::atomic<uint64_t> total_log_size_;
+
+  // If this is non-empty, we need to delete these log files in background
+  // threads. Protected by db mutex.
+  autovector<log::Writer*> logs_to_free_;
+
+  bool is_snapshot_supported_;
+
+  std::map<uint64_t, std::map<std::string, uint64_t>> stats_history_;
+
+  std::map<std::string, uint64_t> stats_slice_;
+
+  bool stats_slice_initialized_ = false;
+
+  Directories directories_;
+
+  WriteBufferManager* write_buffer_manager_;
+
+  WriteThread write_thread_;
+  WriteBatch tmp_batch_;
+  // The write thread when the writers have no memtable write. This will be used
+  // in 2PC to batch the prepares separately from the serial commit.
+  WriteThread nonmem_write_thread_;
+
+  WriteController write_controller_;
+
+  // Size of the last batch group. In slowdown mode, next write needs to
+  // sleep if it uses up the quota.
+  // Note: This is to protect memtable and compaction. If the batch only writes
+  // to the WAL its size need not to be included in this.
+  uint64_t last_batch_group_size_;
+
+  FlushScheduler flush_scheduler_;
+
+  TrimHistoryScheduler trim_history_scheduler_;
+
+  SnapshotList snapshots_;
+
+  // For each background job, pending_outputs_ keeps the current file number at
+  // the time that background job started.
+  // FindObsoleteFiles()/PurgeObsoleteFiles() never deletes any file that has
+  // number bigger than any of the file number in pending_outputs_. Since file
+  // numbers grow monotonically, this also means that pending_outputs_ is always
+  // sorted. After a background job is done executing, its file number is
+  // deleted from pending_outputs_, which allows PurgeObsoleteFiles() to clean
+  // it up.
+  // State is protected with db mutex.
+  std::list<uint64_t> pending_outputs_;
+
+  // flush_queue_ and compaction_queue_ hold column families that we need to
+  // flush and compact, respectively.
+  // A column family is inserted into flush_queue_ when it satisfies condition
+  // cfd->imm()->IsFlushPending()
+  // A column family is inserted into compaction_queue_ when it satisfied
+  // condition cfd->NeedsCompaction()
+  // Column families in this list are all Ref()-erenced
+  // TODO(icanadi) Provide some kind of ReferencedColumnFamily class that will
+  // do RAII on ColumnFamilyData
+  // Column families are in this queue when they need to be flushed or
+  // compacted. Consumers of these queues are flush and compaction threads. When
+  // column family is put on this queue, we increase unscheduled_flushes_ and
+  // unscheduled_compactions_. When these variables are bigger than zero, that
+  // means we need to schedule background threads for flush and compaction.
+  // Once the background threads are scheduled, we decrease unscheduled_flushes_
+  // and unscheduled_compactions_. That way we keep track of number of
+  // compaction and flush threads we need to schedule. This scheduling is done
+  // in MaybeScheduleFlushOrCompaction()
+  // invariant(column family present in flush_queue_ <==>
+  // ColumnFamilyData::pending_flush_ == true)
+  std::deque<FlushRequest> flush_queue_;
+  // invariant(column family present in compaction_queue_ <==>
+  // ColumnFamilyData::pending_compaction_ == true)
+  std::deque<ColumnFamilyData*> compaction_queue_;
+
+  // A map to store file numbers and filenames of the files to be purged
+  std::unordered_map<uint64_t, PurgeFileInfo> purge_files_;
+
+  // A vector to store the file numbers that have been assigned to certain
+  // JobContext. Current implementation tracks ssts only.
+  std::unordered_set<uint64_t> files_grabbed_for_purge_;
+
+  // A queue to store log writers to close
+  std::deque<log::Writer*> logs_to_free_queue_;
+  std::deque<SuperVersion*> superversions_to_free_queue_;
+  int unscheduled_flushes_;
+  int unscheduled_compactions_;
+
+  // count how many background compactions are running or have been scheduled in
+  // the BOTTOM pool
+  int bg_bottom_compaction_scheduled_;
+
+  // count how many background compactions are running or have been scheduled
+  int bg_compaction_scheduled_;
+
+  // stores the number of compactions are currently running
+  int num_running_compactions_;
+
+  // number of background memtable flush jobs, submitted to the HIGH pool
+  int bg_flush_scheduled_;
+
+  // stores the number of flushes are currently running
+  int num_running_flushes_;
+
+  // number of background obsolete file purge jobs, submitted to the HIGH pool
+  int bg_purge_scheduled_;
+
+  std::deque<ManualCompactionState*> manual_compaction_dequeue_;
+
+  // shall we disable deletion of obsolete files
+  // if 0 the deletion is enabled.
+  // if non-zero, files will not be getting deleted
+  // This enables two different threads to call
+  // EnableFileDeletions() and DisableFileDeletions()
+  // without any synchronization
+  int disable_delete_obsolete_files_;
+
+  // Number of times FindObsoleteFiles has found deletable files and the
+  // corresponding call to PurgeObsoleteFiles has not yet finished.
+  int pending_purge_obsolete_files_;
+
+  // last time when DeleteObsoleteFiles with full scan was executed. Originally
+  // initialized with startup time.
+  uint64_t delete_obsolete_files_last_run_;
+
+  // last time stats were dumped to LOG
+  std::atomic<uint64_t> last_stats_dump_time_microsec_;
+
+  // The thread that wants to switch memtable, can wait on this cv until the
+  // pending writes to memtable finishes.
+  std::condition_variable switch_cv_;
+  // The mutex used by switch_cv_. mutex_ should be acquired beforehand.
+  std::mutex switch_mutex_;
+  // Number of threads intending to write to memtable
+  std::atomic<size_t> pending_memtable_writes_ = {};
+
+  // Each flush or compaction gets its own job id. this counter makes sure
+  // they're unique
+  std::atomic<int> next_job_id_;
+
+  // A flag indicating whether the current rocksdb database has any
+  // data that is not yet persisted into either WAL or SST file.
+  // Used when disableWAL is true.
+  std::atomic<bool> has_unpersisted_data_;
+
+  // if an attempt was made to flush all column families that
+  // the oldest log depends on but uncommitted data in the oldest
+  // log prevents the log from being released.
+  // We must attempt to free the dependent memtables again
+  // at a later time after the transaction in the oldest
+  // log is fully commited.
+  bool unable_to_release_oldest_log_;
+
+  static const int KEEP_LOG_FILE_NUM = 1000;
+  // MSVC version 1800 still does not have constexpr for ::max()
+  static const uint64_t kNoTimeOut = port::kMaxUint64;
+
+  std::string db_absolute_path_;
+
+  // Number of running IngestExternalFile() or CreateColumnFamilyWithImport()
+  // calls.
+  // REQUIRES: mutex held
+  int num_running_ingest_file_;
+
+#ifndef ROCKSDB_LITE
+  WalManager wal_manager_;
+#endif  // ROCKSDB_LITE
+
+  // Unified interface for logging events
+  EventLogger event_logger_;
+
+  // A value of > 0 temporarily disables scheduling of background work
+  int bg_work_paused_;
+
+  // A value of > 0 temporarily disables scheduling of background compaction
+  int bg_compaction_paused_;
+
+  // Guard against multiple concurrent refitting
+  bool refitting_level_;
+
+  // Indicate DB was opened successfully
+  bool opened_successfully_;
+
+  // The min threshold to triggere bottommost compaction for removing
+  // garbages, among all column families.
+  SequenceNumber bottommost_files_mark_threshold_ = kMaxSequenceNumber;
+
+  LogsWithPrepTracker logs_with_prep_tracker_;
+
+  // Callback for compaction to check if a key is visible to a snapshot.
+  // REQUIRES: mutex held
+  std::unique_ptr<SnapshotChecker> snapshot_checker_;
+
+  // Callback for when the cached_recoverable_state_ is written to memtable
+  // Only to be set during initialization
+  std::unique_ptr<PreReleaseCallback> recoverable_state_pre_release_callback_;
+
+  // handle for scheduling stats dumping at fixed intervals
+  // REQUIRES: mutex locked
+  std::unique_ptr<ROCKSDB_NAMESPACE::RepeatableThread> thread_dump_stats_;
+
+  // handle for scheduling stats snapshoting at fixed intervals
+  // REQUIRES: mutex locked
+  std::unique_ptr<ROCKSDB_NAMESPACE::RepeatableThread> thread_persist_stats_;
+
+  // When set, we use a separate queue for writes that dont write to memtable.
+  // In 2PC these are the writes at Prepare phase.
+  const bool two_write_queues_;
+  const bool manual_wal_flush_;
+
+  // LastSequence also indicates last published sequence visibile to the
+  // readers. Otherwise LastPublishedSequence should be used.
+  const bool last_seq_same_as_publish_seq_;
+  // It indicates that a customized gc algorithm must be used for
+  // flush/compaction and if it is not provided vis SnapshotChecker, we should
+  // disable gc to be safe.
+  const bool use_custom_gc_;
+  // Flag to indicate that the DB instance shutdown has been initiated. This
+  // different from shutting_down_ atomic in that it is set at the beginning
+  // of shutdown sequence, specifically in order to prevent any background
+  // error recovery from going on in parallel. The latter, shutting_down_,
+  // is set a little later during the shutdown after scheduling memtable
+  // flushes
+  std::atomic<bool> shutdown_initiated_;
+  // Flag to indicate whether sst_file_manager object was allocated in
+  // DB::Open() or passed to us
+  bool own_sfm_;
+
+  // Clients must periodically call SetPreserveDeletesSequenceNumber()
+  // to advance this seqnum. Default value is 0 which means ALL deletes are
+  // preserved. Note that this has no effect if DBOptions.preserve_deletes
+  // is set to false.
+  std::atomic<SequenceNumber> preserve_deletes_seqnum_;
+  const bool preserve_deletes_;
+
+  // Flag to check whether Close() has been called on this DB
+  bool closed_;
+
+  ErrorHandler error_handler_;
+
+  // Conditional variable to coordinate installation of atomic flush results.
+  // With atomic flush, each bg thread installs the result of flushing multiple
+  // column families, and different threads can flush different column
+  // families. It's difficult to rely on one thread to perform batch
+  // installation for all threads. This is different from the non-atomic flush
+  // case.
+  // atomic_flush_install_cv_ makes sure that threads install atomic flush
+  // results sequentially. Flush results of memtables with lower IDs get
+  // installed to MANIFEST first.
+  InstrumentedCondVar atomic_flush_install_cv_;
+
+  bool wal_in_db_path_;
+};
+
+extern Options SanitizeOptions(const std::string& db, const Options& src);
+
+extern DBOptions SanitizeOptions(const std::string& db, const DBOptions& src);
+
+extern CompressionType GetCompressionFlush(
+    const ImmutableCFOptions& ioptions,
+    const MutableCFOptions& mutable_cf_options);
+
+// Return the earliest log file to keep after the memtable flush is
+// finalized.
+// `cfd_to_flush` is the column family whose memtable (specified in
+// `memtables_to_flush`) will be flushed and thus will not depend on any WAL
+// file.
+// The function is only applicable to 2pc mode.
+extern uint64_t PrecomputeMinLogNumberToKeep(
+    VersionSet* vset, const ColumnFamilyData& cfd_to_flush,
+    autovector<VersionEdit*> edit_list,
+    const autovector<MemTable*>& memtables_to_flush,
+    LogsWithPrepTracker* prep_tracker);
+
+// `cfd_to_flush` is the column family whose memtable will be flushed and thus
+// will not depend on any WAL file. nullptr means no memtable is being flushed.
+// The function is only applicable to 2pc mode.
+extern uint64_t FindMinPrepLogReferencedByMemTable(
+    VersionSet* vset, const ColumnFamilyData* cfd_to_flush,
+    const autovector<MemTable*>& memtables_to_flush);
+
+// Fix user-supplied options to be reasonable
+template <class T, class V>
+static void ClipToRange(T* ptr, V minvalue, V maxvalue) {
+  if (static_cast<V>(*ptr) > maxvalue) *ptr = maxvalue;
+  if (static_cast<V>(*ptr) < minvalue) *ptr = minvalue;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/db_impl/db_impl_compaction_flush.cc b/src/rocksdb/db/db_impl/db_impl_compaction_flush.cc
new file mode 100644
index 000000000..c7b3510c3
--- /dev/null
+++ b/src/rocksdb/db/db_impl/db_impl_compaction_flush.cc
@@ -0,0 +1,3116 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include "db/db_impl/db_impl.h"
+
+#include <cinttypes>
+
+#include "db/builder.h"
+#include "db/error_handler.h"
+#include "db/event_helpers.h"
+#include "file/sst_file_manager_impl.h"
+#include "monitoring/iostats_context_imp.h"
+#include "monitoring/perf_context_imp.h"
+#include "monitoring/thread_status_updater.h"
+#include "monitoring/thread_status_util.h"
+#include "test_util/sync_point.h"
+#include "util/cast_util.h"
+#include "util/concurrent_task_limiter_impl.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+bool DBImpl::EnoughRoomForCompaction(
+    ColumnFamilyData* cfd, const std::vector<CompactionInputFiles>& inputs,
+    bool* sfm_reserved_compact_space, LogBuffer* log_buffer) {
+  // Check if we have enough room to do the compaction
+  bool enough_room = true;
+#ifndef ROCKSDB_LITE
+  auto sfm = static_cast<SstFileManagerImpl*>(
+      immutable_db_options_.sst_file_manager.get());
+  if (sfm) {
+    // Pass the current bg_error_ to SFM so it can decide what checks to
+    // perform. If this DB instance hasn't seen any error yet, the SFM can be
+    // optimistic and not do disk space checks
+    enough_room =
+        sfm->EnoughRoomForCompaction(cfd, inputs, error_handler_.GetBGError());
+    if (enough_room) {
+      *sfm_reserved_compact_space = true;
+    }
+  }
+#else
+  (void)cfd;
+  (void)inputs;
+  (void)sfm_reserved_compact_space;
+#endif  // ROCKSDB_LITE
+  if (!enough_room) {
+    // Just in case tests want to change the value of enough_room
+    TEST_SYNC_POINT_CALLBACK(
+        "DBImpl::BackgroundCompaction():CancelledCompaction", &enough_room);
+    ROCKS_LOG_BUFFER(log_buffer,
+                     "Cancelled compaction because not enough room");
+    RecordTick(stats_, COMPACTION_CANCELLED, 1);
+  }
+  return enough_room;
+}
+
+bool DBImpl::RequestCompactionToken(ColumnFamilyData* cfd, bool force,
+                                    std::unique_ptr<TaskLimiterToken>* token,
+                                    LogBuffer* log_buffer) {
+  assert(*token == nullptr);
+  auto limiter = static_cast<ConcurrentTaskLimiterImpl*>(
+      cfd->ioptions()->compaction_thread_limiter.get());
+  if (limiter == nullptr) {
+    return true;
+  }
+  *token = limiter->GetToken(force);
+  if (*token != nullptr) {
+    ROCKS_LOG_BUFFER(log_buffer,
+                     "Thread limiter [%s] increase [%s] compaction task, "
+                     "force: %s, tasks after: %d",
+                     limiter->GetName().c_str(), cfd->GetName().c_str(),
+                     force ? "true" : "false", limiter->GetOutstandingTask());
+    return true;
+  }
+  return false;
+}
+
+Status DBImpl::SyncClosedLogs(JobContext* job_context) {
+  TEST_SYNC_POINT("DBImpl::SyncClosedLogs:Start");
+  mutex_.AssertHeld();
+  autovector<log::Writer*, 1> logs_to_sync;
+  uint64_t current_log_number = logfile_number_;
+  while (logs_.front().number < current_log_number &&
+         logs_.front().getting_synced) {
+    log_sync_cv_.Wait();
+  }
+  for (auto it = logs_.begin();
+       it != logs_.end() && it->number < current_log_number; ++it) {
+    auto& log = *it;
+    assert(!log.getting_synced);
+    log.getting_synced = true;
+    logs_to_sync.push_back(log.writer);
+  }
+
+  Status s;
+  if (!logs_to_sync.empty()) {
+    mutex_.Unlock();
+
+    for (log::Writer* log : logs_to_sync) {
+      ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                     "[JOB %d] Syncing log #%" PRIu64, job_context->job_id,
+                     log->get_log_number());
+      s = log->file()->Sync(immutable_db_options_.use_fsync);
+      if (!s.ok()) {
+        break;
+      }
+
+      if (immutable_db_options_.recycle_log_file_num > 0) {
+        s = log->Close();
+        if (!s.ok()) {
+          break;
+        }
+      }
+    }
+    if (s.ok()) {
+      s = directories_.GetWalDir()->Fsync();
+    }
+
+    mutex_.Lock();
+
+    // "number <= current_log_number - 1" is equivalent to
+    // "number < current_log_number".
+    MarkLogsSynced(current_log_number - 1, true, s);
+    if (!s.ok()) {
+      error_handler_.SetBGError(s, BackgroundErrorReason::kFlush);
+      TEST_SYNC_POINT("DBImpl::SyncClosedLogs:Failed");
+      return s;
+    }
+  }
+  return s;
+}
+
+Status DBImpl::FlushMemTableToOutputFile(
+    ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options,
+    bool* made_progress, JobContext* job_context,
+    SuperVersionContext* superversion_context,
+    std::vector<SequenceNumber>& snapshot_seqs,
+    SequenceNumber earliest_write_conflict_snapshot,
+    SnapshotChecker* snapshot_checker, LogBuffer* log_buffer,
+    Env::Priority thread_pri) {
+  mutex_.AssertHeld();
+  assert(cfd->imm()->NumNotFlushed() != 0);
+  assert(cfd->imm()->IsFlushPending());
+
+  FlushJob flush_job(
+      dbname_, cfd, immutable_db_options_, mutable_cf_options,
+      nullptr /* memtable_id */, file_options_for_compaction_, versions_.get(),
+      &mutex_, &shutting_down_, snapshot_seqs, earliest_write_conflict_snapshot,
+      snapshot_checker, job_context, log_buffer, directories_.GetDbDir(),
+      GetDataDir(cfd, 0U),
+      GetCompressionFlush(*cfd->ioptions(), mutable_cf_options), stats_,
+      &event_logger_, mutable_cf_options.report_bg_io_stats,
+      true /* sync_output_directory */, true /* write_manifest */, thread_pri);
+
+  FileMetaData file_meta;
+
+  TEST_SYNC_POINT("DBImpl::FlushMemTableToOutputFile:BeforePickMemtables");
+  flush_job.PickMemTable();
+  TEST_SYNC_POINT("DBImpl::FlushMemTableToOutputFile:AfterPickMemtables");
+
+#ifndef ROCKSDB_LITE
+  // may temporarily unlock and lock the mutex.
+  NotifyOnFlushBegin(cfd, &file_meta, mutable_cf_options, job_context->job_id);
+#endif  // ROCKSDB_LITE
+
+  Status s;
+  if (logfile_number_ > 0 &&
+      versions_->GetColumnFamilySet()->NumberOfColumnFamilies() > 1) {
+    // If there are more than one column families, we need to make sure that
+    // all the log files except the most recent one are synced. Otherwise if
+    // the host crashes after flushing and before WAL is persistent, the
+    // flushed SST may contain data from write batches whose updates to
+    // other column families are missing.
+    // SyncClosedLogs() may unlock and re-lock the db_mutex.
+    s = SyncClosedLogs(job_context);
+  } else {
+    TEST_SYNC_POINT("DBImpl::SyncClosedLogs:Skip");
+  }
+
+  // Within flush_job.Run, rocksdb may call event listener to notify
+  // file creation and deletion.
+  //
+  // Note that flush_job.Run will unlock and lock the db_mutex,
+  // and EventListener callback will be called when the db_mutex
+  // is unlocked by the current thread.
+  if (s.ok()) {
+    s = flush_job.Run(&logs_with_prep_tracker_, &file_meta);
+  } else {
+    flush_job.Cancel();
+  }
+
+  if (s.ok()) {
+    InstallSuperVersionAndScheduleWork(cfd, superversion_context,
+                                       mutable_cf_options);
+    if (made_progress) {
+      *made_progress = true;
+    }
+    VersionStorageInfo::LevelSummaryStorage tmp;
+    ROCKS_LOG_BUFFER(log_buffer, "[%s] Level summary: %s\n",
+                     cfd->GetName().c_str(),
+                     cfd->current()->storage_info()->LevelSummary(&tmp));
+  }
+
+  if (!s.ok() && !s.IsShutdownInProgress() && !s.IsColumnFamilyDropped()) {
+    Status new_bg_error = s;
+    error_handler_.SetBGError(new_bg_error, BackgroundErrorReason::kFlush);
+  }
+  if (s.ok()) {
+#ifndef ROCKSDB_LITE
+    // may temporarily unlock and lock the mutex.
+    NotifyOnFlushCompleted(cfd, mutable_cf_options,
+                           flush_job.GetCommittedFlushJobsInfo());
+    auto sfm = static_cast<SstFileManagerImpl*>(
+        immutable_db_options_.sst_file_manager.get());
+    if (sfm) {
+      // Notify sst_file_manager that a new file was added
+      std::string file_path = MakeTableFileName(
+          cfd->ioptions()->cf_paths[0].path, file_meta.fd.GetNumber());
+      sfm->OnAddFile(file_path);
+      if (sfm->IsMaxAllowedSpaceReached()) {
+        Status new_bg_error =
+            Status::SpaceLimit("Max allowed space was reached");
+        TEST_SYNC_POINT_CALLBACK(
+            "DBImpl::FlushMemTableToOutputFile:MaxAllowedSpaceReached",
+            &new_bg_error);
+        error_handler_.SetBGError(new_bg_error, BackgroundErrorReason::kFlush);
+      }
+    }
+#endif  // ROCKSDB_LITE
+  }
+  TEST_SYNC_POINT("DBImpl::FlushMemTableToOutputFile:Finish");
+  return s;
+}
+
+Status DBImpl::FlushMemTablesToOutputFiles(
+    const autovector<BGFlushArg>& bg_flush_args, bool* made_progress,
+    JobContext* job_context, LogBuffer* log_buffer, Env::Priority thread_pri) {
+  if (immutable_db_options_.atomic_flush) {
+    return AtomicFlushMemTablesToOutputFiles(
+        bg_flush_args, made_progress, job_context, log_buffer, thread_pri);
+  }
+  std::vector<SequenceNumber> snapshot_seqs;
+  SequenceNumber earliest_write_conflict_snapshot;
+  SnapshotChecker* snapshot_checker;
+  GetSnapshotContext(job_context, &snapshot_seqs,
+                     &earliest_write_conflict_snapshot, &snapshot_checker);
+  Status status;
+  for (auto& arg : bg_flush_args) {
+    ColumnFamilyData* cfd = arg.cfd_;
+    MutableCFOptions mutable_cf_options = *cfd->GetLatestMutableCFOptions();
+    SuperVersionContext* superversion_context = arg.superversion_context_;
+    Status s = FlushMemTableToOutputFile(
+        cfd, mutable_cf_options, made_progress, job_context,
+        superversion_context, snapshot_seqs, earliest_write_conflict_snapshot,
+        snapshot_checker, log_buffer, thread_pri);
+    if (!s.ok()) {
+      status = s;
+      if (!s.IsShutdownInProgress() && !s.IsColumnFamilyDropped()) {
+        // At this point, DB is not shutting down, nor is cfd dropped.
+        // Something is wrong, thus we break out of the loop.
+        break;
+      }
+    }
+  }
+  return status;
+}
+
+/*
+ * Atomically flushes multiple column families.
+ *
+ * For each column family, all memtables with ID smaller than or equal to the
+ * ID specified in bg_flush_args will be flushed. Only after all column
+ * families finish flush will this function commit to MANIFEST. If any of the
+ * column families are not flushed successfully, this function does not have
+ * any side-effect on the state of the database.
+ */
+Status DBImpl::AtomicFlushMemTablesToOutputFiles(
+    const autovector<BGFlushArg>& bg_flush_args, bool* made_progress,
+    JobContext* job_context, LogBuffer* log_buffer, Env::Priority thread_pri) {
+  mutex_.AssertHeld();
+
+  autovector<ColumnFamilyData*> cfds;
+  for (const auto& arg : bg_flush_args) {
+    cfds.emplace_back(arg.cfd_);
+  }
+
+#ifndef NDEBUG
+  for (const auto cfd : cfds) {
+    assert(cfd->imm()->NumNotFlushed() != 0);
+    assert(cfd->imm()->IsFlushPending());
+  }
+#endif /* !NDEBUG */
+
+  std::vector<SequenceNumber> snapshot_seqs;
+  SequenceNumber earliest_write_conflict_snapshot;
+  SnapshotChecker* snapshot_checker;
+  GetSnapshotContext(job_context, &snapshot_seqs,
+                     &earliest_write_conflict_snapshot, &snapshot_checker);
+
+  autovector<Directory*> distinct_output_dirs;
+  autovector<std::string> distinct_output_dir_paths;
+  std::vector<std::unique_ptr<FlushJob>> jobs;
+  std::vector<MutableCFOptions> all_mutable_cf_options;
+  int num_cfs = static_cast<int>(cfds.size());
+  all_mutable_cf_options.reserve(num_cfs);
+  for (int i = 0; i < num_cfs; ++i) {
+    auto cfd = cfds[i];
+    Directory* data_dir = GetDataDir(cfd, 0U);
+    const std::string& curr_path = cfd->ioptions()->cf_paths[0].path;
+
+    // Add to distinct output directories if eligible. Use linear search. Since
+    // the number of elements in the vector is not large, performance should be
+    // tolerable.
+    bool found = false;
+    for (const auto& path : distinct_output_dir_paths) {
+      if (path == curr_path) {
+        found = true;
+        break;
+      }
+    }
+    if (!found) {
+      distinct_output_dir_paths.emplace_back(curr_path);
+      distinct_output_dirs.emplace_back(data_dir);
+    }
+
+    all_mutable_cf_options.emplace_back(*cfd->GetLatestMutableCFOptions());
+    const MutableCFOptions& mutable_cf_options = all_mutable_cf_options.back();
+    const uint64_t* max_memtable_id = &(bg_flush_args[i].max_memtable_id_);
+    jobs.emplace_back(new FlushJob(
+        dbname_, cfd, immutable_db_options_, mutable_cf_options,
+        max_memtable_id, file_options_for_compaction_, versions_.get(), &mutex_,
+        &shutting_down_, snapshot_seqs, earliest_write_conflict_snapshot,
+        snapshot_checker, job_context, log_buffer, directories_.GetDbDir(),
+        data_dir, GetCompressionFlush(*cfd->ioptions(), mutable_cf_options),
+        stats_, &event_logger_, mutable_cf_options.report_bg_io_stats,
+        false /* sync_output_directory */, false /* write_manifest */,
+        thread_pri));
+    jobs.back()->PickMemTable();
+  }
+
+  std::vector<FileMetaData> file_meta(num_cfs);
+  Status s;
+  assert(num_cfs == static_cast<int>(jobs.size()));
+
+#ifndef ROCKSDB_LITE
+  for (int i = 0; i != num_cfs; ++i) {
+    const MutableCFOptions& mutable_cf_options = all_mutable_cf_options.at(i);
+    // may temporarily unlock and lock the mutex.
+    NotifyOnFlushBegin(cfds[i], &file_meta[i], mutable_cf_options,
+                       job_context->job_id);
+  }
+#endif /* !ROCKSDB_LITE */
+
+  if (logfile_number_ > 0) {
+    // TODO (yanqin) investigate whether we should sync the closed logs for
+    // single column family case.
+    s = SyncClosedLogs(job_context);
+  }
+
+  // exec_status stores the execution status of flush_jobs as
+  // <bool /* executed */, Status /* status code */>
+  autovector<std::pair<bool, Status>> exec_status;
+  for (int i = 0; i != num_cfs; ++i) {
+    // Initially all jobs are not executed, with status OK.
+    exec_status.emplace_back(false, Status::OK());
+  }
+
+  if (s.ok()) {
+    // TODO (yanqin): parallelize jobs with threads.
+    for (int i = 1; i != num_cfs; ++i) {
+      exec_status[i].second =
+          jobs[i]->Run(&logs_with_prep_tracker_, &file_meta[i]);
+      exec_status[i].first = true;
+    }
+    if (num_cfs > 1) {
+      TEST_SYNC_POINT(
+          "DBImpl::AtomicFlushMemTablesToOutputFiles:SomeFlushJobsComplete:1");
+      TEST_SYNC_POINT(
+          "DBImpl::AtomicFlushMemTablesToOutputFiles:SomeFlushJobsComplete:2");
+    }
+    assert(exec_status.size() > 0);
+    assert(!file_meta.empty());
+    exec_status[0].second =
+        jobs[0]->Run(&logs_with_prep_tracker_, &file_meta[0]);
+    exec_status[0].first = true;
+
+    Status error_status;
+    for (const auto& e : exec_status) {
+      if (!e.second.ok()) {
+        s = e.second;
+        if (!e.second.IsShutdownInProgress() &&
+            !e.second.IsColumnFamilyDropped()) {
+          // If a flush job did not return OK, and the CF is not dropped, and
+          // the DB is not shutting down, then we have to return this result to
+          // caller later.
+          error_status = e.second;
+        }
+      }
+    }
+
+    s = error_status.ok() ? s : error_status;
+  }
+
+  if (s.IsColumnFamilyDropped()) {
+    s = Status::OK();
+  }
+
+  if (s.ok() || s.IsShutdownInProgress()) {
+    // Sync on all distinct output directories.
+    for (auto dir : distinct_output_dirs) {
+      if (dir != nullptr) {
+        Status error_status = dir->Fsync();
+        if (!error_status.ok()) {
+          s = error_status;
+          break;
+        }
+      }
+    }
+  } else {
+    // Need to undo atomic flush if something went wrong, i.e. s is not OK and
+    // it is not because of CF drop.
+    // Have to cancel the flush jobs that have NOT executed because we need to
+    // unref the versions.
+    for (int i = 0; i != num_cfs; ++i) {
+      if (!exec_status[i].first) {
+        jobs[i]->Cancel();
+      }
+    }
+    for (int i = 0; i != num_cfs; ++i) {
+      if (exec_status[i].first && exec_status[i].second.ok()) {
+        auto& mems = jobs[i]->GetMemTables();
+        cfds[i]->imm()->RollbackMemtableFlush(mems,
+                                              file_meta[i].fd.GetNumber());
+      }
+    }
+  }
+
+  if (s.ok()) {
+    auto wait_to_install_func = [&]() {
+      bool ready = true;
+      for (size_t i = 0; i != cfds.size(); ++i) {
+        const auto& mems = jobs[i]->GetMemTables();
+        if (cfds[i]->IsDropped()) {
+          // If the column family is dropped, then do not wait.
+          continue;
+        } else if (!mems.empty() &&
+                   cfds[i]->imm()->GetEarliestMemTableID() < mems[0]->GetID()) {
+          // If a flush job needs to install the flush result for mems and
+          // mems[0] is not the earliest memtable, it means another thread must
+          // be installing flush results for the same column family, then the
+          // current thread needs to wait.
+          ready = false;
+          break;
+        } else if (mems.empty() && cfds[i]->imm()->GetEarliestMemTableID() <=
+                                       bg_flush_args[i].max_memtable_id_) {
+          // If a flush job does not need to install flush results, then it has
+          // to wait until all memtables up to max_memtable_id_ (inclusive) are
+          // installed.
+          ready = false;
+          break;
+        }
+      }
+      return ready;
+    };
+
+    bool resuming_from_bg_err = error_handler_.IsDBStopped();
+    while ((!error_handler_.IsDBStopped() ||
+            error_handler_.GetRecoveryError().ok()) &&
+           !wait_to_install_func()) {
+      atomic_flush_install_cv_.Wait();
+    }
+
+    s = resuming_from_bg_err ? error_handler_.GetRecoveryError()
+                             : error_handler_.GetBGError();
+  }
+
+  if (s.ok()) {
+    autovector<ColumnFamilyData*> tmp_cfds;
+    autovector<const autovector<MemTable*>*> mems_list;
+    autovector<const MutableCFOptions*> mutable_cf_options_list;
+    autovector<FileMetaData*> tmp_file_meta;
+    for (int i = 0; i != num_cfs; ++i) {
+      const auto& mems = jobs[i]->GetMemTables();
+      if (!cfds[i]->IsDropped() && !mems.empty()) {
+        tmp_cfds.emplace_back(cfds[i]);
+        mems_list.emplace_back(&mems);
+        mutable_cf_options_list.emplace_back(&all_mutable_cf_options[i]);
+        tmp_file_meta.emplace_back(&file_meta[i]);
+      }
+    }
+
+    s = InstallMemtableAtomicFlushResults(
+        nullptr /* imm_lists */, tmp_cfds, mutable_cf_options_list, mems_list,
+        versions_.get(), &mutex_, tmp_file_meta,
+        &job_context->memtables_to_free, directories_.GetDbDir(), log_buffer);
+  }
+
+  if (s.ok()) {
+    assert(num_cfs ==
+           static_cast<int>(job_context->superversion_contexts.size()));
+    for (int i = 0; i != num_cfs; ++i) {
+      if (cfds[i]->IsDropped()) {
+        continue;
+      }
+      InstallSuperVersionAndScheduleWork(cfds[i],
+                                         &job_context->superversion_contexts[i],
+                                         all_mutable_cf_options[i]);
+      VersionStorageInfo::LevelSummaryStorage tmp;
+      ROCKS_LOG_BUFFER(log_buffer, "[%s] Level summary: %s\n",
+                       cfds[i]->GetName().c_str(),
+                       cfds[i]->current()->storage_info()->LevelSummary(&tmp));
+    }
+    if (made_progress) {
+      *made_progress = true;
+    }
+#ifndef ROCKSDB_LITE
+    auto sfm = static_cast<SstFileManagerImpl*>(
+        immutable_db_options_.sst_file_manager.get());
+    assert(all_mutable_cf_options.size() == static_cast<size_t>(num_cfs));
+    for (int i = 0; i != num_cfs; ++i) {
+      if (cfds[i]->IsDropped()) {
+        continue;
+      }
+      NotifyOnFlushCompleted(cfds[i], all_mutable_cf_options[i],
+                             jobs[i]->GetCommittedFlushJobsInfo());
+      if (sfm) {
+        std::string file_path = MakeTableFileName(
+            cfds[i]->ioptions()->cf_paths[0].path, file_meta[i].fd.GetNumber());
+        sfm->OnAddFile(file_path);
+        if (sfm->IsMaxAllowedSpaceReached() &&
+            error_handler_.GetBGError().ok()) {
+          Status new_bg_error =
+              Status::SpaceLimit("Max allowed space was reached");
+          error_handler_.SetBGError(new_bg_error,
+                                    BackgroundErrorReason::kFlush);
+        }
+      }
+    }
+#endif  // ROCKSDB_LITE
+  }
+
+  if (!s.ok() && !s.IsShutdownInProgress()) {
+    Status new_bg_error = s;
+    error_handler_.SetBGError(new_bg_error, BackgroundErrorReason::kFlush);
+  }
+
+  return s;
+}
+
+void DBImpl::NotifyOnFlushBegin(ColumnFamilyData* cfd, FileMetaData* file_meta,
+                                const MutableCFOptions& mutable_cf_options,
+                                int job_id) {
+#ifndef ROCKSDB_LITE
+  if (immutable_db_options_.listeners.size() == 0U) {
+    return;
+  }
+  mutex_.AssertHeld();
+  if (shutting_down_.load(std::memory_order_acquire)) {
+    return;
+  }
+  bool triggered_writes_slowdown =
+      (cfd->current()->storage_info()->NumLevelFiles(0) >=
+       mutable_cf_options.level0_slowdown_writes_trigger);
+  bool triggered_writes_stop =
+      (cfd->current()->storage_info()->NumLevelFiles(0) >=
+       mutable_cf_options.level0_stop_writes_trigger);
+  // release lock while notifying events
+  mutex_.Unlock();
+  {
+    FlushJobInfo info{};
+    info.cf_id = cfd->GetID();
+    info.cf_name = cfd->GetName();
+    // TODO(yhchiang): make db_paths dynamic in case flush does not
+    //                 go to L0 in the future.
+    const uint64_t file_number = file_meta->fd.GetNumber();
+    info.file_path =
+        MakeTableFileName(cfd->ioptions()->cf_paths[0].path, file_number);
+    info.file_number = file_number;
+    info.thread_id = env_->GetThreadID();
+    info.job_id = job_id;
+    info.triggered_writes_slowdown = triggered_writes_slowdown;
+    info.triggered_writes_stop = triggered_writes_stop;
+    info.smallest_seqno = file_meta->fd.smallest_seqno;
+    info.largest_seqno = file_meta->fd.largest_seqno;
+    info.flush_reason = cfd->GetFlushReason();
+    for (auto listener : immutable_db_options_.listeners) {
+      listener->OnFlushBegin(this, info);
+    }
+  }
+  mutex_.Lock();
+// no need to signal bg_cv_ as it will be signaled at the end of the
+// flush process.
+#else
+  (void)cfd;
+  (void)file_meta;
+  (void)mutable_cf_options;
+  (void)job_id;
+#endif  // ROCKSDB_LITE
+}
+
+void DBImpl::NotifyOnFlushCompleted(
+    ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options,
+    std::list<std::unique_ptr<FlushJobInfo>>* flush_jobs_info) {
+#ifndef ROCKSDB_LITE
+  assert(flush_jobs_info != nullptr);
+  if (immutable_db_options_.listeners.size() == 0U) {
+    return;
+  }
+  mutex_.AssertHeld();
+  if (shutting_down_.load(std::memory_order_acquire)) {
+    return;
+  }
+  bool triggered_writes_slowdown =
+      (cfd->current()->storage_info()->NumLevelFiles(0) >=
+       mutable_cf_options.level0_slowdown_writes_trigger);
+  bool triggered_writes_stop =
+      (cfd->current()->storage_info()->NumLevelFiles(0) >=
+       mutable_cf_options.level0_stop_writes_trigger);
+  // release lock while notifying events
+  mutex_.Unlock();
+  {
+    for (auto& info : *flush_jobs_info) {
+      info->triggered_writes_slowdown = triggered_writes_slowdown;
+      info->triggered_writes_stop = triggered_writes_stop;
+      for (auto listener : immutable_db_options_.listeners) {
+        listener->OnFlushCompleted(this, *info);
+      }
+    }
+    flush_jobs_info->clear();
+  }
+  mutex_.Lock();
+  // no need to signal bg_cv_ as it will be signaled at the end of the
+  // flush process.
+#else
+  (void)cfd;
+  (void)mutable_cf_options;
+  (void)flush_jobs_info;
+#endif  // ROCKSDB_LITE
+}
+
+Status DBImpl::CompactRange(const CompactRangeOptions& options,
+                            ColumnFamilyHandle* column_family,
+                            const Slice* begin, const Slice* end) {
+  auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+  auto cfd = cfh->cfd();
+
+  if (options.target_path_id >= cfd->ioptions()->cf_paths.size()) {
+    return Status::InvalidArgument("Invalid target path ID");
+  }
+
+  bool exclusive = options.exclusive_manual_compaction;
+
+  bool flush_needed = true;
+  if (begin != nullptr && end != nullptr) {
+    // TODO(ajkr): We could also optimize away the flush in certain cases where
+    // one/both sides of the interval are unbounded. But it requires more
+    // changes to RangesOverlapWithMemtables.
+    Range range(*begin, *end);
+    SuperVersion* super_version = cfd->GetReferencedSuperVersion(this);
+    cfd->RangesOverlapWithMemtables({range}, super_version, &flush_needed);
+    CleanupSuperVersion(super_version);
+  }
+
+  Status s;
+  if (flush_needed) {
+    FlushOptions fo;
+    fo.allow_write_stall = options.allow_write_stall;
+    if (immutable_db_options_.atomic_flush) {
+      autovector<ColumnFamilyData*> cfds;
+      mutex_.Lock();
+      SelectColumnFamiliesForAtomicFlush(&cfds);
+      mutex_.Unlock();
+      s = AtomicFlushMemTables(cfds, fo, FlushReason::kManualCompaction,
+                               false /* writes_stopped */);
+    } else {
+      s = FlushMemTable(cfd, fo, FlushReason::kManualCompaction,
+                        false /* writes_stopped*/);
+    }
+    if (!s.ok()) {
+      LogFlush(immutable_db_options_.info_log);
+      return s;
+    }
+  }
+
+  int max_level_with_files = 0;
+  // max_file_num_to_ignore can be used to filter out newly created SST files,
+  // useful for bottom level compaction in a manual compaction
+  uint64_t max_file_num_to_ignore = port::kMaxUint64;
+  uint64_t next_file_number = port::kMaxUint64;
+  {
+    InstrumentedMutexLock l(&mutex_);
+    Version* base = cfd->current();
+    for (int level = 1; level < base->storage_info()->num_non_empty_levels();
+         level++) {
+      if (base->storage_info()->OverlapInLevel(level, begin, end)) {
+        max_level_with_files = level;
+      }
+    }
+    next_file_number = versions_->current_next_file_number();
+  }
+
+  int final_output_level = 0;
+
+  if (cfd->ioptions()->compaction_style == kCompactionStyleUniversal &&
+      cfd->NumberLevels() > 1) {
+    // Always compact all files together.
+    final_output_level = cfd->NumberLevels() - 1;
+    // if bottom most level is reserved
+    if (immutable_db_options_.allow_ingest_behind) {
+      final_output_level--;
+    }
+    s = RunManualCompaction(cfd, ColumnFamilyData::kCompactAllLevels,
+                            final_output_level, options, begin, end, exclusive,
+                            false, max_file_num_to_ignore);
+  } else {
+    for (int level = 0; level <= max_level_with_files; level++) {
+      int output_level;
+      // in case the compaction is universal or if we're compacting the
+      // bottom-most level, the output level will be the same as input one.
+      // level 0 can never be the bottommost level (i.e. if all files are in
+      // level 0, we will compact to level 1)
+      if (cfd->ioptions()->compaction_style == kCompactionStyleUniversal ||
+          cfd->ioptions()->compaction_style == kCompactionStyleFIFO) {
+        output_level = level;
+      } else if (level == max_level_with_files && level > 0) {
+        if (options.bottommost_level_compaction ==
+            BottommostLevelCompaction::kSkip) {
+          // Skip bottommost level compaction
+          continue;
+        } else if (options.bottommost_level_compaction ==
+                       BottommostLevelCompaction::kIfHaveCompactionFilter &&
+                   cfd->ioptions()->compaction_filter == nullptr &&
+                   cfd->ioptions()->compaction_filter_factory == nullptr) {
+          // Skip bottommost level compaction since we don't have a compaction
+          // filter
+          continue;
+        }
+        output_level = level;
+        // update max_file_num_to_ignore only for bottom level compaction
+        // because data in newly compacted files in middle levels may still need
+        // to be pushed down
+        max_file_num_to_ignore = next_file_number;
+      } else {
+        output_level = level + 1;
+        if (cfd->ioptions()->compaction_style == kCompactionStyleLevel &&
+            cfd->ioptions()->level_compaction_dynamic_level_bytes &&
+            level == 0) {
+          output_level = ColumnFamilyData::kCompactToBaseLevel;
+        }
+      }
+      s = RunManualCompaction(cfd, level, output_level, options, begin, end,
+                              exclusive, false, max_file_num_to_ignore);
+      if (!s.ok()) {
+        break;
+      }
+      if (output_level == ColumnFamilyData::kCompactToBaseLevel) {
+        final_output_level = cfd->NumberLevels() - 1;
+      } else if (output_level > final_output_level) {
+        final_output_level = output_level;
+      }
+      TEST_SYNC_POINT("DBImpl::RunManualCompaction()::1");
+      TEST_SYNC_POINT("DBImpl::RunManualCompaction()::2");
+    }
+  }
+  if (!s.ok()) {
+    LogFlush(immutable_db_options_.info_log);
+    return s;
+  }
+
+  if (options.change_level) {
+    ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                   "[RefitLevel] waiting for background threads to stop");
+    s = PauseBackgroundWork();
+    if (s.ok()) {
+      s = ReFitLevel(cfd, final_output_level, options.target_level);
+    }
+    ContinueBackgroundWork();
+  }
+  LogFlush(immutable_db_options_.info_log);
+
+  {
+    InstrumentedMutexLock l(&mutex_);
+    // an automatic compaction that has been scheduled might have been
+    // preempted by the manual compactions. Need to schedule it back.
+    MaybeScheduleFlushOrCompaction();
+  }
+
+  return s;
+}
+
+Status DBImpl::CompactFiles(const CompactionOptions& compact_options,
+                            ColumnFamilyHandle* column_family,
+                            const std::vector<std::string>& input_file_names,
+                            const int output_level, const int output_path_id,
+                            std::vector<std::string>* const output_file_names,
+                            CompactionJobInfo* compaction_job_info) {
+#ifdef ROCKSDB_LITE
+  (void)compact_options;
+  (void)column_family;
+  (void)input_file_names;
+  (void)output_level;
+  (void)output_path_id;
+  (void)output_file_names;
+  (void)compaction_job_info;
+  // not supported in lite version
+  return Status::NotSupported("Not supported in ROCKSDB LITE");
+#else
+  if (column_family == nullptr) {
+    return Status::InvalidArgument("ColumnFamilyHandle must be non-null.");
+  }
+
+  auto cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family)->cfd();
+  assert(cfd);
+
+  Status s;
+  JobContext job_context(0, true);
+  LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL,
+                       immutable_db_options_.info_log.get());
+
+  // Perform CompactFiles
+  TEST_SYNC_POINT("TestCompactFiles::IngestExternalFile2");
+  {
+    InstrumentedMutexLock l(&mutex_);
+
+    // This call will unlock/lock the mutex to wait for current running
+    // IngestExternalFile() calls to finish.
+    WaitForIngestFile();
+
+    // We need to get current after `WaitForIngestFile`, because
+    // `IngestExternalFile` may add files that overlap with `input_file_names`
+    auto* current = cfd->current();
+    current->Ref();
+
+    s = CompactFilesImpl(compact_options, cfd, current, input_file_names,
+                         output_file_names, output_level, output_path_id,
+                         &job_context, &log_buffer, compaction_job_info);
+
+    current->Unref();
+  }
+
+  // Find and delete obsolete files
+  {
+    InstrumentedMutexLock l(&mutex_);
+    // If !s.ok(), this means that Compaction failed. In that case, we want
+    // to delete all obsolete files we might have created and we force
+    // FindObsoleteFiles(). This is because job_context does not
+    // catch all created files if compaction failed.
+    FindObsoleteFiles(&job_context, !s.ok());
+  }  // release the mutex
+
+  // delete unnecessary files if any, this is done outside the mutex
+  if (job_context.HaveSomethingToClean() ||
+      job_context.HaveSomethingToDelete() || !log_buffer.IsEmpty()) {
+    // Have to flush the info logs before bg_compaction_scheduled_--
+    // because if bg_flush_scheduled_ becomes 0 and the lock is
+    // released, the deconstructor of DB can kick in and destroy all the
+    // states of DB so info_log might not be available after that point.
+    // It also applies to access other states that DB owns.
+    log_buffer.FlushBufferToLog();
+    if (job_context.HaveSomethingToDelete()) {
+      // no mutex is locked here.  No need to Unlock() and Lock() here.
+      PurgeObsoleteFiles(job_context);
+    }
+    job_context.Clean();
+  }
+
+  return s;
+#endif  // ROCKSDB_LITE
+}
+
+#ifndef ROCKSDB_LITE
+Status DBImpl::CompactFilesImpl(
+    const CompactionOptions& compact_options, ColumnFamilyData* cfd,
+    Version* version, const std::vector<std::string>& input_file_names,
+    std::vector<std::string>* const output_file_names, const int output_level,
+    int output_path_id, JobContext* job_context, LogBuffer* log_buffer,
+    CompactionJobInfo* compaction_job_info) {
+  mutex_.AssertHeld();
+
+  if (shutting_down_.load(std::memory_order_acquire)) {
+    return Status::ShutdownInProgress();
+  }
+  if (manual_compaction_paused_.load(std::memory_order_acquire)) {
+    return Status::Incomplete(Status::SubCode::kManualCompactionPaused);
+  }
+
+  std::unordered_set<uint64_t> input_set;
+  for (const auto& file_name : input_file_names) {
+    input_set.insert(TableFileNameToNumber(file_name));
+  }
+
+  ColumnFamilyMetaData cf_meta;
+  // TODO(yhchiang): can directly use version here if none of the
+  // following functions call is pluggable to external developers.
+  version->GetColumnFamilyMetaData(&cf_meta);
+
+  if (output_path_id < 0) {
+    if (cfd->ioptions()->cf_paths.size() == 1U) {
+      output_path_id = 0;
+    } else {
+      return Status::NotSupported(
+          "Automatic output path selection is not "
+          "yet supported in CompactFiles()");
+    }
+  }
+
+  Status s = cfd->compaction_picker()->SanitizeCompactionInputFiles(
+      &input_set, cf_meta, output_level);
+  if (!s.ok()) {
+    return s;
+  }
+
+  std::vector<CompactionInputFiles> input_files;
+  s = cfd->compaction_picker()->GetCompactionInputsFromFileNumbers(
+      &input_files, &input_set, version->storage_info(), compact_options);
+  if (!s.ok()) {
+    return s;
+  }
+
+  for (const auto& inputs : input_files) {
+    if (cfd->compaction_picker()->AreFilesInCompaction(inputs.files)) {
+      return Status::Aborted(
+          "Some of the necessary compaction input "
+          "files are already being compacted");
+    }
+  }
+  bool sfm_reserved_compact_space = false;
+  // First check if we have enough room to do the compaction
+  bool enough_room = EnoughRoomForCompaction(
+      cfd, input_files, &sfm_reserved_compact_space, log_buffer);
+
+  if (!enough_room) {
+    // m's vars will get set properly at the end of this function,
+    // as long as status == CompactionTooLarge
+    return Status::CompactionTooLarge();
+  }
+
+  // At this point, CompactFiles will be run.
+  bg_compaction_scheduled_++;
+
+  std::unique_ptr<Compaction> c;
+  assert(cfd->compaction_picker());
+  c.reset(cfd->compaction_picker()->CompactFiles(
+      compact_options, input_files, output_level, version->storage_info(),
+      *cfd->GetLatestMutableCFOptions(), output_path_id));
+  // we already sanitized the set of input files and checked for conflicts
+  // without releasing the lock, so we're guaranteed a compaction can be formed.
+  assert(c != nullptr);
+
+  c->SetInputVersion(version);
+  // deletion compaction currently not allowed in CompactFiles.
+  assert(!c->deletion_compaction());
+
+  std::vector<SequenceNumber> snapshot_seqs;
+  SequenceNumber earliest_write_conflict_snapshot;
+  SnapshotChecker* snapshot_checker;
+  GetSnapshotContext(job_context, &snapshot_seqs,
+                     &earliest_write_conflict_snapshot, &snapshot_checker);
+
+  std::unique_ptr<std::list<uint64_t>::iterator> pending_outputs_inserted_elem(
+      new std::list<uint64_t>::iterator(
+          CaptureCurrentFileNumberInPendingOutputs()));
+
+  assert(is_snapshot_supported_ || snapshots_.empty());
+  CompactionJobStats compaction_job_stats;
+  CompactionJob compaction_job(
+      job_context->job_id, c.get(), immutable_db_options_,
+      file_options_for_compaction_, versions_.get(), &shutting_down_,
+      preserve_deletes_seqnum_.load(), log_buffer, directories_.GetDbDir(),
+      GetDataDir(c->column_family_data(), c->output_path_id()), stats_, &mutex_,
+      &error_handler_, snapshot_seqs, earliest_write_conflict_snapshot,
+      snapshot_checker, table_cache_, &event_logger_,
+      c->mutable_cf_options()->paranoid_file_checks,
+      c->mutable_cf_options()->report_bg_io_stats, dbname_,
+      &compaction_job_stats, Env::Priority::USER, &manual_compaction_paused_);
+
+  // Creating a compaction influences the compaction score because the score
+  // takes running compactions into account (by skipping files that are already
+  // being compacted). Since we just changed compaction score, we recalculate it
+  // here.
+  version->storage_info()->ComputeCompactionScore(*cfd->ioptions(),
+                                                  *c->mutable_cf_options());
+
+  compaction_job.Prepare();
+
+  mutex_.Unlock();
+  TEST_SYNC_POINT("CompactFilesImpl:0");
+  TEST_SYNC_POINT("CompactFilesImpl:1");
+  compaction_job.Run();
+  TEST_SYNC_POINT("CompactFilesImpl:2");
+  TEST_SYNC_POINT("CompactFilesImpl:3");
+  mutex_.Lock();
+
+  Status status = compaction_job.Install(*c->mutable_cf_options());
+  if (status.ok()) {
+    InstallSuperVersionAndScheduleWork(c->column_family_data(),
+                                       &job_context->superversion_contexts[0],
+                                       *c->mutable_cf_options());
+  }
+  c->ReleaseCompactionFiles(s);
+#ifndef ROCKSDB_LITE
+  // Need to make sure SstFileManager does its bookkeeping
+  auto sfm = static_cast<SstFileManagerImpl*>(
+      immutable_db_options_.sst_file_manager.get());
+  if (sfm && sfm_reserved_compact_space) {
+    sfm->OnCompactionCompletion(c.get());
+  }
+#endif  // ROCKSDB_LITE
+
+  ReleaseFileNumberFromPendingOutputs(pending_outputs_inserted_elem);
+
+  if (compaction_job_info != nullptr) {
+    BuildCompactionJobInfo(cfd, c.get(), s, compaction_job_stats,
+                           job_context->job_id, version, compaction_job_info);
+  }
+
+  if (status.ok()) {
+    // Done
+  } else if (status.IsColumnFamilyDropped() || status.IsShutdownInProgress()) {
+    // Ignore compaction errors found during shutting down
+  } else if (status.IsManualCompactionPaused()) {
+    // Don't report stopping manual compaction as error
+    ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                   "[%s] [JOB %d] Stopping manual compaction",
+                   c->column_family_data()->GetName().c_str(),
+                   job_context->job_id);
+  } else {
+    ROCKS_LOG_WARN(immutable_db_options_.info_log,
+                   "[%s] [JOB %d] Compaction error: %s",
+                   c->column_family_data()->GetName().c_str(),
+                   job_context->job_id, status.ToString().c_str());
+    error_handler_.SetBGError(status, BackgroundErrorReason::kCompaction);
+  }
+
+  if (output_file_names != nullptr) {
+    for (const auto newf : c->edit()->GetNewFiles()) {
+      (*output_file_names)
+          .push_back(TableFileName(c->immutable_cf_options()->cf_paths,
+                                   newf.second.fd.GetNumber(),
+                                   newf.second.fd.GetPathId()));
+    }
+  }
+
+  c.reset();
+
+  bg_compaction_scheduled_--;
+  if (bg_compaction_scheduled_ == 0) {
+    bg_cv_.SignalAll();
+  }
+  MaybeScheduleFlushOrCompaction();
+  TEST_SYNC_POINT("CompactFilesImpl:End");
+
+  return status;
+}
+#endif  // ROCKSDB_LITE
+
+Status DBImpl::PauseBackgroundWork() {
+  InstrumentedMutexLock guard_lock(&mutex_);
+  bg_compaction_paused_++;
+  while (bg_bottom_compaction_scheduled_ > 0 || bg_compaction_scheduled_ > 0 ||
+         bg_flush_scheduled_ > 0) {
+    bg_cv_.Wait();
+  }
+  bg_work_paused_++;
+  return Status::OK();
+}
+
+Status DBImpl::ContinueBackgroundWork() {
+  InstrumentedMutexLock guard_lock(&mutex_);
+  if (bg_work_paused_ == 0) {
+    return Status::InvalidArgument();
+  }
+  assert(bg_work_paused_ > 0);
+  assert(bg_compaction_paused_ > 0);
+  bg_compaction_paused_--;
+  bg_work_paused_--;
+  // It's sufficient to check just bg_work_paused_ here since
+  // bg_work_paused_ is always no greater than bg_compaction_paused_
+  if (bg_work_paused_ == 0) {
+    MaybeScheduleFlushOrCompaction();
+  }
+  return Status::OK();
+}
+
+void DBImpl::NotifyOnCompactionBegin(ColumnFamilyData* cfd, Compaction* c,
+                                     const Status& st,
+                                     const CompactionJobStats& job_stats,
+                                     int job_id) {
+#ifndef ROCKSDB_LITE
+  if (immutable_db_options_.listeners.empty()) {
+    return;
+  }
+  mutex_.AssertHeld();
+  if (shutting_down_.load(std::memory_order_acquire)) {
+    return;
+  }
+  if (c->is_manual_compaction() &&
+      manual_compaction_paused_.load(std::memory_order_acquire)) {
+    return;
+  }
+  Version* current = cfd->current();
+  current->Ref();
+  // release lock while notifying events
+  mutex_.Unlock();
+  TEST_SYNC_POINT("DBImpl::NotifyOnCompactionBegin::UnlockMutex");
+  {
+    CompactionJobInfo info{};
+    info.cf_name = cfd->GetName();
+    info.status = st;
+    info.thread_id = env_->GetThreadID();
+    info.job_id = job_id;
+    info.base_input_level = c->start_level();
+    info.output_level = c->output_level();
+    info.stats = job_stats;
+    info.table_properties = c->GetOutputTableProperties();
+    info.compaction_reason = c->compaction_reason();
+    info.compression = c->output_compression();
+    for (size_t i = 0; i < c->num_input_levels(); ++i) {
+      for (const auto fmd : *c->inputs(i)) {
+        const FileDescriptor& desc = fmd->fd;
+        const uint64_t file_number = desc.GetNumber();
+        auto fn = TableFileName(c->immutable_cf_options()->cf_paths,
+                                file_number, desc.GetPathId());
+        info.input_files.push_back(fn);
+        info.input_file_infos.push_back(CompactionFileInfo{
+            static_cast<int>(i), file_number, fmd->oldest_blob_file_number});
+        if (info.table_properties.count(fn) == 0) {
+          std::shared_ptr<const TableProperties> tp;
+          auto s = current->GetTableProperties(&tp, fmd, &fn);
+          if (s.ok()) {
+            info.table_properties[fn] = tp;
+          }
+        }
+      }
+    }
+    for (const auto newf : c->edit()->GetNewFiles()) {
+      const FileMetaData& meta = newf.second;
+      const FileDescriptor& desc = meta.fd;
+      const uint64_t file_number = desc.GetNumber();
+      info.output_files.push_back(TableFileName(
+          c->immutable_cf_options()->cf_paths, file_number, desc.GetPathId()));
+      info.output_file_infos.push_back(CompactionFileInfo{
+          newf.first, file_number, meta.oldest_blob_file_number});
+    }
+    for (auto listener : immutable_db_options_.listeners) {
+      listener->OnCompactionBegin(this, info);
+    }
+  }
+  mutex_.Lock();
+  current->Unref();
+#else
+  (void)cfd;
+  (void)c;
+  (void)st;
+  (void)job_stats;
+  (void)job_id;
+#endif  // ROCKSDB_LITE
+}
+
+void DBImpl::NotifyOnCompactionCompleted(
+    ColumnFamilyData* cfd, Compaction* c, const Status& st,
+    const CompactionJobStats& compaction_job_stats, const int job_id) {
+#ifndef ROCKSDB_LITE
+  if (immutable_db_options_.listeners.size() == 0U) {
+    return;
+  }
+  mutex_.AssertHeld();
+  if (shutting_down_.load(std::memory_order_acquire)) {
+    return;
+  }
+  if (c->is_manual_compaction() &&
+      manual_compaction_paused_.load(std::memory_order_acquire)) {
+    return;
+  }
+  Version* current = cfd->current();
+  current->Ref();
+  // release lock while notifying events
+  mutex_.Unlock();
+  TEST_SYNC_POINT("DBImpl::NotifyOnCompactionCompleted::UnlockMutex");
+  {
+    CompactionJobInfo info{};
+    BuildCompactionJobInfo(cfd, c, st, compaction_job_stats, job_id, current,
+                           &info);
+    for (auto listener : immutable_db_options_.listeners) {
+      listener->OnCompactionCompleted(this, info);
+    }
+  }
+  mutex_.Lock();
+  current->Unref();
+  // no need to signal bg_cv_ as it will be signaled at the end of the
+  // flush process.
+#else
+  (void)cfd;
+  (void)c;
+  (void)st;
+  (void)compaction_job_stats;
+  (void)job_id;
+#endif  // ROCKSDB_LITE
+}
+
+// REQUIREMENT: block all background work by calling PauseBackgroundWork()
+// before calling this function
+Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) {
+  assert(level < cfd->NumberLevels());
+  if (target_level >= cfd->NumberLevels()) {
+    return Status::InvalidArgument("Target level exceeds number of levels");
+  }
+
+  SuperVersionContext sv_context(/* create_superversion */ true);
+
+  Status status;
+
+  InstrumentedMutexLock guard_lock(&mutex_);
+
+  // only allow one thread refitting
+  if (refitting_level_) {
+    ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                   "[ReFitLevel] another thread is refitting");
+    return Status::NotSupported("another thread is refitting");
+  }
+  refitting_level_ = true;
+
+  const MutableCFOptions mutable_cf_options = *cfd->GetLatestMutableCFOptions();
+  // move to a smaller level
+  int to_level = target_level;
+  if (target_level < 0) {
+    to_level = FindMinimumEmptyLevelFitting(cfd, mutable_cf_options, level);
+  }
+
+  auto* vstorage = cfd->current()->storage_info();
+  if (to_level > level) {
+    if (level == 0) {
+      return Status::NotSupported(
+          "Cannot change from level 0 to other levels.");
+    }
+    // Check levels are empty for a trivial move
+    for (int l = level + 1; l <= to_level; l++) {
+      if (vstorage->NumLevelFiles(l) > 0) {
+        return Status::NotSupported(
+            "Levels between source and target are not empty for a move.");
+      }
+    }
+  }
+  if (to_level != level) {
+    ROCKS_LOG_DEBUG(immutable_db_options_.info_log,
+                    "[%s] Before refitting:\n%s", cfd->GetName().c_str(),
+                    cfd->current()->DebugString().data());
+
+    VersionEdit edit;
+    edit.SetColumnFamily(cfd->GetID());
+    for (const auto& f : vstorage->LevelFiles(level)) {
+      edit.DeleteFile(level, f->fd.GetNumber());
+      edit.AddFile(to_level, f->fd.GetNumber(), f->fd.GetPathId(),
+                   f->fd.GetFileSize(), f->smallest, f->largest,
+                   f->fd.smallest_seqno, f->fd.largest_seqno,
+                   f->marked_for_compaction, f->oldest_blob_file_number,
+                   f->oldest_ancester_time, f->file_creation_time,
+                   f->file_checksum, f->file_checksum_func_name);
+    }
+    ROCKS_LOG_DEBUG(immutable_db_options_.info_log,
+                    "[%s] Apply version edit:\n%s", cfd->GetName().c_str(),
+                    edit.DebugString().data());
+
+    status = versions_->LogAndApply(cfd, mutable_cf_options, &edit, &mutex_,
+                                    directories_.GetDbDir());
+    InstallSuperVersionAndScheduleWork(cfd, &sv_context, mutable_cf_options);
+
+    ROCKS_LOG_DEBUG(immutable_db_options_.info_log, "[%s] LogAndApply: %s\n",
+                    cfd->GetName().c_str(), status.ToString().data());
+
+    if (status.ok()) {
+      ROCKS_LOG_DEBUG(immutable_db_options_.info_log,
+                      "[%s] After refitting:\n%s", cfd->GetName().c_str(),
+                      cfd->current()->DebugString().data());
+    }
+  }
+
+  sv_context.Clean();
+  refitting_level_ = false;
+
+  return status;
+}
+
+int DBImpl::NumberLevels(ColumnFamilyHandle* column_family) {
+  auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+  return cfh->cfd()->NumberLevels();
+}
+
+int DBImpl::MaxMemCompactionLevel(ColumnFamilyHandle* /*column_family*/) {
+  return 0;
+}
+
+int DBImpl::Level0StopWriteTrigger(ColumnFamilyHandle* column_family) {
+  auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+  InstrumentedMutexLock l(&mutex_);
+  return cfh->cfd()
+      ->GetSuperVersion()
+      ->mutable_cf_options.level0_stop_writes_trigger;
+}
+
+Status DBImpl::Flush(const FlushOptions& flush_options,
+                     ColumnFamilyHandle* column_family) {
+  auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+  ROCKS_LOG_INFO(immutable_db_options_.info_log, "[%s] Manual flush start.",
+                 cfh->GetName().c_str());
+  Status s;
+  if (immutable_db_options_.atomic_flush) {
+    s = AtomicFlushMemTables({cfh->cfd()}, flush_options,
+                             FlushReason::kManualFlush);
+  } else {
+    s = FlushMemTable(cfh->cfd(), flush_options, FlushReason::kManualFlush);
+  }
+
+  ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                 "[%s] Manual flush finished, status: %s\n",
+                 cfh->GetName().c_str(), s.ToString().c_str());
+  return s;
+}
+
+Status DBImpl::Flush(const FlushOptions& flush_options,
+                     const std::vector<ColumnFamilyHandle*>& column_families) {
+  Status s;
+  if (!immutable_db_options_.atomic_flush) {
+    for (auto cfh : column_families) {
+      s = Flush(flush_options, cfh);
+      if (!s.ok()) {
+        break;
+      }
+    }
+  } else {
+    ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                   "Manual atomic flush start.\n"
+                   "=====Column families:=====");
+    for (auto cfh : column_families) {
+      auto cfhi = static_cast<ColumnFamilyHandleImpl*>(cfh);
+      ROCKS_LOG_INFO(immutable_db_options_.info_log, "%s",
+                     cfhi->GetName().c_str());
+    }
+    ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                   "=====End of column families list=====");
+    autovector<ColumnFamilyData*> cfds;
+    std::for_each(column_families.begin(), column_families.end(),
+                  [&cfds](ColumnFamilyHandle* elem) {
+                    auto cfh = static_cast<ColumnFamilyHandleImpl*>(elem);
+                    cfds.emplace_back(cfh->cfd());
+                  });
+    s = AtomicFlushMemTables(cfds, flush_options, FlushReason::kManualFlush);
+    ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                   "Manual atomic flush finished, status: %s\n"
+                   "=====Column families:=====",
+                   s.ToString().c_str());
+    for (auto cfh : column_families) {
+      auto cfhi = static_cast<ColumnFamilyHandleImpl*>(cfh);
+      ROCKS_LOG_INFO(immutable_db_options_.info_log, "%s",
+                     cfhi->GetName().c_str());
+    }
+    ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                   "=====End of column families list=====");
+  }
+  return s;
+}
+
+Status DBImpl::RunManualCompaction(
+    ColumnFamilyData* cfd, int input_level, int output_level,
+    const CompactRangeOptions& compact_range_options, const Slice* begin,
+    const Slice* end, bool exclusive, bool disallow_trivial_move,
+    uint64_t max_file_num_to_ignore) {
+  assert(input_level == ColumnFamilyData::kCompactAllLevels ||
+         input_level >= 0);
+
+  InternalKey begin_storage, end_storage;
+  CompactionArg* ca;
+
+  bool scheduled = false;
+  bool manual_conflict = false;
+  ManualCompactionState manual;
+  manual.cfd = cfd;
+  manual.input_level = input_level;
+  manual.output_level = output_level;
+  manual.output_path_id = compact_range_options.target_path_id;
+  manual.done = false;
+  manual.in_progress = false;
+  manual.incomplete = false;
+  manual.exclusive = exclusive;
+  manual.disallow_trivial_move = disallow_trivial_move;
+  // For universal compaction, we enforce every manual compaction to compact
+  // all files.
+  if (begin == nullptr ||
+      cfd->ioptions()->compaction_style == kCompactionStyleUniversal ||
+      cfd->ioptions()->compaction_style == kCompactionStyleFIFO) {
+    manual.begin = nullptr;
+  } else {
+    begin_storage.SetMinPossibleForUserKey(*begin);
+    manual.begin = &begin_storage;
+  }
+  if (end == nullptr ||
+      cfd->ioptions()->compaction_style == kCompactionStyleUniversal ||
+      cfd->ioptions()->compaction_style == kCompactionStyleFIFO) {
+    manual.end = nullptr;
+  } else {
+    end_storage.SetMaxPossibleForUserKey(*end);
+    manual.end = &end_storage;
+  }
+
+  TEST_SYNC_POINT("DBImpl::RunManualCompaction:0");
+  TEST_SYNC_POINT("DBImpl::RunManualCompaction:1");
+  InstrumentedMutexLock l(&mutex_);
+
+  // When a manual compaction arrives, temporarily disable scheduling of
+  // non-manual compactions and wait until the number of scheduled compaction
+  // jobs drops to zero. This is needed to ensure that this manual compaction
+  // can compact any range of keys/files.
+  //
+  // HasPendingManualCompaction() is true when at least one thread is inside
+  // RunManualCompaction(), i.e. during that time no other compaction will
+  // get scheduled (see MaybeScheduleFlushOrCompaction).
+  //
+  // Note that the following loop doesn't stop more that one thread calling
+  // RunManualCompaction() from getting to the second while loop below.
+  // However, only one of them will actually schedule compaction, while
+  // others will wait on a condition variable until it completes.
+
+  AddManualCompaction(&manual);
+  TEST_SYNC_POINT_CALLBACK("DBImpl::RunManualCompaction:NotScheduled", &mutex_);
+  if (exclusive) {
+    while (bg_bottom_compaction_scheduled_ > 0 ||
+           bg_compaction_scheduled_ > 0) {
+      TEST_SYNC_POINT("DBImpl::RunManualCompaction:WaitScheduled");
+      ROCKS_LOG_INFO(
+          immutable_db_options_.info_log,
+          "[%s] Manual compaction waiting for all other scheduled background "
+          "compactions to finish",
+          cfd->GetName().c_str());
+      bg_cv_.Wait();
+    }
+  }
+
+  ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                 "[%s] Manual compaction starting", cfd->GetName().c_str());
+
+  LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL,
+                       immutable_db_options_.info_log.get());
+  // We don't check bg_error_ here, because if we get the error in compaction,
+  // the compaction will set manual.status to bg_error_ and set manual.done to
+  // true.
+  while (!manual.done) {
+    assert(HasPendingManualCompaction());
+    manual_conflict = false;
+    Compaction* compaction = nullptr;
+    if (ShouldntRunManualCompaction(&manual) || (manual.in_progress == true) ||
+        scheduled ||
+        (((manual.manual_end = &manual.tmp_storage1) != nullptr) &&
+         ((compaction = manual.cfd->CompactRange(
+               *manual.cfd->GetLatestMutableCFOptions(), manual.input_level,
+               manual.output_level, compact_range_options, manual.begin,
+               manual.end, &manual.manual_end, &manual_conflict,
+               max_file_num_to_ignore)) == nullptr &&
+          manual_conflict))) {
+      // exclusive manual compactions should not see a conflict during
+      // CompactRange
+      assert(!exclusive || !manual_conflict);
+      // Running either this or some other manual compaction
+      bg_cv_.Wait();
+      if (scheduled && manual.incomplete == true) {
+        assert(!manual.in_progress);
+        scheduled = false;
+        manual.incomplete = false;
+      }
+    } else if (!scheduled) {
+      if (compaction == nullptr) {
+        manual.done = true;
+        bg_cv_.SignalAll();
+        continue;
+      }
+      ca = new CompactionArg;
+      ca->db = this;
+      ca->prepicked_compaction = new PrepickedCompaction;
+      ca->prepicked_compaction->manual_compaction_state = &manual;
+      ca->prepicked_compaction->compaction = compaction;
+      if (!RequestCompactionToken(
+              cfd, true, &ca->prepicked_compaction->task_token, &log_buffer)) {
+        // Don't throttle manual compaction, only count outstanding tasks.
+        assert(false);
+      }
+      manual.incomplete = false;
+      bg_compaction_scheduled_++;
+      env_->Schedule(&DBImpl::BGWorkCompaction, ca, Env::Priority::LOW, this,
+                     &DBImpl::UnscheduleCompactionCallback);
+      scheduled = true;
+    }
+  }
+
+  log_buffer.FlushBufferToLog();
+  assert(!manual.in_progress);
+  assert(HasPendingManualCompaction());
+  RemoveManualCompaction(&manual);
+  bg_cv_.SignalAll();
+  return manual.status;
+}
+
+void DBImpl::GenerateFlushRequest(const autovector<ColumnFamilyData*>& cfds,
+                                  FlushRequest* req) {
+  assert(req != nullptr);
+  req->reserve(cfds.size());
+  for (const auto cfd : cfds) {
+    if (nullptr == cfd) {
+      // cfd may be null, see DBImpl::ScheduleFlushes
+      continue;
+    }
+    uint64_t max_memtable_id = cfd->imm()->GetLatestMemTableID();
+    req->emplace_back(cfd, max_memtable_id);
+  }
+}
+
+Status DBImpl::FlushMemTable(ColumnFamilyData* cfd,
+                             const FlushOptions& flush_options,
+                             FlushReason flush_reason, bool writes_stopped) {
+  Status s;
+  uint64_t flush_memtable_id = 0;
+  if (!flush_options.allow_write_stall) {
+    bool flush_needed = true;
+    s = WaitUntilFlushWouldNotStallWrites(cfd, &flush_needed);
+    TEST_SYNC_POINT("DBImpl::FlushMemTable:StallWaitDone");
+    if (!s.ok() || !flush_needed) {
+      return s;
+    }
+  }
+  FlushRequest flush_req;
+  {
+    WriteContext context;
+    InstrumentedMutexLock guard_lock(&mutex_);
+
+    WriteThread::Writer w;
+    WriteThread::Writer nonmem_w;
+    if (!writes_stopped) {
+      write_thread_.EnterUnbatched(&w, &mutex_);
+      if (two_write_queues_) {
+        nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_);
+      }
+    }
+    WaitForPendingWrites();
+
+    if (!cfd->mem()->IsEmpty() || !cached_recoverable_state_empty_.load()) {
+      s = SwitchMemtable(cfd, &context);
+    }
+    if (s.ok()) {
+      if (cfd->imm()->NumNotFlushed() != 0 || !cfd->mem()->IsEmpty() ||
+          !cached_recoverable_state_empty_.load()) {
+        flush_memtable_id = cfd->imm()->GetLatestMemTableID();
+        flush_req.emplace_back(cfd, flush_memtable_id);
+      }
+      if (immutable_db_options_.persist_stats_to_disk) {
+        ColumnFamilyData* cfd_stats =
+            versions_->GetColumnFamilySet()->GetColumnFamily(
+                kPersistentStatsColumnFamilyName);
+        if (cfd_stats != nullptr && cfd_stats != cfd &&
+            !cfd_stats->mem()->IsEmpty()) {
+          // only force flush stats CF when it will be the only CF lagging
+          // behind after the current flush
+          bool stats_cf_flush_needed = true;
+          for (auto* loop_cfd : *versions_->GetColumnFamilySet()) {
+            if (loop_cfd == cfd_stats || loop_cfd == cfd) {
+              continue;
+            }
+            if (loop_cfd->GetLogNumber() <= cfd_stats->GetLogNumber()) {
+              stats_cf_flush_needed = false;
+            }
+          }
+          if (stats_cf_flush_needed) {
+            ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                           "Force flushing stats CF with manual flush of %s "
+                           "to avoid holding old logs",
+                           cfd->GetName().c_str());
+            s = SwitchMemtable(cfd_stats, &context);
+            flush_memtable_id = cfd_stats->imm()->GetLatestMemTableID();
+            flush_req.emplace_back(cfd_stats, flush_memtable_id);
+          }
+        }
+      }
+    }
+
+    if (s.ok() && !flush_req.empty()) {
+      for (auto& elem : flush_req) {
+        ColumnFamilyData* loop_cfd = elem.first;
+        loop_cfd->imm()->FlushRequested();
+      }
+      // If the caller wants to wait for this flush to complete, it indicates
+      // that the caller expects the ColumnFamilyData not to be free'ed by
+      // other threads which may drop the column family concurrently.
+      // Therefore, we increase the cfd's ref count.
+      if (flush_options.wait) {
+        for (auto& elem : flush_req) {
+          ColumnFamilyData* loop_cfd = elem.first;
+          loop_cfd->Ref();
+        }
+      }
+      SchedulePendingFlush(flush_req, flush_reason);
+      MaybeScheduleFlushOrCompaction();
+    }
+
+    if (!writes_stopped) {
+      write_thread_.ExitUnbatched(&w);
+      if (two_write_queues_) {
+        nonmem_write_thread_.ExitUnbatched(&nonmem_w);
+      }
+    }
+  }
+  TEST_SYNC_POINT("DBImpl::FlushMemTable:AfterScheduleFlush");
+  TEST_SYNC_POINT("DBImpl::FlushMemTable:BeforeWaitForBgFlush");
+  if (s.ok() && flush_options.wait) {
+    autovector<ColumnFamilyData*> cfds;
+    autovector<const uint64_t*> flush_memtable_ids;
+    for (auto& iter : flush_req) {
+      cfds.push_back(iter.first);
+      flush_memtable_ids.push_back(&(iter.second));
+    }
+    s = WaitForFlushMemTables(cfds, flush_memtable_ids,
+                              (flush_reason == FlushReason::kErrorRecovery));
+    InstrumentedMutexLock lock_guard(&mutex_);
+    for (auto* tmp_cfd : cfds) {
+      tmp_cfd->UnrefAndTryDelete();
+    }
+  }
+  TEST_SYNC_POINT("DBImpl::FlushMemTable:FlushMemTableFinished");
+  return s;
+}
+
+// Flush all elements in 'column_family_datas'
+// and atomically record the result to the MANIFEST.
+Status DBImpl::AtomicFlushMemTables(
+    const autovector<ColumnFamilyData*>& column_family_datas,
+    const FlushOptions& flush_options, FlushReason flush_reason,
+    bool writes_stopped) {
+  Status s;
+  if (!flush_options.allow_write_stall) {
+    int num_cfs_to_flush = 0;
+    for (auto cfd : column_family_datas) {
+      bool flush_needed = true;
+      s = WaitUntilFlushWouldNotStallWrites(cfd, &flush_needed);
+      if (!s.ok()) {
+        return s;
+      } else if (flush_needed) {
+        ++num_cfs_to_flush;
+      }
+    }
+    if (0 == num_cfs_to_flush) {
+      return s;
+    }
+  }
+  FlushRequest flush_req;
+  autovector<ColumnFamilyData*> cfds;
+  {
+    WriteContext context;
+    InstrumentedMutexLock guard_lock(&mutex_);
+
+    WriteThread::Writer w;
+    WriteThread::Writer nonmem_w;
+    if (!writes_stopped) {
+      write_thread_.EnterUnbatched(&w, &mutex_);
+      if (two_write_queues_) {
+        nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_);
+      }
+    }
+    WaitForPendingWrites();
+
+    for (auto cfd : column_family_datas) {
+      if (cfd->IsDropped()) {
+        continue;
+      }
+      if (cfd->imm()->NumNotFlushed() != 0 || !cfd->mem()->IsEmpty() ||
+          !cached_recoverable_state_empty_.load()) {
+        cfds.emplace_back(cfd);
+      }
+    }
+    for (auto cfd : cfds) {
+      if (cfd->mem()->IsEmpty() && cached_recoverable_state_empty_.load()) {
+        continue;
+      }
+      cfd->Ref();
+      s = SwitchMemtable(cfd, &context);
+      cfd->UnrefAndTryDelete();
+      if (!s.ok()) {
+        break;
+      }
+    }
+    if (s.ok()) {
+      AssignAtomicFlushSeq(cfds);
+      for (auto cfd : cfds) {
+        cfd->imm()->FlushRequested();
+      }
+      // If the caller wants to wait for this flush to complete, it indicates
+      // that the caller expects the ColumnFamilyData not to be free'ed by
+      // other threads which may drop the column family concurrently.
+      // Therefore, we increase the cfd's ref count.
+      if (flush_options.wait) {
+        for (auto cfd : cfds) {
+          cfd->Ref();
+        }
+      }
+      GenerateFlushRequest(cfds, &flush_req);
+      SchedulePendingFlush(flush_req, flush_reason);
+      MaybeScheduleFlushOrCompaction();
+    }
+
+    if (!writes_stopped) {
+      write_thread_.ExitUnbatched(&w);
+      if (two_write_queues_) {
+        nonmem_write_thread_.ExitUnbatched(&nonmem_w);
+      }
+    }
+  }
+  TEST_SYNC_POINT("DBImpl::AtomicFlushMemTables:AfterScheduleFlush");
+  TEST_SYNC_POINT("DBImpl::AtomicFlushMemTables:BeforeWaitForBgFlush");
+  if (s.ok() && flush_options.wait) {
+    autovector<const uint64_t*> flush_memtable_ids;
+    for (auto& iter : flush_req) {
+      flush_memtable_ids.push_back(&(iter.second));
+    }
+    s = WaitForFlushMemTables(cfds, flush_memtable_ids,
+                              (flush_reason == FlushReason::kErrorRecovery));
+    InstrumentedMutexLock lock_guard(&mutex_);
+    for (auto* cfd : cfds) {
+      cfd->UnrefAndTryDelete();
+    }
+  }
+  return s;
+}
+
+// Calling FlushMemTable(), whether from DB::Flush() or from Backup Engine, can
+// cause write stall, for example if one memtable is being flushed already.
+// This method tries to avoid write stall (similar to CompactRange() behavior)
+// it emulates how the SuperVersion / LSM would change if flush happens, checks
+// it against various constrains and delays flush if it'd cause write stall.
+// Called should check status and flush_needed to see if flush already happened.
+Status DBImpl::WaitUntilFlushWouldNotStallWrites(ColumnFamilyData* cfd,
+                                                 bool* flush_needed) {
+  {
+    *flush_needed = true;
+    InstrumentedMutexLock l(&mutex_);
+    uint64_t orig_active_memtable_id = cfd->mem()->GetID();
+    WriteStallCondition write_stall_condition = WriteStallCondition::kNormal;
+    do {
+      if (write_stall_condition != WriteStallCondition::kNormal) {
+        // Same error handling as user writes: Don't wait if there's a
+        // background error, even if it's a soft error. We might wait here
+        // indefinitely as the pending flushes/compactions may never finish
+        // successfully, resulting in the stall condition lasting indefinitely
+        if (error_handler_.IsBGWorkStopped()) {
+          return error_handler_.GetBGError();
+        }
+
+        TEST_SYNC_POINT("DBImpl::WaitUntilFlushWouldNotStallWrites:StallWait");
+        ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                       "[%s] WaitUntilFlushWouldNotStallWrites"
+                       " waiting on stall conditions to clear",
+                       cfd->GetName().c_str());
+        bg_cv_.Wait();
+      }
+      if (cfd->IsDropped()) {
+        return Status::ColumnFamilyDropped();
+      }
+      if (shutting_down_.load(std::memory_order_acquire)) {
+        return Status::ShutdownInProgress();
+      }
+
+      uint64_t earliest_memtable_id =
+          std::min(cfd->mem()->GetID(), cfd->imm()->GetEarliestMemTableID());
+      if (earliest_memtable_id > orig_active_memtable_id) {
+        // We waited so long that the memtable we were originally waiting on was
+        // flushed.
+        *flush_needed = false;
+        return Status::OK();
+      }
+
+      const auto& mutable_cf_options = *cfd->GetLatestMutableCFOptions();
+      const auto* vstorage = cfd->current()->storage_info();
+
+      // Skip stalling check if we're below auto-flush and auto-compaction
+      // triggers. If it stalled in these conditions, that'd mean the stall
+      // triggers are so low that stalling is needed for any background work. In
+      // that case we shouldn't wait since background work won't be scheduled.
+      if (cfd->imm()->NumNotFlushed() <
+              cfd->ioptions()->min_write_buffer_number_to_merge &&
+          vstorage->l0_delay_trigger_count() <
+              mutable_cf_options.level0_file_num_compaction_trigger) {
+        break;
+      }
+
+      // check whether one extra immutable memtable or an extra L0 file would
+      // cause write stalling mode to be entered. It could still enter stall
+      // mode due to pending compaction bytes, but that's less common
+      write_stall_condition =
+          ColumnFamilyData::GetWriteStallConditionAndCause(
+              cfd->imm()->NumNotFlushed() + 1,
+              vstorage->l0_delay_trigger_count() + 1,
+              vstorage->estimated_compaction_needed_bytes(), mutable_cf_options)
+              .first;
+    } while (write_stall_condition != WriteStallCondition::kNormal);
+  }
+  return Status::OK();
+}
+
+// Wait for memtables to be flushed for multiple column families.
+// let N = cfds.size()
+// for i in [0, N),
+//  1) if flush_memtable_ids[i] is not null, then the memtables with lower IDs
+//     have to be flushed for THIS column family;
+//  2) if flush_memtable_ids[i] is null, then all memtables in THIS column
+//     family have to be flushed.
+// Finish waiting when ALL column families finish flushing memtables.
+// resuming_from_bg_err indicates whether the caller is trying to resume from
+// background error or in normal processing.
+Status DBImpl::WaitForFlushMemTables(
+    const autovector<ColumnFamilyData*>& cfds,
+    const autovector<const uint64_t*>& flush_memtable_ids,
+    bool resuming_from_bg_err) {
+  int num = static_cast<int>(cfds.size());
+  // Wait until the compaction completes
+  InstrumentedMutexLock l(&mutex_);
+  // If the caller is trying to resume from bg error, then
+  // error_handler_.IsDBStopped() is true.
+  while (resuming_from_bg_err || !error_handler_.IsDBStopped()) {
+    if (shutting_down_.load(std::memory_order_acquire)) {
+      return Status::ShutdownInProgress();
+    }
+    // If an error has occurred during resumption, then no need to wait.
+    if (!error_handler_.GetRecoveryError().ok()) {
+      break;
+    }
+    // Number of column families that have been dropped.
+    int num_dropped = 0;
+    // Number of column families that have finished flush.
+    int num_finished = 0;
+    for (int i = 0; i < num; ++i) {
+      if (cfds[i]->IsDropped()) {
+        ++num_dropped;
+      } else if (cfds[i]->imm()->NumNotFlushed() == 0 ||
+                 (flush_memtable_ids[i] != nullptr &&
+                  cfds[i]->imm()->GetEarliestMemTableID() >
+                      *flush_memtable_ids[i])) {
+        ++num_finished;
+      }
+    }
+    if (1 == num_dropped && 1 == num) {
+      return Status::InvalidArgument("Cannot flush a dropped CF");
+    }
+    // Column families involved in this flush request have either been dropped
+    // or finished flush. Then it's time to finish waiting.
+    if (num_dropped + num_finished == num) {
+      break;
+    }
+    bg_cv_.Wait();
+  }
+  Status s;
+  // If not resuming from bg error, and an error has caused the DB to stop,
+  // then report the bg error to caller.
+  if (!resuming_from_bg_err && error_handler_.IsDBStopped()) {
+    s = error_handler_.GetBGError();
+  }
+  return s;
+}
+
+Status DBImpl::EnableAutoCompaction(
+    const std::vector<ColumnFamilyHandle*>& column_family_handles) {
+  Status s;
+  for (auto cf_ptr : column_family_handles) {
+    Status status =
+        this->SetOptions(cf_ptr, {{"disable_auto_compactions", "false"}});
+    if (!status.ok()) {
+      s = status;
+    }
+  }
+
+  return s;
+}
+
+void DBImpl::DisableManualCompaction() {
+  manual_compaction_paused_.store(true, std::memory_order_release);
+}
+
+void DBImpl::EnableManualCompaction() {
+  manual_compaction_paused_.store(false, std::memory_order_release);
+}
+
+void DBImpl::MaybeScheduleFlushOrCompaction() {
+  mutex_.AssertHeld();
+  if (!opened_successfully_) {
+    // Compaction may introduce data race to DB open
+    return;
+  }
+  if (bg_work_paused_ > 0) {
+    // we paused the background work
+    return;
+  } else if (error_handler_.IsBGWorkStopped() &&
+             !error_handler_.IsRecoveryInProgress()) {
+    // There has been a hard error and this call is not part of the recovery
+    // sequence. Bail out here so we don't get into an endless loop of
+    // scheduling BG work which will again call this function
+    return;
+  } else if (shutting_down_.load(std::memory_order_acquire)) {
+    // DB is being deleted; no more background compactions
+    return;
+  }
+  auto bg_job_limits = GetBGJobLimits();
+  bool is_flush_pool_empty =
+      env_->GetBackgroundThreads(Env::Priority::HIGH) == 0;
+  while (!is_flush_pool_empty && unscheduled_flushes_ > 0 &&
+         bg_flush_scheduled_ < bg_job_limits.max_flushes) {
+    bg_flush_scheduled_++;
+    FlushThreadArg* fta = new FlushThreadArg;
+    fta->db_ = this;
+    fta->thread_pri_ = Env::Priority::HIGH;
+    env_->Schedule(&DBImpl::BGWorkFlush, fta, Env::Priority::HIGH, this,
+                   &DBImpl::UnscheduleFlushCallback);
+    --unscheduled_flushes_;
+    TEST_SYNC_POINT_CALLBACK(
+        "DBImpl::MaybeScheduleFlushOrCompaction:AfterSchedule:0",
+        &unscheduled_flushes_);
+  }
+
+  // special case -- if high-pri (flush) thread pool is empty, then schedule
+  // flushes in low-pri (compaction) thread pool.
+  if (is_flush_pool_empty) {
+    while (unscheduled_flushes_ > 0 &&
+           bg_flush_scheduled_ + bg_compaction_scheduled_ <
+               bg_job_limits.max_flushes) {
+      bg_flush_scheduled_++;
+      FlushThreadArg* fta = new FlushThreadArg;
+      fta->db_ = this;
+      fta->thread_pri_ = Env::Priority::LOW;
+      env_->Schedule(&DBImpl::BGWorkFlush, fta, Env::Priority::LOW, this,
+                     &DBImpl::UnscheduleFlushCallback);
+      --unscheduled_flushes_;
+    }
+  }
+
+  if (bg_compaction_paused_ > 0) {
+    // we paused the background compaction
+    return;
+  } else if (error_handler_.IsBGWorkStopped()) {
+    // Compaction is not part of the recovery sequence from a hard error. We
+    // might get here because recovery might do a flush and install a new
+    // super version, which will try to schedule pending compactions. Bail
+    // out here and let the higher level recovery handle compactions
+    return;
+  }
+
+  if (HasExclusiveManualCompaction()) {
+    // only manual compactions are allowed to run. don't schedule automatic
+    // compactions
+    TEST_SYNC_POINT("DBImpl::MaybeScheduleFlushOrCompaction:Conflict");
+    return;
+  }
+
+  while (bg_compaction_scheduled_ < bg_job_limits.max_compactions &&
+         unscheduled_compactions_ > 0) {
+    CompactionArg* ca = new CompactionArg;
+    ca->db = this;
+    ca->prepicked_compaction = nullptr;
+    bg_compaction_scheduled_++;
+    unscheduled_compactions_--;
+    env_->Schedule(&DBImpl::BGWorkCompaction, ca, Env::Priority::LOW, this,
+                   &DBImpl::UnscheduleCompactionCallback);
+  }
+}
+
+DBImpl::BGJobLimits DBImpl::GetBGJobLimits() const {
+  mutex_.AssertHeld();
+  return GetBGJobLimits(immutable_db_options_.max_background_flushes,
+                        mutable_db_options_.max_background_compactions,
+                        mutable_db_options_.max_background_jobs,
+                        write_controller_.NeedSpeedupCompaction());
+}
+
+DBImpl::BGJobLimits DBImpl::GetBGJobLimits(int max_background_flushes,
+                                           int max_background_compactions,
+                                           int max_background_jobs,
+                                           bool parallelize_compactions) {
+  BGJobLimits res;
+  if (max_background_flushes == -1 && max_background_compactions == -1) {
+    // for our first stab implementing max_background_jobs, simply allocate a
+    // quarter of the threads to flushes.
+    res.max_flushes = std::max(1, max_background_jobs / 4);
+    res.max_compactions = std::max(1, max_background_jobs - res.max_flushes);
+  } else {
+    // compatibility code in case users haven't migrated to max_background_jobs,
+    // which automatically computes flush/compaction limits
+    res.max_flushes = std::max(1, max_background_flushes);
+    res.max_compactions = std::max(1, max_background_compactions);
+  }
+  if (!parallelize_compactions) {
+    // throttle background compactions until we deem necessary
+    res.max_compactions = 1;
+  }
+  return res;
+}
+
+void DBImpl::AddToCompactionQueue(ColumnFamilyData* cfd) {
+  assert(!cfd->queued_for_compaction());
+  cfd->Ref();
+  compaction_queue_.push_back(cfd);
+  cfd->set_queued_for_compaction(true);
+}
+
+ColumnFamilyData* DBImpl::PopFirstFromCompactionQueue() {
+  assert(!compaction_queue_.empty());
+  auto cfd = *compaction_queue_.begin();
+  compaction_queue_.pop_front();
+  assert(cfd->queued_for_compaction());
+  cfd->set_queued_for_compaction(false);
+  return cfd;
+}
+
+DBImpl::FlushRequest DBImpl::PopFirstFromFlushQueue() {
+  assert(!flush_queue_.empty());
+  FlushRequest flush_req = flush_queue_.front();
+  flush_queue_.pop_front();
+  // TODO: need to unset flush reason?
+  return flush_req;
+}
+
+ColumnFamilyData* DBImpl::PickCompactionFromQueue(
+    std::unique_ptr<TaskLimiterToken>* token, LogBuffer* log_buffer) {
+  assert(!compaction_queue_.empty());
+  assert(*token == nullptr);
+  autovector<ColumnFamilyData*> throttled_candidates;
+  ColumnFamilyData* cfd = nullptr;
+  while (!compaction_queue_.empty()) {
+    auto first_cfd = *compaction_queue_.begin();
+    compaction_queue_.pop_front();
+    assert(first_cfd->queued_for_compaction());
+    if (!RequestCompactionToken(first_cfd, false, token, log_buffer)) {
+      throttled_candidates.push_back(first_cfd);
+      continue;
+    }
+    cfd = first_cfd;
+    cfd->set_queued_for_compaction(false);
+    break;
+  }
+  // Add throttled compaction candidates back to queue in the original order.
+  for (auto iter = throttled_candidates.rbegin();
+       iter != throttled_candidates.rend(); ++iter) {
+    compaction_queue_.push_front(*iter);
+  }
+  return cfd;
+}
+
+void DBImpl::SchedulePendingFlush(const FlushRequest& flush_req,
+                                  FlushReason flush_reason) {
+  if (flush_req.empty()) {
+    return;
+  }
+  for (auto& iter : flush_req) {
+    ColumnFamilyData* cfd = iter.first;
+    cfd->Ref();
+    cfd->SetFlushReason(flush_reason);
+  }
+  ++unscheduled_flushes_;
+  flush_queue_.push_back(flush_req);
+}
+
+void DBImpl::SchedulePendingCompaction(ColumnFamilyData* cfd) {
+  if (!cfd->queued_for_compaction() && cfd->NeedsCompaction()) {
+    AddToCompactionQueue(cfd);
+    ++unscheduled_compactions_;
+  }
+}
+
+void DBImpl::SchedulePendingPurge(std::string fname, std::string dir_to_sync,
+                                  FileType type, uint64_t number, int job_id) {
+  mutex_.AssertHeld();
+  PurgeFileInfo file_info(fname, dir_to_sync, type, number, job_id);
+  purge_files_.insert({{number, std::move(file_info)}});
+}
+
+void DBImpl::BGWorkFlush(void* arg) {
+  FlushThreadArg fta = *(reinterpret_cast<FlushThreadArg*>(arg));
+  delete reinterpret_cast<FlushThreadArg*>(arg);
+
+  IOSTATS_SET_THREAD_POOL_ID(fta.thread_pri_);
+  TEST_SYNC_POINT("DBImpl::BGWorkFlush");
+  static_cast_with_check<DBImpl, DB>(fta.db_)->BackgroundCallFlush(
+      fta.thread_pri_);
+  TEST_SYNC_POINT("DBImpl::BGWorkFlush:done");
+}
+
+void DBImpl::BGWorkCompaction(void* arg) {
+  CompactionArg ca = *(reinterpret_cast<CompactionArg*>(arg));
+  delete reinterpret_cast<CompactionArg*>(arg);
+  IOSTATS_SET_THREAD_POOL_ID(Env::Priority::LOW);
+  TEST_SYNC_POINT("DBImpl::BGWorkCompaction");
+  auto prepicked_compaction =
+      static_cast<PrepickedCompaction*>(ca.prepicked_compaction);
+  static_cast_with_check<DBImpl, DB>(ca.db)->BackgroundCallCompaction(
+      prepicked_compaction, Env::Priority::LOW);
+  delete prepicked_compaction;
+}
+
+void DBImpl::BGWorkBottomCompaction(void* arg) {
+  CompactionArg ca = *(static_cast<CompactionArg*>(arg));
+  delete static_cast<CompactionArg*>(arg);
+  IOSTATS_SET_THREAD_POOL_ID(Env::Priority::BOTTOM);
+  TEST_SYNC_POINT("DBImpl::BGWorkBottomCompaction");
+  auto* prepicked_compaction = ca.prepicked_compaction;
+  assert(prepicked_compaction && prepicked_compaction->compaction &&
+         !prepicked_compaction->manual_compaction_state);
+  ca.db->BackgroundCallCompaction(prepicked_compaction, Env::Priority::BOTTOM);
+  delete prepicked_compaction;
+}
+
+void DBImpl::BGWorkPurge(void* db) {
+  IOSTATS_SET_THREAD_POOL_ID(Env::Priority::HIGH);
+  TEST_SYNC_POINT("DBImpl::BGWorkPurge:start");
+  reinterpret_cast<DBImpl*>(db)->BackgroundCallPurge();
+  TEST_SYNC_POINT("DBImpl::BGWorkPurge:end");
+}
+
+void DBImpl::UnscheduleCompactionCallback(void* arg) {
+  CompactionArg ca = *(reinterpret_cast<CompactionArg*>(arg));
+  delete reinterpret_cast<CompactionArg*>(arg);
+  if (ca.prepicked_compaction != nullptr) {
+    if (ca.prepicked_compaction->compaction != nullptr) {
+      delete ca.prepicked_compaction->compaction;
+    }
+    delete ca.prepicked_compaction;
+  }
+  TEST_SYNC_POINT("DBImpl::UnscheduleCompactionCallback");
+}
+
+void DBImpl::UnscheduleFlushCallback(void* arg) {
+  delete reinterpret_cast<FlushThreadArg*>(arg);
+  TEST_SYNC_POINT("DBImpl::UnscheduleFlushCallback");
+}
+
+Status DBImpl::BackgroundFlush(bool* made_progress, JobContext* job_context,
+                               LogBuffer* log_buffer, FlushReason* reason,
+                               Env::Priority thread_pri) {
+  mutex_.AssertHeld();
+
+  Status status;
+  *reason = FlushReason::kOthers;
+  // If BG work is stopped due to an error, but a recovery is in progress,
+  // that means this flush is part of the recovery. So allow it to go through
+  if (!error_handler_.IsBGWorkStopped()) {
+    if (shutting_down_.load(std::memory_order_acquire)) {
+      status = Status::ShutdownInProgress();
+    }
+  } else if (!error_handler_.IsRecoveryInProgress()) {
+    status = error_handler_.GetBGError();
+  }
+
+  if (!status.ok()) {
+    return status;
+  }
+
+  autovector<BGFlushArg> bg_flush_args;
+  std::vector<SuperVersionContext>& superversion_contexts =
+      job_context->superversion_contexts;
+  autovector<ColumnFamilyData*> column_families_not_to_flush;
+  while (!flush_queue_.empty()) {
+    // This cfd is already referenced
+    const FlushRequest& flush_req = PopFirstFromFlushQueue();
+    superversion_contexts.clear();
+    superversion_contexts.reserve(flush_req.size());
+
+    for (const auto& iter : flush_req) {
+      ColumnFamilyData* cfd = iter.first;
+      if (cfd->IsDropped() || !cfd->imm()->IsFlushPending()) {
+        // can't flush this CF, try next one
+        column_families_not_to_flush.push_back(cfd);
+        continue;
+      }
+      superversion_contexts.emplace_back(SuperVersionContext(true));
+      bg_flush_args.emplace_back(cfd, iter.second,
+                                 &(superversion_contexts.back()));
+    }
+    if (!bg_flush_args.empty()) {
+      break;
+    }
+  }
+
+  if (!bg_flush_args.empty()) {
+    auto bg_job_limits = GetBGJobLimits();
+    for (const auto& arg : bg_flush_args) {
+      ColumnFamilyData* cfd = arg.cfd_;
+      ROCKS_LOG_BUFFER(
+          log_buffer,
+          "Calling FlushMemTableToOutputFile with column "
+          "family [%s], flush slots available %d, compaction slots available "
+          "%d, "
+          "flush slots scheduled %d, compaction slots scheduled %d",
+          cfd->GetName().c_str(), bg_job_limits.max_flushes,
+          bg_job_limits.max_compactions, bg_flush_scheduled_,
+          bg_compaction_scheduled_);
+    }
+    status = FlushMemTablesToOutputFiles(bg_flush_args, made_progress,
+                                         job_context, log_buffer, thread_pri);
+    TEST_SYNC_POINT("DBImpl::BackgroundFlush:BeforeFlush");
+    // All the CFDs in the FlushReq must have the same flush reason, so just
+    // grab the first one
+    *reason = bg_flush_args[0].cfd_->GetFlushReason();
+    for (auto& arg : bg_flush_args) {
+      ColumnFamilyData* cfd = arg.cfd_;
+      if (cfd->UnrefAndTryDelete()) {
+        arg.cfd_ = nullptr;
+      }
+    }
+  }
+  for (auto cfd : column_families_not_to_flush) {
+    cfd->UnrefAndTryDelete();
+  }
+  return status;
+}
+
+void DBImpl::BackgroundCallFlush(Env::Priority thread_pri) {
+  bool made_progress = false;
+  JobContext job_context(next_job_id_.fetch_add(1), true);
+
+  TEST_SYNC_POINT("DBImpl::BackgroundCallFlush:start");
+
+  LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL,
+                       immutable_db_options_.info_log.get());
+  {
+    InstrumentedMutexLock l(&mutex_);
+    assert(bg_flush_scheduled_);
+    num_running_flushes_++;
+
+    std::unique_ptr<std::list<uint64_t>::iterator>
+        pending_outputs_inserted_elem(new std::list<uint64_t>::iterator(
+            CaptureCurrentFileNumberInPendingOutputs()));
+    FlushReason reason;
+
+    Status s = BackgroundFlush(&made_progress, &job_context, &log_buffer,
+                               &reason, thread_pri);
+    if (!s.ok() && !s.IsShutdownInProgress() && !s.IsColumnFamilyDropped() &&
+        reason != FlushReason::kErrorRecovery) {
+      // Wait a little bit before retrying background flush in
+      // case this is an environmental problem and we do not want to
+      // chew up resources for failed flushes for the duration of
+      // the problem.
+      uint64_t error_cnt =
+          default_cf_internal_stats_->BumpAndGetBackgroundErrorCount();
+      bg_cv_.SignalAll();  // In case a waiter can proceed despite the error
+      mutex_.Unlock();
+      ROCKS_LOG_ERROR(immutable_db_options_.info_log,
+                      "Waiting after background flush error: %s"
+                      "Accumulated background error counts: %" PRIu64,
+                      s.ToString().c_str(), error_cnt);
+      log_buffer.FlushBufferToLog();
+      LogFlush(immutable_db_options_.info_log);
+      env_->SleepForMicroseconds(1000000);
+      mutex_.Lock();
+    }
+
+    TEST_SYNC_POINT("DBImpl::BackgroundCallFlush:FlushFinish:0");
+    ReleaseFileNumberFromPendingOutputs(pending_outputs_inserted_elem);
+
+    // If flush failed, we want to delete all temporary files that we might have
+    // created. Thus, we force full scan in FindObsoleteFiles()
+    FindObsoleteFiles(&job_context, !s.ok() && !s.IsShutdownInProgress() &&
+                                        !s.IsColumnFamilyDropped());
+    // delete unnecessary files if any, this is done outside the mutex
+    if (job_context.HaveSomethingToClean() ||
+        job_context.HaveSomethingToDelete() || !log_buffer.IsEmpty()) {
+      mutex_.Unlock();
+      TEST_SYNC_POINT("DBImpl::BackgroundCallFlush:FilesFound");
+      // Have to flush the info logs before bg_flush_scheduled_--
+      // because if bg_flush_scheduled_ becomes 0 and the lock is
+      // released, the deconstructor of DB can kick in and destroy all the
+      // states of DB so info_log might not be available after that point.
+      // It also applies to access other states that DB owns.
+      log_buffer.FlushBufferToLog();
+      if (job_context.HaveSomethingToDelete()) {
+        PurgeObsoleteFiles(job_context);
+      }
+      job_context.Clean();
+      mutex_.Lock();
+    }
+    TEST_SYNC_POINT("DBImpl::BackgroundCallFlush:ContextCleanedUp");
+
+    assert(num_running_flushes_ > 0);
+    num_running_flushes_--;
+    bg_flush_scheduled_--;
+    // See if there's more work to be done
+    MaybeScheduleFlushOrCompaction();
+    atomic_flush_install_cv_.SignalAll();
+    bg_cv_.SignalAll();
+    // IMPORTANT: there should be no code after calling SignalAll. This call may
+    // signal the DB destructor that it's OK to proceed with destruction. In
+    // that case, all DB variables will be dealloacated and referencing them
+    // will cause trouble.
+  }
+}
+
+void DBImpl::BackgroundCallCompaction(PrepickedCompaction* prepicked_compaction,
+                                      Env::Priority bg_thread_pri) {
+  bool made_progress = false;
+  JobContext job_context(next_job_id_.fetch_add(1), true);
+  TEST_SYNC_POINT("BackgroundCallCompaction:0");
+  LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL,
+                       immutable_db_options_.info_log.get());
+  {
+    InstrumentedMutexLock l(&mutex_);
+
+    // This call will unlock/lock the mutex to wait for current running
+    // IngestExternalFile() calls to finish.
+    WaitForIngestFile();
+
+    num_running_compactions_++;
+
+    std::unique_ptr<std::list<uint64_t>::iterator>
+        pending_outputs_inserted_elem(new std::list<uint64_t>::iterator(
+            CaptureCurrentFileNumberInPendingOutputs()));
+
+    assert((bg_thread_pri == Env::Priority::BOTTOM &&
+            bg_bottom_compaction_scheduled_) ||
+           (bg_thread_pri == Env::Priority::LOW && bg_compaction_scheduled_));
+    Status s = BackgroundCompaction(&made_progress, &job_context, &log_buffer,
+                                    prepicked_compaction, bg_thread_pri);
+    TEST_SYNC_POINT("BackgroundCallCompaction:1");
+    if (s.IsBusy()) {
+      bg_cv_.SignalAll();  // In case a waiter can proceed despite the error
+      mutex_.Unlock();
+      env_->SleepForMicroseconds(10000);  // prevent hot loop
+      mutex_.Lock();
+    } else if (!s.ok() && !s.IsShutdownInProgress() &&
+               !s.IsManualCompactionPaused() && !s.IsColumnFamilyDropped()) {
+      // Wait a little bit before retrying background compaction in
+      // case this is an environmental problem and we do not want to
+      // chew up resources for failed compactions for the duration of
+      // the problem.
+      uint64_t error_cnt =
+          default_cf_internal_stats_->BumpAndGetBackgroundErrorCount();
+      bg_cv_.SignalAll();  // In case a waiter can proceed despite the error
+      mutex_.Unlock();
+      log_buffer.FlushBufferToLog();
+      ROCKS_LOG_ERROR(immutable_db_options_.info_log,
+                      "Waiting after background compaction error: %s, "
+                      "Accumulated background error counts: %" PRIu64,
+                      s.ToString().c_str(), error_cnt);
+      LogFlush(immutable_db_options_.info_log);
+      env_->SleepForMicroseconds(1000000);
+      mutex_.Lock();
+    } else if (s.IsManualCompactionPaused()) {
+      ManualCompactionState* m = prepicked_compaction->manual_compaction_state;
+      assert(m);
+      ROCKS_LOG_BUFFER(&log_buffer, "[%s] [JOB %d] Manual compaction paused",
+                       m->cfd->GetName().c_str(), job_context.job_id);
+    }
+
+    ReleaseFileNumberFromPendingOutputs(pending_outputs_inserted_elem);
+
+    // If compaction failed, we want to delete all temporary files that we might
+    // have created (they might not be all recorded in job_context in case of a
+    // failure). Thus, we force full scan in FindObsoleteFiles()
+    FindObsoleteFiles(&job_context, !s.ok() && !s.IsShutdownInProgress() &&
+                                        !s.IsManualCompactionPaused() &&
+                                        !s.IsColumnFamilyDropped());
+    TEST_SYNC_POINT("DBImpl::BackgroundCallCompaction:FoundObsoleteFiles");
+
+    // delete unnecessary files if any, this is done outside the mutex
+    if (job_context.HaveSomethingToClean() ||
+        job_context.HaveSomethingToDelete() || !log_buffer.IsEmpty()) {
+      mutex_.Unlock();
+      // Have to flush the info logs before bg_compaction_scheduled_--
+      // because if bg_flush_scheduled_ becomes 0 and the lock is
+      // released, the deconstructor of DB can kick in and destroy all the
+      // states of DB so info_log might not be available after that point.
+      // It also applies to access other states that DB owns.
+      log_buffer.FlushBufferToLog();
+      if (job_context.HaveSomethingToDelete()) {
+        PurgeObsoleteFiles(job_context);
+        TEST_SYNC_POINT("DBImpl::BackgroundCallCompaction:PurgedObsoleteFiles");
+      }
+      job_context.Clean();
+      mutex_.Lock();
+    }
+
+    assert(num_running_compactions_ > 0);
+    num_running_compactions_--;
+    if (bg_thread_pri == Env::Priority::LOW) {
+      bg_compaction_scheduled_--;
+    } else {
+      assert(bg_thread_pri == Env::Priority::BOTTOM);
+      bg_bottom_compaction_scheduled_--;
+    }
+
+    versions_->GetColumnFamilySet()->FreeDeadColumnFamilies();
+
+    // See if there's more work to be done
+    MaybeScheduleFlushOrCompaction();
+    if (made_progress ||
+        (bg_compaction_scheduled_ == 0 &&
+         bg_bottom_compaction_scheduled_ == 0) ||
+        HasPendingManualCompaction() || unscheduled_compactions_ == 0) {
+      // signal if
+      // * made_progress -- need to wakeup DelayWrite
+      // * bg_{bottom,}_compaction_scheduled_ == 0 -- need to wakeup ~DBImpl
+      // * HasPendingManualCompaction -- need to wakeup RunManualCompaction
+      // If none of this is true, there is no need to signal since nobody is
+      // waiting for it
+      bg_cv_.SignalAll();
+    }
+    // IMPORTANT: there should be no code after calling SignalAll. This call may
+    // signal the DB destructor that it's OK to proceed with destruction. In
+    // that case, all DB variables will be dealloacated and referencing them
+    // will cause trouble.
+  }
+}
+
+Status DBImpl::BackgroundCompaction(bool* made_progress,
+                                    JobContext* job_context,
+                                    LogBuffer* log_buffer,
+                                    PrepickedCompaction* prepicked_compaction,
+                                    Env::Priority thread_pri) {
+  ManualCompactionState* manual_compaction =
+      prepicked_compaction == nullptr
+          ? nullptr
+          : prepicked_compaction->manual_compaction_state;
+  *made_progress = false;
+  mutex_.AssertHeld();
+  TEST_SYNC_POINT("DBImpl::BackgroundCompaction:Start");
+
+  bool is_manual = (manual_compaction != nullptr);
+  std::unique_ptr<Compaction> c;
+  if (prepicked_compaction != nullptr &&
+      prepicked_compaction->compaction != nullptr) {
+    c.reset(prepicked_compaction->compaction);
+  }
+  bool is_prepicked = is_manual || c;
+
+  // (manual_compaction->in_progress == false);
+  bool trivial_move_disallowed =
+      is_manual && manual_compaction->disallow_trivial_move;
+
+  CompactionJobStats compaction_job_stats;
+  Status status;
+  if (!error_handler_.IsBGWorkStopped()) {
+    if (shutting_down_.load(std::memory_order_acquire)) {
+      status = Status::ShutdownInProgress();
+    } else if (is_manual &&
+               manual_compaction_paused_.load(std::memory_order_acquire)) {
+      status = Status::Incomplete(Status::SubCode::kManualCompactionPaused);
+    }
+  } else {
+    status = error_handler_.GetBGError();
+    // If we get here, it means a hard error happened after this compaction
+    // was scheduled by MaybeScheduleFlushOrCompaction(), but before it got
+    // a chance to execute. Since we didn't pop a cfd from the compaction
+    // queue, increment unscheduled_compactions_
+    unscheduled_compactions_++;
+  }
+
+  if (!status.ok()) {
+    if (is_manual) {
+      manual_compaction->status = status;
+      manual_compaction->done = true;
+      manual_compaction->in_progress = false;
+      manual_compaction = nullptr;
+    }
+    if (c) {
+      c->ReleaseCompactionFiles(status);
+      c.reset();
+    }
+    return status;
+  }
+
+  if (is_manual) {
+    // another thread cannot pick up the same work
+    manual_compaction->in_progress = true;
+  }
+
+  std::unique_ptr<TaskLimiterToken> task_token;
+
+  // InternalKey manual_end_storage;
+  // InternalKey* manual_end = &manual_end_storage;
+  bool sfm_reserved_compact_space = false;
+  if (is_manual) {
+    ManualCompactionState* m = manual_compaction;
+    assert(m->in_progress);
+    if (!c) {
+      m->done = true;
+      m->manual_end = nullptr;
+      ROCKS_LOG_BUFFER(log_buffer,
+                       "[%s] Manual compaction from level-%d from %s .. "
+                       "%s; nothing to do\n",
+                       m->cfd->GetName().c_str(), m->input_level,
+                       (m->begin ? m->begin->DebugString().c_str() : "(begin)"),
+                       (m->end ? m->end->DebugString().c_str() : "(end)"));
+    } else {
+      // First check if we have enough room to do the compaction
+      bool enough_room = EnoughRoomForCompaction(
+          m->cfd, *(c->inputs()), &sfm_reserved_compact_space, log_buffer);
+
+      if (!enough_room) {
+        // Then don't do the compaction
+        c->ReleaseCompactionFiles(status);
+        c.reset();
+        // m's vars will get set properly at the end of this function,
+        // as long as status == CompactionTooLarge
+        status = Status::CompactionTooLarge();
+      } else {
+        ROCKS_LOG_BUFFER(
+            log_buffer,
+            "[%s] Manual compaction from level-%d to level-%d from %s .. "
+            "%s; will stop at %s\n",
+            m->cfd->GetName().c_str(), m->input_level, c->output_level(),
+            (m->begin ? m->begin->DebugString().c_str() : "(begin)"),
+            (m->end ? m->end->DebugString().c_str() : "(end)"),
+            ((m->done || m->manual_end == nullptr)
+                 ? "(end)"
+                 : m->manual_end->DebugString().c_str()));
+      }
+    }
+  } else if (!is_prepicked && !compaction_queue_.empty()) {
+    if (HasExclusiveManualCompaction()) {
+      // Can't compact right now, but try again later
+      TEST_SYNC_POINT("DBImpl::BackgroundCompaction()::Conflict");
+
+      // Stay in the compaction queue.
+      unscheduled_compactions_++;
+
+      return Status::OK();
+    }
+
+    auto cfd = PickCompactionFromQueue(&task_token, log_buffer);
+    if (cfd == nullptr) {
+      // Can't find any executable task from the compaction queue.
+      // All tasks have been throttled by compaction thread limiter.
+      ++unscheduled_compactions_;
+      return Status::Busy();
+    }
+
+    // We unreference here because the following code will take a Ref() on
+    // this cfd if it is going to use it (Compaction class holds a
+    // reference).
+    // This will all happen under a mutex so we don't have to be afraid of
+    // somebody else deleting it.
+    if (cfd->UnrefAndTryDelete()) {
+      // This was the last reference of the column family, so no need to
+      // compact.
+      return Status::OK();
+    }
+
+    // Pick up latest mutable CF Options and use it throughout the
+    // compaction job
+    // Compaction makes a copy of the latest MutableCFOptions. It should be used
+    // throughout the compaction procedure to make sure consistency. It will
+    // eventually be installed into SuperVersion
+    auto* mutable_cf_options = cfd->GetLatestMutableCFOptions();
+    if (!mutable_cf_options->disable_auto_compactions && !cfd->IsDropped()) {
+      // NOTE: try to avoid unnecessary copy of MutableCFOptions if
+      // compaction is not necessary. Need to make sure mutex is held
+      // until we make a copy in the following code
+      TEST_SYNC_POINT("DBImpl::BackgroundCompaction():BeforePickCompaction");
+      c.reset(cfd->PickCompaction(*mutable_cf_options, log_buffer));
+      TEST_SYNC_POINT("DBImpl::BackgroundCompaction():AfterPickCompaction");
+
+      if (c != nullptr) {
+        bool enough_room = EnoughRoomForCompaction(
+            cfd, *(c->inputs()), &sfm_reserved_compact_space, log_buffer);
+
+        if (!enough_room) {
+          // Then don't do the compaction
+          c->ReleaseCompactionFiles(status);
+          c->column_family_data()
+              ->current()
+              ->storage_info()
+              ->ComputeCompactionScore(*(c->immutable_cf_options()),
+                                       *(c->mutable_cf_options()));
+          AddToCompactionQueue(cfd);
+          ++unscheduled_compactions_;
+
+          c.reset();
+          // Don't need to sleep here, because BackgroundCallCompaction
+          // will sleep if !s.ok()
+          status = Status::CompactionTooLarge();
+        } else {
+          // update statistics
+          RecordInHistogram(stats_, NUM_FILES_IN_SINGLE_COMPACTION,
+                            c->inputs(0)->size());
+          // There are three things that can change compaction score:
+          // 1) When flush or compaction finish. This case is covered by
+          // InstallSuperVersionAndScheduleWork
+          // 2) When MutableCFOptions changes. This case is also covered by
+          // InstallSuperVersionAndScheduleWork, because this is when the new
+          // options take effect.
+          // 3) When we Pick a new compaction, we "remove" those files being
+          // compacted from the calculation, which then influences compaction
+          // score. Here we check if we need the new compaction even without the
+          // files that are currently being compacted. If we need another
+          // compaction, we might be able to execute it in parallel, so we add
+          // it to the queue and schedule a new thread.
+          if (cfd->NeedsCompaction()) {
+            // Yes, we need more compactions!
+            AddToCompactionQueue(cfd);
+            ++unscheduled_compactions_;
+            MaybeScheduleFlushOrCompaction();
+          }
+        }
+      }
+    }
+  }
+
+  if (!c) {
+    // Nothing to do
+    ROCKS_LOG_BUFFER(log_buffer, "Compaction nothing to do");
+  } else if (c->deletion_compaction()) {
+    // TODO(icanadi) Do we want to honor snapshots here? i.e. not delete old
+    // file if there is alive snapshot pointing to it
+    TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCompaction:BeforeCompaction",
+                             c->column_family_data());
+    assert(c->num_input_files(1) == 0);
+    assert(c->level() == 0);
+    assert(c->column_family_data()->ioptions()->compaction_style ==
+           kCompactionStyleFIFO);
+
+    compaction_job_stats.num_input_files = c->num_input_files(0);
+
+    NotifyOnCompactionBegin(c->column_family_data(), c.get(), status,
+                            compaction_job_stats, job_context->job_id);
+
+    for (const auto& f : *c->inputs(0)) {
+      c->edit()->DeleteFile(c->level(), f->fd.GetNumber());
+    }
+    status = versions_->LogAndApply(c->column_family_data(),
+                                    *c->mutable_cf_options(), c->edit(),
+                                    &mutex_, directories_.GetDbDir());
+    InstallSuperVersionAndScheduleWork(c->column_family_data(),
+                                       &job_context->superversion_contexts[0],
+                                       *c->mutable_cf_options());
+    ROCKS_LOG_BUFFER(log_buffer, "[%s] Deleted %d files\n",
+                     c->column_family_data()->GetName().c_str(),
+                     c->num_input_files(0));
+    *made_progress = true;
+    TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCompaction:AfterCompaction",
+                             c->column_family_data());
+  } else if (!trivial_move_disallowed && c->IsTrivialMove()) {
+    TEST_SYNC_POINT("DBImpl::BackgroundCompaction:TrivialMove");
+    TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCompaction:BeforeCompaction",
+                             c->column_family_data());
+    // Instrument for event update
+    // TODO(yhchiang): add op details for showing trivial-move.
+    ThreadStatusUtil::SetColumnFamily(
+        c->column_family_data(), c->column_family_data()->ioptions()->env,
+        immutable_db_options_.enable_thread_tracking);
+    ThreadStatusUtil::SetThreadOperation(ThreadStatus::OP_COMPACTION);
+
+    compaction_job_stats.num_input_files = c->num_input_files(0);
+
+    NotifyOnCompactionBegin(c->column_family_data(), c.get(), status,
+                            compaction_job_stats, job_context->job_id);
+
+    // Move files to next level
+    int32_t moved_files = 0;
+    int64_t moved_bytes = 0;
+    for (unsigned int l = 0; l < c->num_input_levels(); l++) {
+      if (c->level(l) == c->output_level()) {
+        continue;
+      }
+      for (size_t i = 0; i < c->num_input_files(l); i++) {
+        FileMetaData* f = c->input(l, i);
+        c->edit()->DeleteFile(c->level(l), f->fd.GetNumber());
+        c->edit()->AddFile(c->output_level(), f->fd.GetNumber(),
+                           f->fd.GetPathId(), f->fd.GetFileSize(), f->smallest,
+                           f->largest, f->fd.smallest_seqno,
+                           f->fd.largest_seqno, f->marked_for_compaction,
+                           f->oldest_blob_file_number, f->oldest_ancester_time,
+                           f->file_creation_time, f->file_checksum,
+                           f->file_checksum_func_name);
+
+        ROCKS_LOG_BUFFER(
+            log_buffer,
+            "[%s] Moving #%" PRIu64 " to level-%d %" PRIu64 " bytes\n",
+            c->column_family_data()->GetName().c_str(), f->fd.GetNumber(),
+            c->output_level(), f->fd.GetFileSize());
+        ++moved_files;
+        moved_bytes += f->fd.GetFileSize();
+      }
+    }
+
+    status = versions_->LogAndApply(c->column_family_data(),
+                                    *c->mutable_cf_options(), c->edit(),
+                                    &mutex_, directories_.GetDbDir());
+    // Use latest MutableCFOptions
+    InstallSuperVersionAndScheduleWork(c->column_family_data(),
+                                       &job_context->superversion_contexts[0],
+                                       *c->mutable_cf_options());
+
+    VersionStorageInfo::LevelSummaryStorage tmp;
+    c->column_family_data()->internal_stats()->IncBytesMoved(c->output_level(),
+                                                             moved_bytes);
+    {
+      event_logger_.LogToBuffer(log_buffer)
+          << "job" << job_context->job_id << "event"
+          << "trivial_move"
+          << "destination_level" << c->output_level() << "files" << moved_files
+          << "total_files_size" << moved_bytes;
+    }
+    ROCKS_LOG_BUFFER(
+        log_buffer,
+        "[%s] Moved #%d files to level-%d %" PRIu64 " bytes %s: %s\n",
+        c->column_family_data()->GetName().c_str(), moved_files,
+        c->output_level(), moved_bytes, status.ToString().c_str(),
+        c->column_family_data()->current()->storage_info()->LevelSummary(&tmp));
+    *made_progress = true;
+
+    // Clear Instrument
+    ThreadStatusUtil::ResetThreadStatus();
+    TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCompaction:AfterCompaction",
+                             c->column_family_data());
+  } else if (!is_prepicked && c->output_level() > 0 &&
+             c->output_level() ==
+                 c->column_family_data()
+                     ->current()
+                     ->storage_info()
+                     ->MaxOutputLevel(
+                         immutable_db_options_.allow_ingest_behind) &&
+             env_->GetBackgroundThreads(Env::Priority::BOTTOM) > 0) {
+    // Forward compactions involving last level to the bottom pool if it exists,
+    // such that compactions unlikely to contribute to write stalls can be
+    // delayed or deprioritized.
+    TEST_SYNC_POINT("DBImpl::BackgroundCompaction:ForwardToBottomPriPool");
+    CompactionArg* ca = new CompactionArg;
+    ca->db = this;
+    ca->prepicked_compaction = new PrepickedCompaction;
+    ca->prepicked_compaction->compaction = c.release();
+    ca->prepicked_compaction->manual_compaction_state = nullptr;
+    // Transfer requested token, so it doesn't need to do it again.
+    ca->prepicked_compaction->task_token = std::move(task_token);
+    ++bg_bottom_compaction_scheduled_;
+    env_->Schedule(&DBImpl::BGWorkBottomCompaction, ca, Env::Priority::BOTTOM,
+                   this, &DBImpl::UnscheduleCompactionCallback);
+  } else {
+    TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCompaction:BeforeCompaction",
+                             c->column_family_data());
+    int output_level __attribute__((__unused__));
+    output_level = c->output_level();
+    TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCompaction:NonTrivial",
+                             &output_level);
+    std::vector<SequenceNumber> snapshot_seqs;
+    SequenceNumber earliest_write_conflict_snapshot;
+    SnapshotChecker* snapshot_checker;
+    GetSnapshotContext(job_context, &snapshot_seqs,
+                       &earliest_write_conflict_snapshot, &snapshot_checker);
+    assert(is_snapshot_supported_ || snapshots_.empty());
+    CompactionJob compaction_job(
+        job_context->job_id, c.get(), immutable_db_options_,
+        file_options_for_compaction_, versions_.get(), &shutting_down_,
+        preserve_deletes_seqnum_.load(), log_buffer, directories_.GetDbDir(),
+        GetDataDir(c->column_family_data(), c->output_path_id()), stats_,
+        &mutex_, &error_handler_, snapshot_seqs,
+        earliest_write_conflict_snapshot, snapshot_checker, table_cache_,
+        &event_logger_, c->mutable_cf_options()->paranoid_file_checks,
+        c->mutable_cf_options()->report_bg_io_stats, dbname_,
+        &compaction_job_stats, thread_pri,
+        is_manual ? &manual_compaction_paused_ : nullptr);
+    compaction_job.Prepare();
+
+    NotifyOnCompactionBegin(c->column_family_data(), c.get(), status,
+                            compaction_job_stats, job_context->job_id);
+
+    mutex_.Unlock();
+    TEST_SYNC_POINT_CALLBACK(
+        "DBImpl::BackgroundCompaction:NonTrivial:BeforeRun", nullptr);
+    compaction_job.Run();
+    TEST_SYNC_POINT("DBImpl::BackgroundCompaction:NonTrivial:AfterRun");
+    mutex_.Lock();
+
+    status = compaction_job.Install(*c->mutable_cf_options());
+    if (status.ok()) {
+      InstallSuperVersionAndScheduleWork(c->column_family_data(),
+                                         &job_context->superversion_contexts[0],
+                                         *c->mutable_cf_options());
+    }
+    *made_progress = true;
+    TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCompaction:AfterCompaction",
+                             c->column_family_data());
+  }
+  if (c != nullptr) {
+    c->ReleaseCompactionFiles(status);
+    *made_progress = true;
+
+#ifndef ROCKSDB_LITE
+    // Need to make sure SstFileManager does its bookkeeping
+    auto sfm = static_cast<SstFileManagerImpl*>(
+        immutable_db_options_.sst_file_manager.get());
+    if (sfm && sfm_reserved_compact_space) {
+      sfm->OnCompactionCompletion(c.get());
+    }
+#endif  // ROCKSDB_LITE
+
+    NotifyOnCompactionCompleted(c->column_family_data(), c.get(), status,
+                                compaction_job_stats, job_context->job_id);
+  }
+
+  if (status.ok() || status.IsCompactionTooLarge() ||
+      status.IsManualCompactionPaused()) {
+    // Done
+  } else if (status.IsColumnFamilyDropped() || status.IsShutdownInProgress()) {
+    // Ignore compaction errors found during shutting down
+  } else {
+    ROCKS_LOG_WARN(immutable_db_options_.info_log, "Compaction error: %s",
+                   status.ToString().c_str());
+    error_handler_.SetBGError(status, BackgroundErrorReason::kCompaction);
+    if (c != nullptr && !is_manual && !error_handler_.IsBGWorkStopped()) {
+      // Put this cfd back in the compaction queue so we can retry after some
+      // time
+      auto cfd = c->column_family_data();
+      assert(cfd != nullptr);
+      // Since this compaction failed, we need to recompute the score so it
+      // takes the original input files into account
+      c->column_family_data()
+          ->current()
+          ->storage_info()
+          ->ComputeCompactionScore(*(c->immutable_cf_options()),
+                                   *(c->mutable_cf_options()));
+      if (!cfd->queued_for_compaction()) {
+        AddToCompactionQueue(cfd);
+        ++unscheduled_compactions_;
+      }
+    }
+  }
+  // this will unref its input_version and column_family_data
+  c.reset();
+
+  if (is_manual) {
+    ManualCompactionState* m = manual_compaction;
+    if (!status.ok()) {
+      m->status = status;
+      m->done = true;
+    }
+    // For universal compaction:
+    //   Because universal compaction always happens at level 0, so one
+    //   compaction will pick up all overlapped files. No files will be
+    //   filtered out due to size limit and left for a successive compaction.
+    //   So we can safely conclude the current compaction.
+    //
+    //   Also note that, if we don't stop here, then the current compaction
+    //   writes a new file back to level 0, which will be used in successive
+    //   compaction. Hence the manual compaction will never finish.
+    //
+    // Stop the compaction if manual_end points to nullptr -- this means
+    // that we compacted the whole range. manual_end should always point
+    // to nullptr in case of universal compaction
+    if (m->manual_end == nullptr) {
+      m->done = true;
+    }
+    if (!m->done) {
+      // We only compacted part of the requested range.  Update *m
+      // to the range that is left to be compacted.
+      // Universal and FIFO compactions should always compact the whole range
+      assert(m->cfd->ioptions()->compaction_style !=
+                 kCompactionStyleUniversal ||
+             m->cfd->ioptions()->num_levels > 1);
+      assert(m->cfd->ioptions()->compaction_style != kCompactionStyleFIFO);
+      m->tmp_storage = *m->manual_end;
+      m->begin = &m->tmp_storage;
+      m->incomplete = true;
+    }
+    m->in_progress = false;  // not being processed anymore
+  }
+  TEST_SYNC_POINT("DBImpl::BackgroundCompaction:Finish");
+  return status;
+}
+
+bool DBImpl::HasPendingManualCompaction() {
+  return (!manual_compaction_dequeue_.empty());
+}
+
+void DBImpl::AddManualCompaction(DBImpl::ManualCompactionState* m) {
+  manual_compaction_dequeue_.push_back(m);
+}
+
+void DBImpl::RemoveManualCompaction(DBImpl::ManualCompactionState* m) {
+  // Remove from queue
+  std::deque<ManualCompactionState*>::iterator it =
+      manual_compaction_dequeue_.begin();
+  while (it != manual_compaction_dequeue_.end()) {
+    if (m == (*it)) {
+      it = manual_compaction_dequeue_.erase(it);
+      return;
+    }
+    ++it;
+  }
+  assert(false);
+  return;
+}
+
+bool DBImpl::ShouldntRunManualCompaction(ManualCompactionState* m) {
+  if (num_running_ingest_file_ > 0) {
+    // We need to wait for other IngestExternalFile() calls to finish
+    // before running a manual compaction.
+    return true;
+  }
+  if (m->exclusive) {
+    return (bg_bottom_compaction_scheduled_ > 0 ||
+            bg_compaction_scheduled_ > 0);
+  }
+  std::deque<ManualCompactionState*>::iterator it =
+      manual_compaction_dequeue_.begin();
+  bool seen = false;
+  while (it != manual_compaction_dequeue_.end()) {
+    if (m == (*it)) {
+      ++it;
+      seen = true;
+      continue;
+    } else if (MCOverlap(m, (*it)) && (!seen && !(*it)->in_progress)) {
+      // Consider the other manual compaction *it, conflicts if:
+      // overlaps with m
+      // and (*it) is ahead in the queue and is not yet in progress
+      return true;
+    }
+    ++it;
+  }
+  return false;
+}
+
+bool DBImpl::HaveManualCompaction(ColumnFamilyData* cfd) {
+  // Remove from priority queue
+  std::deque<ManualCompactionState*>::iterator it =
+      manual_compaction_dequeue_.begin();
+  while (it != manual_compaction_dequeue_.end()) {
+    if ((*it)->exclusive) {
+      return true;
+    }
+    if ((cfd == (*it)->cfd) && (!((*it)->in_progress || (*it)->done))) {
+      // Allow automatic compaction if manual compaction is
+      // in progress
+      return true;
+    }
+    ++it;
+  }
+  return false;
+}
+
+bool DBImpl::HasExclusiveManualCompaction() {
+  // Remove from priority queue
+  std::deque<ManualCompactionState*>::iterator it =
+      manual_compaction_dequeue_.begin();
+  while (it != manual_compaction_dequeue_.end()) {
+    if ((*it)->exclusive) {
+      return true;
+    }
+    ++it;
+  }
+  return false;
+}
+
+bool DBImpl::MCOverlap(ManualCompactionState* m, ManualCompactionState* m1) {
+  if ((m->exclusive) || (m1->exclusive)) {
+    return true;
+  }
+  if (m->cfd != m1->cfd) {
+    return false;
+  }
+  return true;
+}
+
+#ifndef ROCKSDB_LITE
+void DBImpl::BuildCompactionJobInfo(
+    const ColumnFamilyData* cfd, Compaction* c, const Status& st,
+    const CompactionJobStats& compaction_job_stats, const int job_id,
+    const Version* current, CompactionJobInfo* compaction_job_info) const {
+  assert(compaction_job_info != nullptr);
+  compaction_job_info->cf_id = cfd->GetID();
+  compaction_job_info->cf_name = cfd->GetName();
+  compaction_job_info->status = st;
+  compaction_job_info->thread_id = env_->GetThreadID();
+  compaction_job_info->job_id = job_id;
+  compaction_job_info->base_input_level = c->start_level();
+  compaction_job_info->output_level = c->output_level();
+  compaction_job_info->stats = compaction_job_stats;
+  compaction_job_info->table_properties = c->GetOutputTableProperties();
+  compaction_job_info->compaction_reason = c->compaction_reason();
+  compaction_job_info->compression = c->output_compression();
+  for (size_t i = 0; i < c->num_input_levels(); ++i) {
+    for (const auto fmd : *c->inputs(i)) {
+      const FileDescriptor& desc = fmd->fd;
+      const uint64_t file_number = desc.GetNumber();
+      auto fn = TableFileName(c->immutable_cf_options()->cf_paths, file_number,
+                              desc.GetPathId());
+      compaction_job_info->input_files.push_back(fn);
+      compaction_job_info->input_file_infos.push_back(CompactionFileInfo{
+          static_cast<int>(i), file_number, fmd->oldest_blob_file_number});
+      if (compaction_job_info->table_properties.count(fn) == 0) {
+        std::shared_ptr<const TableProperties> tp;
+        auto s = current->GetTableProperties(&tp, fmd, &fn);
+        if (s.ok()) {
+          compaction_job_info->table_properties[fn] = tp;
+        }
+      }
+    }
+  }
+  for (const auto& newf : c->edit()->GetNewFiles()) {
+    const FileMetaData& meta = newf.second;
+    const FileDescriptor& desc = meta.fd;
+    const uint64_t file_number = desc.GetNumber();
+    compaction_job_info->output_files.push_back(TableFileName(
+        c->immutable_cf_options()->cf_paths, file_number, desc.GetPathId()));
+    compaction_job_info->output_file_infos.push_back(CompactionFileInfo{
+        newf.first, file_number, meta.oldest_blob_file_number});
+  }
+}
+#endif
+
+// SuperVersionContext gets created and destructed outside of the lock --
+// we use this conveniently to:
+// * malloc one SuperVersion() outside of the lock -- new_superversion
+// * delete SuperVersion()s outside of the lock -- superversions_to_free
+//
+// However, if InstallSuperVersionAndScheduleWork() gets called twice with the
+// same sv_context, we can't reuse the SuperVersion() that got
+// malloced because
+// first call already used it. In that rare case, we take a hit and create a
+// new SuperVersion() inside of the mutex. We do similar thing
+// for superversion_to_free
+
+void DBImpl::InstallSuperVersionAndScheduleWork(
+    ColumnFamilyData* cfd, SuperVersionContext* sv_context,
+    const MutableCFOptions& mutable_cf_options) {
+  mutex_.AssertHeld();
+
+  // Update max_total_in_memory_state_
+  size_t old_memtable_size = 0;
+  auto* old_sv = cfd->GetSuperVersion();
+  if (old_sv) {
+    old_memtable_size = old_sv->mutable_cf_options.write_buffer_size *
+                        old_sv->mutable_cf_options.max_write_buffer_number;
+  }
+
+  // this branch is unlikely to step in
+  if (UNLIKELY(sv_context->new_superversion == nullptr)) {
+    sv_context->NewSuperVersion();
+  }
+  cfd->InstallSuperVersion(sv_context, &mutex_, mutable_cf_options);
+
+  // There may be a small data race here. The snapshot tricking bottommost
+  // compaction may already be released here. But assuming there will always be
+  // newer snapshot created and released frequently, the compaction will be
+  // triggered soon anyway.
+  bottommost_files_mark_threshold_ = kMaxSequenceNumber;
+  for (auto* my_cfd : *versions_->GetColumnFamilySet()) {
+    bottommost_files_mark_threshold_ = std::min(
+        bottommost_files_mark_threshold_,
+        my_cfd->current()->storage_info()->bottommost_files_mark_threshold());
+  }
+
+  // Whenever we install new SuperVersion, we might need to issue new flushes or
+  // compactions.
+  SchedulePendingCompaction(cfd);
+  MaybeScheduleFlushOrCompaction();
+
+  // Update max_total_in_memory_state_
+  max_total_in_memory_state_ = max_total_in_memory_state_ - old_memtable_size +
+                               mutable_cf_options.write_buffer_size *
+                                   mutable_cf_options.max_write_buffer_number;
+}
+
+// ShouldPurge is called by FindObsoleteFiles when doing a full scan,
+// and db mutex (mutex_) should already be held.
+// Actually, the current implementation of FindObsoleteFiles with
+// full_scan=true can issue I/O requests to obtain list of files in
+// directories, e.g. env_->getChildren while holding db mutex.
+bool DBImpl::ShouldPurge(uint64_t file_number) const {
+  return files_grabbed_for_purge_.find(file_number) ==
+             files_grabbed_for_purge_.end() &&
+         purge_files_.find(file_number) == purge_files_.end();
+}
+
+// MarkAsGrabbedForPurge is called by FindObsoleteFiles, and db mutex
+// (mutex_) should already be held.
+void DBImpl::MarkAsGrabbedForPurge(uint64_t file_number) {
+  files_grabbed_for_purge_.insert(file_number);
+}
+
+void DBImpl::SetSnapshotChecker(SnapshotChecker* snapshot_checker) {
+  InstrumentedMutexLock l(&mutex_);
+  // snapshot_checker_ should only set once. If we need to set it multiple
+  // times, we need to make sure the old one is not deleted while it is still
+  // using by a compaction job.
+  assert(!snapshot_checker_);
+  snapshot_checker_.reset(snapshot_checker);
+}
+
+void DBImpl::GetSnapshotContext(
+    JobContext* job_context, std::vector<SequenceNumber>* snapshot_seqs,
+    SequenceNumber* earliest_write_conflict_snapshot,
+    SnapshotChecker** snapshot_checker_ptr) {
+  mutex_.AssertHeld();
+  assert(job_context != nullptr);
+  assert(snapshot_seqs != nullptr);
+  assert(earliest_write_conflict_snapshot != nullptr);
+  assert(snapshot_checker_ptr != nullptr);
+
+  *snapshot_checker_ptr = snapshot_checker_.get();
+  if (use_custom_gc_ && *snapshot_checker_ptr == nullptr) {
+    *snapshot_checker_ptr = DisableGCSnapshotChecker::Instance();
+  }
+  if (*snapshot_checker_ptr != nullptr) {
+    // If snapshot_checker is used, that means the flush/compaction may
+    // contain values not visible to snapshot taken after
+    // flush/compaction job starts. Take a snapshot and it will appear
+    // in snapshot_seqs and force compaction iterator to consider such
+    // snapshots.
+    const Snapshot* job_snapshot =
+        GetSnapshotImpl(false /*write_conflict_boundary*/, false /*lock*/);
+    job_context->job_snapshot.reset(new ManagedSnapshot(this, job_snapshot));
+  }
+  *snapshot_seqs = snapshots_.GetAll(earliest_write_conflict_snapshot);
+}
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/db_impl/db_impl_debug.cc b/src/rocksdb/db/db_impl/db_impl_debug.cc
new file mode 100644
index 000000000..610b57d39
--- /dev/null
+++ b/src/rocksdb/db/db_impl/db_impl_debug.cc
@@ -0,0 +1,294 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef NDEBUG
+
+#include "db/column_family.h"
+#include "db/db_impl/db_impl.h"
+#include "db/error_handler.h"
+#include "monitoring/thread_status_updater.h"
+#include "util/cast_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+uint64_t DBImpl::TEST_GetLevel0TotalSize() {
+  InstrumentedMutexLock l(&mutex_);
+  return default_cf_handle_->cfd()->current()->storage_info()->NumLevelBytes(0);
+}
+
+void DBImpl::TEST_SwitchWAL() {
+  WriteContext write_context;
+  InstrumentedMutexLock l(&mutex_);
+  void* writer = TEST_BeginWrite();
+  SwitchWAL(&write_context);
+  TEST_EndWrite(writer);
+}
+
+bool DBImpl::TEST_WALBufferIsEmpty(bool lock) {
+  if (lock) {
+    log_write_mutex_.Lock();
+  }
+  log::Writer* cur_log_writer = logs_.back().writer;
+  auto res = cur_log_writer->TEST_BufferIsEmpty();
+  if (lock) {
+    log_write_mutex_.Unlock();
+  }
+  return res;
+}
+
+int64_t DBImpl::TEST_MaxNextLevelOverlappingBytes(
+    ColumnFamilyHandle* column_family) {
+  ColumnFamilyData* cfd;
+  if (column_family == nullptr) {
+    cfd = default_cf_handle_->cfd();
+  } else {
+    auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+    cfd = cfh->cfd();
+  }
+  InstrumentedMutexLock l(&mutex_);
+  return cfd->current()->storage_info()->MaxNextLevelOverlappingBytes();
+}
+
+void DBImpl::TEST_GetFilesMetaData(
+    ColumnFamilyHandle* column_family,
+    std::vector<std::vector<FileMetaData>>* metadata) {
+  auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+  auto cfd = cfh->cfd();
+  InstrumentedMutexLock l(&mutex_);
+  metadata->resize(NumberLevels());
+  for (int level = 0; level < NumberLevels(); level++) {
+    const std::vector<FileMetaData*>& files =
+        cfd->current()->storage_info()->LevelFiles(level);
+
+    (*metadata)[level].clear();
+    for (const auto& f : files) {
+      (*metadata)[level].push_back(*f);
+    }
+  }
+}
+
+uint64_t DBImpl::TEST_Current_Manifest_FileNo() {
+  return versions_->manifest_file_number();
+}
+
+uint64_t DBImpl::TEST_Current_Next_FileNo() {
+  return versions_->current_next_file_number();
+}
+
+Status DBImpl::TEST_CompactRange(int level, const Slice* begin,
+                                 const Slice* end,
+                                 ColumnFamilyHandle* column_family,
+                                 bool disallow_trivial_move) {
+  ColumnFamilyData* cfd;
+  if (column_family == nullptr) {
+    cfd = default_cf_handle_->cfd();
+  } else {
+    auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+    cfd = cfh->cfd();
+  }
+  int output_level =
+      (cfd->ioptions()->compaction_style == kCompactionStyleUniversal ||
+       cfd->ioptions()->compaction_style == kCompactionStyleFIFO)
+          ? level
+          : level + 1;
+  return RunManualCompaction(cfd, level, output_level, CompactRangeOptions(),
+                             begin, end, true, disallow_trivial_move,
+                             port::kMaxUint64 /*max_file_num_to_ignore*/);
+}
+
+Status DBImpl::TEST_SwitchMemtable(ColumnFamilyData* cfd) {
+  WriteContext write_context;
+  InstrumentedMutexLock l(&mutex_);
+  if (cfd == nullptr) {
+    cfd = default_cf_handle_->cfd();
+  }
+
+  Status s;
+  void* writer = TEST_BeginWrite();
+  if (two_write_queues_) {
+    WriteThread::Writer nonmem_w;
+    nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_);
+    s = SwitchMemtable(cfd, &write_context);
+    nonmem_write_thread_.ExitUnbatched(&nonmem_w);
+  } else {
+    s = SwitchMemtable(cfd, &write_context);
+  }
+  TEST_EndWrite(writer);
+  return s;
+}
+
+Status DBImpl::TEST_FlushMemTable(bool wait, bool allow_write_stall,
+                                  ColumnFamilyHandle* cfh) {
+  FlushOptions fo;
+  fo.wait = wait;
+  fo.allow_write_stall = allow_write_stall;
+  ColumnFamilyData* cfd;
+  if (cfh == nullptr) {
+    cfd = default_cf_handle_->cfd();
+  } else {
+    auto cfhi = reinterpret_cast<ColumnFamilyHandleImpl*>(cfh);
+    cfd = cfhi->cfd();
+  }
+  return FlushMemTable(cfd, fo, FlushReason::kTest);
+}
+
+Status DBImpl::TEST_FlushMemTable(ColumnFamilyData* cfd,
+                                  const FlushOptions& flush_opts) {
+  return FlushMemTable(cfd, flush_opts, FlushReason::kTest);
+}
+
+Status DBImpl::TEST_AtomicFlushMemTables(
+    const autovector<ColumnFamilyData*>& cfds, const FlushOptions& flush_opts) {
+  return AtomicFlushMemTables(cfds, flush_opts, FlushReason::kTest);
+}
+
+Status DBImpl::TEST_WaitForFlushMemTable(ColumnFamilyHandle* column_family) {
+  ColumnFamilyData* cfd;
+  if (column_family == nullptr) {
+    cfd = default_cf_handle_->cfd();
+  } else {
+    auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+    cfd = cfh->cfd();
+  }
+  return WaitForFlushMemTable(cfd, nullptr, false);
+}
+
+Status DBImpl::TEST_WaitForCompact(bool wait_unscheduled) {
+  // Wait until the compaction completes
+
+  // TODO: a bug here. This function actually does not necessarily
+  // wait for compact. It actually waits for scheduled compaction
+  // OR flush to finish.
+
+  InstrumentedMutexLock l(&mutex_);
+  while ((bg_bottom_compaction_scheduled_ || bg_compaction_scheduled_ ||
+          bg_flush_scheduled_ ||
+          (wait_unscheduled && unscheduled_compactions_)) &&
+         (error_handler_.GetBGError() == Status::OK())) {
+    bg_cv_.Wait();
+  }
+  return error_handler_.GetBGError();
+}
+
+void DBImpl::TEST_LockMutex() { mutex_.Lock(); }
+
+void DBImpl::TEST_UnlockMutex() { mutex_.Unlock(); }
+
+void* DBImpl::TEST_BeginWrite() {
+  auto w = new WriteThread::Writer();
+  write_thread_.EnterUnbatched(w, &mutex_);
+  return reinterpret_cast<void*>(w);
+}
+
+void DBImpl::TEST_EndWrite(void* w) {
+  auto writer = reinterpret_cast<WriteThread::Writer*>(w);
+  write_thread_.ExitUnbatched(writer);
+  delete writer;
+}
+
+size_t DBImpl::TEST_LogsToFreeSize() {
+  InstrumentedMutexLock l(&mutex_);
+  return logs_to_free_.size();
+}
+
+uint64_t DBImpl::TEST_LogfileNumber() {
+  InstrumentedMutexLock l(&mutex_);
+  return logfile_number_;
+}
+
+Status DBImpl::TEST_GetAllImmutableCFOptions(
+    std::unordered_map<std::string, const ImmutableCFOptions*>* iopts_map) {
+  std::vector<std::string> cf_names;
+  std::vector<const ImmutableCFOptions*> iopts;
+  {
+    InstrumentedMutexLock l(&mutex_);
+    for (auto cfd : *versions_->GetColumnFamilySet()) {
+      cf_names.push_back(cfd->GetName());
+      iopts.push_back(cfd->ioptions());
+    }
+  }
+  iopts_map->clear();
+  for (size_t i = 0; i < cf_names.size(); ++i) {
+    iopts_map->insert({cf_names[i], iopts[i]});
+  }
+
+  return Status::OK();
+}
+
+uint64_t DBImpl::TEST_FindMinLogContainingOutstandingPrep() {
+  return logs_with_prep_tracker_.FindMinLogContainingOutstandingPrep();
+}
+
+size_t DBImpl::TEST_PreparedSectionCompletedSize() {
+  return logs_with_prep_tracker_.TEST_PreparedSectionCompletedSize();
+}
+
+size_t DBImpl::TEST_LogsWithPrepSize() {
+  return logs_with_prep_tracker_.TEST_LogsWithPrepSize();
+}
+
+uint64_t DBImpl::TEST_FindMinPrepLogReferencedByMemTable() {
+  autovector<MemTable*> empty_list;
+  return FindMinPrepLogReferencedByMemTable(versions_.get(), nullptr,
+                                            empty_list);
+}
+
+Status DBImpl::TEST_GetLatestMutableCFOptions(
+    ColumnFamilyHandle* column_family, MutableCFOptions* mutable_cf_options) {
+  InstrumentedMutexLock l(&mutex_);
+
+  auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+  *mutable_cf_options = *cfh->cfd()->GetLatestMutableCFOptions();
+  return Status::OK();
+}
+
+int DBImpl::TEST_BGCompactionsAllowed() const {
+  InstrumentedMutexLock l(&mutex_);
+  return GetBGJobLimits().max_compactions;
+}
+
+int DBImpl::TEST_BGFlushesAllowed() const {
+  InstrumentedMutexLock l(&mutex_);
+  return GetBGJobLimits().max_flushes;
+}
+
+SequenceNumber DBImpl::TEST_GetLastVisibleSequence() const {
+  if (last_seq_same_as_publish_seq_) {
+    return versions_->LastSequence();
+  } else {
+    return versions_->LastAllocatedSequence();
+  }
+}
+
+size_t DBImpl::TEST_GetWalPreallocateBlockSize(
+    uint64_t write_buffer_size) const {
+  InstrumentedMutexLock l(&mutex_);
+  return GetWalPreallocateBlockSize(write_buffer_size);
+}
+
+void DBImpl::TEST_WaitForDumpStatsRun(std::function<void()> callback) const {
+  if (thread_dump_stats_ != nullptr) {
+    thread_dump_stats_->TEST_WaitForRun(callback);
+  }
+}
+
+void DBImpl::TEST_WaitForPersistStatsRun(std::function<void()> callback) const {
+  if (thread_persist_stats_ != nullptr) {
+    thread_persist_stats_->TEST_WaitForRun(callback);
+  }
+}
+
+bool DBImpl::TEST_IsPersistentStatsEnabled() const {
+  return thread_persist_stats_ && thread_persist_stats_->IsRunning();
+}
+
+size_t DBImpl::TEST_EstimateInMemoryStatsHistorySize() const {
+  return EstimateInMemoryStatsHistorySize();
+}
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // NDEBUG
diff --git a/src/rocksdb/db/db_impl/db_impl_experimental.cc b/src/rocksdb/db/db_impl/db_impl_experimental.cc
new file mode 100644
index 000000000..f0c17ce95
--- /dev/null
+++ b/src/rocksdb/db/db_impl/db_impl_experimental.cc
@@ -0,0 +1,151 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/db_impl/db_impl.h"
+
+#include <cinttypes>
+#include <vector>
+
+#include "db/column_family.h"
+#include "db/job_context.h"
+#include "db/version_set.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+#ifndef ROCKSDB_LITE
+Status DBImpl::SuggestCompactRange(ColumnFamilyHandle* column_family,
+                                   const Slice* begin, const Slice* end) {
+  auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+  auto cfd = cfh->cfd();
+  InternalKey start_key, end_key;
+  if (begin != nullptr) {
+    start_key.SetMinPossibleForUserKey(*begin);
+  }
+  if (end != nullptr) {
+    end_key.SetMaxPossibleForUserKey(*end);
+  }
+  {
+    InstrumentedMutexLock l(&mutex_);
+    auto vstorage = cfd->current()->storage_info();
+    for (int level = 0; level < vstorage->num_non_empty_levels() - 1; ++level) {
+      std::vector<FileMetaData*> inputs;
+      vstorage->GetOverlappingInputs(
+          level, begin == nullptr ? nullptr : &start_key,
+          end == nullptr ? nullptr : &end_key, &inputs);
+      for (auto f : inputs) {
+        f->marked_for_compaction = true;
+      }
+    }
+    // Since we have some more files to compact, we should also recompute
+    // compaction score
+    vstorage->ComputeCompactionScore(*cfd->ioptions(),
+                                     *cfd->GetLatestMutableCFOptions());
+    SchedulePendingCompaction(cfd);
+    MaybeScheduleFlushOrCompaction();
+  }
+  return Status::OK();
+}
+
+Status DBImpl::PromoteL0(ColumnFamilyHandle* column_family, int target_level) {
+  assert(column_family);
+
+  if (target_level < 1) {
+    ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                   "PromoteL0 FAILED. Invalid target level %d\n", target_level);
+    return Status::InvalidArgument("Invalid target level");
+  }
+
+  Status status;
+  VersionEdit edit;
+  JobContext job_context(next_job_id_.fetch_add(1), true);
+  {
+    InstrumentedMutexLock l(&mutex_);
+    auto* cfd = static_cast<ColumnFamilyHandleImpl*>(column_family)->cfd();
+    const auto* vstorage = cfd->current()->storage_info();
+
+    if (target_level >= vstorage->num_levels()) {
+      ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                     "PromoteL0 FAILED. Target level %d does not exist\n",
+                     target_level);
+      job_context.Clean();
+      return Status::InvalidArgument("Target level does not exist");
+    }
+
+    // Sort L0 files by range.
+    const InternalKeyComparator* icmp = &cfd->internal_comparator();
+    auto l0_files = vstorage->LevelFiles(0);
+    std::sort(l0_files.begin(), l0_files.end(),
+              [icmp](FileMetaData* f1, FileMetaData* f2) {
+                return icmp->Compare(f1->largest, f2->largest) < 0;
+              });
+
+    // Check that no L0 file is being compacted and that they have
+    // non-overlapping ranges.
+    for (size_t i = 0; i < l0_files.size(); ++i) {
+      auto f = l0_files[i];
+      if (f->being_compacted) {
+        ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                       "PromoteL0 FAILED. File %" PRIu64 " being compacted\n",
+                       f->fd.GetNumber());
+        job_context.Clean();
+        return Status::InvalidArgument("PromoteL0 called during L0 compaction");
+      }
+
+      if (i == 0) continue;
+      auto prev_f = l0_files[i - 1];
+      if (icmp->Compare(prev_f->largest, f->smallest) >= 0) {
+        ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                       "PromoteL0 FAILED. Files %" PRIu64 " and %" PRIu64
+                       " have overlapping ranges\n",
+                       prev_f->fd.GetNumber(), f->fd.GetNumber());
+        job_context.Clean();
+        return Status::InvalidArgument("L0 has overlapping files");
+      }
+    }
+
+    // Check that all levels up to target_level are empty.
+    for (int level = 1; level <= target_level; ++level) {
+      if (vstorage->NumLevelFiles(level) > 0) {
+        ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                       "PromoteL0 FAILED. Level %d not empty\n", level);
+        job_context.Clean();
+        return Status::InvalidArgument(
+            "All levels up to target_level "
+            "must be empty");
+      }
+    }
+
+    edit.SetColumnFamily(cfd->GetID());
+    for (const auto& f : l0_files) {
+      edit.DeleteFile(0, f->fd.GetNumber());
+      edit.AddFile(target_level, f->fd.GetNumber(), f->fd.GetPathId(),
+                   f->fd.GetFileSize(), f->smallest, f->largest,
+                   f->fd.smallest_seqno, f->fd.largest_seqno,
+                   f->marked_for_compaction, f->oldest_blob_file_number,
+                   f->oldest_ancester_time, f->file_creation_time,
+                   f->file_checksum, f->file_checksum_func_name);
+    }
+
+    status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(),
+                                    &edit, &mutex_, directories_.GetDbDir());
+    if (status.ok()) {
+      InstallSuperVersionAndScheduleWork(cfd,
+                                         &job_context.superversion_contexts[0],
+                                         *cfd->GetLatestMutableCFOptions());
+    }
+  }  // lock released here
+  LogFlush(immutable_db_options_.info_log);
+  job_context.Clean();
+
+  return status;
+}
+#endif  // ROCKSDB_LITE
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/db_impl/db_impl_files.cc b/src/rocksdb/db/db_impl/db_impl_files.cc
new file mode 100644
index 000000000..c5d07dd01
--- /dev/null
+++ b/src/rocksdb/db/db_impl/db_impl_files.cc
@@ -0,0 +1,667 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include "db/db_impl/db_impl.h"
+
+#include <cinttypes>
+#include <set>
+#include <unordered_set>
+#include "db/event_helpers.h"
+#include "db/memtable_list.h"
+#include "file/file_util.h"
+#include "file/sst_file_manager_impl.h"
+#include "util/autovector.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+uint64_t DBImpl::MinLogNumberToKeep() {
+  if (allow_2pc()) {
+    return versions_->min_log_number_to_keep_2pc();
+  } else {
+    return versions_->MinLogNumberWithUnflushedData();
+  }
+}
+
+uint64_t DBImpl::MinObsoleteSstNumberToKeep() {
+  mutex_.AssertHeld();
+  if (!pending_outputs_.empty()) {
+    return *pending_outputs_.begin();
+  }
+  return std::numeric_limits<uint64_t>::max();
+}
+
+// * Returns the list of live files in 'sst_live'
+// If it's doing full scan:
+// * Returns the list of all files in the filesystem in
+// 'full_scan_candidate_files'.
+// Otherwise, gets obsolete files from VersionSet.
+// no_full_scan = true -- never do the full scan using GetChildren()
+// force = false -- don't force the full scan, except every
+//  mutable_db_options_.delete_obsolete_files_period_micros
+// force = true -- force the full scan
+void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force,
+                               bool no_full_scan) {
+  mutex_.AssertHeld();
+
+  // if deletion is disabled, do nothing
+  if (disable_delete_obsolete_files_ > 0) {
+    return;
+  }
+
+  bool doing_the_full_scan = false;
+
+  // logic for figuring out if we're doing the full scan
+  if (no_full_scan) {
+    doing_the_full_scan = false;
+  } else if (force ||
+             mutable_db_options_.delete_obsolete_files_period_micros == 0) {
+    doing_the_full_scan = true;
+  } else {
+    const uint64_t now_micros = env_->NowMicros();
+    if ((delete_obsolete_files_last_run_ +
+         mutable_db_options_.delete_obsolete_files_period_micros) <
+        now_micros) {
+      doing_the_full_scan = true;
+      delete_obsolete_files_last_run_ = now_micros;
+    }
+  }
+
+  // don't delete files that might be currently written to from compaction
+  // threads
+  // Since job_context->min_pending_output is set, until file scan finishes,
+  // mutex_ cannot be released. Otherwise, we might see no min_pending_output
+  // here but later find newer generated unfinalized files while scanning.
+  if (!pending_outputs_.empty()) {
+    job_context->min_pending_output = *pending_outputs_.begin();
+  } else {
+    // delete all of them
+    job_context->min_pending_output = std::numeric_limits<uint64_t>::max();
+  }
+
+  // Get obsolete files.  This function will also update the list of
+  // pending files in VersionSet().
+  versions_->GetObsoleteFiles(&job_context->sst_delete_files,
+                              &job_context->manifest_delete_files,
+                              job_context->min_pending_output);
+
+  // Mark the elements in job_context->sst_delete_files as grabbedForPurge
+  // so that other threads calling FindObsoleteFiles with full_scan=true
+  // will not add these files to candidate list for purge.
+  for (const auto& sst_to_del : job_context->sst_delete_files) {
+    MarkAsGrabbedForPurge(sst_to_del.metadata->fd.GetNumber());
+  }
+
+  // store the current filenum, lognum, etc
+  job_context->manifest_file_number = versions_->manifest_file_number();
+  job_context->pending_manifest_file_number =
+      versions_->pending_manifest_file_number();
+  job_context->log_number = MinLogNumberToKeep();
+  job_context->prev_log_number = versions_->prev_log_number();
+
+  versions_->AddLiveFiles(&job_context->sst_live);
+  if (doing_the_full_scan) {
+    InfoLogPrefix info_log_prefix(!immutable_db_options_.db_log_dir.empty(),
+                                  dbname_);
+    std::set<std::string> paths;
+    for (size_t path_id = 0; path_id < immutable_db_options_.db_paths.size();
+         path_id++) {
+      paths.insert(immutable_db_options_.db_paths[path_id].path);
+    }
+
+    // Note that if cf_paths is not specified in the ColumnFamilyOptions
+    // of a particular column family, we use db_paths as the cf_paths
+    // setting. Hence, there can be multiple duplicates of files from db_paths
+    // in the following code. The duplicate are removed while identifying
+    // unique files in PurgeObsoleteFiles.
+    for (auto cfd : *versions_->GetColumnFamilySet()) {
+      for (size_t path_id = 0; path_id < cfd->ioptions()->cf_paths.size();
+           path_id++) {
+        auto& path = cfd->ioptions()->cf_paths[path_id].path;
+
+        if (paths.find(path) == paths.end()) {
+          paths.insert(path);
+        }
+      }
+    }
+
+    for (auto& path : paths) {
+      // set of all files in the directory. We'll exclude files that are still
+      // alive in the subsequent processings.
+      std::vector<std::string> files;
+      env_->GetChildren(path, &files);  // Ignore errors
+      for (const std::string& file : files) {
+        uint64_t number;
+        FileType type;
+        // 1. If we cannot parse the file name, we skip;
+        // 2. If the file with file_number equals number has already been
+        // grabbed for purge by another compaction job, or it has already been
+        // schedule for purge, we also skip it if we
+        // are doing full scan in order to avoid double deletion of the same
+        // file under race conditions. See
+        // https://github.com/facebook/rocksdb/issues/3573
+        if (!ParseFileName(file, &number, info_log_prefix.prefix, &type) ||
+            !ShouldPurge(number)) {
+          continue;
+        }
+
+        // TODO(icanadi) clean up this mess to avoid having one-off "/" prefixes
+        job_context->full_scan_candidate_files.emplace_back("/" + file, path);
+      }
+    }
+
+    // Add log files in wal_dir
+    if (immutable_db_options_.wal_dir != dbname_) {
+      std::vector<std::string> log_files;
+      env_->GetChildren(immutable_db_options_.wal_dir,
+                        &log_files);  // Ignore errors
+      for (const std::string& log_file : log_files) {
+        job_context->full_scan_candidate_files.emplace_back(
+            log_file, immutable_db_options_.wal_dir);
+      }
+    }
+    // Add info log files in db_log_dir
+    if (!immutable_db_options_.db_log_dir.empty() &&
+        immutable_db_options_.db_log_dir != dbname_) {
+      std::vector<std::string> info_log_files;
+      // Ignore errors
+      env_->GetChildren(immutable_db_options_.db_log_dir, &info_log_files);
+      for (std::string& log_file : info_log_files) {
+        job_context->full_scan_candidate_files.emplace_back(
+            log_file, immutable_db_options_.db_log_dir);
+      }
+    }
+  }
+
+  // logs_ is empty when called during recovery, in which case there can't yet
+  // be any tracked obsolete logs
+  if (!alive_log_files_.empty() && !logs_.empty()) {
+    uint64_t min_log_number = job_context->log_number;
+    size_t num_alive_log_files = alive_log_files_.size();
+    // find newly obsoleted log files
+    while (alive_log_files_.begin()->number < min_log_number) {
+      auto& earliest = *alive_log_files_.begin();
+      if (immutable_db_options_.recycle_log_file_num >
+          log_recycle_files_.size()) {
+        ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                       "adding log %" PRIu64 " to recycle list\n",
+                       earliest.number);
+        log_recycle_files_.push_back(earliest.number);
+      } else {
+        job_context->log_delete_files.push_back(earliest.number);
+      }
+      if (job_context->size_log_to_delete == 0) {
+        job_context->prev_total_log_size = total_log_size_;
+        job_context->num_alive_log_files = num_alive_log_files;
+      }
+      job_context->size_log_to_delete += earliest.size;
+      total_log_size_ -= earliest.size;
+      if (two_write_queues_) {
+        log_write_mutex_.Lock();
+      }
+      alive_log_files_.pop_front();
+      if (two_write_queues_) {
+        log_write_mutex_.Unlock();
+      }
+      // Current log should always stay alive since it can't have
+      // number < MinLogNumber().
+      assert(alive_log_files_.size());
+    }
+    while (!logs_.empty() && logs_.front().number < min_log_number) {
+      auto& log = logs_.front();
+      if (log.getting_synced) {
+        log_sync_cv_.Wait();
+        // logs_ could have changed while we were waiting.
+        continue;
+      }
+      logs_to_free_.push_back(log.ReleaseWriter());
+      {
+        InstrumentedMutexLock wl(&log_write_mutex_);
+        logs_.pop_front();
+      }
+    }
+    // Current log cannot be obsolete.
+    assert(!logs_.empty());
+  }
+
+  // We're just cleaning up for DB::Write().
+  assert(job_context->logs_to_free.empty());
+  job_context->logs_to_free = logs_to_free_;
+  job_context->log_recycle_files.assign(log_recycle_files_.begin(),
+                                        log_recycle_files_.end());
+  if (job_context->HaveSomethingToDelete()) {
+    ++pending_purge_obsolete_files_;
+  }
+  logs_to_free_.clear();
+}
+
+namespace {
+bool CompareCandidateFile(const JobContext::CandidateFileInfo& first,
+                          const JobContext::CandidateFileInfo& second) {
+  if (first.file_name > second.file_name) {
+    return true;
+  } else if (first.file_name < second.file_name) {
+    return false;
+  } else {
+    return (first.file_path > second.file_path);
+  }
+}
+};  // namespace
+
+// Delete obsolete files and log status and information of file deletion
+void DBImpl::DeleteObsoleteFileImpl(int job_id, const std::string& fname,
+                                    const std::string& path_to_sync,
+                                    FileType type, uint64_t number) {
+  Status file_deletion_status;
+  if (type == kTableFile || type == kLogFile) {
+    file_deletion_status =
+        DeleteDBFile(&immutable_db_options_, fname, path_to_sync,
+                     /*force_bg=*/false, /*force_fg=*/!wal_in_db_path_);
+  } else {
+    file_deletion_status = env_->DeleteFile(fname);
+  }
+  TEST_SYNC_POINT_CALLBACK("DBImpl::DeleteObsoleteFileImpl:AfterDeletion",
+                           &file_deletion_status);
+  if (file_deletion_status.ok()) {
+    ROCKS_LOG_DEBUG(immutable_db_options_.info_log,
+                    "[JOB %d] Delete %s type=%d #%" PRIu64 " -- %s\n", job_id,
+                    fname.c_str(), type, number,
+                    file_deletion_status.ToString().c_str());
+  } else if (env_->FileExists(fname).IsNotFound()) {
+    ROCKS_LOG_INFO(
+        immutable_db_options_.info_log,
+        "[JOB %d] Tried to delete a non-existing file %s type=%d #%" PRIu64
+        " -- %s\n",
+        job_id, fname.c_str(), type, number,
+        file_deletion_status.ToString().c_str());
+  } else {
+    ROCKS_LOG_ERROR(immutable_db_options_.info_log,
+                    "[JOB %d] Failed to delete %s type=%d #%" PRIu64 " -- %s\n",
+                    job_id, fname.c_str(), type, number,
+                    file_deletion_status.ToString().c_str());
+  }
+  if (type == kTableFile) {
+    EventHelpers::LogAndNotifyTableFileDeletion(
+        &event_logger_, job_id, number, fname, file_deletion_status, GetName(),
+        immutable_db_options_.listeners);
+  }
+}
+
+// Diffs the files listed in filenames and those that do not
+// belong to live files are possibly removed. Also, removes all the
+// files in sst_delete_files and log_delete_files.
+// It is not necessary to hold the mutex when invoking this method.
+void DBImpl::PurgeObsoleteFiles(JobContext& state, bool schedule_only) {
+  TEST_SYNC_POINT("DBImpl::PurgeObsoleteFiles:Begin");
+  // we'd better have sth to delete
+  assert(state.HaveSomethingToDelete());
+
+  // FindObsoleteFiles() should've populated this so nonzero
+  assert(state.manifest_file_number != 0);
+
+  // Now, convert live list to an unordered map, WITHOUT mutex held;
+  // set is slow.
+  std::unordered_map<uint64_t, const FileDescriptor*> sst_live_map;
+  for (const FileDescriptor& fd : state.sst_live) {
+    sst_live_map[fd.GetNumber()] = &fd;
+  }
+  std::unordered_set<uint64_t> log_recycle_files_set(
+      state.log_recycle_files.begin(), state.log_recycle_files.end());
+
+  auto candidate_files = state.full_scan_candidate_files;
+  candidate_files.reserve(
+      candidate_files.size() + state.sst_delete_files.size() +
+      state.log_delete_files.size() + state.manifest_delete_files.size());
+  // We may ignore the dbname when generating the file names.
+  for (auto& file : state.sst_delete_files) {
+    candidate_files.emplace_back(
+        MakeTableFileName(file.metadata->fd.GetNumber()), file.path);
+    if (file.metadata->table_reader_handle) {
+      table_cache_->Release(file.metadata->table_reader_handle);
+    }
+    file.DeleteMetadata();
+  }
+
+  for (auto file_num : state.log_delete_files) {
+    if (file_num > 0) {
+      candidate_files.emplace_back(LogFileName(file_num),
+                                   immutable_db_options_.wal_dir);
+    }
+  }
+  for (const auto& filename : state.manifest_delete_files) {
+    candidate_files.emplace_back(filename, dbname_);
+  }
+
+  // dedup state.candidate_files so we don't try to delete the same
+  // file twice
+  std::sort(candidate_files.begin(), candidate_files.end(),
+            CompareCandidateFile);
+  candidate_files.erase(
+      std::unique(candidate_files.begin(), candidate_files.end()),
+      candidate_files.end());
+
+  if (state.prev_total_log_size > 0) {
+    ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                   "[JOB %d] Try to delete WAL files size %" PRIu64
+                   ", prev total WAL file size %" PRIu64
+                   ", number of live WAL files %" ROCKSDB_PRIszt ".\n",
+                   state.job_id, state.size_log_to_delete,
+                   state.prev_total_log_size, state.num_alive_log_files);
+  }
+
+  std::vector<std::string> old_info_log_files;
+  InfoLogPrefix info_log_prefix(!immutable_db_options_.db_log_dir.empty(),
+                                dbname_);
+
+  // File numbers of most recent two OPTIONS file in candidate_files (found in
+  // previos FindObsoleteFiles(full_scan=true))
+  // At this point, there must not be any duplicate file numbers in
+  // candidate_files.
+  uint64_t optsfile_num1 = std::numeric_limits<uint64_t>::min();
+  uint64_t optsfile_num2 = std::numeric_limits<uint64_t>::min();
+  for (const auto& candidate_file : candidate_files) {
+    const std::string& fname = candidate_file.file_name;
+    uint64_t number;
+    FileType type;
+    if (!ParseFileName(fname, &number, info_log_prefix.prefix, &type) ||
+        type != kOptionsFile) {
+      continue;
+    }
+    if (number > optsfile_num1) {
+      optsfile_num2 = optsfile_num1;
+      optsfile_num1 = number;
+    } else if (number > optsfile_num2) {
+      optsfile_num2 = number;
+    }
+  }
+
+  // Close WALs before trying to delete them.
+  for (const auto w : state.logs_to_free) {
+    // TODO: maybe check the return value of Close.
+    w->Close();
+  }
+
+  bool own_files = OwnTablesAndLogs();
+  std::unordered_set<uint64_t> files_to_del;
+  for (const auto& candidate_file : candidate_files) {
+    const std::string& to_delete = candidate_file.file_name;
+    uint64_t number;
+    FileType type;
+    // Ignore file if we cannot recognize it.
+    if (!ParseFileName(to_delete, &number, info_log_prefix.prefix, &type)) {
+      continue;
+    }
+
+    bool keep = true;
+    switch (type) {
+      case kLogFile:
+        keep = ((number >= state.log_number) ||
+                (number == state.prev_log_number) ||
+                (log_recycle_files_set.find(number) !=
+                 log_recycle_files_set.end()));
+        break;
+      case kDescriptorFile:
+        // Keep my manifest file, and any newer incarnations'
+        // (can happen during manifest roll)
+        keep = (number >= state.manifest_file_number);
+        break;
+      case kTableFile:
+        // If the second condition is not there, this makes
+        // DontDeletePendingOutputs fail
+        keep = (sst_live_map.find(number) != sst_live_map.end()) ||
+               number >= state.min_pending_output;
+        if (!keep) {
+          files_to_del.insert(number);
+        }
+        break;
+      case kTempFile:
+        // Any temp files that are currently being written to must
+        // be recorded in pending_outputs_, which is inserted into "live".
+        // Also, SetCurrentFile creates a temp file when writing out new
+        // manifest, which is equal to state.pending_manifest_file_number. We
+        // should not delete that file
+        //
+        // TODO(yhchiang): carefully modify the third condition to safely
+        //                 remove the temp options files.
+        keep = (sst_live_map.find(number) != sst_live_map.end()) ||
+               (number == state.pending_manifest_file_number) ||
+               (to_delete.find(kOptionsFileNamePrefix) != std::string::npos);
+        break;
+      case kInfoLogFile:
+        keep = true;
+        if (number != 0) {
+          old_info_log_files.push_back(to_delete);
+        }
+        break;
+      case kOptionsFile:
+        keep = (number >= optsfile_num2);
+        TEST_SYNC_POINT_CALLBACK(
+            "DBImpl::PurgeObsoleteFiles:CheckOptionsFiles:1",
+            reinterpret_cast<void*>(&number));
+        TEST_SYNC_POINT_CALLBACK(
+            "DBImpl::PurgeObsoleteFiles:CheckOptionsFiles:2",
+            reinterpret_cast<void*>(&keep));
+        break;
+      case kCurrentFile:
+      case kDBLockFile:
+      case kIdentityFile:
+      case kMetaDatabase:
+      case kBlobFile:
+        keep = true;
+        break;
+    }
+
+    if (keep) {
+      continue;
+    }
+
+    std::string fname;
+    std::string dir_to_sync;
+    if (type == kTableFile) {
+      // evict from cache
+      TableCache::Evict(table_cache_.get(), number);
+      fname = MakeTableFileName(candidate_file.file_path, number);
+      dir_to_sync = candidate_file.file_path;
+    } else {
+      dir_to_sync =
+          (type == kLogFile) ? immutable_db_options_.wal_dir : dbname_;
+      fname = dir_to_sync +
+              ((!dir_to_sync.empty() && dir_to_sync.back() == '/') ||
+                       (!to_delete.empty() && to_delete.front() == '/')
+                   ? ""
+                   : "/") +
+              to_delete;
+    }
+
+#ifndef ROCKSDB_LITE
+    if (type == kLogFile && (immutable_db_options_.wal_ttl_seconds > 0 ||
+                             immutable_db_options_.wal_size_limit_mb > 0)) {
+      wal_manager_.ArchiveWALFile(fname, number);
+      continue;
+    }
+#endif  // !ROCKSDB_LITE
+
+    // If I do not own these files, e.g. secondary instance with max_open_files
+    // = -1, then no need to delete or schedule delete these files since they
+    // will be removed by their owner, e.g. the primary instance.
+    if (!own_files) {
+      continue;
+    }
+    Status file_deletion_status;
+    if (schedule_only) {
+      InstrumentedMutexLock guard_lock(&mutex_);
+      SchedulePendingPurge(fname, dir_to_sync, type, number, state.job_id);
+    } else {
+      DeleteObsoleteFileImpl(state.job_id, fname, dir_to_sync, type, number);
+    }
+  }
+
+  {
+    // After purging obsolete files, remove them from files_grabbed_for_purge_.
+    InstrumentedMutexLock guard_lock(&mutex_);
+    autovector<uint64_t> to_be_removed;
+    for (auto fn : files_grabbed_for_purge_) {
+      if (files_to_del.count(fn) != 0) {
+        to_be_removed.emplace_back(fn);
+      }
+    }
+    for (auto fn : to_be_removed) {
+      files_grabbed_for_purge_.erase(fn);
+    }
+  }
+
+  // Delete old info log files.
+  size_t old_info_log_file_count = old_info_log_files.size();
+  if (old_info_log_file_count != 0 &&
+      old_info_log_file_count >= immutable_db_options_.keep_log_file_num) {
+    std::sort(old_info_log_files.begin(), old_info_log_files.end());
+    size_t end =
+        old_info_log_file_count - immutable_db_options_.keep_log_file_num;
+    for (unsigned int i = 0; i <= end; i++) {
+      std::string& to_delete = old_info_log_files.at(i);
+      std::string full_path_to_delete =
+          (immutable_db_options_.db_log_dir.empty()
+               ? dbname_
+               : immutable_db_options_.db_log_dir) +
+          "/" + to_delete;
+      ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                     "[JOB %d] Delete info log file %s\n", state.job_id,
+                     full_path_to_delete.c_str());
+      Status s = env_->DeleteFile(full_path_to_delete);
+      if (!s.ok()) {
+        if (env_->FileExists(full_path_to_delete).IsNotFound()) {
+          ROCKS_LOG_INFO(
+              immutable_db_options_.info_log,
+              "[JOB %d] Tried to delete non-existing info log file %s FAILED "
+              "-- %s\n",
+              state.job_id, to_delete.c_str(), s.ToString().c_str());
+        } else {
+          ROCKS_LOG_ERROR(immutable_db_options_.info_log,
+                          "[JOB %d] Delete info log file %s FAILED -- %s\n",
+                          state.job_id, to_delete.c_str(),
+                          s.ToString().c_str());
+        }
+      }
+    }
+  }
+#ifndef ROCKSDB_LITE
+  wal_manager_.PurgeObsoleteWALFiles();
+#endif  // ROCKSDB_LITE
+  LogFlush(immutable_db_options_.info_log);
+  InstrumentedMutexLock l(&mutex_);
+  --pending_purge_obsolete_files_;
+  assert(pending_purge_obsolete_files_ >= 0);
+  if (pending_purge_obsolete_files_ == 0) {
+    bg_cv_.SignalAll();
+  }
+  TEST_SYNC_POINT("DBImpl::PurgeObsoleteFiles:End");
+}
+
+void DBImpl::DeleteObsoleteFiles() {
+  mutex_.AssertHeld();
+  JobContext job_context(next_job_id_.fetch_add(1));
+  FindObsoleteFiles(&job_context, true);
+
+  mutex_.Unlock();
+  if (job_context.HaveSomethingToDelete()) {
+    PurgeObsoleteFiles(job_context);
+  }
+  job_context.Clean();
+  mutex_.Lock();
+}
+
+uint64_t FindMinPrepLogReferencedByMemTable(
+    VersionSet* vset, const ColumnFamilyData* cfd_to_flush,
+    const autovector<MemTable*>& memtables_to_flush) {
+  uint64_t min_log = 0;
+
+  // we must look through the memtables for two phase transactions
+  // that have been committed but not yet flushed
+  for (auto loop_cfd : *vset->GetColumnFamilySet()) {
+    if (loop_cfd->IsDropped() || loop_cfd == cfd_to_flush) {
+      continue;
+    }
+
+    auto log = loop_cfd->imm()->PrecomputeMinLogContainingPrepSection(
+        memtables_to_flush);
+
+    if (log > 0 && (min_log == 0 || log < min_log)) {
+      min_log = log;
+    }
+
+    log = loop_cfd->mem()->GetMinLogContainingPrepSection();
+
+    if (log > 0 && (min_log == 0 || log < min_log)) {
+      min_log = log;
+    }
+  }
+
+  return min_log;
+}
+
+uint64_t PrecomputeMinLogNumberToKeep(
+    VersionSet* vset, const ColumnFamilyData& cfd_to_flush,
+    autovector<VersionEdit*> edit_list,
+    const autovector<MemTable*>& memtables_to_flush,
+    LogsWithPrepTracker* prep_tracker) {
+  assert(vset != nullptr);
+  assert(prep_tracker != nullptr);
+  // Calculate updated min_log_number_to_keep
+  // Since the function should only be called in 2pc mode, log number in
+  // the version edit should be sufficient.
+
+  // Precompute the min log number containing unflushed data for the column
+  // family being flushed (`cfd_to_flush`).
+  uint64_t cf_min_log_number_to_keep = 0;
+  for (auto& e : edit_list) {
+    if (e->HasLogNumber()) {
+      cf_min_log_number_to_keep =
+          std::max(cf_min_log_number_to_keep, e->GetLogNumber());
+    }
+  }
+  if (cf_min_log_number_to_keep == 0) {
+    // No version edit contains information on log number. The log number
+    // for this column family should stay the same as it is.
+    cf_min_log_number_to_keep = cfd_to_flush.GetLogNumber();
+  }
+
+  // Get min log number containing unflushed data for other column families.
+  uint64_t min_log_number_to_keep =
+      vset->PreComputeMinLogNumberWithUnflushedData(&cfd_to_flush);
+  if (cf_min_log_number_to_keep != 0) {
+    min_log_number_to_keep =
+        std::min(cf_min_log_number_to_keep, min_log_number_to_keep);
+  }
+
+  // if are 2pc we must consider logs containing prepared
+  // sections of outstanding transactions.
+  //
+  // We must check min logs with outstanding prep before we check
+  // logs references by memtables because a log referenced by the
+  // first data structure could transition to the second under us.
+  //
+  // TODO: iterating over all column families under db mutex.
+  // should find more optimal solution
+  auto min_log_in_prep_heap =
+      prep_tracker->FindMinLogContainingOutstandingPrep();
+
+  if (min_log_in_prep_heap != 0 &&
+      min_log_in_prep_heap < min_log_number_to_keep) {
+    min_log_number_to_keep = min_log_in_prep_heap;
+  }
+
+  uint64_t min_log_refed_by_mem = FindMinPrepLogReferencedByMemTable(
+      vset, &cfd_to_flush, memtables_to_flush);
+
+  if (min_log_refed_by_mem != 0 &&
+      min_log_refed_by_mem < min_log_number_to_keep) {
+    min_log_number_to_keep = min_log_refed_by_mem;
+  }
+  return min_log_number_to_keep;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/db_impl/db_impl_open.cc b/src/rocksdb/db/db_impl/db_impl_open.cc
new file mode 100644
index 000000000..6ae4ead54
--- /dev/null
+++ b/src/rocksdb/db/db_impl/db_impl_open.cc
@@ -0,0 +1,1651 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include "db/db_impl/db_impl.h"
+
+#include <cinttypes>
+
+#include "db/builder.h"
+#include "db/error_handler.h"
+#include "env/composite_env_wrapper.h"
+#include "file/read_write_util.h"
+#include "file/sst_file_manager_impl.h"
+#include "file/writable_file_writer.h"
+#include "monitoring/persistent_stats_history.h"
+#include "options/options_helper.h"
+#include "rocksdb/wal_filter.h"
+#include "table/block_based/block_based_table_factory.h"
+#include "test_util/sync_point.h"
+#include "util/rate_limiter.h"
+
+namespace ROCKSDB_NAMESPACE {
+Options SanitizeOptions(const std::string& dbname, const Options& src) {
+  auto db_options = SanitizeOptions(dbname, DBOptions(src));
+  ImmutableDBOptions immutable_db_options(db_options);
+  auto cf_options =
+      SanitizeOptions(immutable_db_options, ColumnFamilyOptions(src));
+  return Options(db_options, cf_options);
+}
+
+DBOptions SanitizeOptions(const std::string& dbname, const DBOptions& src) {
+  DBOptions result(src);
+
+  if (result.file_system == nullptr) {
+    if (result.env == Env::Default()) {
+      result.file_system = FileSystem::Default();
+    } else {
+      result.file_system.reset(new LegacyFileSystemWrapper(result.env));
+    }
+  } else {
+    if (result.env == nullptr) {
+      result.env = Env::Default();
+    }
+  }
+
+  // result.max_open_files means an "infinite" open files.
+  if (result.max_open_files != -1) {
+    int max_max_open_files = port::GetMaxOpenFiles();
+    if (max_max_open_files == -1) {
+      max_max_open_files = 0x400000;
+    }
+    ClipToRange(&result.max_open_files, 20, max_max_open_files);
+    TEST_SYNC_POINT_CALLBACK("SanitizeOptions::AfterChangeMaxOpenFiles",
+                             &result.max_open_files);
+  }
+
+  if (result.info_log == nullptr) {
+    Status s = CreateLoggerFromOptions(dbname, result, &result.info_log);
+    if (!s.ok()) {
+      // No place suitable for logging
+      result.info_log = nullptr;
+    }
+  }
+
+  if (!result.write_buffer_manager) {
+    result.write_buffer_manager.reset(
+        new WriteBufferManager(result.db_write_buffer_size));
+  }
+  auto bg_job_limits = DBImpl::GetBGJobLimits(
+      result.max_background_flushes, result.max_background_compactions,
+      result.max_background_jobs, true /* parallelize_compactions */);
+  result.env->IncBackgroundThreadsIfNeeded(bg_job_limits.max_compactions,
+                                           Env::Priority::LOW);
+  result.env->IncBackgroundThreadsIfNeeded(bg_job_limits.max_flushes,
+                                           Env::Priority::HIGH);
+
+  if (result.rate_limiter.get() != nullptr) {
+    if (result.bytes_per_sync == 0) {
+      result.bytes_per_sync = 1024 * 1024;
+    }
+  }
+
+  if (result.delayed_write_rate == 0) {
+    if (result.rate_limiter.get() != nullptr) {
+      result.delayed_write_rate = result.rate_limiter->GetBytesPerSecond();
+    }
+    if (result.delayed_write_rate == 0) {
+      result.delayed_write_rate = 16 * 1024 * 1024;
+    }
+  }
+
+  if (result.WAL_ttl_seconds > 0 || result.WAL_size_limit_MB > 0) {
+    result.recycle_log_file_num = false;
+  }
+
+  if (result.recycle_log_file_num &&
+      (result.wal_recovery_mode == WALRecoveryMode::kPointInTimeRecovery ||
+       result.wal_recovery_mode == WALRecoveryMode::kAbsoluteConsistency)) {
+    // kPointInTimeRecovery is inconsistent with recycle log file feature since
+    // we define the "end" of the log as the first corrupt record we encounter.
+    // kAbsoluteConsistency doesn't make sense because even a clean
+    // shutdown leaves old junk at the end of the log file.
+    result.recycle_log_file_num = 0;
+  }
+
+  if (result.wal_dir.empty()) {
+    // Use dbname as default
+    result.wal_dir = dbname;
+  }
+  if (result.wal_dir.back() == '/') {
+    result.wal_dir = result.wal_dir.substr(0, result.wal_dir.size() - 1);
+  }
+
+  if (result.db_paths.size() == 0) {
+    result.db_paths.emplace_back(dbname, std::numeric_limits<uint64_t>::max());
+  }
+
+  if (result.use_direct_reads && result.compaction_readahead_size == 0) {
+    TEST_SYNC_POINT_CALLBACK("SanitizeOptions:direct_io", nullptr);
+    result.compaction_readahead_size = 1024 * 1024 * 2;
+  }
+
+  if (result.compaction_readahead_size > 0 || result.use_direct_reads) {
+    result.new_table_reader_for_compaction_inputs = true;
+  }
+
+  // Force flush on DB open if 2PC is enabled, since with 2PC we have no
+  // guarantee that consecutive log files have consecutive sequence id, which
+  // make recovery complicated.
+  if (result.allow_2pc) {
+    result.avoid_flush_during_recovery = false;
+  }
+
+#ifndef ROCKSDB_LITE
+  ImmutableDBOptions immutable_db_options(result);
+  if (!IsWalDirSameAsDBPath(&immutable_db_options)) {
+    // Either the WAL dir and db_paths[0]/db_name are not the same, or we
+    // cannot tell for sure. In either case, assume they're different and
+    // explicitly cleanup the trash log files (bypass DeleteScheduler)
+    // Do this first so even if we end up calling
+    // DeleteScheduler::CleanupDirectory on the same dir later, it will be
+    // safe
+    std::vector<std::string> filenames;
+    result.env->GetChildren(result.wal_dir, &filenames);
+    for (std::string& filename : filenames) {
+      if (filename.find(".log.trash", filename.length() -
+                                          std::string(".log.trash").length()) !=
+          std::string::npos) {
+        std::string trash_file = result.wal_dir + "/" + filename;
+        result.env->DeleteFile(trash_file);
+      }
+    }
+  }
+  // When the DB is stopped, it's possible that there are some .trash files that
+  // were not deleted yet, when we open the DB we will find these .trash files
+  // and schedule them to be deleted (or delete immediately if SstFileManager
+  // was not used)
+  auto sfm = static_cast<SstFileManagerImpl*>(result.sst_file_manager.get());
+  for (size_t i = 0; i < result.db_paths.size(); i++) {
+    DeleteScheduler::CleanupDirectory(result.env, sfm, result.db_paths[i].path);
+  }
+
+  // Create a default SstFileManager for purposes of tracking compaction size
+  // and facilitating recovery from out of space errors.
+  if (result.sst_file_manager.get() == nullptr) {
+    std::shared_ptr<SstFileManager> sst_file_manager(
+        NewSstFileManager(result.env, result.info_log));
+    result.sst_file_manager = sst_file_manager;
+  }
+#endif
+
+  if (!result.paranoid_checks) {
+    result.skip_checking_sst_file_sizes_on_db_open = true;
+    ROCKS_LOG_INFO(result.info_log,
+                   "file size check will be skipped during open.");
+  }
+
+  return result;
+}
+
+namespace {
+Status SanitizeOptionsByTable(
+    const DBOptions& db_opts,
+    const std::vector<ColumnFamilyDescriptor>& column_families) {
+  Status s;
+  for (auto cf : column_families) {
+    s = cf.options.table_factory->SanitizeOptions(db_opts, cf.options);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+  return Status::OK();
+}
+}  // namespace
+
+Status DBImpl::ValidateOptions(
+    const DBOptions& db_options,
+    const std::vector<ColumnFamilyDescriptor>& column_families) {
+  Status s;
+  for (auto& cfd : column_families) {
+    s = ColumnFamilyData::ValidateOptions(db_options, cfd.options);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+  s = ValidateOptions(db_options);
+  return s;
+}
+
+Status DBImpl::ValidateOptions(const DBOptions& db_options) {
+  if (db_options.db_paths.size() > 4) {
+    return Status::NotSupported(
+        "More than four DB paths are not supported yet. ");
+  }
+
+  if (db_options.allow_mmap_reads && db_options.use_direct_reads) {
+    // Protect against assert in PosixMMapReadableFile constructor
+    return Status::NotSupported(
+        "If memory mapped reads (allow_mmap_reads) are enabled "
+        "then direct I/O reads (use_direct_reads) must be disabled. ");
+  }
+
+  if (db_options.allow_mmap_writes &&
+      db_options.use_direct_io_for_flush_and_compaction) {
+    return Status::NotSupported(
+        "If memory mapped writes (allow_mmap_writes) are enabled "
+        "then direct I/O writes (use_direct_io_for_flush_and_compaction) must "
+        "be disabled. ");
+  }
+
+  if (db_options.keep_log_file_num == 0) {
+    return Status::InvalidArgument("keep_log_file_num must be greater than 0");
+  }
+
+  if (db_options.unordered_write &&
+      !db_options.allow_concurrent_memtable_write) {
+    return Status::InvalidArgument(
+        "unordered_write is incompatible with !allow_concurrent_memtable_write");
+  }
+
+  if (db_options.unordered_write && db_options.enable_pipelined_write) {
+    return Status::InvalidArgument(
+        "unordered_write is incompatible with enable_pipelined_write");
+  }
+
+  if (db_options.atomic_flush && db_options.enable_pipelined_write) {
+    return Status::InvalidArgument(
+        "atomic_flush is incompatible with enable_pipelined_write");
+  }
+
+  return Status::OK();
+}
+
+Status DBImpl::NewDB() {
+  VersionEdit new_db;
+  Status s = SetIdentityFile(env_, dbname_);
+  if (!s.ok()) {
+    return s;
+  }
+  if (immutable_db_options_.write_dbid_to_manifest) {
+    std::string temp_db_id;
+    GetDbIdentityFromIdentityFile(&temp_db_id);
+    new_db.SetDBId(temp_db_id);
+  }
+  new_db.SetLogNumber(0);
+  new_db.SetNextFile(2);
+  new_db.SetLastSequence(0);
+
+  ROCKS_LOG_INFO(immutable_db_options_.info_log, "Creating manifest 1 \n");
+  const std::string manifest = DescriptorFileName(dbname_, 1);
+  {
+    std::unique_ptr<FSWritableFile> file;
+    FileOptions file_options = fs_->OptimizeForManifestWrite(file_options_);
+    s = NewWritableFile(fs_.get(), manifest, &file, file_options);
+    if (!s.ok()) {
+      return s;
+    }
+    file->SetPreallocationBlockSize(
+        immutable_db_options_.manifest_preallocation_size);
+    std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
+        std::move(file), manifest, file_options, env_, nullptr /* stats */,
+        immutable_db_options_.listeners));
+    log::Writer log(std::move(file_writer), 0, false);
+    std::string record;
+    new_db.EncodeTo(&record);
+    s = log.AddRecord(record);
+    if (s.ok()) {
+      s = SyncManifest(env_, &immutable_db_options_, log.file());
+    }
+  }
+  if (s.ok()) {
+    // Make "CURRENT" file that points to the new manifest file.
+    s = SetCurrentFile(env_, dbname_, 1, directories_.GetDbDir());
+  } else {
+    fs_->DeleteFile(manifest, IOOptions(), nullptr);
+  }
+  return s;
+}
+
+Status DBImpl::CreateAndNewDirectory(Env* env, const std::string& dirname,
+                                     std::unique_ptr<Directory>* directory) {
+  // We call CreateDirIfMissing() as the directory may already exist (if we
+  // are reopening a DB), when this happens we don't want creating the
+  // directory to cause an error. However, we need to check if creating the
+  // directory fails or else we may get an obscure message about the lock
+  // file not existing. One real-world example of this occurring is if
+  // env->CreateDirIfMissing() doesn't create intermediate directories, e.g.
+  // when dbname_ is "dir/db" but when "dir" doesn't exist.
+  Status s = env->CreateDirIfMissing(dirname);
+  if (!s.ok()) {
+    return s;
+  }
+  return env->NewDirectory(dirname, directory);
+}
+
+Status Directories::SetDirectories(Env* env, const std::string& dbname,
+                                   const std::string& wal_dir,
+                                   const std::vector<DbPath>& data_paths) {
+  Status s = DBImpl::CreateAndNewDirectory(env, dbname, &db_dir_);
+  if (!s.ok()) {
+    return s;
+  }
+  if (!wal_dir.empty() && dbname != wal_dir) {
+    s = DBImpl::CreateAndNewDirectory(env, wal_dir, &wal_dir_);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  data_dirs_.clear();
+  for (auto& p : data_paths) {
+    const std::string db_path = p.path;
+    if (db_path == dbname) {
+      data_dirs_.emplace_back(nullptr);
+    } else {
+      std::unique_ptr<Directory> path_directory;
+      s = DBImpl::CreateAndNewDirectory(env, db_path, &path_directory);
+      if (!s.ok()) {
+        return s;
+      }
+      data_dirs_.emplace_back(path_directory.release());
+    }
+  }
+  assert(data_dirs_.size() == data_paths.size());
+  return Status::OK();
+}
+
+Status DBImpl::Recover(
+    const std::vector<ColumnFamilyDescriptor>& column_families, bool read_only,
+    bool error_if_log_file_exist, bool error_if_data_exists_in_logs,
+    uint64_t* recovered_seq) {
+  mutex_.AssertHeld();
+
+  bool is_new_db = false;
+  assert(db_lock_ == nullptr);
+  if (!read_only) {
+    Status s = directories_.SetDirectories(env_, dbname_,
+                                           immutable_db_options_.wal_dir,
+                                           immutable_db_options_.db_paths);
+    if (!s.ok()) {
+      return s;
+    }
+
+    s = env_->LockFile(LockFileName(dbname_), &db_lock_);
+    if (!s.ok()) {
+      return s;
+    }
+
+    std::string current_fname = CurrentFileName(dbname_);
+    s = env_->FileExists(current_fname);
+    if (s.IsNotFound()) {
+      if (immutable_db_options_.create_if_missing) {
+        s = NewDB();
+        is_new_db = true;
+        if (!s.ok()) {
+          return s;
+        }
+      } else {
+        return Status::InvalidArgument(
+            current_fname, "does not exist (create_if_missing is false)");
+      }
+    } else if (s.ok()) {
+      if (immutable_db_options_.error_if_exists) {
+        return Status::InvalidArgument(dbname_,
+                                       "exists (error_if_exists is true)");
+      }
+    } else {
+      // Unexpected error reading file
+      assert(s.IsIOError());
+      return s;
+    }
+    // Verify compatibility of file_options_ and filesystem
+    {
+      std::unique_ptr<FSRandomAccessFile> idfile;
+      FileOptions customized_fs(file_options_);
+      customized_fs.use_direct_reads |=
+          immutable_db_options_.use_direct_io_for_flush_and_compaction;
+      s = fs_->NewRandomAccessFile(current_fname, customized_fs, &idfile,
+                                   nullptr);
+      if (!s.ok()) {
+        std::string error_str = s.ToString();
+        // Check if unsupported Direct I/O is the root cause
+        customized_fs.use_direct_reads = false;
+        s = fs_->NewRandomAccessFile(current_fname, customized_fs, &idfile,
+                                     nullptr);
+        if (s.ok()) {
+          return Status::InvalidArgument(
+              "Direct I/O is not supported by the specified DB.");
+        } else {
+          return Status::InvalidArgument(
+              "Found options incompatible with filesystem", error_str.c_str());
+        }
+      }
+    }
+  }
+  assert(db_id_.empty());
+  Status s = versions_->Recover(column_families, read_only, &db_id_);
+  if (!s.ok()) {
+    return s;
+  }
+  // Happens when immutable_db_options_.write_dbid_to_manifest is set to true
+  // the very first time.
+  if (db_id_.empty()) {
+    // Check for the IDENTITY file and create it if not there.
+    s = fs_->FileExists(IdentityFileName(dbname_), IOOptions(), nullptr);
+    // Typically Identity file is created in NewDB() and for some reason if
+    // it is no longer available then at this point DB ID is not in Identity
+    // file or Manifest.
+    if (s.IsNotFound()) {
+      s = SetIdentityFile(env_, dbname_);
+      if (!s.ok()) {
+        return s;
+      }
+    } else if (!s.ok()) {
+      assert(s.IsIOError());
+      return s;
+    }
+    s = GetDbIdentityFromIdentityFile(&db_id_);
+    if (immutable_db_options_.write_dbid_to_manifest && s.ok()) {
+      VersionEdit edit;
+      edit.SetDBId(db_id_);
+      Options options;
+      MutableCFOptions mutable_cf_options(options);
+      versions_->db_id_ = db_id_;
+      s = versions_->LogAndApply(versions_->GetColumnFamilySet()->GetDefault(),
+                             mutable_cf_options, &edit, &mutex_, nullptr,
+                             false);
+    }
+  } else {
+    s = SetIdentityFile(env_, dbname_, db_id_);
+  }
+
+  if (immutable_db_options_.paranoid_checks && s.ok()) {
+    s = CheckConsistency();
+  }
+  if (s.ok() && !read_only) {
+    std::map<std::string, std::shared_ptr<Directory>> created_dirs;
+    for (auto cfd : *versions_->GetColumnFamilySet()) {
+      s = cfd->AddDirectories(&created_dirs);
+      if (!s.ok()) {
+        return s;
+      }
+    }
+  }
+  // DB mutex is already held
+  if (s.ok() && immutable_db_options_.persist_stats_to_disk) {
+    s = InitPersistStatsColumnFamily();
+  }
+
+  if (s.ok()) {
+    // Initial max_total_in_memory_state_ before recovery logs. Log recovery
+    // may check this value to decide whether to flush.
+    max_total_in_memory_state_ = 0;
+    for (auto cfd : *versions_->GetColumnFamilySet()) {
+      auto* mutable_cf_options = cfd->GetLatestMutableCFOptions();
+      max_total_in_memory_state_ += mutable_cf_options->write_buffer_size *
+                                    mutable_cf_options->max_write_buffer_number;
+    }
+
+    SequenceNumber next_sequence(kMaxSequenceNumber);
+    default_cf_handle_ = new ColumnFamilyHandleImpl(
+        versions_->GetColumnFamilySet()->GetDefault(), this, &mutex_);
+    default_cf_internal_stats_ = default_cf_handle_->cfd()->internal_stats();
+    // TODO(Zhongyi): handle single_column_family_mode_ when
+    // persistent_stats is enabled
+    single_column_family_mode_ =
+        versions_->GetColumnFamilySet()->NumberOfColumnFamilies() == 1;
+
+    // Recover from all newer log files than the ones named in the
+    // descriptor (new log files may have been added by the previous
+    // incarnation without registering them in the descriptor).
+    //
+    // Note that prev_log_number() is no longer used, but we pay
+    // attention to it in case we are recovering a database
+    // produced by an older version of rocksdb.
+    std::vector<std::string> filenames;
+    s = env_->GetChildren(immutable_db_options_.wal_dir, &filenames);
+    if (s.IsNotFound()) {
+      return Status::InvalidArgument("wal_dir not found",
+                                     immutable_db_options_.wal_dir);
+    } else if (!s.ok()) {
+      return s;
+    }
+
+    std::vector<uint64_t> logs;
+    for (size_t i = 0; i < filenames.size(); i++) {
+      uint64_t number;
+      FileType type;
+      if (ParseFileName(filenames[i], &number, &type) && type == kLogFile) {
+        if (is_new_db) {
+          return Status::Corruption(
+              "While creating a new Db, wal_dir contains "
+              "existing log file: ",
+              filenames[i]);
+        } else {
+          logs.push_back(number);
+        }
+      }
+    }
+
+    if (logs.size() > 0) {
+      if (error_if_log_file_exist) {
+        return Status::Corruption(
+            "The db was opened in readonly mode with error_if_log_file_exist"
+            "flag but a log file already exists");
+      } else if (error_if_data_exists_in_logs) {
+        for (auto& log : logs) {
+          std::string fname = LogFileName(immutable_db_options_.wal_dir, log);
+          uint64_t bytes;
+          s = env_->GetFileSize(fname, &bytes);
+          if (s.ok()) {
+            if (bytes > 0) {
+              return Status::Corruption(
+                  "error_if_data_exists_in_logs is set but there are data "
+                  " in log files.");
+            }
+          }
+        }
+      }
+    }
+
+    if (!logs.empty()) {
+      // Recover in the order in which the logs were generated
+      std::sort(logs.begin(), logs.end());
+      bool corrupted_log_found = false;
+      s = RecoverLogFiles(logs, &next_sequence, read_only,
+                          &corrupted_log_found);
+      if (corrupted_log_found && recovered_seq != nullptr) {
+        *recovered_seq = next_sequence;
+      }
+      if (!s.ok()) {
+        // Clear memtables if recovery failed
+        for (auto cfd : *versions_->GetColumnFamilySet()) {
+          cfd->CreateNewMemtable(*cfd->GetLatestMutableCFOptions(),
+                                 kMaxSequenceNumber);
+        }
+      }
+    }
+  }
+
+  if (read_only) {
+    // If we are opening as read-only, we need to update options_file_number_
+    // to reflect the most recent OPTIONS file. It does not matter for regular
+    // read-write db instance because options_file_number_ will later be
+    // updated to versions_->NewFileNumber() in RenameTempFileToOptionsFile.
+    std::vector<std::string> file_names;
+    if (s.ok()) {
+      s = env_->GetChildren(GetName(), &file_names);
+    }
+    if (s.ok()) {
+      uint64_t number = 0;
+      uint64_t options_file_number = 0;
+      FileType type;
+      for (const auto& fname : file_names) {
+        if (ParseFileName(fname, &number, &type) && type == kOptionsFile) {
+          options_file_number = std::max(number, options_file_number);
+        }
+      }
+      versions_->options_file_number_ = options_file_number;
+    }
+  }
+
+  return s;
+}
+
+Status DBImpl::PersistentStatsProcessFormatVersion() {
+  mutex_.AssertHeld();
+  Status s;
+  // persist version when stats CF doesn't exist
+  bool should_persist_format_version = !persistent_stats_cfd_exists_;
+  mutex_.Unlock();
+  if (persistent_stats_cfd_exists_) {
+    // Check persistent stats format version compatibility. Drop and recreate
+    // persistent stats CF if format version is incompatible
+    uint64_t format_version_recovered = 0;
+    Status s_format = DecodePersistentStatsVersionNumber(
+        this, StatsVersionKeyType::kFormatVersion, &format_version_recovered);
+    uint64_t compatible_version_recovered = 0;
+    Status s_compatible = DecodePersistentStatsVersionNumber(
+        this, StatsVersionKeyType::kCompatibleVersion,
+        &compatible_version_recovered);
+    // abort reading from existing stats CF if any of following is true:
+    // 1. failed to read format version or compatible version from disk
+    // 2. sst's format version is greater than current format version, meaning
+    // this sst is encoded with a newer RocksDB release, and current compatible
+    // version is below the sst's compatible version
+    if (!s_format.ok() || !s_compatible.ok() ||
+        (kStatsCFCurrentFormatVersion < format_version_recovered &&
+         kStatsCFCompatibleFormatVersion < compatible_version_recovered)) {
+      if (!s_format.ok() || !s_compatible.ok()) {
+        ROCKS_LOG_INFO(
+            immutable_db_options_.info_log,
+            "Reading persistent stats version key failed. Format key: %s, "
+            "compatible key: %s",
+            s_format.ToString().c_str(), s_compatible.ToString().c_str());
+      } else {
+        ROCKS_LOG_INFO(
+            immutable_db_options_.info_log,
+            "Disable persistent stats due to corrupted or incompatible format "
+            "version\n");
+      }
+      DropColumnFamily(persist_stats_cf_handle_);
+      DestroyColumnFamilyHandle(persist_stats_cf_handle_);
+      ColumnFamilyHandle* handle = nullptr;
+      ColumnFamilyOptions cfo;
+      OptimizeForPersistentStats(&cfo);
+      s = CreateColumnFamily(cfo, kPersistentStatsColumnFamilyName, &handle);
+      persist_stats_cf_handle_ = static_cast<ColumnFamilyHandleImpl*>(handle);
+      // should also persist version here because old stats CF is discarded
+      should_persist_format_version = true;
+    }
+  }
+  if (s.ok() && should_persist_format_version) {
+    // Persistent stats CF being created for the first time, need to write
+    // format version key
+    WriteBatch batch;
+    batch.Put(persist_stats_cf_handle_, kFormatVersionKeyString,
+              ToString(kStatsCFCurrentFormatVersion));
+    batch.Put(persist_stats_cf_handle_, kCompatibleVersionKeyString,
+              ToString(kStatsCFCompatibleFormatVersion));
+    WriteOptions wo;
+    wo.low_pri = true;
+    wo.no_slowdown = true;
+    wo.sync = false;
+    s = Write(wo, &batch);
+  }
+  mutex_.Lock();
+  return s;
+}
+
+Status DBImpl::InitPersistStatsColumnFamily() {
+  mutex_.AssertHeld();
+  assert(!persist_stats_cf_handle_);
+  ColumnFamilyData* persistent_stats_cfd =
+      versions_->GetColumnFamilySet()->GetColumnFamily(
+          kPersistentStatsColumnFamilyName);
+  persistent_stats_cfd_exists_ = persistent_stats_cfd != nullptr;
+
+  Status s;
+  if (persistent_stats_cfd != nullptr) {
+    // We are recovering from a DB which already contains persistent stats CF,
+    // the CF is already created in VersionSet::ApplyOneVersionEdit, but
+    // column family handle was not. Need to explicitly create handle here.
+    persist_stats_cf_handle_ =
+        new ColumnFamilyHandleImpl(persistent_stats_cfd, this, &mutex_);
+  } else {
+    mutex_.Unlock();
+    ColumnFamilyHandle* handle = nullptr;
+    ColumnFamilyOptions cfo;
+    OptimizeForPersistentStats(&cfo);
+    s = CreateColumnFamily(cfo, kPersistentStatsColumnFamilyName, &handle);
+    persist_stats_cf_handle_ = static_cast<ColumnFamilyHandleImpl*>(handle);
+    mutex_.Lock();
+  }
+  return s;
+}
+
+// REQUIRES: log_numbers are sorted in ascending order
+Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
+                               SequenceNumber* next_sequence, bool read_only,
+                               bool* corrupted_log_found) {
+  struct LogReporter : public log::Reader::Reporter {
+    Env* env;
+    Logger* info_log;
+    const char* fname;
+    Status* status;  // nullptr if immutable_db_options_.paranoid_checks==false
+    void Corruption(size_t bytes, const Status& s) override {
+      ROCKS_LOG_WARN(info_log, "%s%s: dropping %d bytes; %s",
+                     (this->status == nullptr ? "(ignoring error) " : ""),
+                     fname, static_cast<int>(bytes), s.ToString().c_str());
+      if (this->status != nullptr && this->status->ok()) {
+        *this->status = s;
+      }
+    }
+  };
+
+  mutex_.AssertHeld();
+  Status status;
+  std::unordered_map<int, VersionEdit> version_edits;
+  // no need to refcount because iteration is under mutex
+  for (auto cfd : *versions_->GetColumnFamilySet()) {
+    VersionEdit edit;
+    edit.SetColumnFamily(cfd->GetID());
+    version_edits.insert({cfd->GetID(), edit});
+  }
+  int job_id = next_job_id_.fetch_add(1);
+  {
+    auto stream = event_logger_.Log();
+    stream << "job" << job_id << "event"
+           << "recovery_started";
+    stream << "log_files";
+    stream.StartArray();
+    for (auto log_number : log_numbers) {
+      stream << log_number;
+    }
+    stream.EndArray();
+  }
+
+#ifndef ROCKSDB_LITE
+  if (immutable_db_options_.wal_filter != nullptr) {
+    std::map<std::string, uint32_t> cf_name_id_map;
+    std::map<uint32_t, uint64_t> cf_lognumber_map;
+    for (auto cfd : *versions_->GetColumnFamilySet()) {
+      cf_name_id_map.insert(std::make_pair(cfd->GetName(), cfd->GetID()));
+      cf_lognumber_map.insert(
+          std::make_pair(cfd->GetID(), cfd->GetLogNumber()));
+    }
+
+    immutable_db_options_.wal_filter->ColumnFamilyLogNumberMap(cf_lognumber_map,
+                                                               cf_name_id_map);
+  }
+#endif
+
+  bool stop_replay_by_wal_filter = false;
+  bool stop_replay_for_corruption = false;
+  bool flushed = false;
+  uint64_t corrupted_log_number = kMaxSequenceNumber;
+  uint64_t min_log_number = MinLogNumberToKeep();
+  for (auto log_number : log_numbers) {
+    if (log_number < min_log_number) {
+      ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                     "Skipping log #%" PRIu64
+                     " since it is older than min log to keep #%" PRIu64,
+                     log_number, min_log_number);
+      continue;
+    }
+    // The previous incarnation may not have written any MANIFEST
+    // records after allocating this log number.  So we manually
+    // update the file number allocation counter in VersionSet.
+    versions_->MarkFileNumberUsed(log_number);
+    // Open the log file
+    std::string fname = LogFileName(immutable_db_options_.wal_dir, log_number);
+
+    ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                   "Recovering log #%" PRIu64 " mode %d", log_number,
+                   static_cast<int>(immutable_db_options_.wal_recovery_mode));
+    auto logFileDropped = [this, &fname]() {
+      uint64_t bytes;
+      if (env_->GetFileSize(fname, &bytes).ok()) {
+        auto info_log = immutable_db_options_.info_log.get();
+        ROCKS_LOG_WARN(info_log, "%s: dropping %d bytes", fname.c_str(),
+                       static_cast<int>(bytes));
+      }
+    };
+    if (stop_replay_by_wal_filter) {
+      logFileDropped();
+      continue;
+    }
+
+    std::unique_ptr<SequentialFileReader> file_reader;
+    {
+      std::unique_ptr<FSSequentialFile> file;
+      status = fs_->NewSequentialFile(fname,
+                                      fs_->OptimizeForLogRead(file_options_),
+                                      &file, nullptr);
+      if (!status.ok()) {
+        MaybeIgnoreError(&status);
+        if (!status.ok()) {
+          return status;
+        } else {
+          // Fail with one log file, but that's ok.
+          // Try next one.
+          continue;
+        }
+      }
+      file_reader.reset(new SequentialFileReader(
+          std::move(file), fname, immutable_db_options_.log_readahead_size));
+    }
+
+    // Create the log reader.
+    LogReporter reporter;
+    reporter.env = env_;
+    reporter.info_log = immutable_db_options_.info_log.get();
+    reporter.fname = fname.c_str();
+    if (!immutable_db_options_.paranoid_checks ||
+        immutable_db_options_.wal_recovery_mode ==
+            WALRecoveryMode::kSkipAnyCorruptedRecords) {
+      reporter.status = nullptr;
+    } else {
+      reporter.status = &status;
+    }
+    // We intentially make log::Reader do checksumming even if
+    // paranoid_checks==false so that corruptions cause entire commits
+    // to be skipped instead of propagating bad information (like overly
+    // large sequence numbers).
+    log::Reader reader(immutable_db_options_.info_log, std::move(file_reader),
+                       &reporter, true /*checksum*/, log_number);
+
+    // Determine if we should tolerate incomplete records at the tail end of the
+    // Read all the records and add to a memtable
+    std::string scratch;
+    Slice record;
+    WriteBatch batch;
+
+    while (!stop_replay_by_wal_filter &&
+           reader.ReadRecord(&record, &scratch,
+                             immutable_db_options_.wal_recovery_mode) &&
+           status.ok()) {
+      if (record.size() < WriteBatchInternal::kHeader) {
+        reporter.Corruption(record.size(),
+                            Status::Corruption("log record too small"));
+        continue;
+      }
+      WriteBatchInternal::SetContents(&batch, record);
+      SequenceNumber sequence = WriteBatchInternal::Sequence(&batch);
+
+      if (immutable_db_options_.wal_recovery_mode ==
+          WALRecoveryMode::kPointInTimeRecovery) {
+        // In point-in-time recovery mode, if sequence id of log files are
+        // consecutive, we continue recovery despite corruption. This could
+        // happen when we open and write to a corrupted DB, where sequence id
+        // will start from the last sequence id we recovered.
+        if (sequence == *next_sequence) {
+          stop_replay_for_corruption = false;
+        }
+        if (stop_replay_for_corruption) {
+          logFileDropped();
+          break;
+        }
+      }
+
+#ifndef ROCKSDB_LITE
+      if (immutable_db_options_.wal_filter != nullptr) {
+        WriteBatch new_batch;
+        bool batch_changed = false;
+
+        WalFilter::WalProcessingOption wal_processing_option =
+            immutable_db_options_.wal_filter->LogRecordFound(
+                log_number, fname, batch, &new_batch, &batch_changed);
+
+        switch (wal_processing_option) {
+          case WalFilter::WalProcessingOption::kContinueProcessing:
+            // do nothing, proceeed normally
+            break;
+          case WalFilter::WalProcessingOption::kIgnoreCurrentRecord:
+            // skip current record
+            continue;
+          case WalFilter::WalProcessingOption::kStopReplay:
+            // skip current record and stop replay
+            stop_replay_by_wal_filter = true;
+            continue;
+          case WalFilter::WalProcessingOption::kCorruptedRecord: {
+            status =
+                Status::Corruption("Corruption reported by Wal Filter ",
+                                   immutable_db_options_.wal_filter->Name());
+            MaybeIgnoreError(&status);
+            if (!status.ok()) {
+              reporter.Corruption(record.size(), status);
+              continue;
+            }
+            break;
+          }
+          default: {
+            assert(false);  // unhandled case
+            status = Status::NotSupported(
+                "Unknown WalProcessingOption returned"
+                " by Wal Filter ",
+                immutable_db_options_.wal_filter->Name());
+            MaybeIgnoreError(&status);
+            if (!status.ok()) {
+              return status;
+            } else {
+              // Ignore the error with current record processing.
+              continue;
+            }
+          }
+        }
+
+        if (batch_changed) {
+          // Make sure that the count in the new batch is
+          // within the orignal count.
+          int new_count = WriteBatchInternal::Count(&new_batch);
+          int original_count = WriteBatchInternal::Count(&batch);
+          if (new_count > original_count) {
+            ROCKS_LOG_FATAL(
+                immutable_db_options_.info_log,
+                "Recovering log #%" PRIu64
+                " mode %d log filter %s returned "
+                "more records (%d) than original (%d) which is not allowed. "
+                "Aborting recovery.",
+                log_number,
+                static_cast<int>(immutable_db_options_.wal_recovery_mode),
+                immutable_db_options_.wal_filter->Name(), new_count,
+                original_count);
+            status = Status::NotSupported(
+                "More than original # of records "
+                "returned by Wal Filter ",
+                immutable_db_options_.wal_filter->Name());
+            return status;
+          }
+          // Set the same sequence number in the new_batch
+          // as the original batch.
+          WriteBatchInternal::SetSequence(&new_batch,
+                                          WriteBatchInternal::Sequence(&batch));
+          batch = new_batch;
+        }
+      }
+#endif  // ROCKSDB_LITE
+
+      // If column family was not found, it might mean that the WAL write
+      // batch references to the column family that was dropped after the
+      // insert. We don't want to fail the whole write batch in that case --
+      // we just ignore the update.
+      // That's why we set ignore missing column families to true
+      bool has_valid_writes = false;
+      status = WriteBatchInternal::InsertInto(
+          &batch, column_family_memtables_.get(), &flush_scheduler_,
+          &trim_history_scheduler_, true, log_number, this,
+          false /* concurrent_memtable_writes */, next_sequence,
+          &has_valid_writes, seq_per_batch_, batch_per_txn_);
+      MaybeIgnoreError(&status);
+      if (!status.ok()) {
+        // We are treating this as a failure while reading since we read valid
+        // blocks that do not form coherent data
+        reporter.Corruption(record.size(), status);
+        continue;
+      }
+
+      if (has_valid_writes && !read_only) {
+        // we can do this because this is called before client has access to the
+        // DB and there is only a single thread operating on DB
+        ColumnFamilyData* cfd;
+
+        while ((cfd = flush_scheduler_.TakeNextColumnFamily()) != nullptr) {
+          cfd->UnrefAndTryDelete();
+          // If this asserts, it means that InsertInto failed in
+          // filtering updates to already-flushed column families
+          assert(cfd->GetLogNumber() <= log_number);
+          auto iter = version_edits.find(cfd->GetID());
+          assert(iter != version_edits.end());
+          VersionEdit* edit = &iter->second;
+          status = WriteLevel0TableForRecovery(job_id, cfd, cfd->mem(), edit);
+          if (!status.ok()) {
+            // Reflect errors immediately so that conditions like full
+            // file-systems cause the DB::Open() to fail.
+            return status;
+          }
+          flushed = true;
+
+          cfd->CreateNewMemtable(*cfd->GetLatestMutableCFOptions(),
+                                 *next_sequence);
+        }
+      }
+    }
+
+    if (!status.ok()) {
+      if (status.IsNotSupported()) {
+        // We should not treat NotSupported as corruption. It is rather a clear
+        // sign that we are processing a WAL that is produced by an incompatible
+        // version of the code.
+        return status;
+      }
+      if (immutable_db_options_.wal_recovery_mode ==
+          WALRecoveryMode::kSkipAnyCorruptedRecords) {
+        // We should ignore all errors unconditionally
+        status = Status::OK();
+      } else if (immutable_db_options_.wal_recovery_mode ==
+                 WALRecoveryMode::kPointInTimeRecovery) {
+        // We should ignore the error but not continue replaying
+        status = Status::OK();
+        stop_replay_for_corruption = true;
+        corrupted_log_number = log_number;
+        if (corrupted_log_found != nullptr) {
+          *corrupted_log_found = true;
+        }
+        ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                       "Point in time recovered to log #%" PRIu64
+                       " seq #%" PRIu64,
+                       log_number, *next_sequence);
+      } else {
+        assert(immutable_db_options_.wal_recovery_mode ==
+                   WALRecoveryMode::kTolerateCorruptedTailRecords ||
+               immutable_db_options_.wal_recovery_mode ==
+                   WALRecoveryMode::kAbsoluteConsistency);
+        return status;
+      }
+    }
+
+    flush_scheduler_.Clear();
+    trim_history_scheduler_.Clear();
+    auto last_sequence = *next_sequence - 1;
+    if ((*next_sequence != kMaxSequenceNumber) &&
+        (versions_->LastSequence() <= last_sequence)) {
+      versions_->SetLastAllocatedSequence(last_sequence);
+      versions_->SetLastPublishedSequence(last_sequence);
+      versions_->SetLastSequence(last_sequence);
+    }
+  }
+  // Compare the corrupted log number to all columnfamily's current log number.
+  // Abort Open() if any column family's log number is greater than
+  // the corrupted log number, which means CF contains data beyond the point of
+  // corruption. This could during PIT recovery when the WAL is corrupted and
+  // some (but not all) CFs are flushed
+  // Exclude the PIT case where no log is dropped after the corruption point.
+  // This is to cover the case for empty logs after corrupted log, in which we
+  // don't reset stop_replay_for_corruption.
+  if (stop_replay_for_corruption == true &&
+      (immutable_db_options_.wal_recovery_mode ==
+           WALRecoveryMode::kPointInTimeRecovery ||
+       immutable_db_options_.wal_recovery_mode ==
+           WALRecoveryMode::kTolerateCorruptedTailRecords)) {
+    for (auto cfd : *versions_->GetColumnFamilySet()) {
+      if (cfd->GetLogNumber() > corrupted_log_number) {
+        ROCKS_LOG_ERROR(immutable_db_options_.info_log,
+                        "Column family inconsistency: SST file contains data"
+                        " beyond the point of corruption.");
+        return Status::Corruption("SST file is ahead of WALs");
+      }
+    }
+  }
+
+  // True if there's any data in the WALs; if not, we can skip re-processing
+  // them later
+  bool data_seen = false;
+  if (!read_only) {
+    // no need to refcount since client still doesn't have access
+    // to the DB and can not drop column families while we iterate
+    auto max_log_number = log_numbers.back();
+    for (auto cfd : *versions_->GetColumnFamilySet()) {
+      auto iter = version_edits.find(cfd->GetID());
+      assert(iter != version_edits.end());
+      VersionEdit* edit = &iter->second;
+
+      if (cfd->GetLogNumber() > max_log_number) {
+        // Column family cfd has already flushed the data
+        // from all logs. Memtable has to be empty because
+        // we filter the updates based on log_number
+        // (in WriteBatch::InsertInto)
+        assert(cfd->mem()->GetFirstSequenceNumber() == 0);
+        assert(edit->NumEntries() == 0);
+        continue;
+      }
+
+      TEST_SYNC_POINT_CALLBACK(
+          "DBImpl::RecoverLogFiles:BeforeFlushFinalMemtable", /*arg=*/nullptr);
+
+      // flush the final memtable (if non-empty)
+      if (cfd->mem()->GetFirstSequenceNumber() != 0) {
+        // If flush happened in the middle of recovery (e.g. due to memtable
+        // being full), we flush at the end. Otherwise we'll need to record
+        // where we were on last flush, which make the logic complicated.
+        if (flushed || !immutable_db_options_.avoid_flush_during_recovery) {
+          status = WriteLevel0TableForRecovery(job_id, cfd, cfd->mem(), edit);
+          if (!status.ok()) {
+            // Recovery failed
+            break;
+          }
+          flushed = true;
+
+          cfd->CreateNewMemtable(*cfd->GetLatestMutableCFOptions(),
+                                 versions_->LastSequence());
+        }
+        data_seen = true;
+      }
+
+      // Update the log number info in the version edit corresponding to this
+      // column family. Note that the version edits will be written to MANIFEST
+      // together later.
+      // writing log_number in the manifest means that any log file
+      // with number strongly less than (log_number + 1) is already
+      // recovered and should be ignored on next reincarnation.
+      // Since we already recovered max_log_number, we want all logs
+      // with numbers `<= max_log_number` (includes this one) to be ignored
+      if (flushed || cfd->mem()->GetFirstSequenceNumber() == 0) {
+        edit->SetLogNumber(max_log_number + 1);
+      }
+    }
+    if (status.ok()) {
+      // we must mark the next log number as used, even though it's
+      // not actually used. that is because VersionSet assumes
+      // VersionSet::next_file_number_ always to be strictly greater than any
+      // log number
+      versions_->MarkFileNumberUsed(max_log_number + 1);
+
+      autovector<ColumnFamilyData*> cfds;
+      autovector<const MutableCFOptions*> cf_opts;
+      autovector<autovector<VersionEdit*>> edit_lists;
+      for (auto* cfd : *versions_->GetColumnFamilySet()) {
+        cfds.push_back(cfd);
+        cf_opts.push_back(cfd->GetLatestMutableCFOptions());
+        auto iter = version_edits.find(cfd->GetID());
+        assert(iter != version_edits.end());
+        edit_lists.push_back({&iter->second});
+      }
+      // write MANIFEST with update
+      status = versions_->LogAndApply(cfds, cf_opts, edit_lists, &mutex_,
+                                      directories_.GetDbDir(),
+                                      /*new_descriptor_log=*/true);
+    }
+  }
+
+  if (status.ok() && data_seen && !flushed) {
+    status = RestoreAliveLogFiles(log_numbers);
+  }
+
+  event_logger_.Log() << "job" << job_id << "event"
+                      << "recovery_finished";
+
+  return status;
+}
+
+Status DBImpl::RestoreAliveLogFiles(const std::vector<uint64_t>& log_numbers) {
+  if (log_numbers.empty()) {
+    return Status::OK();
+  }
+  Status s;
+  mutex_.AssertHeld();
+  assert(immutable_db_options_.avoid_flush_during_recovery);
+  if (two_write_queues_) {
+    log_write_mutex_.Lock();
+  }
+  // Mark these as alive so they'll be considered for deletion later by
+  // FindObsoleteFiles()
+  total_log_size_ = 0;
+  log_empty_ = false;
+  for (auto log_number : log_numbers) {
+    LogFileNumberSize log(log_number);
+    std::string fname = LogFileName(immutable_db_options_.wal_dir, log_number);
+    // This gets the appear size of the logs, not including preallocated space.
+    s = env_->GetFileSize(fname, &log.size);
+    if (!s.ok()) {
+      break;
+    }
+    total_log_size_ += log.size;
+    alive_log_files_.push_back(log);
+    // We preallocate space for logs, but then after a crash and restart, those
+    // preallocated space are not needed anymore. It is likely only the last
+    // log has such preallocated space, so we only truncate for the last log.
+    if (log_number == log_numbers.back()) {
+      std::unique_ptr<FSWritableFile> last_log;
+      Status truncate_status = fs_->ReopenWritableFile(
+          fname,
+          fs_->OptimizeForLogWrite(
+              file_options_,
+              BuildDBOptions(immutable_db_options_, mutable_db_options_)),
+          &last_log, nullptr);
+      if (truncate_status.ok()) {
+        truncate_status = last_log->Truncate(log.size, IOOptions(), nullptr);
+      }
+      if (truncate_status.ok()) {
+        truncate_status = last_log->Close(IOOptions(), nullptr);
+      }
+      // Not a critical error if fail to truncate.
+      if (!truncate_status.ok()) {
+        ROCKS_LOG_WARN(immutable_db_options_.info_log,
+                       "Failed to truncate log #%" PRIu64 ": %s", log_number,
+                       truncate_status.ToString().c_str());
+      }
+    }
+  }
+  if (two_write_queues_) {
+    log_write_mutex_.Unlock();
+  }
+  return s;
+}
+
+Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
+                                           MemTable* mem, VersionEdit* edit) {
+  mutex_.AssertHeld();
+  const uint64_t start_micros = env_->NowMicros();
+  FileMetaData meta;
+  std::unique_ptr<std::list<uint64_t>::iterator> pending_outputs_inserted_elem(
+      new std::list<uint64_t>::iterator(
+          CaptureCurrentFileNumberInPendingOutputs()));
+  meta.fd = FileDescriptor(versions_->NewFileNumber(), 0, 0);
+  ReadOptions ro;
+  ro.total_order_seek = true;
+  Arena arena;
+  Status s;
+  TableProperties table_properties;
+  {
+    ScopedArenaIterator iter(mem->NewIterator(ro, &arena));
+    ROCKS_LOG_DEBUG(immutable_db_options_.info_log,
+                    "[%s] [WriteLevel0TableForRecovery]"
+                    " Level-0 table #%" PRIu64 ": started",
+                    cfd->GetName().c_str(), meta.fd.GetNumber());
+
+    // Get the latest mutable cf options while the mutex is still locked
+    const MutableCFOptions mutable_cf_options =
+        *cfd->GetLatestMutableCFOptions();
+    bool paranoid_file_checks =
+        cfd->GetLatestMutableCFOptions()->paranoid_file_checks;
+
+    int64_t _current_time = 0;
+    env_->GetCurrentTime(&_current_time);  // ignore error
+    const uint64_t current_time = static_cast<uint64_t>(_current_time);
+    meta.oldest_ancester_time = current_time;
+
+    {
+      auto write_hint = cfd->CalculateSSTWriteHint(0);
+      mutex_.Unlock();
+
+      SequenceNumber earliest_write_conflict_snapshot;
+      std::vector<SequenceNumber> snapshot_seqs =
+          snapshots_.GetAll(&earliest_write_conflict_snapshot);
+      auto snapshot_checker = snapshot_checker_.get();
+      if (use_custom_gc_ && snapshot_checker == nullptr) {
+        snapshot_checker = DisableGCSnapshotChecker::Instance();
+      }
+      std::vector<std::unique_ptr<FragmentedRangeTombstoneIterator>>
+          range_del_iters;
+      auto range_del_iter =
+          mem->NewRangeTombstoneIterator(ro, kMaxSequenceNumber);
+      if (range_del_iter != nullptr) {
+        range_del_iters.emplace_back(range_del_iter);
+      }
+      s = BuildTable(
+          dbname_, env_, fs_.get(), *cfd->ioptions(), mutable_cf_options,
+          file_options_for_compaction_, cfd->table_cache(), iter.get(),
+          std::move(range_del_iters), &meta, cfd->internal_comparator(),
+          cfd->int_tbl_prop_collector_factories(), cfd->GetID(), cfd->GetName(),
+          snapshot_seqs, earliest_write_conflict_snapshot, snapshot_checker,
+          GetCompressionFlush(*cfd->ioptions(), mutable_cf_options),
+          mutable_cf_options.sample_for_compression,
+          cfd->ioptions()->compression_opts, paranoid_file_checks,
+          cfd->internal_stats(), TableFileCreationReason::kRecovery,
+          &event_logger_, job_id, Env::IO_HIGH, nullptr /* table_properties */,
+          -1 /* level */, current_time, write_hint);
+      LogFlush(immutable_db_options_.info_log);
+      ROCKS_LOG_DEBUG(immutable_db_options_.info_log,
+                      "[%s] [WriteLevel0TableForRecovery]"
+                      " Level-0 table #%" PRIu64 ": %" PRIu64 " bytes %s",
+                      cfd->GetName().c_str(), meta.fd.GetNumber(),
+                      meta.fd.GetFileSize(), s.ToString().c_str());
+      mutex_.Lock();
+    }
+  }
+  ReleaseFileNumberFromPendingOutputs(pending_outputs_inserted_elem);
+
+  // Note that if file_size is zero, the file has been deleted and
+  // should not be added to the manifest.
+  int level = 0;
+  if (s.ok() && meta.fd.GetFileSize() > 0) {
+    edit->AddFile(level, meta.fd.GetNumber(), meta.fd.GetPathId(),
+                  meta.fd.GetFileSize(), meta.smallest, meta.largest,
+                  meta.fd.smallest_seqno, meta.fd.largest_seqno,
+                  meta.marked_for_compaction, meta.oldest_blob_file_number,
+                  meta.oldest_ancester_time, meta.file_creation_time,
+                  meta.file_checksum, meta.file_checksum_func_name);
+  }
+
+  InternalStats::CompactionStats stats(CompactionReason::kFlush, 1);
+  stats.micros = env_->NowMicros() - start_micros;
+  stats.bytes_written = meta.fd.GetFileSize();
+  stats.num_output_files = 1;
+  cfd->internal_stats()->AddCompactionStats(level, Env::Priority::USER, stats);
+  cfd->internal_stats()->AddCFStats(InternalStats::BYTES_FLUSHED,
+                                    meta.fd.GetFileSize());
+  RecordTick(stats_, COMPACT_WRITE_BYTES, meta.fd.GetFileSize());
+  return s;
+}
+
+Status DB::Open(const Options& options, const std::string& dbname, DB** dbptr) {
+  DBOptions db_options(options);
+  ColumnFamilyOptions cf_options(options);
+  std::vector<ColumnFamilyDescriptor> column_families;
+  column_families.push_back(
+      ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options));
+  if (db_options.persist_stats_to_disk) {
+    column_families.push_back(
+        ColumnFamilyDescriptor(kPersistentStatsColumnFamilyName, cf_options));
+  }
+  std::vector<ColumnFamilyHandle*> handles;
+  Status s = DB::Open(db_options, dbname, column_families, &handles, dbptr);
+  if (s.ok()) {
+    if (db_options.persist_stats_to_disk) {
+      assert(handles.size() == 2);
+    } else {
+      assert(handles.size() == 1);
+    }
+    // i can delete the handle since DBImpl is always holding a reference to
+    // default column family
+    if (db_options.persist_stats_to_disk && handles[1] != nullptr) {
+      delete handles[1];
+    }
+    delete handles[0];
+  }
+  return s;
+}
+
+Status DB::Open(const DBOptions& db_options, const std::string& dbname,
+                const std::vector<ColumnFamilyDescriptor>& column_families,
+                std::vector<ColumnFamilyHandle*>* handles, DB** dbptr) {
+  const bool kSeqPerBatch = true;
+  const bool kBatchPerTxn = true;
+  return DBImpl::Open(db_options, dbname, column_families, handles, dbptr,
+                      !kSeqPerBatch, kBatchPerTxn);
+}
+
+Status DBImpl::CreateWAL(uint64_t log_file_num, uint64_t recycle_log_number,
+                         size_t preallocate_block_size, log::Writer** new_log) {
+  Status s;
+  std::unique_ptr<FSWritableFile> lfile;
+
+  DBOptions db_options =
+      BuildDBOptions(immutable_db_options_, mutable_db_options_);
+  FileOptions opt_file_options =
+      fs_->OptimizeForLogWrite(file_options_, db_options);
+  std::string log_fname =
+      LogFileName(immutable_db_options_.wal_dir, log_file_num);
+
+  if (recycle_log_number) {
+    ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                   "reusing log %" PRIu64 " from recycle list\n",
+                   recycle_log_number);
+    std::string old_log_fname =
+        LogFileName(immutable_db_options_.wal_dir, recycle_log_number);
+    TEST_SYNC_POINT("DBImpl::CreateWAL:BeforeReuseWritableFile1");
+    TEST_SYNC_POINT("DBImpl::CreateWAL:BeforeReuseWritableFile2");
+    s = fs_->ReuseWritableFile(log_fname, old_log_fname, opt_file_options,
+                               &lfile, /*dbg=*/nullptr);
+  } else {
+    s = NewWritableFile(fs_.get(), log_fname, &lfile, opt_file_options);
+  }
+
+  if (s.ok()) {
+    lfile->SetWriteLifeTimeHint(CalculateWALWriteHint());
+    lfile->SetPreallocationBlockSize(preallocate_block_size);
+
+    const auto& listeners = immutable_db_options_.listeners;
+    std::unique_ptr<WritableFileWriter> file_writer(
+        new WritableFileWriter(std::move(lfile), log_fname, opt_file_options,
+                               env_, nullptr /* stats */, listeners));
+    *new_log = new log::Writer(std::move(file_writer), log_file_num,
+                               immutable_db_options_.recycle_log_file_num > 0,
+                               immutable_db_options_.manual_wal_flush);
+  }
+  return s;
+}
+
+Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
+                    const std::vector<ColumnFamilyDescriptor>& column_families,
+                    std::vector<ColumnFamilyHandle*>* handles, DB** dbptr,
+                    const bool seq_per_batch, const bool batch_per_txn) {
+  Status s = SanitizeOptionsByTable(db_options, column_families);
+  if (!s.ok()) {
+    return s;
+  }
+
+  s = ValidateOptions(db_options, column_families);
+  if (!s.ok()) {
+    return s;
+  }
+
+  *dbptr = nullptr;
+  handles->clear();
+
+  size_t max_write_buffer_size = 0;
+  for (auto cf : column_families) {
+    max_write_buffer_size =
+        std::max(max_write_buffer_size, cf.options.write_buffer_size);
+  }
+
+  DBImpl* impl = new DBImpl(db_options, dbname, seq_per_batch, batch_per_txn);
+  s = impl->env_->CreateDirIfMissing(impl->immutable_db_options_.wal_dir);
+  if (s.ok()) {
+    std::vector<std::string> paths;
+    for (auto& db_path : impl->immutable_db_options_.db_paths) {
+      paths.emplace_back(db_path.path);
+    }
+    for (auto& cf : column_families) {
+      for (auto& cf_path : cf.options.cf_paths) {
+        paths.emplace_back(cf_path.path);
+      }
+    }
+    for (auto& path : paths) {
+      s = impl->env_->CreateDirIfMissing(path);
+      if (!s.ok()) {
+        break;
+      }
+    }
+
+    // For recovery from NoSpace() error, we can only handle
+    // the case where the database is stored in a single path
+    if (paths.size() <= 1) {
+      impl->error_handler_.EnableAutoRecovery();
+    }
+  }
+
+  if (!s.ok()) {
+    delete impl;
+    return s;
+  }
+
+  s = impl->CreateArchivalDirectory();
+  if (!s.ok()) {
+    delete impl;
+    return s;
+  }
+
+  impl->wal_in_db_path_ = IsWalDirSameAsDBPath(&impl->immutable_db_options_);
+
+  impl->mutex_.Lock();
+  // Handles create_if_missing, error_if_exists
+  uint64_t recovered_seq(kMaxSequenceNumber);
+  s = impl->Recover(column_families, false, false, false, &recovered_seq);
+  if (s.ok()) {
+    uint64_t new_log_number = impl->versions_->NewFileNumber();
+    log::Writer* new_log = nullptr;
+    const size_t preallocate_block_size =
+        impl->GetWalPreallocateBlockSize(max_write_buffer_size);
+    s = impl->CreateWAL(new_log_number, 0 /*recycle_log_number*/,
+                        preallocate_block_size, &new_log);
+    if (s.ok()) {
+      InstrumentedMutexLock wl(&impl->log_write_mutex_);
+      impl->logfile_number_ = new_log_number;
+      assert(new_log != nullptr);
+      impl->logs_.emplace_back(new_log_number, new_log);
+    }
+
+    if (s.ok()) {
+      // set column family handles
+      for (auto cf : column_families) {
+        auto cfd =
+            impl->versions_->GetColumnFamilySet()->GetColumnFamily(cf.name);
+        if (cfd != nullptr) {
+          handles->push_back(
+              new ColumnFamilyHandleImpl(cfd, impl, &impl->mutex_));
+          impl->NewThreadStatusCfInfo(cfd);
+        } else {
+          if (db_options.create_missing_column_families) {
+            // missing column family, create it
+            ColumnFamilyHandle* handle;
+            impl->mutex_.Unlock();
+            s = impl->CreateColumnFamily(cf.options, cf.name, &handle);
+            impl->mutex_.Lock();
+            if (s.ok()) {
+              handles->push_back(handle);
+            } else {
+              break;
+            }
+          } else {
+            s = Status::InvalidArgument("Column family not found: ", cf.name);
+            break;
+          }
+        }
+      }
+    }
+    if (s.ok()) {
+      SuperVersionContext sv_context(/* create_superversion */ true);
+      for (auto cfd : *impl->versions_->GetColumnFamilySet()) {
+        impl->InstallSuperVersionAndScheduleWork(
+            cfd, &sv_context, *cfd->GetLatestMutableCFOptions());
+      }
+      sv_context.Clean();
+      if (impl->two_write_queues_) {
+        impl->log_write_mutex_.Lock();
+      }
+      impl->alive_log_files_.push_back(
+          DBImpl::LogFileNumberSize(impl->logfile_number_));
+      if (impl->two_write_queues_) {
+        impl->log_write_mutex_.Unlock();
+      }
+
+      impl->DeleteObsoleteFiles();
+      s = impl->directories_.GetDbDir()->Fsync();
+    }
+    if (s.ok()) {
+      // In WritePrepared there could be gap in sequence numbers. This breaks
+      // the trick we use in kPointInTimeRecovery which assumes the first seq in
+      // the log right after the corrupted log is one larger than the last seq
+      // we read from the logs. To let this trick keep working, we add a dummy
+      // entry with the expected sequence to the first log right after recovery.
+      // In non-WritePrepared case also the new log after recovery could be
+      // empty, and thus missing the consecutive seq hint to distinguish
+      // middle-log corruption to corrupted-log-remained-after-recovery. This
+      // case also will be addressed by a dummy write.
+      if (recovered_seq != kMaxSequenceNumber) {
+        WriteBatch empty_batch;
+        WriteBatchInternal::SetSequence(&empty_batch, recovered_seq);
+        WriteOptions write_options;
+        uint64_t log_used, log_size;
+        log::Writer* log_writer = impl->logs_.back().writer;
+        s = impl->WriteToWAL(empty_batch, log_writer, &log_used, &log_size);
+        if (s.ok()) {
+          // Need to fsync, otherwise it might get lost after a power reset.
+          s = impl->FlushWAL(false);
+          if (s.ok()) {
+            s = log_writer->file()->Sync(impl->immutable_db_options_.use_fsync);
+          }
+        }
+      }
+    }
+  }
+  if (s.ok() && impl->immutable_db_options_.persist_stats_to_disk) {
+    // try to read format version but no need to fail Open() even if it fails
+    s = impl->PersistentStatsProcessFormatVersion();
+  }
+
+  if (s.ok()) {
+    for (auto cfd : *impl->versions_->GetColumnFamilySet()) {
+      if (cfd->ioptions()->compaction_style == kCompactionStyleFIFO) {
+        auto* vstorage = cfd->current()->storage_info();
+        for (int i = 1; i < vstorage->num_levels(); ++i) {
+          int num_files = vstorage->NumLevelFiles(i);
+          if (num_files > 0) {
+            s = Status::InvalidArgument(
+                "Not all files are at level 0. Cannot "
+                "open with FIFO compaction style.");
+            break;
+          }
+        }
+      }
+      if (!cfd->mem()->IsSnapshotSupported()) {
+        impl->is_snapshot_supported_ = false;
+      }
+      if (cfd->ioptions()->merge_operator != nullptr &&
+          !cfd->mem()->IsMergeOperatorSupported()) {
+        s = Status::InvalidArgument(
+            "The memtable of column family %s does not support merge operator "
+            "its options.merge_operator is non-null",
+            cfd->GetName().c_str());
+      }
+      if (!s.ok()) {
+        break;
+      }
+    }
+  }
+  TEST_SYNC_POINT("DBImpl::Open:Opened");
+  Status persist_options_status;
+  if (s.ok()) {
+    // Persist RocksDB Options before scheduling the compaction.
+    // The WriteOptionsFile() will release and lock the mutex internally.
+    persist_options_status = impl->WriteOptionsFile(
+        false /*need_mutex_lock*/, false /*need_enter_write_thread*/);
+
+    *dbptr = impl;
+    impl->opened_successfully_ = true;
+    impl->MaybeScheduleFlushOrCompaction();
+  }
+  impl->mutex_.Unlock();
+
+#ifndef ROCKSDB_LITE
+  auto sfm = static_cast<SstFileManagerImpl*>(
+      impl->immutable_db_options_.sst_file_manager.get());
+  if (s.ok() && sfm) {
+    // Notify SstFileManager about all sst files that already exist in
+    // db_paths[0] and cf_paths[0] when the DB is opened.
+
+    // SstFileManagerImpl needs to know sizes of the files. For files whose size
+    // we already know (sst files that appear in manifest - typically that's the
+    // vast majority of all files), we'll pass the size to SstFileManager.
+    // For all other files SstFileManager will query the size from filesystem.
+
+    std::vector<LiveFileMetaData> metadata;
+
+    impl->mutex_.Lock();
+    impl->versions_->GetLiveFilesMetaData(&metadata);
+    impl->mutex_.Unlock();
+
+    std::unordered_map<std::string, uint64_t> known_file_sizes;
+    for (const auto& md : metadata) {
+      std::string name = md.name;
+      if (!name.empty() && name[0] == '/') {
+        name = name.substr(1);
+      }
+      known_file_sizes[name] = md.size;
+    }
+
+    std::vector<std::string> paths;
+    paths.emplace_back(impl->immutable_db_options_.db_paths[0].path);
+    for (auto& cf : column_families) {
+      if (!cf.options.cf_paths.empty()) {
+        paths.emplace_back(cf.options.cf_paths[0].path);
+      }
+    }
+    // Remove duplicate paths.
+    std::sort(paths.begin(), paths.end());
+    paths.erase(std::unique(paths.begin(), paths.end()), paths.end());
+    for (auto& path : paths) {
+      std::vector<std::string> existing_files;
+      impl->immutable_db_options_.env->GetChildren(path, &existing_files);
+      for (auto& file_name : existing_files) {
+        uint64_t file_number;
+        FileType file_type;
+        std::string file_path = path + "/" + file_name;
+        if (ParseFileName(file_name, &file_number, &file_type) &&
+            file_type == kTableFile) {
+          if (known_file_sizes.count(file_name)) {
+            // We're assuming that each sst file name exists in at most one of
+            // the paths.
+            sfm->OnAddFile(file_path, known_file_sizes.at(file_name),
+                           /* compaction */ false);
+          } else {
+            sfm->OnAddFile(file_path);
+          }
+        }
+      }
+    }
+
+    // Reserve some disk buffer space. This is a heuristic - when we run out
+    // of disk space, this ensures that there is atleast write_buffer_size
+    // amount of free space before we resume DB writes. In low disk space
+    // conditions, we want to avoid a lot of small L0 files due to frequent
+    // WAL write failures and resultant forced flushes
+    sfm->ReserveDiskBuffer(max_write_buffer_size,
+                           impl->immutable_db_options_.db_paths[0].path);
+  }
+#endif  // !ROCKSDB_LITE
+
+  if (s.ok()) {
+    ROCKS_LOG_HEADER(impl->immutable_db_options_.info_log, "DB pointer %p",
+                     impl);
+    LogFlush(impl->immutable_db_options_.info_log);
+    assert(impl->TEST_WALBufferIsEmpty());
+    // If the assert above fails then we need to FlushWAL before returning
+    // control back to the user.
+    if (!persist_options_status.ok()) {
+      s = Status::IOError(
+          "DB::Open() failed --- Unable to persist Options file",
+          persist_options_status.ToString());
+    }
+  }
+  if (s.ok()) {
+    impl->StartTimedTasks();
+  }
+  if (!s.ok()) {
+    for (auto* h : *handles) {
+      delete h;
+    }
+    handles->clear();
+    delete impl;
+    *dbptr = nullptr;
+  }
+  return s;
+}
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/db_impl/db_impl_readonly.cc b/src/rocksdb/db/db_impl/db_impl_readonly.cc
new file mode 100644
index 000000000..a4242bfe1
--- /dev/null
+++ b/src/rocksdb/db/db_impl/db_impl_readonly.cc
@@ -0,0 +1,221 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/db_impl/db_impl_readonly.h"
+#include "db/arena_wrapped_db_iter.h"
+
+#include "db/compacted_db_impl.h"
+#include "db/db_impl/db_impl.h"
+#include "db/db_iter.h"
+#include "db/merge_context.h"
+#include "monitoring/perf_context_imp.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+#ifndef ROCKSDB_LITE
+
+DBImplReadOnly::DBImplReadOnly(const DBOptions& db_options,
+                               const std::string& dbname)
+    : DBImpl(db_options, dbname) {
+  ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                 "Opening the db in read only mode");
+  LogFlush(immutable_db_options_.info_log);
+}
+
+DBImplReadOnly::~DBImplReadOnly() {}
+
+// Implementations of the DB interface
+Status DBImplReadOnly::Get(const ReadOptions& read_options,
+                           ColumnFamilyHandle* column_family, const Slice& key,
+                           PinnableSlice* pinnable_val) {
+  assert(pinnable_val != nullptr);
+  // TODO: stopwatch DB_GET needed?, perf timer needed?
+  PERF_TIMER_GUARD(get_snapshot_time);
+  Status s;
+  SequenceNumber snapshot = versions_->LastSequence();
+  auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+  auto cfd = cfh->cfd();
+  if (tracer_) {
+    InstrumentedMutexLock lock(&trace_mutex_);
+    if (tracer_) {
+      tracer_->Get(column_family, key);
+    }
+  }
+  SuperVersion* super_version = cfd->GetSuperVersion();
+  MergeContext merge_context;
+  SequenceNumber max_covering_tombstone_seq = 0;
+  LookupKey lkey(key, snapshot);
+  PERF_TIMER_STOP(get_snapshot_time);
+  if (super_version->mem->Get(lkey, pinnable_val->GetSelf(), &s, &merge_context,
+                              &max_covering_tombstone_seq, read_options)) {
+    pinnable_val->PinSelf();
+    RecordTick(stats_, MEMTABLE_HIT);
+  } else {
+    PERF_TIMER_GUARD(get_from_output_files_time);
+    super_version->current->Get(read_options, lkey, pinnable_val, &s,
+                                &merge_context, &max_covering_tombstone_seq);
+    RecordTick(stats_, MEMTABLE_MISS);
+  }
+  RecordTick(stats_, NUMBER_KEYS_READ);
+  size_t size = pinnable_val->size();
+  RecordTick(stats_, BYTES_READ, size);
+  RecordInHistogram(stats_, BYTES_PER_READ, size);
+  PERF_COUNTER_ADD(get_read_bytes, size);
+  return s;
+}
+
+Iterator* DBImplReadOnly::NewIterator(const ReadOptions& read_options,
+                                      ColumnFamilyHandle* column_family) {
+  auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+  auto cfd = cfh->cfd();
+  SuperVersion* super_version = cfd->GetSuperVersion()->Ref();
+  SequenceNumber latest_snapshot = versions_->LastSequence();
+  SequenceNumber read_seq =
+      read_options.snapshot != nullptr
+          ? reinterpret_cast<const SnapshotImpl*>(read_options.snapshot)
+                ->number_
+          : latest_snapshot;
+  ReadCallback* read_callback = nullptr;  // No read callback provided.
+  auto db_iter = NewArenaWrappedDbIterator(
+      env_, read_options, *cfd->ioptions(), super_version->mutable_cf_options,
+      read_seq,
+      super_version->mutable_cf_options.max_sequential_skip_in_iterations,
+      super_version->version_number, read_callback);
+  auto internal_iter =
+      NewInternalIterator(read_options, cfd, super_version, db_iter->GetArena(),
+                          db_iter->GetRangeDelAggregator(), read_seq);
+  db_iter->SetIterUnderDBIter(internal_iter);
+  return db_iter;
+}
+
+Status DBImplReadOnly::NewIterators(
+    const ReadOptions& read_options,
+    const std::vector<ColumnFamilyHandle*>& column_families,
+    std::vector<Iterator*>* iterators) {
+  ReadCallback* read_callback = nullptr;  // No read callback provided.
+  if (iterators == nullptr) {
+    return Status::InvalidArgument("iterators not allowed to be nullptr");
+  }
+  iterators->clear();
+  iterators->reserve(column_families.size());
+  SequenceNumber latest_snapshot = versions_->LastSequence();
+  SequenceNumber read_seq =
+      read_options.snapshot != nullptr
+          ? reinterpret_cast<const SnapshotImpl*>(read_options.snapshot)
+                ->number_
+          : latest_snapshot;
+
+  for (auto cfh : column_families) {
+    auto* cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(cfh)->cfd();
+    auto* sv = cfd->GetSuperVersion()->Ref();
+    auto* db_iter = NewArenaWrappedDbIterator(
+        env_, read_options, *cfd->ioptions(), sv->mutable_cf_options, read_seq,
+        sv->mutable_cf_options.max_sequential_skip_in_iterations,
+        sv->version_number, read_callback);
+    auto* internal_iter =
+        NewInternalIterator(read_options, cfd, sv, db_iter->GetArena(),
+                            db_iter->GetRangeDelAggregator(), read_seq);
+    db_iter->SetIterUnderDBIter(internal_iter);
+    iterators->push_back(db_iter);
+  }
+
+  return Status::OK();
+}
+
+Status DB::OpenForReadOnly(const Options& options, const std::string& dbname,
+                           DB** dbptr, bool /*error_if_log_file_exist*/) {
+  *dbptr = nullptr;
+
+  // Try to first open DB as fully compacted DB
+  Status s;
+  s = CompactedDBImpl::Open(options, dbname, dbptr);
+  if (s.ok()) {
+    return s;
+  }
+
+  DBOptions db_options(options);
+  ColumnFamilyOptions cf_options(options);
+  std::vector<ColumnFamilyDescriptor> column_families;
+  column_families.push_back(
+      ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options));
+  std::vector<ColumnFamilyHandle*> handles;
+
+  s = DB::OpenForReadOnly(db_options, dbname, column_families, &handles, dbptr);
+  if (s.ok()) {
+    assert(handles.size() == 1);
+    // i can delete the handle since DBImpl is always holding a
+    // reference to default column family
+    delete handles[0];
+  }
+  return s;
+}
+
+Status DB::OpenForReadOnly(
+    const DBOptions& db_options, const std::string& dbname,
+    const std::vector<ColumnFamilyDescriptor>& column_families,
+    std::vector<ColumnFamilyHandle*>* handles, DB** dbptr,
+    bool error_if_log_file_exist) {
+  *dbptr = nullptr;
+  handles->clear();
+
+  SuperVersionContext sv_context(/* create_superversion */ true);
+  DBImplReadOnly* impl = new DBImplReadOnly(db_options, dbname);
+  impl->mutex_.Lock();
+  Status s = impl->Recover(column_families, true /* read only */,
+                           error_if_log_file_exist);
+  if (s.ok()) {
+    // set column family handles
+    for (auto cf : column_families) {
+      auto cfd =
+          impl->versions_->GetColumnFamilySet()->GetColumnFamily(cf.name);
+      if (cfd == nullptr) {
+        s = Status::InvalidArgument("Column family not found: ", cf.name);
+        break;
+      }
+      handles->push_back(new ColumnFamilyHandleImpl(cfd, impl, &impl->mutex_));
+    }
+  }
+  if (s.ok()) {
+    for (auto cfd : *impl->versions_->GetColumnFamilySet()) {
+      sv_context.NewSuperVersion();
+      cfd->InstallSuperVersion(&sv_context, &impl->mutex_);
+    }
+  }
+  impl->mutex_.Unlock();
+  sv_context.Clean();
+  if (s.ok()) {
+    *dbptr = impl;
+    for (auto* h : *handles) {
+      impl->NewThreadStatusCfInfo(
+          reinterpret_cast<ColumnFamilyHandleImpl*>(h)->cfd());
+    }
+  } else {
+    for (auto h : *handles) {
+      delete h;
+    }
+    handles->clear();
+    delete impl;
+  }
+  return s;
+}
+
+#else   // !ROCKSDB_LITE
+
+Status DB::OpenForReadOnly(const Options& /*options*/,
+                           const std::string& /*dbname*/, DB** /*dbptr*/,
+                           bool /*error_if_log_file_exist*/) {
+  return Status::NotSupported("Not supported in ROCKSDB_LITE.");
+}
+
+Status DB::OpenForReadOnly(
+    const DBOptions& /*db_options*/, const std::string& /*dbname*/,
+    const std::vector<ColumnFamilyDescriptor>& /*column_families*/,
+    std::vector<ColumnFamilyHandle*>* /*handles*/, DB** /*dbptr*/,
+    bool /*error_if_log_file_exist*/) {
+  return Status::NotSupported("Not supported in ROCKSDB_LITE.");
+}
+#endif  // !ROCKSDB_LITE
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/db_impl/db_impl_readonly.h b/src/rocksdb/db/db_impl/db_impl_readonly.h
new file mode 100644
index 000000000..04d06b4a1
--- /dev/null
+++ b/src/rocksdb/db/db_impl/db_impl_readonly.h
@@ -0,0 +1,137 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <string>
+#include <vector>
+#include "db/db_impl/db_impl.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBImplReadOnly : public DBImpl {
+ public:
+  DBImplReadOnly(const DBOptions& options, const std::string& dbname);
+  // No copying allowed
+  DBImplReadOnly(const DBImplReadOnly&) = delete;
+  void operator=(const DBImplReadOnly&) = delete;
+
+  virtual ~DBImplReadOnly();
+
+  // Implementations of the DB interface
+  using DB::Get;
+  virtual Status Get(const ReadOptions& options,
+                     ColumnFamilyHandle* column_family, const Slice& key,
+                     PinnableSlice* value) override;
+
+  // TODO: Implement ReadOnly MultiGet?
+
+  using DBImpl::NewIterator;
+  virtual Iterator* NewIterator(const ReadOptions&,
+                                ColumnFamilyHandle* column_family) override;
+
+  virtual Status NewIterators(
+      const ReadOptions& options,
+      const std::vector<ColumnFamilyHandle*>& column_families,
+      std::vector<Iterator*>* iterators) override;
+
+  using DBImpl::Put;
+  virtual Status Put(const WriteOptions& /*options*/,
+                     ColumnFamilyHandle* /*column_family*/,
+                     const Slice& /*key*/, const Slice& /*value*/) override {
+    return Status::NotSupported("Not supported operation in read only mode.");
+  }
+  using DBImpl::Merge;
+  virtual Status Merge(const WriteOptions& /*options*/,
+                       ColumnFamilyHandle* /*column_family*/,
+                       const Slice& /*key*/, const Slice& /*value*/) override {
+    return Status::NotSupported("Not supported operation in read only mode.");
+  }
+  using DBImpl::Delete;
+  virtual Status Delete(const WriteOptions& /*options*/,
+                        ColumnFamilyHandle* /*column_family*/,
+                        const Slice& /*key*/) override {
+    return Status::NotSupported("Not supported operation in read only mode.");
+  }
+  using DBImpl::SingleDelete;
+  virtual Status SingleDelete(const WriteOptions& /*options*/,
+                              ColumnFamilyHandle* /*column_family*/,
+                              const Slice& /*key*/) override {
+    return Status::NotSupported("Not supported operation in read only mode.");
+  }
+  virtual Status Write(const WriteOptions& /*options*/,
+                       WriteBatch* /*updates*/) override {
+    return Status::NotSupported("Not supported operation in read only mode.");
+  }
+  using DBImpl::CompactRange;
+  virtual Status CompactRange(const CompactRangeOptions& /*options*/,
+                              ColumnFamilyHandle* /*column_family*/,
+                              const Slice* /*begin*/,
+                              const Slice* /*end*/) override {
+    return Status::NotSupported("Not supported operation in read only mode.");
+  }
+
+  using DBImpl::CompactFiles;
+  virtual Status CompactFiles(
+      const CompactionOptions& /*compact_options*/,
+      ColumnFamilyHandle* /*column_family*/,
+      const std::vector<std::string>& /*input_file_names*/,
+      const int /*output_level*/, const int /*output_path_id*/ = -1,
+      std::vector<std::string>* const /*output_file_names*/ = nullptr,
+      CompactionJobInfo* /*compaction_job_info*/ = nullptr) override {
+    return Status::NotSupported("Not supported operation in read only mode.");
+  }
+
+  virtual Status DisableFileDeletions() override {
+    return Status::NotSupported("Not supported operation in read only mode.");
+  }
+
+  virtual Status EnableFileDeletions(bool /*force*/) override {
+    return Status::NotSupported("Not supported operation in read only mode.");
+  }
+  virtual Status GetLiveFiles(std::vector<std::string>& ret,
+                              uint64_t* manifest_file_size,
+                              bool /*flush_memtable*/) override {
+    return DBImpl::GetLiveFiles(ret, manifest_file_size,
+                                false /* flush_memtable */);
+  }
+
+  using DBImpl::Flush;
+  virtual Status Flush(const FlushOptions& /*options*/,
+                       ColumnFamilyHandle* /*column_family*/) override {
+    return Status::NotSupported("Not supported operation in read only mode.");
+  }
+
+  using DBImpl::SyncWAL;
+  virtual Status SyncWAL() override {
+    return Status::NotSupported("Not supported operation in read only mode.");
+  }
+
+  using DB::IngestExternalFile;
+  virtual Status IngestExternalFile(
+      ColumnFamilyHandle* /*column_family*/,
+      const std::vector<std::string>& /*external_files*/,
+      const IngestExternalFileOptions& /*ingestion_options*/) override {
+    return Status::NotSupported("Not supported operation in read only mode.");
+  }
+
+  using DB::CreateColumnFamilyWithImport;
+  virtual Status CreateColumnFamilyWithImport(
+      const ColumnFamilyOptions& /*options*/,
+      const std::string& /*column_family_name*/,
+      const ImportColumnFamilyOptions& /*import_options*/,
+      const ExportImportFilesMetaData& /*metadata*/,
+      ColumnFamilyHandle** /*handle*/) override {
+    return Status::NotSupported("Not supported operation in read only mode.");
+  }
+
+ private:
+  friend class DB;
+};
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/db_impl/db_impl_secondary.cc b/src/rocksdb/db/db_impl/db_impl_secondary.cc
new file mode 100644
index 000000000..f0ec27c32
--- /dev/null
+++ b/src/rocksdb/db/db_impl/db_impl_secondary.cc
@@ -0,0 +1,671 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/db_impl/db_impl_secondary.h"
+
+#include <cinttypes>
+
+#include "db/arena_wrapped_db_iter.h"
+#include "db/merge_context.h"
+#include "logging/auto_roll_logger.h"
+#include "monitoring/perf_context_imp.h"
+#include "util/cast_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+#ifndef ROCKSDB_LITE
+DBImplSecondary::DBImplSecondary(const DBOptions& db_options,
+                                 const std::string& dbname)
+    : DBImpl(db_options, dbname) {
+  ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                 "Opening the db in secondary mode");
+  LogFlush(immutable_db_options_.info_log);
+}
+
+DBImplSecondary::~DBImplSecondary() {}
+
+Status DBImplSecondary::Recover(
+    const std::vector<ColumnFamilyDescriptor>& column_families,
+    bool /*readonly*/, bool /*error_if_log_file_exist*/,
+    bool /*error_if_data_exists_in_logs*/, uint64_t*) {
+  mutex_.AssertHeld();
+
+  JobContext job_context(0);
+  Status s;
+  s = static_cast<ReactiveVersionSet*>(versions_.get())
+          ->Recover(column_families, &manifest_reader_, &manifest_reporter_,
+                    &manifest_reader_status_);
+  if (!s.ok()) {
+    return s;
+  }
+  if (immutable_db_options_.paranoid_checks && s.ok()) {
+    s = CheckConsistency();
+  }
+  // Initial max_total_in_memory_state_ before recovery logs.
+  max_total_in_memory_state_ = 0;
+  for (auto cfd : *versions_->GetColumnFamilySet()) {
+    auto* mutable_cf_options = cfd->GetLatestMutableCFOptions();
+    max_total_in_memory_state_ += mutable_cf_options->write_buffer_size *
+                                  mutable_cf_options->max_write_buffer_number;
+  }
+  if (s.ok()) {
+    default_cf_handle_ = new ColumnFamilyHandleImpl(
+        versions_->GetColumnFamilySet()->GetDefault(), this, &mutex_);
+    default_cf_internal_stats_ = default_cf_handle_->cfd()->internal_stats();
+    single_column_family_mode_ =
+        versions_->GetColumnFamilySet()->NumberOfColumnFamilies() == 1;
+
+    std::unordered_set<ColumnFamilyData*> cfds_changed;
+    s = FindAndRecoverLogFiles(&cfds_changed, &job_context);
+  }
+
+  if (s.IsPathNotFound()) {
+    ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                   "Secondary tries to read WAL, but WAL file(s) have already "
+                   "been purged by primary.");
+    s = Status::OK();
+  }
+  // TODO: update options_file_number_ needed?
+
+  job_context.Clean();
+  return s;
+}
+
+// find new WAL and apply them in order to the secondary instance
+Status DBImplSecondary::FindAndRecoverLogFiles(
+    std::unordered_set<ColumnFamilyData*>* cfds_changed,
+    JobContext* job_context) {
+  assert(nullptr != cfds_changed);
+  assert(nullptr != job_context);
+  Status s;
+  std::vector<uint64_t> logs;
+  s = FindNewLogNumbers(&logs);
+  if (s.ok() && !logs.empty()) {
+    SequenceNumber next_sequence(kMaxSequenceNumber);
+    s = RecoverLogFiles(logs, &next_sequence, cfds_changed, job_context);
+  }
+  return s;
+}
+
+// List wal_dir and find all new WALs, return these log numbers
+Status DBImplSecondary::FindNewLogNumbers(std::vector<uint64_t>* logs) {
+  assert(logs != nullptr);
+  std::vector<std::string> filenames;
+  Status s;
+  s = env_->GetChildren(immutable_db_options_.wal_dir, &filenames);
+  if (s.IsNotFound()) {
+    return Status::InvalidArgument("Failed to open wal_dir",
+                                   immutable_db_options_.wal_dir);
+  } else if (!s.ok()) {
+    return s;
+  }
+
+  // if log_readers_ is non-empty, it means we have applied all logs with log
+  // numbers smaller than the smallest log in log_readers_, so there is no
+  // need to pass these logs to RecoverLogFiles
+  uint64_t log_number_min = 0;
+  if (!log_readers_.empty()) {
+    log_number_min = log_readers_.begin()->first;
+  }
+  for (size_t i = 0; i < filenames.size(); i++) {
+    uint64_t number;
+    FileType type;
+    if (ParseFileName(filenames[i], &number, &type) && type == kLogFile &&
+        number >= log_number_min) {
+      logs->push_back(number);
+    }
+  }
+  // Recover logs in the order that they were generated
+  if (!logs->empty()) {
+    std::sort(logs->begin(), logs->end());
+  }
+  return s;
+}
+
+Status DBImplSecondary::MaybeInitLogReader(
+    uint64_t log_number, log::FragmentBufferedReader** log_reader) {
+  auto iter = log_readers_.find(log_number);
+  // make sure the log file is still present
+  if (iter == log_readers_.end() ||
+      iter->second->reader_->GetLogNumber() != log_number) {
+    // delete the obsolete log reader if log number mismatch
+    if (iter != log_readers_.end()) {
+      log_readers_.erase(iter);
+    }
+    // initialize log reader from log_number
+    // TODO: min_log_number_to_keep_2pc check needed?
+    // Open the log file
+    std::string fname = LogFileName(immutable_db_options_.wal_dir, log_number);
+    ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                   "Recovering log #%" PRIu64 " mode %d", log_number,
+                   static_cast<int>(immutable_db_options_.wal_recovery_mode));
+
+    std::unique_ptr<SequentialFileReader> file_reader;
+    {
+      std::unique_ptr<FSSequentialFile> file;
+      Status status = fs_->NewSequentialFile(
+          fname, fs_->OptimizeForLogRead(file_options_), &file,
+          nullptr);
+      if (!status.ok()) {
+        *log_reader = nullptr;
+        return status;
+      }
+      file_reader.reset(new SequentialFileReader(
+          std::move(file), fname, immutable_db_options_.log_readahead_size));
+    }
+
+    // Create the log reader.
+    LogReaderContainer* log_reader_container = new LogReaderContainer(
+        env_, immutable_db_options_.info_log, std::move(fname),
+        std::move(file_reader), log_number);
+    log_readers_.insert(std::make_pair(
+        log_number, std::unique_ptr<LogReaderContainer>(log_reader_container)));
+  }
+  iter = log_readers_.find(log_number);
+  assert(iter != log_readers_.end());
+  *log_reader = iter->second->reader_;
+  return Status::OK();
+}
+
+// After manifest recovery, replay WALs and refresh log_readers_ if necessary
+// REQUIRES: log_numbers are sorted in ascending order
+Status DBImplSecondary::RecoverLogFiles(
+    const std::vector<uint64_t>& log_numbers, SequenceNumber* next_sequence,
+    std::unordered_set<ColumnFamilyData*>* cfds_changed,
+    JobContext* job_context) {
+  assert(nullptr != cfds_changed);
+  assert(nullptr != job_context);
+  mutex_.AssertHeld();
+  Status status;
+  for (auto log_number : log_numbers) {
+    log::FragmentBufferedReader* reader = nullptr;
+    status = MaybeInitLogReader(log_number, &reader);
+    if (!status.ok()) {
+      return status;
+    }
+    assert(reader != nullptr);
+  }
+  for (auto log_number : log_numbers) {
+    auto it  = log_readers_.find(log_number);
+    assert(it != log_readers_.end());
+    log::FragmentBufferedReader* reader = it->second->reader_;
+    // Manually update the file number allocation counter in VersionSet.
+    versions_->MarkFileNumberUsed(log_number);
+
+    // Determine if we should tolerate incomplete records at the tail end of the
+    // Read all the records and add to a memtable
+    std::string scratch;
+    Slice record;
+    WriteBatch batch;
+
+    while (reader->ReadRecord(&record, &scratch,
+                              immutable_db_options_.wal_recovery_mode) &&
+           status.ok()) {
+      if (record.size() < WriteBatchInternal::kHeader) {
+        reader->GetReporter()->Corruption(
+            record.size(), Status::Corruption("log record too small"));
+        continue;
+      }
+      WriteBatchInternal::SetContents(&batch, record);
+      SequenceNumber seq_of_batch = WriteBatchInternal::Sequence(&batch);
+      std::vector<uint32_t> column_family_ids;
+      status = CollectColumnFamilyIdsFromWriteBatch(batch, &column_family_ids);
+      if (status.ok()) {
+        for (const auto id : column_family_ids) {
+          ColumnFamilyData* cfd =
+              versions_->GetColumnFamilySet()->GetColumnFamily(id);
+          if (cfd == nullptr) {
+            continue;
+          }
+          if (cfds_changed->count(cfd) == 0) {
+            cfds_changed->insert(cfd);
+          }
+          const std::vector<FileMetaData*>& l0_files =
+              cfd->current()->storage_info()->LevelFiles(0);
+          SequenceNumber seq =
+              l0_files.empty() ? 0 : l0_files.back()->fd.largest_seqno;
+          // If the write batch's sequence number is smaller than the last
+          // sequence number of the largest sequence persisted for this column
+          // family, then its data must reside in an SST that has already been
+          // added in the prior MANIFEST replay.
+          if (seq_of_batch <= seq) {
+            continue;
+          }
+          auto curr_log_num = port::kMaxUint64;
+          if (cfd_to_current_log_.count(cfd) > 0) {
+            curr_log_num = cfd_to_current_log_[cfd];
+          }
+          // If the active memtable contains records added by replaying an
+          // earlier WAL, then we need to seal the memtable, add it to the
+          // immutable memtable list and create a new active memtable.
+          if (!cfd->mem()->IsEmpty() && (curr_log_num == port::kMaxUint64 ||
+                                         curr_log_num != log_number)) {
+            const MutableCFOptions mutable_cf_options =
+                *cfd->GetLatestMutableCFOptions();
+            MemTable* new_mem =
+                cfd->ConstructNewMemtable(mutable_cf_options, seq_of_batch);
+            cfd->mem()->SetNextLogNumber(log_number);
+            cfd->imm()->Add(cfd->mem(), &job_context->memtables_to_free);
+            new_mem->Ref();
+            cfd->SetMemtable(new_mem);
+          }
+        }
+        bool has_valid_writes = false;
+        status = WriteBatchInternal::InsertInto(
+            &batch, column_family_memtables_.get(),
+            nullptr /* flush_scheduler */, nullptr /* trim_history_scheduler*/,
+            true, log_number, this, false /* concurrent_memtable_writes */,
+            next_sequence, &has_valid_writes, seq_per_batch_, batch_per_txn_);
+      }
+      // If column family was not found, it might mean that the WAL write
+      // batch references to the column family that was dropped after the
+      // insert. We don't want to fail the whole write batch in that case --
+      // we just ignore the update.
+      // That's why we set ignore missing column families to true
+      // passing null flush_scheduler will disable memtable flushing which is
+      // needed for secondary instances
+      if (status.ok()) {
+        for (const auto id : column_family_ids) {
+          ColumnFamilyData* cfd =
+              versions_->GetColumnFamilySet()->GetColumnFamily(id);
+          if (cfd == nullptr) {
+            continue;
+          }
+          std::unordered_map<ColumnFamilyData*, uint64_t>::iterator iter =
+              cfd_to_current_log_.find(cfd);
+          if (iter == cfd_to_current_log_.end()) {
+            cfd_to_current_log_.insert({cfd, log_number});
+          } else if (log_number > iter->second) {
+            iter->second = log_number;
+          }
+        }
+        auto last_sequence = *next_sequence - 1;
+        if ((*next_sequence != kMaxSequenceNumber) &&
+            (versions_->LastSequence() <= last_sequence)) {
+          versions_->SetLastAllocatedSequence(last_sequence);
+          versions_->SetLastPublishedSequence(last_sequence);
+          versions_->SetLastSequence(last_sequence);
+        }
+      } else {
+        // We are treating this as a failure while reading since we read valid
+        // blocks that do not form coherent data
+        reader->GetReporter()->Corruption(record.size(), status);
+      }
+    }
+    if (!status.ok()) {
+      return status;
+    }
+  }
+  // remove logreaders from map after successfully recovering the WAL
+  if (log_readers_.size() > 1) {
+    auto erase_iter = log_readers_.begin();
+    std::advance(erase_iter, log_readers_.size() - 1);
+    log_readers_.erase(log_readers_.begin(), erase_iter);
+  }
+  return status;
+}
+
+// Implementation of the DB interface
+Status DBImplSecondary::Get(const ReadOptions& read_options,
+                            ColumnFamilyHandle* column_family, const Slice& key,
+                            PinnableSlice* value) {
+  return GetImpl(read_options, column_family, key, value);
+}
+
+Status DBImplSecondary::GetImpl(const ReadOptions& read_options,
+                                ColumnFamilyHandle* column_family,
+                                const Slice& key, PinnableSlice* pinnable_val) {
+  assert(pinnable_val != nullptr);
+  PERF_CPU_TIMER_GUARD(get_cpu_nanos, env_);
+  StopWatch sw(env_, stats_, DB_GET);
+  PERF_TIMER_GUARD(get_snapshot_time);
+
+  auto cfh = static_cast<ColumnFamilyHandleImpl*>(column_family);
+  ColumnFamilyData* cfd = cfh->cfd();
+  if (tracer_) {
+    InstrumentedMutexLock lock(&trace_mutex_);
+    if (tracer_) {
+      tracer_->Get(column_family, key);
+    }
+  }
+  // Acquire SuperVersion
+  SuperVersion* super_version = GetAndRefSuperVersion(cfd);
+  SequenceNumber snapshot = versions_->LastSequence();
+  MergeContext merge_context;
+  SequenceNumber max_covering_tombstone_seq = 0;
+  Status s;
+  LookupKey lkey(key, snapshot);
+  PERF_TIMER_STOP(get_snapshot_time);
+
+  bool done = false;
+  if (super_version->mem->Get(lkey, pinnable_val->GetSelf(), &s, &merge_context,
+                              &max_covering_tombstone_seq, read_options)) {
+    done = true;
+    pinnable_val->PinSelf();
+    RecordTick(stats_, MEMTABLE_HIT);
+  } else if ((s.ok() || s.IsMergeInProgress()) &&
+             super_version->imm->Get(
+                 lkey, pinnable_val->GetSelf(), &s, &merge_context,
+                 &max_covering_tombstone_seq, read_options)) {
+    done = true;
+    pinnable_val->PinSelf();
+    RecordTick(stats_, MEMTABLE_HIT);
+  }
+  if (!done && !s.ok() && !s.IsMergeInProgress()) {
+    ReturnAndCleanupSuperVersion(cfd, super_version);
+    return s;
+  }
+  if (!done) {
+    PERF_TIMER_GUARD(get_from_output_files_time);
+    super_version->current->Get(read_options, lkey, pinnable_val, &s,
+                                &merge_context, &max_covering_tombstone_seq);
+    RecordTick(stats_, MEMTABLE_MISS);
+  }
+  {
+    PERF_TIMER_GUARD(get_post_process_time);
+    ReturnAndCleanupSuperVersion(cfd, super_version);
+    RecordTick(stats_, NUMBER_KEYS_READ);
+    size_t size = pinnable_val->size();
+    RecordTick(stats_, BYTES_READ, size);
+    RecordTimeToHistogram(stats_, BYTES_PER_READ, size);
+    PERF_COUNTER_ADD(get_read_bytes, size);
+  }
+  return s;
+}
+
+Iterator* DBImplSecondary::NewIterator(const ReadOptions& read_options,
+                                       ColumnFamilyHandle* column_family) {
+  if (read_options.managed) {
+    return NewErrorIterator(
+        Status::NotSupported("Managed iterator is not supported anymore."));
+  }
+  if (read_options.read_tier == kPersistedTier) {
+    return NewErrorIterator(Status::NotSupported(
+        "ReadTier::kPersistedData is not yet supported in iterators."));
+  }
+  Iterator* result = nullptr;
+  auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+  auto cfd = cfh->cfd();
+  ReadCallback* read_callback = nullptr;  // No read callback provided.
+  if (read_options.tailing) {
+    return NewErrorIterator(Status::NotSupported(
+        "tailing iterator not supported in secondary mode"));
+  } else if (read_options.snapshot != nullptr) {
+    // TODO (yanqin) support snapshot.
+    return NewErrorIterator(
+        Status::NotSupported("snapshot not supported in secondary mode"));
+  } else {
+    auto snapshot = versions_->LastSequence();
+    result = NewIteratorImpl(read_options, cfd, snapshot, read_callback);
+  }
+  return result;
+}
+
+ArenaWrappedDBIter* DBImplSecondary::NewIteratorImpl(
+    const ReadOptions& read_options, ColumnFamilyData* cfd,
+    SequenceNumber snapshot, ReadCallback* read_callback) {
+  assert(nullptr != cfd);
+  SuperVersion* super_version = cfd->GetReferencedSuperVersion(this);
+  auto db_iter = NewArenaWrappedDbIterator(
+      env_, read_options, *cfd->ioptions(), super_version->mutable_cf_options,
+      snapshot,
+      super_version->mutable_cf_options.max_sequential_skip_in_iterations,
+      super_version->version_number, read_callback);
+  auto internal_iter =
+      NewInternalIterator(read_options, cfd, super_version, db_iter->GetArena(),
+                          db_iter->GetRangeDelAggregator(), snapshot);
+  db_iter->SetIterUnderDBIter(internal_iter);
+  return db_iter;
+}
+
+Status DBImplSecondary::NewIterators(
+    const ReadOptions& read_options,
+    const std::vector<ColumnFamilyHandle*>& column_families,
+    std::vector<Iterator*>* iterators) {
+  if (read_options.managed) {
+    return Status::NotSupported("Managed iterator is not supported anymore.");
+  }
+  if (read_options.read_tier == kPersistedTier) {
+    return Status::NotSupported(
+        "ReadTier::kPersistedData is not yet supported in iterators.");
+  }
+  ReadCallback* read_callback = nullptr;  // No read callback provided.
+  if (iterators == nullptr) {
+    return Status::InvalidArgument("iterators not allowed to be nullptr");
+  }
+  iterators->clear();
+  iterators->reserve(column_families.size());
+  if (read_options.tailing) {
+    return Status::NotSupported(
+        "tailing iterator not supported in secondary mode");
+  } else if (read_options.snapshot != nullptr) {
+    // TODO (yanqin) support snapshot.
+    return Status::NotSupported("snapshot not supported in secondary mode");
+  } else {
+    SequenceNumber read_seq = versions_->LastSequence();
+    for (auto cfh : column_families) {
+      ColumnFamilyData* cfd = static_cast<ColumnFamilyHandleImpl*>(cfh)->cfd();
+      iterators->push_back(
+          NewIteratorImpl(read_options, cfd, read_seq, read_callback));
+    }
+  }
+  return Status::OK();
+}
+
+Status DBImplSecondary::CheckConsistency() {
+  mutex_.AssertHeld();
+  Status s = DBImpl::CheckConsistency();
+  // If DBImpl::CheckConsistency() which is stricter returns success, then we
+  // do not need to give a second chance.
+  if (s.ok()) {
+    return s;
+  }
+  // It's possible that DBImpl::CheckConssitency() can fail because the primary
+  // may have removed certain files, causing the GetFileSize(name) call to
+  // fail and returning a PathNotFound. In this case, we take a best-effort
+  // approach and just proceed.
+  TEST_SYNC_POINT_CALLBACK(
+      "DBImplSecondary::CheckConsistency:AfterFirstAttempt", &s);
+
+  if (immutable_db_options_.skip_checking_sst_file_sizes_on_db_open) {
+    return Status::OK();
+  }
+
+  std::vector<LiveFileMetaData> metadata;
+  versions_->GetLiveFilesMetaData(&metadata);
+
+  std::string corruption_messages;
+  for (const auto& md : metadata) {
+    // md.name has a leading "/".
+    std::string file_path = md.db_path + md.name;
+
+    uint64_t fsize = 0;
+    s = env_->GetFileSize(file_path, &fsize);
+    if (!s.ok() &&
+        (env_->GetFileSize(Rocks2LevelTableFileName(file_path), &fsize).ok() ||
+         s.IsPathNotFound())) {
+      s = Status::OK();
+    }
+    if (!s.ok()) {
+      corruption_messages +=
+          "Can't access " + md.name + ": " + s.ToString() + "\n";
+    }
+  }
+  return corruption_messages.empty() ? Status::OK()
+                                     : Status::Corruption(corruption_messages);
+}
+
+Status DBImplSecondary::TryCatchUpWithPrimary() {
+  assert(versions_.get() != nullptr);
+  assert(manifest_reader_.get() != nullptr);
+  Status s;
+  // read the manifest and apply new changes to the secondary instance
+  std::unordered_set<ColumnFamilyData*> cfds_changed;
+  JobContext job_context(0, true /*create_superversion*/);
+  {
+    InstrumentedMutexLock lock_guard(&mutex_);
+    s = static_cast_with_check<ReactiveVersionSet>(versions_.get())
+            ->ReadAndApply(&mutex_, &manifest_reader_, &cfds_changed);
+
+    ROCKS_LOG_INFO(immutable_db_options_.info_log, "Last sequence is %" PRIu64,
+                   static_cast<uint64_t>(versions_->LastSequence()));
+    for (ColumnFamilyData* cfd : cfds_changed) {
+      if (cfd->IsDropped()) {
+        ROCKS_LOG_DEBUG(immutable_db_options_.info_log, "[%s] is dropped\n",
+                        cfd->GetName().c_str());
+        continue;
+      }
+      VersionStorageInfo::LevelSummaryStorage tmp;
+      ROCKS_LOG_DEBUG(immutable_db_options_.info_log,
+                      "[%s] Level summary: %s\n", cfd->GetName().c_str(),
+                      cfd->current()->storage_info()->LevelSummary(&tmp));
+    }
+
+    // list wal_dir to discover new WALs and apply new changes to the secondary
+    // instance
+    if (s.ok()) {
+      s = FindAndRecoverLogFiles(&cfds_changed, &job_context);
+    }
+    if (s.IsPathNotFound()) {
+      ROCKS_LOG_INFO(
+          immutable_db_options_.info_log,
+          "Secondary tries to read WAL, but WAL file(s) have already "
+          "been purged by primary.");
+      s = Status::OK();
+    }
+    if (s.ok()) {
+      for (auto cfd : cfds_changed) {
+        cfd->imm()->RemoveOldMemTables(cfd->GetLogNumber(),
+                                       &job_context.memtables_to_free);
+        auto& sv_context = job_context.superversion_contexts.back();
+        cfd->InstallSuperVersion(&sv_context, &mutex_);
+        sv_context.NewSuperVersion();
+      }
+    }
+  }
+  job_context.Clean();
+
+  // Cleanup unused, obsolete files.
+  JobContext purge_files_job_context(0);
+  {
+    InstrumentedMutexLock lock_guard(&mutex_);
+    // Currently, secondary instance does not own the database files, thus it
+    // is unnecessary for the secondary to force full scan.
+    FindObsoleteFiles(&purge_files_job_context, /*force=*/false);
+  }
+  if (purge_files_job_context.HaveSomethingToDelete()) {
+    PurgeObsoleteFiles(purge_files_job_context);
+  }
+  purge_files_job_context.Clean();
+  return s;
+}
+
+Status DB::OpenAsSecondary(const Options& options, const std::string& dbname,
+                           const std::string& secondary_path, DB** dbptr) {
+  *dbptr = nullptr;
+
+  DBOptions db_options(options);
+  ColumnFamilyOptions cf_options(options);
+  std::vector<ColumnFamilyDescriptor> column_families;
+  column_families.emplace_back(kDefaultColumnFamilyName, cf_options);
+  std::vector<ColumnFamilyHandle*> handles;
+
+  Status s = DB::OpenAsSecondary(db_options, dbname, secondary_path,
+                                 column_families, &handles, dbptr);
+  if (s.ok()) {
+    assert(handles.size() == 1);
+    delete handles[0];
+  }
+  return s;
+}
+
+Status DB::OpenAsSecondary(
+    const DBOptions& db_options, const std::string& dbname,
+    const std::string& secondary_path,
+    const std::vector<ColumnFamilyDescriptor>& column_families,
+    std::vector<ColumnFamilyHandle*>* handles, DB** dbptr) {
+  *dbptr = nullptr;
+  if (db_options.max_open_files != -1) {
+    // TODO (yanqin) maybe support max_open_files != -1 by creating hard links
+    // on SST files so that db secondary can still have access to old SSTs
+    // while primary instance may delete original.
+    return Status::InvalidArgument("require max_open_files to be -1");
+  }
+
+  DBOptions tmp_opts(db_options);
+  Status s;
+  if (nullptr == tmp_opts.info_log) {
+    s = CreateLoggerFromOptions(secondary_path, tmp_opts, &tmp_opts.info_log);
+    if (!s.ok()) {
+      tmp_opts.info_log = nullptr;
+    }
+  }
+
+  handles->clear();
+  DBImplSecondary* impl = new DBImplSecondary(tmp_opts, dbname);
+  impl->versions_.reset(new ReactiveVersionSet(
+      dbname, &impl->immutable_db_options_, impl->file_options_,
+      impl->table_cache_.get(), impl->write_buffer_manager_,
+      &impl->write_controller_));
+  impl->column_family_memtables_.reset(
+      new ColumnFamilyMemTablesImpl(impl->versions_->GetColumnFamilySet()));
+  impl->wal_in_db_path_ = IsWalDirSameAsDBPath(&impl->immutable_db_options_);
+
+  impl->mutex_.Lock();
+  s = impl->Recover(column_families, true, false, false);
+  if (s.ok()) {
+    for (auto cf : column_families) {
+      auto cfd =
+          impl->versions_->GetColumnFamilySet()->GetColumnFamily(cf.name);
+      if (nullptr == cfd) {
+        s = Status::InvalidArgument("Column family not found: ", cf.name);
+        break;
+      }
+      handles->push_back(new ColumnFamilyHandleImpl(cfd, impl, &impl->mutex_));
+    }
+  }
+  SuperVersionContext sv_context(true /* create_superversion */);
+  if (s.ok()) {
+    for (auto cfd : *impl->versions_->GetColumnFamilySet()) {
+      sv_context.NewSuperVersion();
+      cfd->InstallSuperVersion(&sv_context, &impl->mutex_);
+    }
+  }
+  impl->mutex_.Unlock();
+  sv_context.Clean();
+  if (s.ok()) {
+    *dbptr = impl;
+    for (auto h : *handles) {
+      impl->NewThreadStatusCfInfo(
+          reinterpret_cast<ColumnFamilyHandleImpl*>(h)->cfd());
+    }
+  } else {
+    for (auto h : *handles) {
+      delete h;
+    }
+    handles->clear();
+    delete impl;
+  }
+  return s;
+}
+#else   // !ROCKSDB_LITE
+
+Status DB::OpenAsSecondary(const Options& /*options*/,
+                           const std::string& /*name*/,
+                           const std::string& /*secondary_path*/,
+                           DB** /*dbptr*/) {
+  return Status::NotSupported("Not supported in ROCKSDB_LITE.");
+}
+
+Status DB::OpenAsSecondary(
+    const DBOptions& /*db_options*/, const std::string& /*dbname*/,
+    const std::string& /*secondary_path*/,
+    const std::vector<ColumnFamilyDescriptor>& /*column_families*/,
+    std::vector<ColumnFamilyHandle*>* /*handles*/, DB** /*dbptr*/) {
+  return Status::NotSupported("Not supported in ROCKSDB_LITE.");
+}
+#endif  // !ROCKSDB_LITE
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/db_impl/db_impl_secondary.h b/src/rocksdb/db/db_impl/db_impl_secondary.h
new file mode 100644
index 000000000..24f2e7767
--- /dev/null
+++ b/src/rocksdb/db/db_impl/db_impl_secondary.h
@@ -0,0 +1,333 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <string>
+#include <vector>
+#include "db/db_impl/db_impl.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// A wrapper class to hold log reader, log reporter, log status.
+class LogReaderContainer {
+ public:
+  LogReaderContainer()
+      : reader_(nullptr), reporter_(nullptr), status_(nullptr) {}
+  LogReaderContainer(Env* env, std::shared_ptr<Logger> info_log,
+                     std::string fname,
+                     std::unique_ptr<SequentialFileReader>&& file_reader,
+                     uint64_t log_number) {
+    LogReporter* reporter = new LogReporter();
+    status_ = new Status();
+    reporter->env = env;
+    reporter->info_log = info_log.get();
+    reporter->fname = std::move(fname);
+    reporter->status = status_;
+    reporter_ = reporter;
+    // We intentially make log::Reader do checksumming even if
+    // paranoid_checks==false so that corruptions cause entire commits
+    // to be skipped instead of propagating bad information (like overly
+    // large sequence numbers).
+    reader_ = new log::FragmentBufferedReader(info_log, std::move(file_reader),
+                                              reporter, true /*checksum*/,
+                                              log_number);
+  }
+  log::FragmentBufferedReader* reader_;
+  log::Reader::Reporter* reporter_;
+  Status* status_;
+  ~LogReaderContainer() {
+    delete reader_;
+    delete reporter_;
+    delete status_;
+  }
+ private:
+  struct LogReporter : public log::Reader::Reporter {
+    Env* env;
+    Logger* info_log;
+    std::string fname;
+    Status* status;  // nullptr if immutable_db_options_.paranoid_checks==false
+    void Corruption(size_t bytes, const Status& s) override {
+      ROCKS_LOG_WARN(info_log, "%s%s: dropping %d bytes; %s",
+                     (this->status == nullptr ? "(ignoring error) " : ""),
+                     fname.c_str(), static_cast<int>(bytes),
+                     s.ToString().c_str());
+      if (this->status != nullptr && this->status->ok()) {
+        *this->status = s;
+      }
+    }
+  };
+};
+
+// The secondary instance shares access to the storage as the primary.
+// The secondary is able to read and replay changes described in both the
+// MANIFEST and the WAL files without coordination with the primary.
+// The secondary instance can be opened using `DB::OpenAsSecondary`. After
+// that, it can call `DBImplSecondary::TryCatchUpWithPrimary` to make best
+// effort attempts to catch up with the primary.
+class DBImplSecondary : public DBImpl {
+ public:
+  DBImplSecondary(const DBOptions& options, const std::string& dbname);
+  ~DBImplSecondary() override;
+
+  // Recover by replaying MANIFEST and WAL. Also initialize manifest_reader_
+  // and log_readers_ to facilitate future operations.
+  Status Recover(const std::vector<ColumnFamilyDescriptor>& column_families,
+                 bool read_only, bool error_if_log_file_exist,
+                 bool error_if_data_exists_in_logs,
+                 uint64_t* = nullptr) override;
+
+  // Implementations of the DB interface
+  using DB::Get;
+  Status Get(const ReadOptions& options, ColumnFamilyHandle* column_family,
+             const Slice& key, PinnableSlice* value) override;
+
+  Status GetImpl(const ReadOptions& options, ColumnFamilyHandle* column_family,
+                 const Slice& key, PinnableSlice* value);
+
+  using DBImpl::NewIterator;
+  Iterator* NewIterator(const ReadOptions&,
+                        ColumnFamilyHandle* column_family) override;
+
+  ArenaWrappedDBIter* NewIteratorImpl(const ReadOptions& read_options,
+                                      ColumnFamilyData* cfd,
+                                      SequenceNumber snapshot,
+                                      ReadCallback* read_callback);
+
+  Status NewIterators(const ReadOptions& options,
+                      const std::vector<ColumnFamilyHandle*>& column_families,
+                      std::vector<Iterator*>* iterators) override;
+
+  using DBImpl::Put;
+  Status Put(const WriteOptions& /*options*/,
+             ColumnFamilyHandle* /*column_family*/, const Slice& /*key*/,
+             const Slice& /*value*/) override {
+    return Status::NotSupported("Not supported operation in secondary mode.");
+  }
+
+  using DBImpl::Merge;
+  Status Merge(const WriteOptions& /*options*/,
+               ColumnFamilyHandle* /*column_family*/, const Slice& /*key*/,
+               const Slice& /*value*/) override {
+    return Status::NotSupported("Not supported operation in secondary mode.");
+  }
+
+  using DBImpl::Delete;
+  Status Delete(const WriteOptions& /*options*/,
+                ColumnFamilyHandle* /*column_family*/,
+                const Slice& /*key*/) override {
+    return Status::NotSupported("Not supported operation in secondary mode.");
+  }
+
+  using DBImpl::SingleDelete;
+  Status SingleDelete(const WriteOptions& /*options*/,
+                      ColumnFamilyHandle* /*column_family*/,
+                      const Slice& /*key*/) override {
+    return Status::NotSupported("Not supported operation in secondary mode.");
+  }
+
+  Status Write(const WriteOptions& /*options*/,
+               WriteBatch* /*updates*/) override {
+    return Status::NotSupported("Not supported operation in secondary mode.");
+  }
+
+  using DBImpl::CompactRange;
+  Status CompactRange(const CompactRangeOptions& /*options*/,
+                      ColumnFamilyHandle* /*column_family*/,
+                      const Slice* /*begin*/, const Slice* /*end*/) override {
+    return Status::NotSupported("Not supported operation in secondary mode.");
+  }
+
+  using DBImpl::CompactFiles;
+  Status CompactFiles(
+      const CompactionOptions& /*compact_options*/,
+      ColumnFamilyHandle* /*column_family*/,
+      const std::vector<std::string>& /*input_file_names*/,
+      const int /*output_level*/, const int /*output_path_id*/ = -1,
+      std::vector<std::string>* const /*output_file_names*/ = nullptr,
+      CompactionJobInfo* /*compaction_job_info*/ = nullptr) override {
+    return Status::NotSupported("Not supported operation in secondary mode.");
+  }
+
+  Status DisableFileDeletions() override {
+    return Status::NotSupported("Not supported operation in secondary mode.");
+  }
+
+  Status EnableFileDeletions(bool /*force*/) override {
+    return Status::NotSupported("Not supported operation in secondary mode.");
+  }
+
+  Status GetLiveFiles(std::vector<std::string>&,
+                      uint64_t* /*manifest_file_size*/,
+                      bool /*flush_memtable*/ = true) override {
+    return Status::NotSupported("Not supported operation in secondary mode.");
+  }
+
+  using DBImpl::Flush;
+  Status Flush(const FlushOptions& /*options*/,
+               ColumnFamilyHandle* /*column_family*/) override {
+    return Status::NotSupported("Not supported operation in secondary mode.");
+  }
+
+  using DBImpl::SetDBOptions;
+  Status SetDBOptions(const std::unordered_map<std::string, std::string>&
+                      /*options_map*/) override {
+    // Currently not supported because changing certain options may cause
+    // flush/compaction.
+    return Status::NotSupported("Not supported operation in secondary mode.");
+  }
+
+  using DBImpl::SetOptions;
+  Status SetOptions(
+      ColumnFamilyHandle* /*cfd*/,
+      const std::unordered_map<std::string, std::string>& /*options_map*/)
+      override {
+    // Currently not supported because changing certain options may cause
+    // flush/compaction and/or write to MANIFEST.
+    return Status::NotSupported("Not supported operation in secondary mode.");
+  }
+
+  using DBImpl::SyncWAL;
+  Status SyncWAL() override {
+    return Status::NotSupported("Not supported operation in secondary mode.");
+  }
+
+  using DB::IngestExternalFile;
+  Status IngestExternalFile(
+      ColumnFamilyHandle* /*column_family*/,
+      const std::vector<std::string>& /*external_files*/,
+      const IngestExternalFileOptions& /*ingestion_options*/) override {
+    return Status::NotSupported("Not supported operation in secondary mode.");
+  }
+
+  // Try to catch up with the primary by reading as much as possible from the
+  // log files until there is nothing more to read or encounters an error. If
+  // the amount of information in the log files to process is huge, this
+  // method can take long time due to all the I/O and CPU costs.
+  Status TryCatchUpWithPrimary() override;
+
+
+  // Try to find log reader using log_number from log_readers_ map, initialize
+  // if it doesn't exist
+  Status MaybeInitLogReader(uint64_t log_number,
+                            log::FragmentBufferedReader** log_reader);
+
+  // Check if all live files exist on file system and that their file sizes
+  // matche to the in-memory records. It is possible that some live files may
+  // have been deleted by the primary. In this case, CheckConsistency() does
+  // not flag the missing file as inconsistency.
+  Status CheckConsistency() override;
+
+ protected:
+  // ColumnFamilyCollector is a write batch handler which does nothing
+  // except recording unique column family IDs
+  class ColumnFamilyCollector : public WriteBatch::Handler {
+    std::unordered_set<uint32_t> column_family_ids_;
+
+    Status AddColumnFamilyId(uint32_t column_family_id) {
+      if (column_family_ids_.find(column_family_id) ==
+          column_family_ids_.end()) {
+        column_family_ids_.insert(column_family_id);
+      }
+      return Status::OK();
+    }
+
+   public:
+    explicit ColumnFamilyCollector() {}
+
+    ~ColumnFamilyCollector() override {}
+
+    Status PutCF(uint32_t column_family_id, const Slice&,
+                 const Slice&) override {
+      return AddColumnFamilyId(column_family_id);
+    }
+
+    Status DeleteCF(uint32_t column_family_id, const Slice&) override {
+      return AddColumnFamilyId(column_family_id);
+    }
+
+    Status SingleDeleteCF(uint32_t column_family_id, const Slice&) override {
+      return AddColumnFamilyId(column_family_id);
+    }
+
+    Status DeleteRangeCF(uint32_t column_family_id, const Slice&,
+                         const Slice&) override {
+      return AddColumnFamilyId(column_family_id);
+    }
+
+    Status MergeCF(uint32_t column_family_id, const Slice&,
+                   const Slice&) override {
+      return AddColumnFamilyId(column_family_id);
+    }
+
+    Status PutBlobIndexCF(uint32_t column_family_id, const Slice&,
+                          const Slice&) override {
+      return AddColumnFamilyId(column_family_id);
+    }
+
+    const std::unordered_set<uint32_t>& column_families() const {
+      return column_family_ids_;
+    }
+  };
+
+  Status CollectColumnFamilyIdsFromWriteBatch(
+      const WriteBatch& batch, std::vector<uint32_t>* column_family_ids) {
+    assert(column_family_ids != nullptr);
+    column_family_ids->clear();
+    ColumnFamilyCollector handler;
+    Status s = batch.Iterate(&handler);
+    if (s.ok()) {
+      for (const auto& cf : handler.column_families()) {
+        column_family_ids->push_back(cf);
+      }
+    }
+    return s;
+  }
+
+  bool OwnTablesAndLogs() const override {
+    // Currently, the secondary instance does not own the database files. It
+    // simply opens the files of the primary instance and tracks their file
+    // descriptors until they become obsolete. In the future, the secondary may
+    // create links to database files. OwnTablesAndLogs will return true then.
+    return false;
+  }
+
+ private:
+  friend class DB;
+
+  // No copying allowed
+  DBImplSecondary(const DBImplSecondary&);
+  void operator=(const DBImplSecondary&);
+
+  using DBImpl::Recover;
+
+  Status FindAndRecoverLogFiles(
+      std::unordered_set<ColumnFamilyData*>* cfds_changed,
+      JobContext* job_context);
+  Status FindNewLogNumbers(std::vector<uint64_t>* logs);
+  // After manifest recovery, replay WALs and refresh log_readers_ if necessary
+  // REQUIRES: log_numbers are sorted in ascending order
+  Status RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
+                         SequenceNumber* next_sequence,
+                         std::unordered_set<ColumnFamilyData*>* cfds_changed,
+                         JobContext* job_context);
+
+  std::unique_ptr<log::FragmentBufferedReader> manifest_reader_;
+  std::unique_ptr<log::Reader::Reporter> manifest_reporter_;
+  std::unique_ptr<Status> manifest_reader_status_;
+
+  // Cache log readers for each log number, used for continue WAL replay
+  // after recovery
+  std::map<uint64_t, std::unique_ptr<LogReaderContainer>> log_readers_;
+
+  // Current WAL number replayed for each column family.
+  std::unordered_map<ColumnFamilyData*, uint64_t> cfd_to_current_log_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/db_impl/db_impl_write.cc b/src/rocksdb/db/db_impl/db_impl_write.cc
new file mode 100644
index 000000000..8f6f685e4
--- /dev/null
+++ b/src/rocksdb/db/db_impl/db_impl_write.cc
@@ -0,0 +1,1839 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include "db/db_impl/db_impl.h"
+
+#include <cinttypes>
+#include "db/error_handler.h"
+#include "db/event_helpers.h"
+#include "monitoring/perf_context_imp.h"
+#include "options/options_helper.h"
+#include "test_util/sync_point.h"
+
+namespace ROCKSDB_NAMESPACE {
+// Convenience methods
+Status DBImpl::Put(const WriteOptions& o, ColumnFamilyHandle* column_family,
+                   const Slice& key, const Slice& val) {
+  return DB::Put(o, column_family, key, val);
+}
+
+Status DBImpl::Merge(const WriteOptions& o, ColumnFamilyHandle* column_family,
+                     const Slice& key, const Slice& val) {
+  auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+  if (!cfh->cfd()->ioptions()->merge_operator) {
+    return Status::NotSupported("Provide a merge_operator when opening DB");
+  } else {
+    return DB::Merge(o, column_family, key, val);
+  }
+}
+
+Status DBImpl::Delete(const WriteOptions& write_options,
+                      ColumnFamilyHandle* column_family, const Slice& key) {
+  return DB::Delete(write_options, column_family, key);
+}
+
+Status DBImpl::SingleDelete(const WriteOptions& write_options,
+                            ColumnFamilyHandle* column_family,
+                            const Slice& key) {
+  return DB::SingleDelete(write_options, column_family, key);
+}
+
+void DBImpl::SetRecoverableStatePreReleaseCallback(
+    PreReleaseCallback* callback) {
+  recoverable_state_pre_release_callback_.reset(callback);
+}
+
+Status DBImpl::Write(const WriteOptions& write_options, WriteBatch* my_batch) {
+  return WriteImpl(write_options, my_batch, nullptr, nullptr);
+}
+
+#ifndef ROCKSDB_LITE
+Status DBImpl::WriteWithCallback(const WriteOptions& write_options,
+                                 WriteBatch* my_batch,
+                                 WriteCallback* callback) {
+  return WriteImpl(write_options, my_batch, callback, nullptr);
+}
+#endif  // ROCKSDB_LITE
+
+// The main write queue. This is the only write queue that updates LastSequence.
+// When using one write queue, the same sequence also indicates the last
+// published sequence.
+Status DBImpl::WriteImpl(const WriteOptions& write_options,
+                         WriteBatch* my_batch, WriteCallback* callback,
+                         uint64_t* log_used, uint64_t log_ref,
+                         bool disable_memtable, uint64_t* seq_used,
+                         size_t batch_cnt,
+                         PreReleaseCallback* pre_release_callback) {
+  assert(!seq_per_batch_ || batch_cnt != 0);
+  if (my_batch == nullptr) {
+    return Status::Corruption("Batch is nullptr!");
+  }
+  if (tracer_) {
+    InstrumentedMutexLock lock(&trace_mutex_);
+    if (tracer_) {
+      tracer_->Write(my_batch);
+    }
+  }
+  if (write_options.sync && write_options.disableWAL) {
+    return Status::InvalidArgument("Sync writes has to enable WAL.");
+  }
+  if (two_write_queues_ && immutable_db_options_.enable_pipelined_write) {
+    return Status::NotSupported(
+        "pipelined_writes is not compatible with concurrent prepares");
+  }
+  if (seq_per_batch_ && immutable_db_options_.enable_pipelined_write) {
+    // TODO(yiwu): update pipeline write with seq_per_batch and batch_cnt
+    return Status::NotSupported(
+        "pipelined_writes is not compatible with seq_per_batch");
+  }
+  if (immutable_db_options_.unordered_write &&
+      immutable_db_options_.enable_pipelined_write) {
+    return Status::NotSupported(
+        "pipelined_writes is not compatible with unordered_write");
+  }
+  // Otherwise IsLatestPersistentState optimization does not make sense
+  assert(!WriteBatchInternal::IsLatestPersistentState(my_batch) ||
+         disable_memtable);
+
+  Status status;
+  if (write_options.low_pri) {
+    status = ThrottleLowPriWritesIfNeeded(write_options, my_batch);
+    if (!status.ok()) {
+      return status;
+    }
+  }
+
+  if (two_write_queues_ && disable_memtable) {
+    AssignOrder assign_order =
+        seq_per_batch_ ? kDoAssignOrder : kDontAssignOrder;
+    // Otherwise it is WAL-only Prepare batches in WriteCommitted policy and
+    // they don't consume sequence.
+    return WriteImplWALOnly(&nonmem_write_thread_, write_options, my_batch,
+                            callback, log_used, log_ref, seq_used, batch_cnt,
+                            pre_release_callback, assign_order,
+                            kDontPublishLastSeq, disable_memtable);
+  }
+
+  if (immutable_db_options_.unordered_write) {
+    const size_t sub_batch_cnt = batch_cnt != 0
+                                     ? batch_cnt
+                                     // every key is a sub-batch consuming a seq
+                                     : WriteBatchInternal::Count(my_batch);
+    uint64_t seq;
+    // Use a write thread to i) optimize for WAL write, ii) publish last
+    // sequence in in increasing order, iii) call pre_release_callback serially
+    status = WriteImplWALOnly(&write_thread_, write_options, my_batch, callback,
+                              log_used, log_ref, &seq, sub_batch_cnt,
+                              pre_release_callback, kDoAssignOrder,
+                              kDoPublishLastSeq, disable_memtable);
+    TEST_SYNC_POINT("DBImpl::WriteImpl:UnorderedWriteAfterWriteWAL");
+    if (!status.ok()) {
+      return status;
+    }
+    if (seq_used) {
+      *seq_used = seq;
+    }
+    if (!disable_memtable) {
+      TEST_SYNC_POINT("DBImpl::WriteImpl:BeforeUnorderedWriteMemtable");
+      status = UnorderedWriteMemtable(write_options, my_batch, callback,
+                                      log_ref, seq, sub_batch_cnt);
+    }
+    return status;
+  }
+
+  if (immutable_db_options_.enable_pipelined_write) {
+    return PipelinedWriteImpl(write_options, my_batch, callback, log_used,
+                              log_ref, disable_memtable, seq_used);
+  }
+
+  PERF_TIMER_GUARD(write_pre_and_post_process_time);
+  WriteThread::Writer w(write_options, my_batch, callback, log_ref,
+                        disable_memtable, batch_cnt, pre_release_callback);
+
+  if (!write_options.disableWAL) {
+    RecordTick(stats_, WRITE_WITH_WAL);
+  }
+
+  StopWatch write_sw(env_, immutable_db_options_.statistics.get(), DB_WRITE);
+
+  write_thread_.JoinBatchGroup(&w);
+  if (w.state == WriteThread::STATE_PARALLEL_MEMTABLE_WRITER) {
+    // we are a non-leader in a parallel group
+
+    if (w.ShouldWriteToMemtable()) {
+      PERF_TIMER_STOP(write_pre_and_post_process_time);
+      PERF_TIMER_GUARD(write_memtable_time);
+
+      ColumnFamilyMemTablesImpl column_family_memtables(
+          versions_->GetColumnFamilySet());
+      w.status = WriteBatchInternal::InsertInto(
+          &w, w.sequence, &column_family_memtables, &flush_scheduler_,
+          &trim_history_scheduler_,
+          write_options.ignore_missing_column_families, 0 /*log_number*/, this,
+          true /*concurrent_memtable_writes*/, seq_per_batch_, w.batch_cnt,
+          batch_per_txn_, write_options.memtable_insert_hint_per_batch);
+
+      PERF_TIMER_START(write_pre_and_post_process_time);
+    }
+
+    if (write_thread_.CompleteParallelMemTableWriter(&w)) {
+      // we're responsible for exit batch group
+      // TODO(myabandeh): propagate status to write_group
+      auto last_sequence = w.write_group->last_sequence;
+      versions_->SetLastSequence(last_sequence);
+      MemTableInsertStatusCheck(w.status);
+      write_thread_.ExitAsBatchGroupFollower(&w);
+    }
+    assert(w.state == WriteThread::STATE_COMPLETED);
+    // STATE_COMPLETED conditional below handles exit
+
+    status = w.FinalStatus();
+  }
+  if (w.state == WriteThread::STATE_COMPLETED) {
+    if (log_used != nullptr) {
+      *log_used = w.log_used;
+    }
+    if (seq_used != nullptr) {
+      *seq_used = w.sequence;
+    }
+    // write is complete and leader has updated sequence
+    return w.FinalStatus();
+  }
+  // else we are the leader of the write batch group
+  assert(w.state == WriteThread::STATE_GROUP_LEADER);
+
+  // Once reaches this point, the current writer "w" will try to do its write
+  // job.  It may also pick up some of the remaining writers in the "writers_"
+  // when it finds suitable, and finish them in the same write batch.
+  // This is how a write job could be done by the other writer.
+  WriteContext write_context;
+  WriteThread::WriteGroup write_group;
+  bool in_parallel_group = false;
+  uint64_t last_sequence = kMaxSequenceNumber;
+
+  mutex_.Lock();
+
+  bool need_log_sync = write_options.sync;
+  bool need_log_dir_sync = need_log_sync && !log_dir_synced_;
+  if (!two_write_queues_ || !disable_memtable) {
+    // With concurrent writes we do preprocess only in the write thread that
+    // also does write to memtable to avoid sync issue on shared data structure
+    // with the other thread
+
+    // PreprocessWrite does its own perf timing.
+    PERF_TIMER_STOP(write_pre_and_post_process_time);
+
+    status = PreprocessWrite(write_options, &need_log_sync, &write_context);
+    if (!two_write_queues_) {
+      // Assign it after ::PreprocessWrite since the sequence might advance
+      // inside it by WriteRecoverableState
+      last_sequence = versions_->LastSequence();
+    }
+
+    PERF_TIMER_START(write_pre_and_post_process_time);
+  }
+  log::Writer* log_writer = logs_.back().writer;
+
+  mutex_.Unlock();
+
+  // Add to log and apply to memtable.  We can release the lock
+  // during this phase since &w is currently responsible for logging
+  // and protects against concurrent loggers and concurrent writes
+  // into memtables
+
+  TEST_SYNC_POINT("DBImpl::WriteImpl:BeforeLeaderEnters");
+  last_batch_group_size_ =
+      write_thread_.EnterAsBatchGroupLeader(&w, &write_group);
+
+  if (status.ok()) {
+    // Rules for when we can update the memtable concurrently
+    // 1. supported by memtable
+    // 2. Puts are not okay if inplace_update_support
+    // 3. Merges are not okay
+    //
+    // Rules 1..2 are enforced by checking the options
+    // during startup (CheckConcurrentWritesSupported), so if
+    // options.allow_concurrent_memtable_write is true then they can be
+    // assumed to be true.  Rule 3 is checked for each batch.  We could
+    // relax rules 2 if we could prevent write batches from referring
+    // more than once to a particular key.
+    bool parallel = immutable_db_options_.allow_concurrent_memtable_write &&
+                    write_group.size > 1;
+    size_t total_count = 0;
+    size_t valid_batches = 0;
+    size_t total_byte_size = 0;
+    size_t pre_release_callback_cnt = 0;
+    for (auto* writer : write_group) {
+      if (writer->CheckCallback(this)) {
+        valid_batches += writer->batch_cnt;
+        if (writer->ShouldWriteToMemtable()) {
+          total_count += WriteBatchInternal::Count(writer->batch);
+          parallel = parallel && !writer->batch->HasMerge();
+        }
+        total_byte_size = WriteBatchInternal::AppendedByteSize(
+            total_byte_size, WriteBatchInternal::ByteSize(writer->batch));
+        if (writer->pre_release_callback) {
+          pre_release_callback_cnt++;
+        }
+      }
+    }
+    // Note about seq_per_batch_: either disableWAL is set for the entire write
+    // group or not. In either case we inc seq for each write batch with no
+    // failed callback. This means that there could be a batch with
+    // disalbe_memtable in between; although we do not write this batch to
+    // memtable it still consumes a seq. Otherwise, if !seq_per_batch_, we inc
+    // the seq per valid written key to mem.
+    size_t seq_inc = seq_per_batch_ ? valid_batches : total_count;
+
+    const bool concurrent_update = two_write_queues_;
+    // Update stats while we are an exclusive group leader, so we know
+    // that nobody else can be writing to these particular stats.
+    // We're optimistic, updating the stats before we successfully
+    // commit.  That lets us release our leader status early.
+    auto stats = default_cf_internal_stats_;
+    stats->AddDBStats(InternalStats::kIntStatsNumKeysWritten, total_count,
+                      concurrent_update);
+    RecordTick(stats_, NUMBER_KEYS_WRITTEN, total_count);
+    stats->AddDBStats(InternalStats::kIntStatsBytesWritten, total_byte_size,
+                      concurrent_update);
+    RecordTick(stats_, BYTES_WRITTEN, total_byte_size);
+    stats->AddDBStats(InternalStats::kIntStatsWriteDoneBySelf, 1,
+                      concurrent_update);
+    RecordTick(stats_, WRITE_DONE_BY_SELF);
+    auto write_done_by_other = write_group.size - 1;
+    if (write_done_by_other > 0) {
+      stats->AddDBStats(InternalStats::kIntStatsWriteDoneByOther,
+                        write_done_by_other, concurrent_update);
+      RecordTick(stats_, WRITE_DONE_BY_OTHER, write_done_by_other);
+    }
+    RecordInHistogram(stats_, BYTES_PER_WRITE, total_byte_size);
+
+    if (write_options.disableWAL) {
+      has_unpersisted_data_.store(true, std::memory_order_relaxed);
+    }
+
+    PERF_TIMER_STOP(write_pre_and_post_process_time);
+
+    if (!two_write_queues_) {
+      if (status.ok() && !write_options.disableWAL) {
+        PERF_TIMER_GUARD(write_wal_time);
+        status = WriteToWAL(write_group, log_writer, log_used, need_log_sync,
+                            need_log_dir_sync, last_sequence + 1);
+      }
+    } else {
+      if (status.ok() && !write_options.disableWAL) {
+        PERF_TIMER_GUARD(write_wal_time);
+        // LastAllocatedSequence is increased inside WriteToWAL under
+        // wal_write_mutex_ to ensure ordered events in WAL
+        status = ConcurrentWriteToWAL(write_group, log_used, &last_sequence,
+                                      seq_inc);
+      } else {
+        // Otherwise we inc seq number for memtable writes
+        last_sequence = versions_->FetchAddLastAllocatedSequence(seq_inc);
+      }
+    }
+    assert(last_sequence != kMaxSequenceNumber);
+    const SequenceNumber current_sequence = last_sequence + 1;
+    last_sequence += seq_inc;
+
+    // PreReleaseCallback is called after WAL write and before memtable write
+    if (status.ok()) {
+      SequenceNumber next_sequence = current_sequence;
+      size_t index = 0;
+      // Note: the logic for advancing seq here must be consistent with the
+      // logic in WriteBatchInternal::InsertInto(write_group...) as well as
+      // with WriteBatchInternal::InsertInto(write_batch...) that is called on
+      // the merged batch during recovery from the WAL.
+      for (auto* writer : write_group) {
+        if (writer->CallbackFailed()) {
+          continue;
+        }
+        writer->sequence = next_sequence;
+        if (writer->pre_release_callback) {
+          Status ws = writer->pre_release_callback->Callback(
+              writer->sequence, disable_memtable, writer->log_used, index++,
+              pre_release_callback_cnt);
+          if (!ws.ok()) {
+            status = ws;
+            break;
+          }
+        }
+        if (seq_per_batch_) {
+          assert(writer->batch_cnt);
+          next_sequence += writer->batch_cnt;
+        } else if (writer->ShouldWriteToMemtable()) {
+          next_sequence += WriteBatchInternal::Count(writer->batch);
+        }
+      }
+    }
+
+    if (status.ok()) {
+      PERF_TIMER_GUARD(write_memtable_time);
+
+      if (!parallel) {
+        // w.sequence will be set inside InsertInto
+        w.status = WriteBatchInternal::InsertInto(
+            write_group, current_sequence, column_family_memtables_.get(),
+            &flush_scheduler_, &trim_history_scheduler_,
+            write_options.ignore_missing_column_families,
+            0 /*recovery_log_number*/, this, parallel, seq_per_batch_,
+            batch_per_txn_);
+      } else {
+        write_group.last_sequence = last_sequence;
+        write_thread_.LaunchParallelMemTableWriters(&write_group);
+        in_parallel_group = true;
+
+        // Each parallel follower is doing each own writes. The leader should
+        // also do its own.
+        if (w.ShouldWriteToMemtable()) {
+          ColumnFamilyMemTablesImpl column_family_memtables(
+              versions_->GetColumnFamilySet());
+          assert(w.sequence == current_sequence);
+          w.status = WriteBatchInternal::InsertInto(
+              &w, w.sequence, &column_family_memtables, &flush_scheduler_,
+              &trim_history_scheduler_,
+              write_options.ignore_missing_column_families, 0 /*log_number*/,
+              this, true /*concurrent_memtable_writes*/, seq_per_batch_,
+              w.batch_cnt, batch_per_txn_,
+              write_options.memtable_insert_hint_per_batch);
+        }
+      }
+      if (seq_used != nullptr) {
+        *seq_used = w.sequence;
+      }
+    }
+  }
+  PERF_TIMER_START(write_pre_and_post_process_time);
+
+  if (!w.CallbackFailed()) {
+    WriteStatusCheck(status);
+  }
+
+  if (need_log_sync) {
+    mutex_.Lock();
+    MarkLogsSynced(logfile_number_, need_log_dir_sync, status);
+    mutex_.Unlock();
+    // Requesting sync with two_write_queues_ is expected to be very rare. We
+    // hence provide a simple implementation that is not necessarily efficient.
+    if (two_write_queues_) {
+      if (manual_wal_flush_) {
+        status = FlushWAL(true);
+      } else {
+        status = SyncWAL();
+      }
+    }
+  }
+
+  bool should_exit_batch_group = true;
+  if (in_parallel_group) {
+    // CompleteParallelWorker returns true if this thread should
+    // handle exit, false means somebody else did
+    should_exit_batch_group = write_thread_.CompleteParallelMemTableWriter(&w);
+  }
+  if (should_exit_batch_group) {
+    if (status.ok()) {
+      // Note: if we are to resume after non-OK statuses we need to revisit how
+      // we reacts to non-OK statuses here.
+      versions_->SetLastSequence(last_sequence);
+    }
+    MemTableInsertStatusCheck(w.status);
+    write_thread_.ExitAsBatchGroupLeader(write_group, status);
+  }
+
+  if (status.ok()) {
+    status = w.FinalStatus();
+  }
+  return status;
+}
+
+Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options,
+                                  WriteBatch* my_batch, WriteCallback* callback,
+                                  uint64_t* log_used, uint64_t log_ref,
+                                  bool disable_memtable, uint64_t* seq_used) {
+  PERF_TIMER_GUARD(write_pre_and_post_process_time);
+  StopWatch write_sw(env_, immutable_db_options_.statistics.get(), DB_WRITE);
+
+  WriteContext write_context;
+
+  WriteThread::Writer w(write_options, my_batch, callback, log_ref,
+                        disable_memtable);
+  write_thread_.JoinBatchGroup(&w);
+  if (w.state == WriteThread::STATE_GROUP_LEADER) {
+    WriteThread::WriteGroup wal_write_group;
+    if (w.callback && !w.callback->AllowWriteBatching()) {
+      write_thread_.WaitForMemTableWriters();
+    }
+    mutex_.Lock();
+    bool need_log_sync = !write_options.disableWAL && write_options.sync;
+    bool need_log_dir_sync = need_log_sync && !log_dir_synced_;
+    // PreprocessWrite does its own perf timing.
+    PERF_TIMER_STOP(write_pre_and_post_process_time);
+    w.status = PreprocessWrite(write_options, &need_log_sync, &write_context);
+    PERF_TIMER_START(write_pre_and_post_process_time);
+    log::Writer* log_writer = logs_.back().writer;
+    mutex_.Unlock();
+
+    // This can set non-OK status if callback fail.
+    last_batch_group_size_ =
+        write_thread_.EnterAsBatchGroupLeader(&w, &wal_write_group);
+    const SequenceNumber current_sequence =
+        write_thread_.UpdateLastSequence(versions_->LastSequence()) + 1;
+    size_t total_count = 0;
+    size_t total_byte_size = 0;
+
+    if (w.status.ok()) {
+      SequenceNumber next_sequence = current_sequence;
+      for (auto writer : wal_write_group) {
+        if (writer->CheckCallback(this)) {
+          if (writer->ShouldWriteToMemtable()) {
+            writer->sequence = next_sequence;
+            size_t count = WriteBatchInternal::Count(writer->batch);
+            next_sequence += count;
+            total_count += count;
+          }
+          total_byte_size = WriteBatchInternal::AppendedByteSize(
+              total_byte_size, WriteBatchInternal::ByteSize(writer->batch));
+        }
+      }
+      if (w.disable_wal) {
+        has_unpersisted_data_.store(true, std::memory_order_relaxed);
+      }
+      write_thread_.UpdateLastSequence(current_sequence + total_count - 1);
+    }
+
+    auto stats = default_cf_internal_stats_;
+    stats->AddDBStats(InternalStats::kIntStatsNumKeysWritten, total_count);
+    RecordTick(stats_, NUMBER_KEYS_WRITTEN, total_count);
+    stats->AddDBStats(InternalStats::kIntStatsBytesWritten, total_byte_size);
+    RecordTick(stats_, BYTES_WRITTEN, total_byte_size);
+    RecordInHistogram(stats_, BYTES_PER_WRITE, total_byte_size);
+
+    PERF_TIMER_STOP(write_pre_and_post_process_time);
+
+    if (w.status.ok() && !write_options.disableWAL) {
+      PERF_TIMER_GUARD(write_wal_time);
+      stats->AddDBStats(InternalStats::kIntStatsWriteDoneBySelf, 1);
+      RecordTick(stats_, WRITE_DONE_BY_SELF, 1);
+      if (wal_write_group.size > 1) {
+        stats->AddDBStats(InternalStats::kIntStatsWriteDoneByOther,
+                          wal_write_group.size - 1);
+        RecordTick(stats_, WRITE_DONE_BY_OTHER, wal_write_group.size - 1);
+      }
+      w.status = WriteToWAL(wal_write_group, log_writer, log_used,
+                            need_log_sync, need_log_dir_sync, current_sequence);
+    }
+
+    if (!w.CallbackFailed()) {
+      WriteStatusCheck(w.status);
+    }
+
+    if (need_log_sync) {
+      mutex_.Lock();
+      MarkLogsSynced(logfile_number_, need_log_dir_sync, w.status);
+      mutex_.Unlock();
+    }
+
+    write_thread_.ExitAsBatchGroupLeader(wal_write_group, w.status);
+  }
+
+  WriteThread::WriteGroup memtable_write_group;
+  if (w.state == WriteThread::STATE_MEMTABLE_WRITER_LEADER) {
+    PERF_TIMER_GUARD(write_memtable_time);
+    assert(w.ShouldWriteToMemtable());
+    write_thread_.EnterAsMemTableWriter(&w, &memtable_write_group);
+    if (memtable_write_group.size > 1 &&
+        immutable_db_options_.allow_concurrent_memtable_write) {
+      write_thread_.LaunchParallelMemTableWriters(&memtable_write_group);
+    } else {
+      memtable_write_group.status = WriteBatchInternal::InsertInto(
+          memtable_write_group, w.sequence, column_family_memtables_.get(),
+          &flush_scheduler_, &trim_history_scheduler_,
+          write_options.ignore_missing_column_families, 0 /*log_number*/, this,
+          false /*concurrent_memtable_writes*/, seq_per_batch_, batch_per_txn_);
+      versions_->SetLastSequence(memtable_write_group.last_sequence);
+      write_thread_.ExitAsMemTableWriter(&w, memtable_write_group);
+    }
+  }
+
+  if (w.state == WriteThread::STATE_PARALLEL_MEMTABLE_WRITER) {
+    assert(w.ShouldWriteToMemtable());
+    ColumnFamilyMemTablesImpl column_family_memtables(
+        versions_->GetColumnFamilySet());
+    w.status = WriteBatchInternal::InsertInto(
+        &w, w.sequence, &column_family_memtables, &flush_scheduler_,
+        &trim_history_scheduler_, write_options.ignore_missing_column_families,
+        0 /*log_number*/, this, true /*concurrent_memtable_writes*/,
+        false /*seq_per_batch*/, 0 /*batch_cnt*/, true /*batch_per_txn*/,
+        write_options.memtable_insert_hint_per_batch);
+    if (write_thread_.CompleteParallelMemTableWriter(&w)) {
+      MemTableInsertStatusCheck(w.status);
+      versions_->SetLastSequence(w.write_group->last_sequence);
+      write_thread_.ExitAsMemTableWriter(&w, *w.write_group);
+    }
+  }
+  if (seq_used != nullptr) {
+    *seq_used = w.sequence;
+  }
+
+  assert(w.state == WriteThread::STATE_COMPLETED);
+  return w.FinalStatus();
+}
+
+Status DBImpl::UnorderedWriteMemtable(const WriteOptions& write_options,
+                                      WriteBatch* my_batch,
+                                      WriteCallback* callback, uint64_t log_ref,
+                                      SequenceNumber seq,
+                                      const size_t sub_batch_cnt) {
+  PERF_TIMER_GUARD(write_pre_and_post_process_time);
+  StopWatch write_sw(env_, immutable_db_options_.statistics.get(), DB_WRITE);
+
+  WriteThread::Writer w(write_options, my_batch, callback, log_ref,
+                        false /*disable_memtable*/);
+
+  if (w.CheckCallback(this) && w.ShouldWriteToMemtable()) {
+    w.sequence = seq;
+    size_t total_count = WriteBatchInternal::Count(my_batch);
+    InternalStats* stats = default_cf_internal_stats_;
+    stats->AddDBStats(InternalStats::kIntStatsNumKeysWritten, total_count);
+    RecordTick(stats_, NUMBER_KEYS_WRITTEN, total_count);
+
+    ColumnFamilyMemTablesImpl column_family_memtables(
+        versions_->GetColumnFamilySet());
+    w.status = WriteBatchInternal::InsertInto(
+        &w, w.sequence, &column_family_memtables, &flush_scheduler_,
+        &trim_history_scheduler_, write_options.ignore_missing_column_families,
+        0 /*log_number*/, this, true /*concurrent_memtable_writes*/,
+        seq_per_batch_, sub_batch_cnt, true /*batch_per_txn*/,
+        write_options.memtable_insert_hint_per_batch);
+
+    WriteStatusCheck(w.status);
+    if (write_options.disableWAL) {
+      has_unpersisted_data_.store(true, std::memory_order_relaxed);
+    }
+  }
+
+  size_t pending_cnt = pending_memtable_writes_.fetch_sub(1) - 1;
+  if (pending_cnt == 0) {
+    // switch_cv_ waits until pending_memtable_writes_ = 0. Locking its mutex
+    // before notify ensures that cv is in waiting state when it is notified
+    // thus not missing the update to pending_memtable_writes_ even though it is
+    // not modified under the mutex.
+    std::lock_guard<std::mutex> lck(switch_mutex_);
+    switch_cv_.notify_all();
+  }
+
+  if (!w.FinalStatus().ok()) {
+    return w.FinalStatus();
+  }
+  return Status::OK();
+}
+
+// The 2nd write queue. If enabled it will be used only for WAL-only writes.
+// This is the only queue that updates LastPublishedSequence which is only
+// applicable in a two-queue setting.
+Status DBImpl::WriteImplWALOnly(
+    WriteThread* write_thread, const WriteOptions& write_options,
+    WriteBatch* my_batch, WriteCallback* callback, uint64_t* log_used,
+    const uint64_t log_ref, uint64_t* seq_used, const size_t sub_batch_cnt,
+    PreReleaseCallback* pre_release_callback, const AssignOrder assign_order,
+    const PublishLastSeq publish_last_seq, const bool disable_memtable) {
+  Status status;
+  PERF_TIMER_GUARD(write_pre_and_post_process_time);
+  WriteThread::Writer w(write_options, my_batch, callback, log_ref,
+                        disable_memtable, sub_batch_cnt, pre_release_callback);
+  RecordTick(stats_, WRITE_WITH_WAL);
+  StopWatch write_sw(env_, immutable_db_options_.statistics.get(), DB_WRITE);
+
+  write_thread->JoinBatchGroup(&w);
+  assert(w.state != WriteThread::STATE_PARALLEL_MEMTABLE_WRITER);
+  if (w.state == WriteThread::STATE_COMPLETED) {
+    if (log_used != nullptr) {
+      *log_used = w.log_used;
+    }
+    if (seq_used != nullptr) {
+      *seq_used = w.sequence;
+    }
+    return w.FinalStatus();
+  }
+  // else we are the leader of the write batch group
+  assert(w.state == WriteThread::STATE_GROUP_LEADER);
+
+  if (publish_last_seq == kDoPublishLastSeq) {
+    // Currently we only use kDoPublishLastSeq in unordered_write
+    assert(immutable_db_options_.unordered_write);
+    WriteContext write_context;
+    if (error_handler_.IsDBStopped()) {
+      status = error_handler_.GetBGError();
+    }
+    // TODO(myabandeh): Make preliminary checks thread-safe so we could do them
+    // without paying the cost of obtaining the mutex.
+    if (status.ok()) {
+      InstrumentedMutexLock l(&mutex_);
+      bool need_log_sync = false;
+      status = PreprocessWrite(write_options, &need_log_sync, &write_context);
+      WriteStatusCheck(status);
+    }
+    if (!status.ok()) {
+      WriteThread::WriteGroup write_group;
+      write_thread->EnterAsBatchGroupLeader(&w, &write_group);
+      write_thread->ExitAsBatchGroupLeader(write_group, status);
+      return status;
+    }
+  }
+
+  WriteThread::WriteGroup write_group;
+  uint64_t last_sequence;
+  write_thread->EnterAsBatchGroupLeader(&w, &write_group);
+  // Note: no need to update last_batch_group_size_ here since the batch writes
+  // to WAL only
+
+  size_t pre_release_callback_cnt = 0;
+  size_t total_byte_size = 0;
+  for (auto* writer : write_group) {
+    if (writer->CheckCallback(this)) {
+      total_byte_size = WriteBatchInternal::AppendedByteSize(
+          total_byte_size, WriteBatchInternal::ByteSize(writer->batch));
+      if (writer->pre_release_callback) {
+        pre_release_callback_cnt++;
+      }
+    }
+  }
+
+  const bool concurrent_update = true;
+  // Update stats while we are an exclusive group leader, so we know
+  // that nobody else can be writing to these particular stats.
+  // We're optimistic, updating the stats before we successfully
+  // commit.  That lets us release our leader status early.
+  auto stats = default_cf_internal_stats_;
+  stats->AddDBStats(InternalStats::kIntStatsBytesWritten, total_byte_size,
+                    concurrent_update);
+  RecordTick(stats_, BYTES_WRITTEN, total_byte_size);
+  stats->AddDBStats(InternalStats::kIntStatsWriteDoneBySelf, 1,
+                    concurrent_update);
+  RecordTick(stats_, WRITE_DONE_BY_SELF);
+  auto write_done_by_other = write_group.size - 1;
+  if (write_done_by_other > 0) {
+    stats->AddDBStats(InternalStats::kIntStatsWriteDoneByOther,
+                      write_done_by_other, concurrent_update);
+    RecordTick(stats_, WRITE_DONE_BY_OTHER, write_done_by_other);
+  }
+  RecordInHistogram(stats_, BYTES_PER_WRITE, total_byte_size);
+
+  PERF_TIMER_STOP(write_pre_and_post_process_time);
+
+  PERF_TIMER_GUARD(write_wal_time);
+  // LastAllocatedSequence is increased inside WriteToWAL under
+  // wal_write_mutex_ to ensure ordered events in WAL
+  size_t seq_inc = 0 /* total_count */;
+  if (assign_order == kDoAssignOrder) {
+    size_t total_batch_cnt = 0;
+    for (auto* writer : write_group) {
+      assert(writer->batch_cnt || !seq_per_batch_);
+      if (!writer->CallbackFailed()) {
+        total_batch_cnt += writer->batch_cnt;
+      }
+    }
+    seq_inc = total_batch_cnt;
+  }
+  if (!write_options.disableWAL) {
+    status =
+        ConcurrentWriteToWAL(write_group, log_used, &last_sequence, seq_inc);
+  } else {
+    // Otherwise we inc seq number to do solely the seq allocation
+    last_sequence = versions_->FetchAddLastAllocatedSequence(seq_inc);
+  }
+
+  size_t memtable_write_cnt = 0;
+  auto curr_seq = last_sequence + 1;
+  for (auto* writer : write_group) {
+    if (writer->CallbackFailed()) {
+      continue;
+    }
+    writer->sequence = curr_seq;
+    if (assign_order == kDoAssignOrder) {
+      assert(writer->batch_cnt || !seq_per_batch_);
+      curr_seq += writer->batch_cnt;
+    }
+    if (!writer->disable_memtable) {
+      memtable_write_cnt++;
+    }
+    // else seq advances only by memtable writes
+  }
+  if (status.ok() && write_options.sync) {
+    assert(!write_options.disableWAL);
+    // Requesting sync with two_write_queues_ is expected to be very rare. We
+    // hance provide a simple implementation that is not necessarily efficient.
+    if (manual_wal_flush_) {
+      status = FlushWAL(true);
+    } else {
+      status = SyncWAL();
+    }
+  }
+  PERF_TIMER_START(write_pre_and_post_process_time);
+
+  if (!w.CallbackFailed()) {
+    WriteStatusCheck(status);
+  }
+  if (status.ok()) {
+    size_t index = 0;
+    for (auto* writer : write_group) {
+      if (!writer->CallbackFailed() && writer->pre_release_callback) {
+        assert(writer->sequence != kMaxSequenceNumber);
+        Status ws = writer->pre_release_callback->Callback(
+            writer->sequence, disable_memtable, writer->log_used, index++,
+            pre_release_callback_cnt);
+        if (!ws.ok()) {
+          status = ws;
+          break;
+        }
+      }
+    }
+  }
+  if (publish_last_seq == kDoPublishLastSeq) {
+    versions_->SetLastSequence(last_sequence + seq_inc);
+    // Currently we only use kDoPublishLastSeq in unordered_write
+    assert(immutable_db_options_.unordered_write);
+  }
+  if (immutable_db_options_.unordered_write && status.ok()) {
+    pending_memtable_writes_ += memtable_write_cnt;
+  }
+  write_thread->ExitAsBatchGroupLeader(write_group, status);
+  if (status.ok()) {
+    status = w.FinalStatus();
+  }
+  if (seq_used != nullptr) {
+    *seq_used = w.sequence;
+  }
+  return status;
+}
+
+void DBImpl::WriteStatusCheck(const Status& status) {
+  // Is setting bg_error_ enough here?  This will at least stop
+  // compaction and fail any further writes.
+  if (immutable_db_options_.paranoid_checks && !status.ok() &&
+      !status.IsBusy() && !status.IsIncomplete()) {
+    mutex_.Lock();
+    error_handler_.SetBGError(status, BackgroundErrorReason::kWriteCallback);
+    mutex_.Unlock();
+  }
+}
+
+void DBImpl::MemTableInsertStatusCheck(const Status& status) {
+  // A non-OK status here indicates that the state implied by the
+  // WAL has diverged from the in-memory state.  This could be
+  // because of a corrupt write_batch (very bad), or because the
+  // client specified an invalid column family and didn't specify
+  // ignore_missing_column_families.
+  if (!status.ok()) {
+    mutex_.Lock();
+    assert(!error_handler_.IsBGWorkStopped());
+    error_handler_.SetBGError(status, BackgroundErrorReason::kMemTable);
+    mutex_.Unlock();
+  }
+}
+
+Status DBImpl::PreprocessWrite(const WriteOptions& write_options,
+                               bool* need_log_sync,
+                               WriteContext* write_context) {
+  mutex_.AssertHeld();
+  assert(write_context != nullptr && need_log_sync != nullptr);
+  Status status;
+
+  if (error_handler_.IsDBStopped()) {
+    status = error_handler_.GetBGError();
+  }
+
+  PERF_TIMER_GUARD(write_scheduling_flushes_compactions_time);
+
+  assert(!single_column_family_mode_ ||
+         versions_->GetColumnFamilySet()->NumberOfColumnFamilies() == 1);
+  if (UNLIKELY(status.ok() && !single_column_family_mode_ &&
+               total_log_size_ > GetMaxTotalWalSize())) {
+    WaitForPendingWrites();
+    status = SwitchWAL(write_context);
+  }
+
+  if (UNLIKELY(status.ok() && write_buffer_manager_->ShouldFlush())) {
+    // Before a new memtable is added in SwitchMemtable(),
+    // write_buffer_manager_->ShouldFlush() will keep returning true. If another
+    // thread is writing to another DB with the same write buffer, they may also
+    // be flushed. We may end up with flushing much more DBs than needed. It's
+    // suboptimal but still correct.
+    WaitForPendingWrites();
+    status = HandleWriteBufferFull(write_context);
+  }
+
+  if (UNLIKELY(status.ok() && !trim_history_scheduler_.Empty())) {
+    status = TrimMemtableHistory(write_context);
+  }
+
+  if (UNLIKELY(status.ok() && !flush_scheduler_.Empty())) {
+    WaitForPendingWrites();
+    status = ScheduleFlushes(write_context);
+  }
+
+  PERF_TIMER_STOP(write_scheduling_flushes_compactions_time);
+  PERF_TIMER_GUARD(write_pre_and_post_process_time);
+
+  if (UNLIKELY(status.ok() && (write_controller_.IsStopped() ||
+                               write_controller_.NeedsDelay()))) {
+    PERF_TIMER_STOP(write_pre_and_post_process_time);
+    PERF_TIMER_GUARD(write_delay_time);
+    // We don't know size of curent batch so that we always use the size
+    // for previous one. It might create a fairness issue that expiration
+    // might happen for smaller writes but larger writes can go through.
+    // Can optimize it if it is an issue.
+    status = DelayWrite(last_batch_group_size_, write_options);
+    PERF_TIMER_START(write_pre_and_post_process_time);
+  }
+
+  if (status.ok() && *need_log_sync) {
+    // Wait until the parallel syncs are finished. Any sync process has to sync
+    // the front log too so it is enough to check the status of front()
+    // We do a while loop since log_sync_cv_ is signalled when any sync is
+    // finished
+    // Note: there does not seem to be a reason to wait for parallel sync at
+    // this early step but it is not important since parallel sync (SyncWAL) and
+    // need_log_sync are usually not used together.
+    while (logs_.front().getting_synced) {
+      log_sync_cv_.Wait();
+    }
+    for (auto& log : logs_) {
+      assert(!log.getting_synced);
+      // This is just to prevent the logs to be synced by a parallel SyncWAL
+      // call. We will do the actual syncing later after we will write to the
+      // WAL.
+      // Note: there does not seem to be a reason to set this early before we
+      // actually write to the WAL
+      log.getting_synced = true;
+    }
+  } else {
+    *need_log_sync = false;
+  }
+
+  return status;
+}
+
+WriteBatch* DBImpl::MergeBatch(const WriteThread::WriteGroup& write_group,
+                               WriteBatch* tmp_batch, size_t* write_with_wal,
+                               WriteBatch** to_be_cached_state) {
+  assert(write_with_wal != nullptr);
+  assert(tmp_batch != nullptr);
+  assert(*to_be_cached_state == nullptr);
+  WriteBatch* merged_batch = nullptr;
+  *write_with_wal = 0;
+  auto* leader = write_group.leader;
+  assert(!leader->disable_wal);  // Same holds for all in the batch group
+  if (write_group.size == 1 && !leader->CallbackFailed() &&
+      leader->batch->GetWalTerminationPoint().is_cleared()) {
+    // we simply write the first WriteBatch to WAL if the group only
+    // contains one batch, that batch should be written to the WAL,
+    // and the batch is not wanting to be truncated
+    merged_batch = leader->batch;
+    if (WriteBatchInternal::IsLatestPersistentState(merged_batch)) {
+      *to_be_cached_state = merged_batch;
+    }
+    *write_with_wal = 1;
+  } else {
+    // WAL needs all of the batches flattened into a single batch.
+    // We could avoid copying here with an iov-like AddRecord
+    // interface
+    merged_batch = tmp_batch;
+    for (auto writer : write_group) {
+      if (!writer->CallbackFailed()) {
+        WriteBatchInternal::Append(merged_batch, writer->batch,
+                                   /*WAL_only*/ true);
+        if (WriteBatchInternal::IsLatestPersistentState(writer->batch)) {
+          // We only need to cache the last of such write batch
+          *to_be_cached_state = writer->batch;
+        }
+        (*write_with_wal)++;
+      }
+    }
+  }
+  return merged_batch;
+}
+
+// When two_write_queues_ is disabled, this function is called from the only
+// write thread. Otherwise this must be called holding log_write_mutex_.
+Status DBImpl::WriteToWAL(const WriteBatch& merged_batch,
+                          log::Writer* log_writer, uint64_t* log_used,
+                          uint64_t* log_size) {
+  assert(log_size != nullptr);
+  Slice log_entry = WriteBatchInternal::Contents(&merged_batch);
+  *log_size = log_entry.size();
+  // When two_write_queues_ WriteToWAL has to be protected from concurretn calls
+  // from the two queues anyway and log_write_mutex_ is already held. Otherwise
+  // if manual_wal_flush_ is enabled we need to protect log_writer->AddRecord
+  // from possible concurrent calls via the FlushWAL by the application.
+  const bool needs_locking = manual_wal_flush_ && !two_write_queues_;
+  // Due to performance cocerns of missed branch prediction penalize the new
+  // manual_wal_flush_ feature (by UNLIKELY) instead of the more common case
+  // when we do not need any locking.
+  if (UNLIKELY(needs_locking)) {
+    log_write_mutex_.Lock();
+  }
+  Status status = log_writer->AddRecord(log_entry);
+  if (UNLIKELY(needs_locking)) {
+    log_write_mutex_.Unlock();
+  }
+  if (log_used != nullptr) {
+    *log_used = logfile_number_;
+  }
+  total_log_size_ += log_entry.size();
+  // TODO(myabandeh): it might be unsafe to access alive_log_files_.back() here
+  // since alive_log_files_ might be modified concurrently
+  alive_log_files_.back().AddSize(log_entry.size());
+  log_empty_ = false;
+  return status;
+}
+
+Status DBImpl::WriteToWAL(const WriteThread::WriteGroup& write_group,
+                          log::Writer* log_writer, uint64_t* log_used,
+                          bool need_log_sync, bool need_log_dir_sync,
+                          SequenceNumber sequence) {
+  Status status;
+
+  assert(!write_group.leader->disable_wal);
+  // Same holds for all in the batch group
+  size_t write_with_wal = 0;
+  WriteBatch* to_be_cached_state = nullptr;
+  WriteBatch* merged_batch = MergeBatch(write_group, &tmp_batch_,
+                                        &write_with_wal, &to_be_cached_state);
+  if (merged_batch == write_group.leader->batch) {
+    write_group.leader->log_used = logfile_number_;
+  } else if (write_with_wal > 1) {
+    for (auto writer : write_group) {
+      writer->log_used = logfile_number_;
+    }
+  }
+
+  WriteBatchInternal::SetSequence(merged_batch, sequence);
+
+  uint64_t log_size;
+  status = WriteToWAL(*merged_batch, log_writer, log_used, &log_size);
+  if (to_be_cached_state) {
+    cached_recoverable_state_ = *to_be_cached_state;
+    cached_recoverable_state_empty_ = false;
+  }
+
+  if (status.ok() && need_log_sync) {
+    StopWatch sw(env_, stats_, WAL_FILE_SYNC_MICROS);
+    // It's safe to access logs_ with unlocked mutex_ here because:
+    //  - we've set getting_synced=true for all logs,
+    //    so other threads won't pop from logs_ while we're here,
+    //  - only writer thread can push to logs_, and we're in
+    //    writer thread, so no one will push to logs_,
+    //  - as long as other threads don't modify it, it's safe to read
+    //    from std::deque from multiple threads concurrently.
+    for (auto& log : logs_) {
+      status = log.writer->file()->Sync(immutable_db_options_.use_fsync);
+      if (!status.ok()) {
+        break;
+      }
+    }
+    if (status.ok() && need_log_dir_sync) {
+      // We only sync WAL directory the first time WAL syncing is
+      // requested, so that in case users never turn on WAL sync,
+      // we can avoid the disk I/O in the write code path.
+      status = directories_.GetWalDir()->Fsync();
+    }
+  }
+
+  if (merged_batch == &tmp_batch_) {
+    tmp_batch_.Clear();
+  }
+  if (status.ok()) {
+    auto stats = default_cf_internal_stats_;
+    if (need_log_sync) {
+      stats->AddDBStats(InternalStats::kIntStatsWalFileSynced, 1);
+      RecordTick(stats_, WAL_FILE_SYNCED);
+    }
+    stats->AddDBStats(InternalStats::kIntStatsWalFileBytes, log_size);
+    RecordTick(stats_, WAL_FILE_BYTES, log_size);
+    stats->AddDBStats(InternalStats::kIntStatsWriteWithWal, write_with_wal);
+    RecordTick(stats_, WRITE_WITH_WAL, write_with_wal);
+  }
+  return status;
+}
+
+Status DBImpl::ConcurrentWriteToWAL(const WriteThread::WriteGroup& write_group,
+                                    uint64_t* log_used,
+                                    SequenceNumber* last_sequence,
+                                    size_t seq_inc) {
+  Status status;
+
+  assert(!write_group.leader->disable_wal);
+  // Same holds for all in the batch group
+  WriteBatch tmp_batch;
+  size_t write_with_wal = 0;
+  WriteBatch* to_be_cached_state = nullptr;
+  WriteBatch* merged_batch =
+      MergeBatch(write_group, &tmp_batch, &write_with_wal, &to_be_cached_state);
+
+  // We need to lock log_write_mutex_ since logs_ and alive_log_files might be
+  // pushed back concurrently
+  log_write_mutex_.Lock();
+  if (merged_batch == write_group.leader->batch) {
+    write_group.leader->log_used = logfile_number_;
+  } else if (write_with_wal > 1) {
+    for (auto writer : write_group) {
+      writer->log_used = logfile_number_;
+    }
+  }
+  *last_sequence = versions_->FetchAddLastAllocatedSequence(seq_inc);
+  auto sequence = *last_sequence + 1;
+  WriteBatchInternal::SetSequence(merged_batch, sequence);
+
+  log::Writer* log_writer = logs_.back().writer;
+  uint64_t log_size;
+  status = WriteToWAL(*merged_batch, log_writer, log_used, &log_size);
+  if (to_be_cached_state) {
+    cached_recoverable_state_ = *to_be_cached_state;
+    cached_recoverable_state_empty_ = false;
+  }
+  log_write_mutex_.Unlock();
+
+  if (status.ok()) {
+    const bool concurrent = true;
+    auto stats = default_cf_internal_stats_;
+    stats->AddDBStats(InternalStats::kIntStatsWalFileBytes, log_size,
+                      concurrent);
+    RecordTick(stats_, WAL_FILE_BYTES, log_size);
+    stats->AddDBStats(InternalStats::kIntStatsWriteWithWal, write_with_wal,
+                      concurrent);
+    RecordTick(stats_, WRITE_WITH_WAL, write_with_wal);
+  }
+  return status;
+}
+
+Status DBImpl::WriteRecoverableState() {
+  mutex_.AssertHeld();
+  if (!cached_recoverable_state_empty_) {
+    bool dont_care_bool;
+    SequenceNumber next_seq;
+    if (two_write_queues_) {
+      log_write_mutex_.Lock();
+    }
+    SequenceNumber seq;
+    if (two_write_queues_) {
+      seq = versions_->FetchAddLastAllocatedSequence(0);
+    } else {
+      seq = versions_->LastSequence();
+    }
+    WriteBatchInternal::SetSequence(&cached_recoverable_state_, seq + 1);
+    auto status = WriteBatchInternal::InsertInto(
+        &cached_recoverable_state_, column_family_memtables_.get(),
+        &flush_scheduler_, &trim_history_scheduler_, true,
+        0 /*recovery_log_number*/, this, false /* concurrent_memtable_writes */,
+        &next_seq, &dont_care_bool, seq_per_batch_);
+    auto last_seq = next_seq - 1;
+    if (two_write_queues_) {
+      versions_->FetchAddLastAllocatedSequence(last_seq - seq);
+      versions_->SetLastPublishedSequence(last_seq);
+    }
+    versions_->SetLastSequence(last_seq);
+    if (two_write_queues_) {
+      log_write_mutex_.Unlock();
+    }
+    if (status.ok() && recoverable_state_pre_release_callback_) {
+      const bool DISABLE_MEMTABLE = true;
+      for (uint64_t sub_batch_seq = seq + 1;
+           sub_batch_seq < next_seq && status.ok(); sub_batch_seq++) {
+        uint64_t const no_log_num = 0;
+        // Unlock it since the callback might end up locking mutex. e.g.,
+        // AddCommitted -> AdvanceMaxEvictedSeq -> GetSnapshotListFromDB
+        mutex_.Unlock();
+        status = recoverable_state_pre_release_callback_->Callback(
+            sub_batch_seq, !DISABLE_MEMTABLE, no_log_num, 0, 1);
+        mutex_.Lock();
+      }
+    }
+    if (status.ok()) {
+      cached_recoverable_state_.Clear();
+      cached_recoverable_state_empty_ = true;
+    }
+    return status;
+  }
+  return Status::OK();
+}
+
+void DBImpl::SelectColumnFamiliesForAtomicFlush(
+    autovector<ColumnFamilyData*>* cfds) {
+  for (ColumnFamilyData* cfd : *versions_->GetColumnFamilySet()) {
+    if (cfd->IsDropped()) {
+      continue;
+    }
+    if (cfd->imm()->NumNotFlushed() != 0 || !cfd->mem()->IsEmpty() ||
+        !cached_recoverable_state_empty_.load()) {
+      cfds->push_back(cfd);
+    }
+  }
+}
+
+// Assign sequence number for atomic flush.
+void DBImpl::AssignAtomicFlushSeq(const autovector<ColumnFamilyData*>& cfds) {
+  assert(immutable_db_options_.atomic_flush);
+  auto seq = versions_->LastSequence();
+  for (auto cfd : cfds) {
+    cfd->imm()->AssignAtomicFlushSeq(seq);
+  }
+}
+
+Status DBImpl::SwitchWAL(WriteContext* write_context) {
+  mutex_.AssertHeld();
+  assert(write_context != nullptr);
+  Status status;
+
+  if (alive_log_files_.begin()->getting_flushed) {
+    return status;
+  }
+
+  auto oldest_alive_log = alive_log_files_.begin()->number;
+  bool flush_wont_release_oldest_log = false;
+  if (allow_2pc()) {
+    auto oldest_log_with_uncommitted_prep =
+        logs_with_prep_tracker_.FindMinLogContainingOutstandingPrep();
+
+    assert(oldest_log_with_uncommitted_prep == 0 ||
+           oldest_log_with_uncommitted_prep >= oldest_alive_log);
+    if (oldest_log_with_uncommitted_prep > 0 &&
+        oldest_log_with_uncommitted_prep == oldest_alive_log) {
+      if (unable_to_release_oldest_log_) {
+        // we already attempted to flush all column families dependent on
+        // the oldest alive log but the log still contained uncommitted
+        // transactions so there is still nothing that we can do.
+        return status;
+      } else {
+        ROCKS_LOG_WARN(
+            immutable_db_options_.info_log,
+            "Unable to release oldest log due to uncommitted transaction");
+        unable_to_release_oldest_log_ = true;
+        flush_wont_release_oldest_log = true;
+      }
+    }
+  }
+  if (!flush_wont_release_oldest_log) {
+    // we only mark this log as getting flushed if we have successfully
+    // flushed all data in this log. If this log contains outstanding prepared
+    // transactions then we cannot flush this log until those transactions are
+    // commited.
+    unable_to_release_oldest_log_ = false;
+    alive_log_files_.begin()->getting_flushed = true;
+  }
+
+  ROCKS_LOG_INFO(
+      immutable_db_options_.info_log,
+      "Flushing all column families with data in WAL number %" PRIu64
+      ". Total log size is %" PRIu64 " while max_total_wal_size is %" PRIu64,
+      oldest_alive_log, total_log_size_.load(), GetMaxTotalWalSize());
+  // no need to refcount because drop is happening in write thread, so can't
+  // happen while we're in the write thread
+  autovector<ColumnFamilyData*> cfds;
+  if (immutable_db_options_.atomic_flush) {
+    SelectColumnFamiliesForAtomicFlush(&cfds);
+  } else {
+    for (auto cfd : *versions_->GetColumnFamilySet()) {
+      if (cfd->IsDropped()) {
+        continue;
+      }
+      if (cfd->OldestLogToKeep() <= oldest_alive_log) {
+        cfds.push_back(cfd);
+      }
+    }
+    MaybeFlushStatsCF(&cfds);
+  }
+  WriteThread::Writer nonmem_w;
+  if (two_write_queues_) {
+    nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_);
+  }
+
+  for (const auto cfd : cfds) {
+    cfd->Ref();
+    status = SwitchMemtable(cfd, write_context);
+    cfd->UnrefAndTryDelete();
+    if (!status.ok()) {
+      break;
+    }
+  }
+  if (two_write_queues_) {
+    nonmem_write_thread_.ExitUnbatched(&nonmem_w);
+  }
+
+  if (status.ok()) {
+    if (immutable_db_options_.atomic_flush) {
+      AssignAtomicFlushSeq(cfds);
+    }
+    for (auto cfd : cfds) {
+      cfd->imm()->FlushRequested();
+    }
+    FlushRequest flush_req;
+    GenerateFlushRequest(cfds, &flush_req);
+    SchedulePendingFlush(flush_req, FlushReason::kWriteBufferManager);
+    MaybeScheduleFlushOrCompaction();
+  }
+  return status;
+}
+
+Status DBImpl::HandleWriteBufferFull(WriteContext* write_context) {
+  mutex_.AssertHeld();
+  assert(write_context != nullptr);
+  Status status;
+
+  // Before a new memtable is added in SwitchMemtable(),
+  // write_buffer_manager_->ShouldFlush() will keep returning true. If another
+  // thread is writing to another DB with the same write buffer, they may also
+  // be flushed. We may end up with flushing much more DBs than needed. It's
+  // suboptimal but still correct.
+  ROCKS_LOG_INFO(
+      immutable_db_options_.info_log,
+      "Flushing column family with oldest memtable entry. Write buffer is "
+      "using %" ROCKSDB_PRIszt " bytes out of a total of %" ROCKSDB_PRIszt ".",
+      write_buffer_manager_->memory_usage(),
+      write_buffer_manager_->buffer_size());
+  // no need to refcount because drop is happening in write thread, so can't
+  // happen while we're in the write thread
+  autovector<ColumnFamilyData*> cfds;
+  if (immutable_db_options_.atomic_flush) {
+    SelectColumnFamiliesForAtomicFlush(&cfds);
+  } else {
+    ColumnFamilyData* cfd_picked = nullptr;
+    SequenceNumber seq_num_for_cf_picked = kMaxSequenceNumber;
+
+    for (auto cfd : *versions_->GetColumnFamilySet()) {
+      if (cfd->IsDropped()) {
+        continue;
+      }
+      if (!cfd->mem()->IsEmpty()) {
+        // We only consider active mem table, hoping immutable memtable is
+        // already in the process of flushing.
+        uint64_t seq = cfd->mem()->GetCreationSeq();
+        if (cfd_picked == nullptr || seq < seq_num_for_cf_picked) {
+          cfd_picked = cfd;
+          seq_num_for_cf_picked = seq;
+        }
+      }
+    }
+    if (cfd_picked != nullptr) {
+      cfds.push_back(cfd_picked);
+    }
+    MaybeFlushStatsCF(&cfds);
+  }
+
+  WriteThread::Writer nonmem_w;
+  if (two_write_queues_) {
+    nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_);
+  }
+  for (const auto cfd : cfds) {
+    if (cfd->mem()->IsEmpty()) {
+      continue;
+    }
+    cfd->Ref();
+    status = SwitchMemtable(cfd, write_context);
+    cfd->UnrefAndTryDelete();
+    if (!status.ok()) {
+      break;
+    }
+  }
+  if (two_write_queues_) {
+    nonmem_write_thread_.ExitUnbatched(&nonmem_w);
+  }
+
+  if (status.ok()) {
+    if (immutable_db_options_.atomic_flush) {
+      AssignAtomicFlushSeq(cfds);
+    }
+    for (const auto cfd : cfds) {
+      cfd->imm()->FlushRequested();
+    }
+    FlushRequest flush_req;
+    GenerateFlushRequest(cfds, &flush_req);
+    SchedulePendingFlush(flush_req, FlushReason::kWriteBufferFull);
+    MaybeScheduleFlushOrCompaction();
+  }
+  return status;
+}
+
+uint64_t DBImpl::GetMaxTotalWalSize() const {
+  mutex_.AssertHeld();
+  return mutable_db_options_.max_total_wal_size == 0
+             ? 4 * max_total_in_memory_state_
+             : mutable_db_options_.max_total_wal_size;
+}
+
+// REQUIRES: mutex_ is held
+// REQUIRES: this thread is currently at the front of the writer queue
+Status DBImpl::DelayWrite(uint64_t num_bytes,
+                          const WriteOptions& write_options) {
+  uint64_t time_delayed = 0;
+  bool delayed = false;
+  {
+    StopWatch sw(env_, stats_, WRITE_STALL, &time_delayed);
+    uint64_t delay = write_controller_.GetDelay(env_, num_bytes);
+    if (delay > 0) {
+      if (write_options.no_slowdown) {
+        return Status::Incomplete("Write stall");
+      }
+      TEST_SYNC_POINT("DBImpl::DelayWrite:Sleep");
+
+      // Notify write_thread_ about the stall so it can setup a barrier and
+      // fail any pending writers with no_slowdown
+      write_thread_.BeginWriteStall();
+      TEST_SYNC_POINT("DBImpl::DelayWrite:BeginWriteStallDone");
+      mutex_.Unlock();
+      // We will delay the write until we have slept for delay ms or
+      // we don't need a delay anymore
+      const uint64_t kDelayInterval = 1000;
+      uint64_t stall_end = sw.start_time() + delay;
+      while (write_controller_.NeedsDelay()) {
+        if (env_->NowMicros() >= stall_end) {
+          // We already delayed this write `delay` microseconds
+          break;
+        }
+
+        delayed = true;
+        // Sleep for 0.001 seconds
+        env_->SleepForMicroseconds(kDelayInterval);
+      }
+      mutex_.Lock();
+      write_thread_.EndWriteStall();
+    }
+
+    // Don't wait if there's a background error, even if its a soft error. We
+    // might wait here indefinitely as the background compaction may never
+    // finish successfully, resulting in the stall condition lasting
+    // indefinitely
+    while (error_handler_.GetBGError().ok() && write_controller_.IsStopped()) {
+      if (write_options.no_slowdown) {
+        return Status::Incomplete("Write stall");
+      }
+      delayed = true;
+
+      // Notify write_thread_ about the stall so it can setup a barrier and
+      // fail any pending writers with no_slowdown
+      write_thread_.BeginWriteStall();
+      TEST_SYNC_POINT("DBImpl::DelayWrite:Wait");
+      bg_cv_.Wait();
+      write_thread_.EndWriteStall();
+    }
+  }
+  assert(!delayed || !write_options.no_slowdown);
+  if (delayed) {
+    default_cf_internal_stats_->AddDBStats(
+        InternalStats::kIntStatsWriteStallMicros, time_delayed);
+    RecordTick(stats_, STALL_MICROS, time_delayed);
+  }
+
+  // If DB is not in read-only mode and write_controller is not stopping
+  // writes, we can ignore any background errors and allow the write to
+  // proceed
+  Status s;
+  if (write_controller_.IsStopped()) {
+    // If writes are still stopped, it means we bailed due to a background
+    // error
+    s = Status::Incomplete(error_handler_.GetBGError().ToString());
+  }
+  if (error_handler_.IsDBStopped()) {
+    s = error_handler_.GetBGError();
+  }
+  return s;
+}
+
+Status DBImpl::ThrottleLowPriWritesIfNeeded(const WriteOptions& write_options,
+                                            WriteBatch* my_batch) {
+  assert(write_options.low_pri);
+  // This is called outside the DB mutex. Although it is safe to make the call,
+  // the consistency condition is not guaranteed to hold. It's OK to live with
+  // it in this case.
+  // If we need to speed compaction, it means the compaction is left behind
+  // and we start to limit low pri writes to a limit.
+  if (write_controller_.NeedSpeedupCompaction()) {
+    if (allow_2pc() && (my_batch->HasCommit() || my_batch->HasRollback())) {
+      // For 2PC, we only rate limit prepare, not commit.
+      return Status::OK();
+    }
+    if (write_options.no_slowdown) {
+      return Status::Incomplete("Low priority write stall");
+    } else {
+      assert(my_batch != nullptr);
+      // Rate limit those writes. The reason that we don't completely wait
+      // is that in case the write is heavy, low pri writes may never have
+      // a chance to run. Now we guarantee we are still slowly making
+      // progress.
+      PERF_TIMER_GUARD(write_delay_time);
+      write_controller_.low_pri_rate_limiter()->Request(
+          my_batch->GetDataSize(), Env::IO_HIGH, nullptr /* stats */,
+          RateLimiter::OpType::kWrite);
+    }
+  }
+  return Status::OK();
+}
+
+void DBImpl::MaybeFlushStatsCF(autovector<ColumnFamilyData*>* cfds) {
+  assert(cfds != nullptr);
+  if (!cfds->empty() && immutable_db_options_.persist_stats_to_disk) {
+    ColumnFamilyData* cfd_stats =
+        versions_->GetColumnFamilySet()->GetColumnFamily(
+            kPersistentStatsColumnFamilyName);
+    if (cfd_stats != nullptr && !cfd_stats->mem()->IsEmpty()) {
+      for (ColumnFamilyData* cfd : *cfds) {
+        if (cfd == cfd_stats) {
+          // stats CF already included in cfds
+          return;
+        }
+      }
+      // force flush stats CF when its log number is less than all other CF's
+      // log numbers
+      bool force_flush_stats_cf = true;
+      for (auto* loop_cfd : *versions_->GetColumnFamilySet()) {
+        if (loop_cfd == cfd_stats) {
+          continue;
+        }
+        if (loop_cfd->GetLogNumber() <= cfd_stats->GetLogNumber()) {
+          force_flush_stats_cf = false;
+        }
+      }
+      if (force_flush_stats_cf) {
+        cfds->push_back(cfd_stats);
+        ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                       "Force flushing stats CF with automated flush "
+                       "to avoid holding old logs");
+      }
+    }
+  }
+}
+
+Status DBImpl::TrimMemtableHistory(WriteContext* context) {
+  autovector<ColumnFamilyData*> cfds;
+  ColumnFamilyData* tmp_cfd;
+  while ((tmp_cfd = trim_history_scheduler_.TakeNextColumnFamily()) !=
+         nullptr) {
+    cfds.push_back(tmp_cfd);
+  }
+  for (auto& cfd : cfds) {
+    autovector<MemTable*> to_delete;
+    cfd->imm()->TrimHistory(&to_delete, cfd->mem()->ApproximateMemoryUsage());
+    if (!to_delete.empty()) {
+      for (auto m : to_delete) {
+        delete m;
+      }
+      context->superversion_context.NewSuperVersion();
+      assert(context->superversion_context.new_superversion.get() != nullptr);
+      cfd->InstallSuperVersion(&context->superversion_context, &mutex_);
+    }
+
+    if (cfd->UnrefAndTryDelete()) {
+      cfd = nullptr;
+    }
+  }
+  return Status::OK();
+}
+
+Status DBImpl::ScheduleFlushes(WriteContext* context) {
+  autovector<ColumnFamilyData*> cfds;
+  if (immutable_db_options_.atomic_flush) {
+    SelectColumnFamiliesForAtomicFlush(&cfds);
+    for (auto cfd : cfds) {
+      cfd->Ref();
+    }
+    flush_scheduler_.Clear();
+  } else {
+    ColumnFamilyData* tmp_cfd;
+    while ((tmp_cfd = flush_scheduler_.TakeNextColumnFamily()) != nullptr) {
+      cfds.push_back(tmp_cfd);
+    }
+    MaybeFlushStatsCF(&cfds);
+  }
+  Status status;
+  WriteThread::Writer nonmem_w;
+  if (two_write_queues_) {
+    nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_);
+  }
+
+  for (auto& cfd : cfds) {
+    if (!cfd->mem()->IsEmpty()) {
+      status = SwitchMemtable(cfd, context);
+    }
+    if (cfd->UnrefAndTryDelete()) {
+      cfd = nullptr;
+    }
+    if (!status.ok()) {
+      break;
+    }
+  }
+
+  if (two_write_queues_) {
+    nonmem_write_thread_.ExitUnbatched(&nonmem_w);
+  }
+
+  if (status.ok()) {
+    if (immutable_db_options_.atomic_flush) {
+      AssignAtomicFlushSeq(cfds);
+    }
+    FlushRequest flush_req;
+    GenerateFlushRequest(cfds, &flush_req);
+    SchedulePendingFlush(flush_req, FlushReason::kWriteBufferFull);
+    MaybeScheduleFlushOrCompaction();
+  }
+  return status;
+}
+
+#ifndef ROCKSDB_LITE
+void DBImpl::NotifyOnMemTableSealed(ColumnFamilyData* /*cfd*/,
+                                    const MemTableInfo& mem_table_info) {
+  if (immutable_db_options_.listeners.size() == 0U) {
+    return;
+  }
+  if (shutting_down_.load(std::memory_order_acquire)) {
+    return;
+  }
+
+  for (auto listener : immutable_db_options_.listeners) {
+    listener->OnMemTableSealed(mem_table_info);
+  }
+}
+#endif  // ROCKSDB_LITE
+
+// REQUIRES: mutex_ is held
+// REQUIRES: this thread is currently at the front of the writer queue
+// REQUIRES: this thread is currently at the front of the 2nd writer queue if
+// two_write_queues_ is true (This is to simplify the reasoning.)
+Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) {
+  mutex_.AssertHeld();
+  WriteThread::Writer nonmem_w;
+  std::unique_ptr<WritableFile> lfile;
+  log::Writer* new_log = nullptr;
+  MemTable* new_mem = nullptr;
+
+  // Recoverable state is persisted in WAL. After memtable switch, WAL might
+  // be deleted, so we write the state to memtable to be persisted as well.
+  Status s = WriteRecoverableState();
+  if (!s.ok()) {
+    return s;
+  }
+
+  // Attempt to switch to a new memtable and trigger flush of old.
+  // Do this without holding the dbmutex lock.
+  assert(versions_->prev_log_number() == 0);
+  if (two_write_queues_) {
+    log_write_mutex_.Lock();
+  }
+  bool creating_new_log = !log_empty_;
+  if (two_write_queues_) {
+    log_write_mutex_.Unlock();
+  }
+  uint64_t recycle_log_number = 0;
+  if (creating_new_log && immutable_db_options_.recycle_log_file_num &&
+      !log_recycle_files_.empty()) {
+    recycle_log_number = log_recycle_files_.front();
+  }
+  uint64_t new_log_number =
+      creating_new_log ? versions_->NewFileNumber() : logfile_number_;
+  const MutableCFOptions mutable_cf_options = *cfd->GetLatestMutableCFOptions();
+
+  // Set memtable_info for memtable sealed callback
+#ifndef ROCKSDB_LITE
+  MemTableInfo memtable_info;
+  memtable_info.cf_name = cfd->GetName();
+  memtable_info.first_seqno = cfd->mem()->GetFirstSequenceNumber();
+  memtable_info.earliest_seqno = cfd->mem()->GetEarliestSequenceNumber();
+  memtable_info.num_entries = cfd->mem()->num_entries();
+  memtable_info.num_deletes = cfd->mem()->num_deletes();
+#endif  // ROCKSDB_LITE
+  // Log this later after lock release. It may be outdated, e.g., if background
+  // flush happens before logging, but that should be ok.
+  int num_imm_unflushed = cfd->imm()->NumNotFlushed();
+  const auto preallocate_block_size =
+      GetWalPreallocateBlockSize(mutable_cf_options.write_buffer_size);
+  mutex_.Unlock();
+  if (creating_new_log) {
+    // TODO: Write buffer size passed in should be max of all CF's instead
+    // of mutable_cf_options.write_buffer_size.
+    s = CreateWAL(new_log_number, recycle_log_number, preallocate_block_size,
+                  &new_log);
+  }
+  if (s.ok()) {
+    SequenceNumber seq = versions_->LastSequence();
+    new_mem = cfd->ConstructNewMemtable(mutable_cf_options, seq);
+    context->superversion_context.NewSuperVersion();
+  }
+  ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                 "[%s] New memtable created with log file: #%" PRIu64
+                 ". Immutable memtables: %d.\n",
+                 cfd->GetName().c_str(), new_log_number, num_imm_unflushed);
+  mutex_.Lock();
+  if (recycle_log_number != 0) {
+    // Since renaming the file is done outside DB mutex, we need to ensure
+    // concurrent full purges don't delete the file while we're recycling it.
+    // To achieve that we hold the old log number in the recyclable list until
+    // after it has been renamed.
+    assert(log_recycle_files_.front() == recycle_log_number);
+    log_recycle_files_.pop_front();
+  }
+  if (s.ok() && creating_new_log) {
+    log_write_mutex_.Lock();
+    assert(new_log != nullptr);
+    if (!logs_.empty()) {
+      // Alway flush the buffer of the last log before switching to a new one
+      log::Writer* cur_log_writer = logs_.back().writer;
+      s = cur_log_writer->WriteBuffer();
+      if (!s.ok()) {
+        ROCKS_LOG_WARN(immutable_db_options_.info_log,
+                       "[%s] Failed to switch from #%" PRIu64 " to #%" PRIu64
+                       "  WAL file\n",
+                       cfd->GetName().c_str(), cur_log_writer->get_log_number(),
+                       new_log_number);
+      }
+    }
+    if (s.ok()) {
+      logfile_number_ = new_log_number;
+      log_empty_ = true;
+      log_dir_synced_ = false;
+      logs_.emplace_back(logfile_number_, new_log);
+      alive_log_files_.push_back(LogFileNumberSize(logfile_number_));
+    }
+    log_write_mutex_.Unlock();
+  }
+
+  if (!s.ok()) {
+    // how do we fail if we're not creating new log?
+    assert(creating_new_log);
+    if (new_mem) {
+      delete new_mem;
+    }
+    if (new_log) {
+      delete new_log;
+    }
+    SuperVersion* new_superversion =
+        context->superversion_context.new_superversion.release();
+    if (new_superversion != nullptr) {
+      delete new_superversion;
+    }
+    // We may have lost data from the WritableFileBuffer in-memory buffer for
+    // the current log, so treat it as a fatal error and set bg_error
+    error_handler_.SetBGError(s, BackgroundErrorReason::kMemTable);
+    // Read back bg_error in order to get the right severity
+    s = error_handler_.GetBGError();
+    return s;
+  }
+
+  for (auto loop_cfd : *versions_->GetColumnFamilySet()) {
+    // all this is just optimization to delete logs that
+    // are no longer needed -- if CF is empty, that means it
+    // doesn't need that particular log to stay alive, so we just
+    // advance the log number. no need to persist this in the manifest
+    if (loop_cfd->mem()->GetFirstSequenceNumber() == 0 &&
+        loop_cfd->imm()->NumNotFlushed() == 0) {
+      if (creating_new_log) {
+        loop_cfd->SetLogNumber(logfile_number_);
+      }
+      loop_cfd->mem()->SetCreationSeq(versions_->LastSequence());
+    }
+  }
+
+  cfd->mem()->SetNextLogNumber(logfile_number_);
+  cfd->imm()->Add(cfd->mem(), &context->memtables_to_free_);
+  new_mem->Ref();
+  cfd->SetMemtable(new_mem);
+  InstallSuperVersionAndScheduleWork(cfd, &context->superversion_context,
+                                     mutable_cf_options);
+#ifndef ROCKSDB_LITE
+  mutex_.Unlock();
+  // Notify client that memtable is sealed, now that we have successfully
+  // installed a new memtable
+  NotifyOnMemTableSealed(cfd, memtable_info);
+  mutex_.Lock();
+#endif  // ROCKSDB_LITE
+  return s;
+}
+
+size_t DBImpl::GetWalPreallocateBlockSize(uint64_t write_buffer_size) const {
+  mutex_.AssertHeld();
+  size_t bsize =
+      static_cast<size_t>(write_buffer_size / 10 + write_buffer_size);
+  // Some users might set very high write_buffer_size and rely on
+  // max_total_wal_size or other parameters to control the WAL size.
+  if (mutable_db_options_.max_total_wal_size > 0) {
+    bsize = std::min<size_t>(
+        bsize, static_cast<size_t>(mutable_db_options_.max_total_wal_size));
+  }
+  if (immutable_db_options_.db_write_buffer_size > 0) {
+    bsize = std::min<size_t>(bsize, immutable_db_options_.db_write_buffer_size);
+  }
+  if (immutable_db_options_.write_buffer_manager &&
+      immutable_db_options_.write_buffer_manager->enabled()) {
+    bsize = std::min<size_t>(
+        bsize, immutable_db_options_.write_buffer_manager->buffer_size());
+  }
+
+  return bsize;
+}
+
+// Default implementations of convenience methods that subclasses of DB
+// can call if they wish
+Status DB::Put(const WriteOptions& opt, ColumnFamilyHandle* column_family,
+               const Slice& key, const Slice& value) {
+  if (nullptr == opt.timestamp) {
+    // Pre-allocate size of write batch conservatively.
+    // 8 bytes are taken by header, 4 bytes for count, 1 byte for type,
+    // and we allocate 11 extra bytes for key length, as well as value length.
+    WriteBatch batch(key.size() + value.size() + 24);
+    Status s = batch.Put(column_family, key, value);
+    if (!s.ok()) {
+      return s;
+    }
+    return Write(opt, &batch);
+  }
+  const Slice* ts = opt.timestamp;
+  assert(nullptr != ts);
+  size_t ts_sz = ts->size();
+  WriteBatch batch(key.size() + ts_sz + value.size() + 24, /*max_bytes=*/0,
+                   ts_sz);
+  Status s = batch.Put(column_family, key, value);
+  if (!s.ok()) {
+    return s;
+  }
+  s = batch.AssignTimestamp(*ts);
+  if (!s.ok()) {
+    return s;
+  }
+  return Write(opt, &batch);
+}
+
+Status DB::Delete(const WriteOptions& opt, ColumnFamilyHandle* column_family,
+                  const Slice& key) {
+  WriteBatch batch;
+  batch.Delete(column_family, key);
+  return Write(opt, &batch);
+}
+
+Status DB::SingleDelete(const WriteOptions& opt,
+                        ColumnFamilyHandle* column_family, const Slice& key) {
+  WriteBatch batch;
+  batch.SingleDelete(column_family, key);
+  return Write(opt, &batch);
+}
+
+Status DB::DeleteRange(const WriteOptions& opt,
+                       ColumnFamilyHandle* column_family,
+                       const Slice& begin_key, const Slice& end_key) {
+  WriteBatch batch;
+  batch.DeleteRange(column_family, begin_key, end_key);
+  return Write(opt, &batch);
+}
+
+Status DB::Merge(const WriteOptions& opt, ColumnFamilyHandle* column_family,
+                 const Slice& key, const Slice& value) {
+  WriteBatch batch;
+  Status s = batch.Merge(column_family, key, value);
+  if (!s.ok()) {
+    return s;
+  }
+  return Write(opt, &batch);
+}
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/db_impl/db_secondary_test.cc b/src/rocksdb/db/db_impl/db_secondary_test.cc
new file mode 100644
index 000000000..0b34181de
--- /dev/null
+++ b/src/rocksdb/db/db_impl/db_secondary_test.cc
@@ -0,0 +1,869 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/db_impl/db_impl_secondary.h"
+#include "db/db_test_util.h"
+#include "port/stack_trace.h"
+#include "test_util/fault_injection_test_env.h"
+#include "test_util/sync_point.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+#ifndef ROCKSDB_LITE
+class DBSecondaryTest : public DBTestBase {
+ public:
+  DBSecondaryTest()
+      : DBTestBase("/db_secondary_test"),
+        secondary_path_(),
+        handles_secondary_(),
+        db_secondary_(nullptr) {
+    secondary_path_ =
+        test::PerThreadDBPath(env_, "/db_secondary_test_secondary");
+  }
+
+  ~DBSecondaryTest() override {
+    CloseSecondary();
+    if (getenv("KEEP_DB") != nullptr) {
+      fprintf(stdout, "Secondary DB is still at %s\n", secondary_path_.c_str());
+    } else {
+      Options options;
+      options.env = env_;
+      EXPECT_OK(DestroyDB(secondary_path_, options));
+    }
+  }
+
+ protected:
+  Status ReopenAsSecondary(const Options& options) {
+    return DB::OpenAsSecondary(options, dbname_, secondary_path_, &db_);
+  }
+
+  void OpenSecondary(const Options& options);
+
+  void OpenSecondaryWithColumnFamilies(
+      const std::vector<std::string>& column_families, const Options& options);
+
+  void CloseSecondary() {
+    for (auto h : handles_secondary_) {
+      db_secondary_->DestroyColumnFamilyHandle(h);
+    }
+    handles_secondary_.clear();
+    delete db_secondary_;
+    db_secondary_ = nullptr;
+  }
+
+  DBImplSecondary* db_secondary_full() {
+    return static_cast<DBImplSecondary*>(db_secondary_);
+  }
+
+  void CheckFileTypeCounts(const std::string& dir, int expected_log,
+                           int expected_sst, int expected_manifest) const;
+
+  std::string secondary_path_;
+  std::vector<ColumnFamilyHandle*> handles_secondary_;
+  DB* db_secondary_;
+};
+
+void DBSecondaryTest::OpenSecondary(const Options& options) {
+  Status s =
+      DB::OpenAsSecondary(options, dbname_, secondary_path_, &db_secondary_);
+  ASSERT_OK(s);
+}
+
+void DBSecondaryTest::OpenSecondaryWithColumnFamilies(
+    const std::vector<std::string>& column_families, const Options& options) {
+  std::vector<ColumnFamilyDescriptor> cf_descs;
+  cf_descs.emplace_back(kDefaultColumnFamilyName, options);
+  for (const auto& cf_name : column_families) {
+    cf_descs.emplace_back(cf_name, options);
+  }
+  Status s = DB::OpenAsSecondary(options, dbname_, secondary_path_, cf_descs,
+                                 &handles_secondary_, &db_secondary_);
+  ASSERT_OK(s);
+}
+
+void DBSecondaryTest::CheckFileTypeCounts(const std::string& dir,
+                                          int expected_log, int expected_sst,
+                                          int expected_manifest) const {
+  std::vector<std::string> filenames;
+  env_->GetChildren(dir, &filenames);
+
+  int log_cnt = 0, sst_cnt = 0, manifest_cnt = 0;
+  for (auto file : filenames) {
+    uint64_t number;
+    FileType type;
+    if (ParseFileName(file, &number, &type)) {
+      log_cnt += (type == kLogFile);
+      sst_cnt += (type == kTableFile);
+      manifest_cnt += (type == kDescriptorFile);
+    }
+  }
+  ASSERT_EQ(expected_log, log_cnt);
+  ASSERT_EQ(expected_sst, sst_cnt);
+  ASSERT_EQ(expected_manifest, manifest_cnt);
+}
+
+TEST_F(DBSecondaryTest, ReopenAsSecondary) {
+  Options options;
+  options.env = env_;
+  Reopen(options);
+  ASSERT_OK(Put("foo", "foo_value"));
+  ASSERT_OK(Put("bar", "bar_value"));
+  ASSERT_OK(dbfull()->Flush(FlushOptions()));
+  Close();
+
+  ASSERT_OK(ReopenAsSecondary(options));
+  ASSERT_EQ("foo_value", Get("foo"));
+  ASSERT_EQ("bar_value", Get("bar"));
+  ReadOptions ropts;
+  ropts.verify_checksums = true;
+  auto db1 = static_cast<DBImplSecondary*>(db_);
+  ASSERT_NE(nullptr, db1);
+  Iterator* iter = db1->NewIterator(ropts);
+  ASSERT_NE(nullptr, iter);
+  size_t count = 0;
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    if (0 == count) {
+      ASSERT_EQ("bar", iter->key().ToString());
+      ASSERT_EQ("bar_value", iter->value().ToString());
+    } else if (1 == count) {
+      ASSERT_EQ("foo", iter->key().ToString());
+      ASSERT_EQ("foo_value", iter->value().ToString());
+    }
+    ++count;
+  }
+  delete iter;
+  ASSERT_EQ(2, count);
+}
+
+TEST_F(DBSecondaryTest, OpenAsSecondary) {
+  Options options;
+  options.env = env_;
+  options.level0_file_num_compaction_trigger = 4;
+  Reopen(options);
+  for (int i = 0; i < 3; ++i) {
+    ASSERT_OK(Put("foo", "foo_value" + std::to_string(i)));
+    ASSERT_OK(Put("bar", "bar_value" + std::to_string(i)));
+    ASSERT_OK(Flush());
+  }
+  Options options1;
+  options1.env = env_;
+  options1.max_open_files = -1;
+  OpenSecondary(options1);
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  ReadOptions ropts;
+  ropts.verify_checksums = true;
+  const auto verify_db_func = [&](const std::string& foo_val,
+                                  const std::string& bar_val) {
+    std::string value;
+    ASSERT_OK(db_secondary_->Get(ropts, "foo", &value));
+    ASSERT_EQ(foo_val, value);
+    ASSERT_OK(db_secondary_->Get(ropts, "bar", &value));
+    ASSERT_EQ(bar_val, value);
+    Iterator* iter = db_secondary_->NewIterator(ropts);
+    ASSERT_NE(nullptr, iter);
+    iter->Seek("foo");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("foo", iter->key().ToString());
+    ASSERT_EQ(foo_val, iter->value().ToString());
+    iter->Seek("bar");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("bar", iter->key().ToString());
+    ASSERT_EQ(bar_val, iter->value().ToString());
+    size_t count = 0;
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+      ++count;
+    }
+    ASSERT_EQ(2, count);
+    delete iter;
+  };
+
+  verify_db_func("foo_value2", "bar_value2");
+
+  ASSERT_OK(Put("foo", "new_foo_value"));
+  ASSERT_OK(Put("bar", "new_bar_value"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+  verify_db_func("new_foo_value", "new_bar_value");
+}
+
+namespace {
+class TraceFileEnv : public EnvWrapper {
+ public:
+  explicit TraceFileEnv(Env* _target) : EnvWrapper(_target) {}
+  Status NewRandomAccessFile(const std::string& f,
+                             std::unique_ptr<RandomAccessFile>* r,
+                             const EnvOptions& env_options) override {
+    class TracedRandomAccessFile : public RandomAccessFile {
+     public:
+      TracedRandomAccessFile(std::unique_ptr<RandomAccessFile>&& target,
+                             std::atomic<int>& counter)
+          : target_(std::move(target)), files_closed_(counter) {}
+      ~TracedRandomAccessFile() override {
+        files_closed_.fetch_add(1, std::memory_order_relaxed);
+      }
+      Status Read(uint64_t offset, size_t n, Slice* result,
+                  char* scratch) const override {
+        return target_->Read(offset, n, result, scratch);
+      }
+
+     private:
+      std::unique_ptr<RandomAccessFile> target_;
+      std::atomic<int>& files_closed_;
+    };
+    Status s = target()->NewRandomAccessFile(f, r, env_options);
+    if (s.ok()) {
+      r->reset(new TracedRandomAccessFile(std::move(*r), files_closed_));
+    }
+    return s;
+  }
+
+  int files_closed() const {
+    return files_closed_.load(std::memory_order_relaxed);
+  }
+
+ private:
+  std::atomic<int> files_closed_{0};
+};
+}  // namespace
+
+TEST_F(DBSecondaryTest, SecondaryCloseFiles) {
+  Options options;
+  options.env = env_;
+  options.max_open_files = 1;
+  options.disable_auto_compactions = true;
+  Reopen(options);
+  Options options1;
+  std::unique_ptr<Env> traced_env(new TraceFileEnv(env_));
+  options1.env = traced_env.get();
+  OpenSecondary(options1);
+
+  static const auto verify_db = [&]() {
+    std::unique_ptr<Iterator> iter1(dbfull()->NewIterator(ReadOptions()));
+    std::unique_ptr<Iterator> iter2(db_secondary_->NewIterator(ReadOptions()));
+    for (iter1->SeekToFirst(), iter2->SeekToFirst();
+         iter1->Valid() && iter2->Valid(); iter1->Next(), iter2->Next()) {
+      ASSERT_EQ(iter1->key(), iter2->key());
+      ASSERT_EQ(iter1->value(), iter2->value());
+    }
+    ASSERT_FALSE(iter1->Valid());
+    ASSERT_FALSE(iter2->Valid());
+  };
+
+  ASSERT_OK(Put("a", "value"));
+  ASSERT_OK(Put("c", "value"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+  verify_db();
+
+  ASSERT_OK(Put("b", "value"));
+  ASSERT_OK(Put("d", "value"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+  verify_db();
+
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+  ASSERT_EQ(2, static_cast<TraceFileEnv*>(traced_env.get())->files_closed());
+
+  Status s = db_secondary_->SetDBOptions({{"max_open_files", "-1"}});
+  ASSERT_TRUE(s.IsNotSupported());
+  CloseSecondary();
+}
+
+TEST_F(DBSecondaryTest, OpenAsSecondaryWALTailing) {
+  Options options;
+  options.env = env_;
+  options.level0_file_num_compaction_trigger = 4;
+  Reopen(options);
+  for (int i = 0; i < 3; ++i) {
+    ASSERT_OK(Put("foo", "foo_value" + std::to_string(i)));
+    ASSERT_OK(Put("bar", "bar_value" + std::to_string(i)));
+  }
+  Options options1;
+  options1.env = env_;
+  options1.max_open_files = -1;
+  OpenSecondary(options1);
+
+  ReadOptions ropts;
+  ropts.verify_checksums = true;
+  const auto verify_db_func = [&](const std::string& foo_val,
+                                  const std::string& bar_val) {
+    std::string value;
+    ASSERT_OK(db_secondary_->Get(ropts, "foo", &value));
+    ASSERT_EQ(foo_val, value);
+    ASSERT_OK(db_secondary_->Get(ropts, "bar", &value));
+    ASSERT_EQ(bar_val, value);
+    Iterator* iter = db_secondary_->NewIterator(ropts);
+    ASSERT_NE(nullptr, iter);
+    iter->Seek("foo");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("foo", iter->key().ToString());
+    ASSERT_EQ(foo_val, iter->value().ToString());
+    iter->Seek("bar");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("bar", iter->key().ToString());
+    ASSERT_EQ(bar_val, iter->value().ToString());
+    size_t count = 0;
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+      ++count;
+    }
+    ASSERT_EQ(2, count);
+    delete iter;
+  };
+
+  verify_db_func("foo_value2", "bar_value2");
+
+  ASSERT_OK(Put("foo", "new_foo_value"));
+  ASSERT_OK(Put("bar", "new_bar_value"));
+
+  ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+  verify_db_func("new_foo_value", "new_bar_value");
+
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("foo", "new_foo_value_1"));
+  ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+  verify_db_func("new_foo_value_1", "new_bar_value");
+}
+
+TEST_F(DBSecondaryTest, OpenWithNonExistColumnFamily) {
+  Options options;
+  options.env = env_;
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  Options options1;
+  options1.env = env_;
+  options1.max_open_files = -1;
+  std::vector<ColumnFamilyDescriptor> cf_descs;
+  cf_descs.emplace_back(kDefaultColumnFamilyName, options1);
+  cf_descs.emplace_back("pikachu", options1);
+  cf_descs.emplace_back("eevee", options1);
+  Status s = DB::OpenAsSecondary(options1, dbname_, secondary_path_, cf_descs,
+                                 &handles_secondary_, &db_secondary_);
+  ASSERT_NOK(s);
+}
+
+TEST_F(DBSecondaryTest, OpenWithSubsetOfColumnFamilies) {
+  Options options;
+  options.env = env_;
+  CreateAndReopenWithCF({"pikachu"}, options);
+  Options options1;
+  options1.env = env_;
+  options1.max_open_files = -1;
+  OpenSecondary(options1);
+  ASSERT_EQ(0, handles_secondary_.size());
+  ASSERT_NE(nullptr, db_secondary_);
+
+  ASSERT_OK(Put(0 /*cf*/, "foo", "foo_value"));
+  ASSERT_OK(Put(1 /*cf*/, "foo", "foo_value"));
+  ASSERT_OK(Flush(0 /*cf*/));
+  ASSERT_OK(Flush(1 /*cf*/));
+  ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+  ReadOptions ropts;
+  ropts.verify_checksums = true;
+  std::string value;
+  ASSERT_OK(db_secondary_->Get(ropts, "foo", &value));
+  ASSERT_EQ("foo_value", value);
+}
+
+TEST_F(DBSecondaryTest, SwitchToNewManifestDuringOpen) {
+  Options options;
+  options.env = env_;
+  Reopen(options);
+  Close();
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"ReactiveVersionSet::MaybeSwitchManifest:AfterGetCurrentManifestPath:0",
+        "VersionSet::ProcessManifestWrites:BeforeNewManifest"},
+       {"VersionSet::ProcessManifestWrites:AfterNewManifest",
+        "ReactiveVersionSet::MaybeSwitchManifest:AfterGetCurrentManifestPath:"
+        "1"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  // Make sure db calls RecoverLogFiles so as to trigger a manifest write,
+  // which causes the db to switch to a new MANIFEST upon start.
+  port::Thread ro_db_thread([&]() {
+    Options options1;
+    options1.env = env_;
+    options1.max_open_files = -1;
+    OpenSecondary(options1);
+    CloseSecondary();
+  });
+  Reopen(options);
+  ro_db_thread.join();
+}
+
+TEST_F(DBSecondaryTest, MissingTableFileDuringOpen) {
+  Options options;
+  options.env = env_;
+  options.level0_file_num_compaction_trigger = 4;
+  Reopen(options);
+  for (int i = 0; i != options.level0_file_num_compaction_trigger; ++i) {
+    ASSERT_OK(Put("foo", "foo_value" + std::to_string(i)));
+    ASSERT_OK(Put("bar", "bar_value" + std::to_string(i)));
+    ASSERT_OK(dbfull()->Flush(FlushOptions()));
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  Options options1;
+  options1.env = env_;
+  options1.max_open_files = -1;
+  OpenSecondary(options1);
+  ReadOptions ropts;
+  ropts.verify_checksums = true;
+  std::string value;
+  ASSERT_OK(db_secondary_->Get(ropts, "foo", &value));
+  ASSERT_EQ("foo_value" +
+                std::to_string(options.level0_file_num_compaction_trigger - 1),
+            value);
+  ASSERT_OK(db_secondary_->Get(ropts, "bar", &value));
+  ASSERT_EQ("bar_value" +
+                std::to_string(options.level0_file_num_compaction_trigger - 1),
+            value);
+  Iterator* iter = db_secondary_->NewIterator(ropts);
+  ASSERT_NE(nullptr, iter);
+  iter->Seek("bar");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("bar", iter->key().ToString());
+  ASSERT_EQ("bar_value" +
+                std::to_string(options.level0_file_num_compaction_trigger - 1),
+            iter->value().ToString());
+  iter->Seek("foo");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("foo", iter->key().ToString());
+  ASSERT_EQ("foo_value" +
+                std::to_string(options.level0_file_num_compaction_trigger - 1),
+            iter->value().ToString());
+  size_t count = 0;
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    ++count;
+  }
+  ASSERT_EQ(2, count);
+  delete iter;
+}
+
+TEST_F(DBSecondaryTest, MissingTableFile) {
+  int table_files_not_exist = 0;
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->SetCallBack(
+      "ReactiveVersionSet::ApplyOneVersionEditToBuilder:AfterLoadTableHandlers",
+      [&](void* arg) {
+        Status s = *reinterpret_cast<Status*>(arg);
+        if (s.IsPathNotFound()) {
+          ++table_files_not_exist;
+        } else if (!s.ok()) {
+          assert(false);  // Should not reach here
+        }
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+  Options options;
+  options.env = env_;
+  options.level0_file_num_compaction_trigger = 4;
+  Reopen(options);
+
+  Options options1;
+  options1.env = env_;
+  options1.max_open_files = -1;
+  OpenSecondary(options1);
+
+  for (int i = 0; i != options.level0_file_num_compaction_trigger; ++i) {
+    ASSERT_OK(Put("foo", "foo_value" + std::to_string(i)));
+    ASSERT_OK(Put("bar", "bar_value" + std::to_string(i)));
+    ASSERT_OK(dbfull()->Flush(FlushOptions()));
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  ASSERT_NE(nullptr, db_secondary_full());
+  ReadOptions ropts;
+  ropts.verify_checksums = true;
+  std::string value;
+  ASSERT_NOK(db_secondary_->Get(ropts, "foo", &value));
+  ASSERT_NOK(db_secondary_->Get(ropts, "bar", &value));
+
+  ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+  ASSERT_EQ(options.level0_file_num_compaction_trigger, table_files_not_exist);
+  ASSERT_OK(db_secondary_->Get(ropts, "foo", &value));
+  ASSERT_EQ("foo_value" +
+                std::to_string(options.level0_file_num_compaction_trigger - 1),
+            value);
+  ASSERT_OK(db_secondary_->Get(ropts, "bar", &value));
+  ASSERT_EQ("bar_value" +
+                std::to_string(options.level0_file_num_compaction_trigger - 1),
+            value);
+  Iterator* iter = db_secondary_->NewIterator(ropts);
+  ASSERT_NE(nullptr, iter);
+  iter->Seek("bar");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("bar", iter->key().ToString());
+  ASSERT_EQ("bar_value" +
+                std::to_string(options.level0_file_num_compaction_trigger - 1),
+            iter->value().ToString());
+  iter->Seek("foo");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("foo", iter->key().ToString());
+  ASSERT_EQ("foo_value" +
+                std::to_string(options.level0_file_num_compaction_trigger - 1),
+            iter->value().ToString());
+  size_t count = 0;
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    ++count;
+  }
+  ASSERT_EQ(2, count);
+  delete iter;
+}
+
+TEST_F(DBSecondaryTest, PrimaryDropColumnFamily) {
+  Options options;
+  options.env = env_;
+  const std::string kCfName1 = "pikachu";
+  CreateAndReopenWithCF({kCfName1}, options);
+
+  Options options1;
+  options1.env = env_;
+  options1.max_open_files = -1;
+  OpenSecondaryWithColumnFamilies({kCfName1}, options1);
+  ASSERT_EQ(2, handles_secondary_.size());
+
+  ASSERT_OK(Put(1 /*cf*/, "foo", "foo_val_1"));
+  ASSERT_OK(Flush(1 /*cf*/));
+
+  ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+  ReadOptions ropts;
+  ropts.verify_checksums = true;
+  std::string value;
+  ASSERT_OK(db_secondary_->Get(ropts, handles_secondary_[1], "foo", &value));
+  ASSERT_EQ("foo_val_1", value);
+
+  ASSERT_OK(dbfull()->DropColumnFamily(handles_[1]));
+  Close();
+  CheckFileTypeCounts(dbname_, 1, 0, 1);
+  ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+  value.clear();
+  ASSERT_OK(db_secondary_->Get(ropts, handles_secondary_[1], "foo", &value));
+  ASSERT_EQ("foo_val_1", value);
+}
+
+TEST_F(DBSecondaryTest, SwitchManifest) {
+  Options options;
+  options.env = env_;
+  options.level0_file_num_compaction_trigger = 4;
+  Reopen(options);
+
+  Options options1;
+  options1.env = env_;
+  options1.max_open_files = -1;
+  OpenSecondary(options1);
+
+  const int kNumFiles = options.level0_file_num_compaction_trigger - 1;
+  // Keep it smaller than 10 so that key0, key1, ..., key9 are sorted as 0, 1,
+  // ..., 9.
+  const int kNumKeys = 10;
+  // Create two sst
+  for (int i = 0; i != kNumFiles; ++i) {
+    for (int j = 0; j != kNumKeys; ++j) {
+      ASSERT_OK(Put("key" + std::to_string(j), "value_" + std::to_string(i)));
+    }
+    ASSERT_OK(Flush());
+  }
+
+  ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+  const auto& range_scan_db = [&]() {
+    ReadOptions tmp_ropts;
+    tmp_ropts.total_order_seek = true;
+    tmp_ropts.verify_checksums = true;
+    std::unique_ptr<Iterator> iter(db_secondary_->NewIterator(tmp_ropts));
+    int cnt = 0;
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next(), ++cnt) {
+      ASSERT_EQ("key" + std::to_string(cnt), iter->key().ToString());
+      ASSERT_EQ("value_" + std::to_string(kNumFiles - 1),
+                iter->value().ToString());
+    }
+  };
+
+  range_scan_db();
+
+  // While secondary instance still keeps old MANIFEST open, we close primary,
+  // restart primary, performs full compaction, close again, restart again so
+  // that next time secondary tries to catch up with primary, the secondary
+  // will skip the MANIFEST in middle.
+  Reopen(options);
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  Reopen(options);
+  ASSERT_OK(dbfull()->SetOptions({{"disable_auto_compactions", "false"}}));
+
+  ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+  range_scan_db();
+}
+
+// Here, "Snapshot" refers to the version edits written by
+// VersionSet::WriteSnapshot() at the beginning of the new MANIFEST after
+// switching from the old one.
+TEST_F(DBSecondaryTest, SkipSnapshotAfterManifestSwitch) {
+  Options options;
+  options.env = env_;
+  options.disable_auto_compactions = true;
+  Reopen(options);
+
+  Options options1;
+  options1.env = env_;
+  options1.max_open_files = -1;
+  OpenSecondary(options1);
+
+  ASSERT_OK(Put("0", "value0"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+  std::string value;
+  ReadOptions ropts;
+  ropts.verify_checksums = true;
+  ASSERT_OK(db_secondary_->Get(ropts, "0", &value));
+  ASSERT_EQ("value0", value);
+
+  Reopen(options);
+  ASSERT_OK(dbfull()->SetOptions({{"disable_auto_compactions", "false"}}));
+  ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+}
+
+TEST_F(DBSecondaryTest, SwitchWAL) {
+  const int kNumKeysPerMemtable = 1;
+  Options options;
+  options.env = env_;
+  options.max_write_buffer_number = 4;
+  options.min_write_buffer_number_to_merge = 2;
+  options.memtable_factory.reset(
+      new SpecialSkipListFactory(kNumKeysPerMemtable));
+  Reopen(options);
+
+  Options options1;
+  options1.env = env_;
+  options1.max_open_files = -1;
+  OpenSecondary(options1);
+
+  const auto& verify_db = [](DB* db1, DB* db2) {
+    ASSERT_NE(nullptr, db1);
+    ASSERT_NE(nullptr, db2);
+    ReadOptions read_opts;
+    read_opts.verify_checksums = true;
+    std::unique_ptr<Iterator> it1(db1->NewIterator(read_opts));
+    std::unique_ptr<Iterator> it2(db2->NewIterator(read_opts));
+    it1->SeekToFirst();
+    it2->SeekToFirst();
+    for (; it1->Valid() && it2->Valid(); it1->Next(), it2->Next()) {
+      ASSERT_EQ(it1->key(), it2->key());
+      ASSERT_EQ(it1->value(), it2->value());
+    }
+    ASSERT_FALSE(it1->Valid());
+    ASSERT_FALSE(it2->Valid());
+
+    for (it1->SeekToFirst(); it1->Valid(); it1->Next()) {
+      std::string value;
+      ASSERT_OK(db2->Get(read_opts, it1->key(), &value));
+      ASSERT_EQ(it1->value(), value);
+    }
+    for (it2->SeekToFirst(); it2->Valid(); it2->Next()) {
+      std::string value;
+      ASSERT_OK(db1->Get(read_opts, it2->key(), &value));
+      ASSERT_EQ(it2->value(), value);
+    }
+  };
+  for (int k = 0; k != 16; ++k) {
+    ASSERT_OK(Put("key" + std::to_string(k), "value" + std::to_string(k)));
+    ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+    verify_db(dbfull(), db_secondary_);
+  }
+}
+
+TEST_F(DBSecondaryTest, SwitchWALMultiColumnFamilies) {
+  const int kNumKeysPerMemtable = 1;
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::BackgroundCallFlush:ContextCleanedUp",
+        "DBSecondaryTest::SwitchWALMultipleColumnFamilies:BeforeCatchUp"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+  const std::string kCFName1 = "pikachu";
+  Options options;
+  options.env = env_;
+  options.max_write_buffer_number = 4;
+  options.min_write_buffer_number_to_merge = 2;
+  options.memtable_factory.reset(
+      new SpecialSkipListFactory(kNumKeysPerMemtable));
+  CreateAndReopenWithCF({kCFName1}, options);
+
+  Options options1;
+  options1.env = env_;
+  options1.max_open_files = -1;
+  OpenSecondaryWithColumnFamilies({kCFName1}, options1);
+  ASSERT_EQ(2, handles_secondary_.size());
+
+  const auto& verify_db = [](DB* db1,
+                             const std::vector<ColumnFamilyHandle*>& handles1,
+                             DB* db2,
+                             const std::vector<ColumnFamilyHandle*>& handles2) {
+    ASSERT_NE(nullptr, db1);
+    ASSERT_NE(nullptr, db2);
+    ReadOptions read_opts;
+    read_opts.verify_checksums = true;
+    ASSERT_EQ(handles1.size(), handles2.size());
+    for (size_t i = 0; i != handles1.size(); ++i) {
+      std::unique_ptr<Iterator> it1(db1->NewIterator(read_opts, handles1[i]));
+      std::unique_ptr<Iterator> it2(db2->NewIterator(read_opts, handles2[i]));
+      it1->SeekToFirst();
+      it2->SeekToFirst();
+      for (; it1->Valid() && it2->Valid(); it1->Next(), it2->Next()) {
+        ASSERT_EQ(it1->key(), it2->key());
+        ASSERT_EQ(it1->value(), it2->value());
+      }
+      ASSERT_FALSE(it1->Valid());
+      ASSERT_FALSE(it2->Valid());
+
+      for (it1->SeekToFirst(); it1->Valid(); it1->Next()) {
+        std::string value;
+        ASSERT_OK(db2->Get(read_opts, handles2[i], it1->key(), &value));
+        ASSERT_EQ(it1->value(), value);
+      }
+      for (it2->SeekToFirst(); it2->Valid(); it2->Next()) {
+        std::string value;
+        ASSERT_OK(db1->Get(read_opts, handles1[i], it2->key(), &value));
+        ASSERT_EQ(it2->value(), value);
+      }
+    }
+  };
+  for (int k = 0; k != 8; ++k) {
+    ASSERT_OK(
+        Put(0 /*cf*/, "key" + std::to_string(k), "value" + std::to_string(k)));
+    ASSERT_OK(
+        Put(1 /*cf*/, "key" + std::to_string(k), "value" + std::to_string(k)));
+    TEST_SYNC_POINT(
+        "DBSecondaryTest::SwitchWALMultipleColumnFamilies:BeforeCatchUp");
+    ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+    verify_db(dbfull(), handles_, db_secondary_, handles_secondary_);
+    SyncPoint::GetInstance()->ClearTrace();
+  }
+}
+
+TEST_F(DBSecondaryTest, CatchUpAfterFlush) {
+  const int kNumKeysPerMemtable = 16;
+  Options options;
+  options.env = env_;
+  options.max_write_buffer_number = 4;
+  options.min_write_buffer_number_to_merge = 2;
+  options.memtable_factory.reset(
+      new SpecialSkipListFactory(kNumKeysPerMemtable));
+  Reopen(options);
+
+  Options options1;
+  options1.env = env_;
+  options1.max_open_files = -1;
+  OpenSecondary(options1);
+
+  WriteOptions write_opts;
+  WriteBatch wb;
+  wb.Put("key0", "value0");
+  wb.Put("key1", "value1");
+  ASSERT_OK(dbfull()->Write(write_opts, &wb));
+  ReadOptions read_opts;
+  std::unique_ptr<Iterator> iter1(db_secondary_->NewIterator(read_opts));
+  iter1->Seek("key0");
+  ASSERT_FALSE(iter1->Valid());
+  iter1->Seek("key1");
+  ASSERT_FALSE(iter1->Valid());
+  ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+  iter1->Seek("key0");
+  ASSERT_FALSE(iter1->Valid());
+  iter1->Seek("key1");
+  ASSERT_FALSE(iter1->Valid());
+  std::unique_ptr<Iterator> iter2(db_secondary_->NewIterator(read_opts));
+  iter2->Seek("key0");
+  ASSERT_TRUE(iter2->Valid());
+  ASSERT_EQ("value0", iter2->value());
+  iter2->Seek("key1");
+  ASSERT_TRUE(iter2->Valid());
+  ASSERT_EQ("value1", iter2->value());
+
+  {
+    WriteBatch wb1;
+    wb1.Put("key0", "value01");
+    wb1.Put("key1", "value11");
+    ASSERT_OK(dbfull()->Write(write_opts, &wb1));
+  }
+
+  {
+    WriteBatch wb2;
+    wb2.Put("key0", "new_value0");
+    wb2.Delete("key1");
+    ASSERT_OK(dbfull()->Write(write_opts, &wb2));
+  }
+
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+  std::unique_ptr<Iterator> iter3(db_secondary_->NewIterator(read_opts));
+  // iter3 should not see value01 and value11 at all.
+  iter3->Seek("key0");
+  ASSERT_TRUE(iter3->Valid());
+  ASSERT_EQ("new_value0", iter3->value());
+  iter3->Seek("key1");
+  ASSERT_FALSE(iter3->Valid());
+}
+
+TEST_F(DBSecondaryTest, CheckConsistencyWhenOpen) {
+  bool called = false;
+  Options options;
+  options.env = env_;
+  options.disable_auto_compactions = true;
+  Reopen(options);
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImplSecondary::CheckConsistency:AfterFirstAttempt", [&](void* arg) {
+        ASSERT_NE(nullptr, arg);
+        called = true;
+        auto* s = reinterpret_cast<Status*>(arg);
+        ASSERT_NOK(*s);
+      });
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::CheckConsistency:AfterGetLiveFilesMetaData",
+        "BackgroundCallCompaction:0"},
+       {"DBImpl::BackgroundCallCompaction:PurgedObsoleteFiles",
+        "DBImpl::CheckConsistency:BeforeGetFileSize"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(Put("a", "value0"));
+  ASSERT_OK(Put("c", "value0"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("b", "value1"));
+  ASSERT_OK(Put("d", "value1"));
+  ASSERT_OK(Flush());
+  port::Thread thread([this]() {
+    Options opts;
+    opts.env = env_;
+    opts.max_open_files = -1;
+    OpenSecondary(opts);
+  });
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  thread.join();
+  ASSERT_TRUE(called);
+}
+#endif  //! ROCKSDB_LITE
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_info_dumper.cc b/src/rocksdb/db/db_info_dumper.cc
new file mode 100644
index 000000000..7008ca6ff
--- /dev/null
+++ b/src/rocksdb/db/db_info_dumper.cc
@@ -0,0 +1,123 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/db_info_dumper.h"
+
+#include <stdio.h>
+#include <algorithm>
+#include <cinttypes>
+#include <string>
+#include <vector>
+
+#include "file/filename.h"
+#include "rocksdb/env.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+void DumpDBFileSummary(const ImmutableDBOptions& options,
+                       const std::string& dbname) {
+  if (options.info_log == nullptr) {
+    return;
+  }
+
+  auto* env = options.env;
+  uint64_t number = 0;
+  FileType type = kInfoLogFile;
+
+  std::vector<std::string> files;
+  uint64_t file_num = 0;
+  uint64_t file_size;
+  std::string file_info, wal_info;
+
+  Header(options.info_log, "DB SUMMARY\n");
+  // Get files in dbname dir
+  if (!env->GetChildren(dbname, &files).ok()) {
+    Error(options.info_log,
+          "Error when reading %s dir\n", dbname.c_str());
+  }
+  std::sort(files.begin(), files.end());
+  for (const std::string& file : files) {
+    if (!ParseFileName(file, &number, &type)) {
+      continue;
+    }
+    switch (type) {
+      case kCurrentFile:
+        Header(options.info_log, "CURRENT file:  %s\n", file.c_str());
+        break;
+      case kIdentityFile:
+        Header(options.info_log, "IDENTITY file:  %s\n", file.c_str());
+        break;
+      case kDescriptorFile:
+        env->GetFileSize(dbname + "/" + file, &file_size);
+        Header(options.info_log, "MANIFEST file:  %s size: %" PRIu64 " Bytes\n",
+               file.c_str(), file_size);
+        break;
+      case kLogFile:
+        env->GetFileSize(dbname + "/" + file, &file_size);
+        char str[16];
+        snprintf(str, sizeof(str), "%" PRIu64, file_size);
+        wal_info.append(file).append(" size: ").
+            append(str).append(" ; ");
+        break;
+      case kTableFile:
+        if (++file_num < 10) {
+          file_info.append(file).append(" ");
+        }
+        break;
+      default:
+        break;
+    }
+  }
+
+  // Get sst files in db_path dir
+  for (auto& db_path : options.db_paths) {
+    if (dbname.compare(db_path.path) != 0) {
+      if (!env->GetChildren(db_path.path, &files).ok()) {
+        Error(options.info_log,
+            "Error when reading %s dir\n",
+            db_path.path.c_str());
+        continue;
+      }
+      std::sort(files.begin(), files.end());
+      for (const std::string& file : files) {
+        if (ParseFileName(file, &number, &type)) {
+          if (type == kTableFile && ++file_num < 10) {
+            file_info.append(file).append(" ");
+          }
+        }
+      }
+    }
+    Header(options.info_log,
+           "SST files in %s dir, Total Num: %" PRIu64 ", files: %s\n",
+           db_path.path.c_str(), file_num, file_info.c_str());
+    file_num = 0;
+    file_info.clear();
+  }
+
+  // Get wal file in wal_dir
+  if (dbname.compare(options.wal_dir) != 0) {
+    if (!env->GetChildren(options.wal_dir, &files).ok()) {
+      Error(options.info_log,
+          "Error when reading %s dir\n",
+          options.wal_dir.c_str());
+      return;
+    }
+    wal_info.clear();
+    for (const std::string& file : files) {
+      if (ParseFileName(file, &number, &type)) {
+        if (type == kLogFile) {
+          env->GetFileSize(options.wal_dir + "/" + file, &file_size);
+          char str[16];
+          snprintf(str, sizeof(str), "%" PRIu64, file_size);
+          wal_info.append(file).append(" size: ").
+              append(str).append(" ; ");
+        }
+      }
+    }
+  }
+  Header(options.info_log, "Write Ahead Log file in %s: %s\n",
+         options.wal_dir.c_str(), wal_info.c_str());
+}
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/db_info_dumper.h b/src/rocksdb/db/db_info_dumper.h
new file mode 100644
index 000000000..91404cbd7
--- /dev/null
+++ b/src/rocksdb/db/db_info_dumper.h
@@ -0,0 +1,14 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#include <string>
+
+#include "options/db_options.h"
+
+namespace ROCKSDB_NAMESPACE {
+void DumpDBFileSummary(const ImmutableDBOptions& options,
+                       const std::string& dbname);
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/db_inplace_update_test.cc b/src/rocksdb/db/db_inplace_update_test.cc
new file mode 100644
index 000000000..26405864e
--- /dev/null
+++ b/src/rocksdb/db/db_inplace_update_test.cc
@@ -0,0 +1,177 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include "db/db_test_util.h"
+#include "port/stack_trace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBTestInPlaceUpdate : public DBTestBase {
+ public:
+  DBTestInPlaceUpdate() : DBTestBase("/db_inplace_update_test") {}
+};
+
+TEST_F(DBTestInPlaceUpdate, InPlaceUpdate) {
+  do {
+    Options options = CurrentOptions();
+    options.create_if_missing = true;
+    options.inplace_update_support = true;
+    options.env = env_;
+    options.write_buffer_size = 100000;
+    options.allow_concurrent_memtable_write = false;
+    Reopen(options);
+    CreateAndReopenWithCF({"pikachu"}, options);
+
+    // Update key with values of smaller size
+    int numValues = 10;
+    for (int i = numValues; i > 0; i--) {
+      std::string value = DummyString(i, 'a');
+      ASSERT_OK(Put(1, "key", value));
+      ASSERT_EQ(value, Get(1, "key"));
+    }
+
+    // Only 1 instance for that key.
+    validateNumberOfEntries(1, 1);
+  } while (ChangeCompactOptions());
+}
+
+TEST_F(DBTestInPlaceUpdate, InPlaceUpdateLargeNewValue) {
+  do {
+    Options options = CurrentOptions();
+    options.create_if_missing = true;
+    options.inplace_update_support = true;
+    options.env = env_;
+    options.write_buffer_size = 100000;
+    options.allow_concurrent_memtable_write = false;
+    Reopen(options);
+    CreateAndReopenWithCF({"pikachu"}, options);
+
+    // Update key with values of larger size
+    int numValues = 10;
+    for (int i = 0; i < numValues; i++) {
+      std::string value = DummyString(i, 'a');
+      ASSERT_OK(Put(1, "key", value));
+      ASSERT_EQ(value, Get(1, "key"));
+    }
+
+    // All 10 updates exist in the internal iterator
+    validateNumberOfEntries(numValues, 1);
+  } while (ChangeCompactOptions());
+}
+
+TEST_F(DBTestInPlaceUpdate, InPlaceUpdateCallbackSmallerSize) {
+  do {
+    Options options = CurrentOptions();
+    options.create_if_missing = true;
+    options.inplace_update_support = true;
+
+    options.env = env_;
+    options.write_buffer_size = 100000;
+    options.inplace_callback =
+        ROCKSDB_NAMESPACE::DBTestInPlaceUpdate::updateInPlaceSmallerSize;
+    options.allow_concurrent_memtable_write = false;
+    Reopen(options);
+    CreateAndReopenWithCF({"pikachu"}, options);
+
+    // Update key with values of smaller size
+    int numValues = 10;
+    ASSERT_OK(Put(1, "key", DummyString(numValues, 'a')));
+    ASSERT_EQ(DummyString(numValues, 'c'), Get(1, "key"));
+
+    for (int i = numValues; i > 0; i--) {
+      ASSERT_OK(Put(1, "key", DummyString(i, 'a')));
+      ASSERT_EQ(DummyString(i - 1, 'b'), Get(1, "key"));
+    }
+
+    // Only 1 instance for that key.
+    validateNumberOfEntries(1, 1);
+  } while (ChangeCompactOptions());
+}
+
+TEST_F(DBTestInPlaceUpdate, InPlaceUpdateCallbackSmallerVarintSize) {
+  do {
+    Options options = CurrentOptions();
+    options.create_if_missing = true;
+    options.inplace_update_support = true;
+
+    options.env = env_;
+    options.write_buffer_size = 100000;
+    options.inplace_callback =
+        ROCKSDB_NAMESPACE::DBTestInPlaceUpdate::updateInPlaceSmallerVarintSize;
+    options.allow_concurrent_memtable_write = false;
+    Reopen(options);
+    CreateAndReopenWithCF({"pikachu"}, options);
+
+    // Update key with values of smaller varint size
+    int numValues = 265;
+    ASSERT_OK(Put(1, "key", DummyString(numValues, 'a')));
+    ASSERT_EQ(DummyString(numValues, 'c'), Get(1, "key"));
+
+    for (int i = numValues; i > 0; i--) {
+      ASSERT_OK(Put(1, "key", DummyString(i, 'a')));
+      ASSERT_EQ(DummyString(1, 'b'), Get(1, "key"));
+    }
+
+    // Only 1 instance for that key.
+    validateNumberOfEntries(1, 1);
+  } while (ChangeCompactOptions());
+}
+
+TEST_F(DBTestInPlaceUpdate, InPlaceUpdateCallbackLargeNewValue) {
+  do {
+    Options options = CurrentOptions();
+    options.create_if_missing = true;
+    options.inplace_update_support = true;
+
+    options.env = env_;
+    options.write_buffer_size = 100000;
+    options.inplace_callback =
+        ROCKSDB_NAMESPACE::DBTestInPlaceUpdate::updateInPlaceLargerSize;
+    options.allow_concurrent_memtable_write = false;
+    Reopen(options);
+    CreateAndReopenWithCF({"pikachu"}, options);
+
+    // Update key with values of larger size
+    int numValues = 10;
+    for (int i = 0; i < numValues; i++) {
+      ASSERT_OK(Put(1, "key", DummyString(i, 'a')));
+      ASSERT_EQ(DummyString(i, 'c'), Get(1, "key"));
+    }
+
+    // No inplace updates. All updates are puts with new seq number
+    // All 10 updates exist in the internal iterator
+    validateNumberOfEntries(numValues, 1);
+  } while (ChangeCompactOptions());
+}
+
+TEST_F(DBTestInPlaceUpdate, InPlaceUpdateCallbackNoAction) {
+  do {
+    Options options = CurrentOptions();
+    options.create_if_missing = true;
+    options.inplace_update_support = true;
+
+    options.env = env_;
+    options.write_buffer_size = 100000;
+    options.inplace_callback =
+        ROCKSDB_NAMESPACE::DBTestInPlaceUpdate::updateInPlaceNoAction;
+    options.allow_concurrent_memtable_write = false;
+    Reopen(options);
+    CreateAndReopenWithCF({"pikachu"}, options);
+
+    // Callback function requests no actions from db
+    ASSERT_OK(Put(1, "key", DummyString(1, 'a')));
+    ASSERT_EQ(Get(1, "key"), "NOT_FOUND");
+  } while (ChangeCompactOptions());
+}
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_io_failure_test.cc b/src/rocksdb/db/db_io_failure_test.cc
new file mode 100644
index 000000000..f8d562447
--- /dev/null
+++ b/src/rocksdb/db/db_io_failure_test.cc
@@ -0,0 +1,568 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/db_test_util.h"
+#include "port/stack_trace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBIOFailureTest : public DBTestBase {
+ public:
+  DBIOFailureTest() : DBTestBase("/db_io_failure_test") {}
+};
+
+#ifndef ROCKSDB_LITE
+// Check that number of files does not grow when writes are dropped
+TEST_F(DBIOFailureTest, DropWrites) {
+  do {
+    Options options = CurrentOptions();
+    options.env = env_;
+    options.paranoid_checks = false;
+    Reopen(options);
+
+    ASSERT_OK(Put("foo", "v1"));
+    ASSERT_EQ("v1", Get("foo"));
+    Compact("a", "z");
+    const size_t num_files = CountFiles();
+    // Force out-of-space errors
+    env_->drop_writes_.store(true, std::memory_order_release);
+    env_->sleep_counter_.Reset();
+    env_->no_slowdown_ = true;
+    for (int i = 0; i < 5; i++) {
+      if (option_config_ != kUniversalCompactionMultiLevel &&
+          option_config_ != kUniversalSubcompactions) {
+        for (int level = 0; level < dbfull()->NumberLevels(); level++) {
+          if (level > 0 && level == dbfull()->NumberLevels() - 1) {
+            break;
+          }
+          dbfull()->TEST_CompactRange(level, nullptr, nullptr, nullptr,
+                                      true /* disallow trivial move */);
+        }
+      } else {
+        dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+      }
+    }
+
+    std::string property_value;
+    ASSERT_TRUE(db_->GetProperty("rocksdb.background-errors", &property_value));
+    ASSERT_EQ("5", property_value);
+
+    env_->drop_writes_.store(false, std::memory_order_release);
+    ASSERT_LT(CountFiles(), num_files + 3);
+
+    // Check that compaction attempts slept after errors
+    // TODO @krad: Figure out why ASSERT_EQ 5 keeps failing in certain compiler
+    // versions
+    ASSERT_GE(env_->sleep_counter_.Read(), 4);
+  } while (ChangeCompactOptions());
+}
+
+// Check background error counter bumped on flush failures.
+TEST_F(DBIOFailureTest, DropWritesFlush) {
+  do {
+    Options options = CurrentOptions();
+    options.env = env_;
+    options.max_background_flushes = 1;
+    Reopen(options);
+
+    ASSERT_OK(Put("foo", "v1"));
+    // Force out-of-space errors
+    env_->drop_writes_.store(true, std::memory_order_release);
+
+    std::string property_value;
+    // Background error count is 0 now.
+    ASSERT_TRUE(db_->GetProperty("rocksdb.background-errors", &property_value));
+    ASSERT_EQ("0", property_value);
+
+    dbfull()->TEST_FlushMemTable(true);
+
+    ASSERT_TRUE(db_->GetProperty("rocksdb.background-errors", &property_value));
+    ASSERT_EQ("1", property_value);
+
+    env_->drop_writes_.store(false, std::memory_order_release);
+  } while (ChangeCompactOptions());
+}
+
+// Check that CompactRange() returns failure if there is not enough space left
+// on device
+TEST_F(DBIOFailureTest, NoSpaceCompactRange) {
+  do {
+    Options options = CurrentOptions();
+    options.env = env_;
+    options.disable_auto_compactions = true;
+    Reopen(options);
+
+    // generate 5 tables
+    for (int i = 0; i < 5; ++i) {
+      ASSERT_OK(Put(Key(i), Key(i) + "v"));
+      ASSERT_OK(Flush());
+    }
+
+    // Force out-of-space errors
+    env_->no_space_.store(true, std::memory_order_release);
+
+    Status s = dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr,
+                                           true /* disallow trivial move */);
+    ASSERT_TRUE(s.IsIOError());
+    ASSERT_TRUE(s.IsNoSpace());
+
+    env_->no_space_.store(false, std::memory_order_release);
+  } while (ChangeCompactOptions());
+}
+#endif  // ROCKSDB_LITE
+
+TEST_F(DBIOFailureTest, NonWritableFileSystem) {
+  do {
+    Options options = CurrentOptions();
+    options.write_buffer_size = 4096;
+    options.arena_block_size = 4096;
+    options.env = env_;
+    Reopen(options);
+    ASSERT_OK(Put("foo", "v1"));
+    env_->non_writeable_rate_.store(100);
+    std::string big(100000, 'x');
+    int errors = 0;
+    for (int i = 0; i < 20; i++) {
+      if (!Put("foo", big).ok()) {
+        errors++;
+        env_->SleepForMicroseconds(100000);
+      }
+    }
+    ASSERT_GT(errors, 0);
+    env_->non_writeable_rate_.store(0);
+  } while (ChangeCompactOptions());
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBIOFailureTest, ManifestWriteError) {
+  // Test for the following problem:
+  // (a) Compaction produces file F
+  // (b) Log record containing F is written to MANIFEST file, but Sync() fails
+  // (c) GC deletes F
+  // (d) After reopening DB, reads fail since deleted F is named in log record
+
+  // We iterate twice.  In the second iteration, everything is the
+  // same except the log record never makes it to the MANIFEST file.
+  for (int iter = 0; iter < 2; iter++) {
+    std::atomic<bool>* error_type = (iter == 0) ? &env_->manifest_sync_error_
+                                                : &env_->manifest_write_error_;
+
+    // Insert foo=>bar mapping
+    Options options = CurrentOptions();
+    options.env = env_;
+    options.create_if_missing = true;
+    options.error_if_exists = false;
+    options.paranoid_checks = true;
+    DestroyAndReopen(options);
+    ASSERT_OK(Put("foo", "bar"));
+    ASSERT_EQ("bar", Get("foo"));
+
+    // Memtable compaction (will succeed)
+    Flush();
+    ASSERT_EQ("bar", Get("foo"));
+    const int last = 2;
+    MoveFilesToLevel(2);
+    ASSERT_EQ(NumTableFilesAtLevel(last), 1);  // foo=>bar is now in last level
+
+    // Merging compaction (will fail)
+    error_type->store(true, std::memory_order_release);
+    dbfull()->TEST_CompactRange(last, nullptr, nullptr);  // Should fail
+    ASSERT_EQ("bar", Get("foo"));
+
+    error_type->store(false, std::memory_order_release);
+
+    // Since paranoid_checks=true, writes should fail
+    ASSERT_NOK(Put("foo2", "bar2"));
+
+    // Recovery: should not lose data
+    ASSERT_EQ("bar", Get("foo"));
+
+    // Try again with paranoid_checks=false
+    Close();
+    options.paranoid_checks = false;
+    Reopen(options);
+
+    // Merging compaction (will fail)
+    error_type->store(true, std::memory_order_release);
+    dbfull()->TEST_CompactRange(last, nullptr, nullptr);  // Should fail
+    ASSERT_EQ("bar", Get("foo"));
+
+    // Recovery: should not lose data
+    error_type->store(false, std::memory_order_release);
+    Reopen(options);
+    ASSERT_EQ("bar", Get("foo"));
+
+    // Since paranoid_checks=false, writes should succeed
+    ASSERT_OK(Put("foo2", "bar2"));
+    ASSERT_EQ("bar", Get("foo"));
+    ASSERT_EQ("bar2", Get("foo2"));
+  }
+}
+
+TEST_F(DBIOFailureTest, PutFailsParanoid) {
+  // Test the following:
+  // (a) A random put fails in paranoid mode (simulate by sync fail)
+  // (b) All other puts have to fail, even if writes would succeed
+  // (c) All of that should happen ONLY if paranoid_checks = true
+
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  options.error_if_exists = false;
+  options.paranoid_checks = true;
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+  Status s;
+
+  ASSERT_OK(Put(1, "foo", "bar"));
+  ASSERT_OK(Put(1, "foo1", "bar1"));
+  // simulate error
+  env_->log_write_error_.store(true, std::memory_order_release);
+  s = Put(1, "foo2", "bar2");
+  ASSERT_TRUE(!s.ok());
+  env_->log_write_error_.store(false, std::memory_order_release);
+  s = Put(1, "foo3", "bar3");
+  // the next put should fail, too
+  ASSERT_TRUE(!s.ok());
+  // but we're still able to read
+  ASSERT_EQ("bar", Get(1, "foo"));
+
+  // do the same thing with paranoid checks off
+  options.paranoid_checks = false;
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  ASSERT_OK(Put(1, "foo", "bar"));
+  ASSERT_OK(Put(1, "foo1", "bar1"));
+  // simulate error
+  env_->log_write_error_.store(true, std::memory_order_release);
+  s = Put(1, "foo2", "bar2");
+  ASSERT_TRUE(!s.ok());
+  env_->log_write_error_.store(false, std::memory_order_release);
+  s = Put(1, "foo3", "bar3");
+  // the next put should NOT fail
+  ASSERT_TRUE(s.ok());
+}
+#if !(defined NDEBUG) || !defined(OS_WIN)
+TEST_F(DBIOFailureTest, FlushSstRangeSyncError) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  options.error_if_exists = false;
+  options.paranoid_checks = true;
+  options.write_buffer_size = 256 * 1024 * 1024;
+  options.writable_file_max_buffer_size = 128 * 1024;
+  options.bytes_per_sync = 128 * 1024;
+  options.level0_file_num_compaction_trigger = 4;
+  options.memtable_factory.reset(new SpecialSkipListFactory(10));
+  BlockBasedTableOptions table_options;
+  table_options.filter_policy.reset(NewBloomFilterPolicy(10));
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+  Status s;
+
+  std::atomic<int> range_sync_called(0);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "SpecialEnv::SStableFile::RangeSync", [&](void* arg) {
+        if (range_sync_called.fetch_add(1) == 0) {
+          Status* st = static_cast<Status*>(arg);
+          *st = Status::IOError("range sync dummy error");
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  Random rnd(301);
+  std::string rnd_str =
+      RandomString(&rnd, static_cast<int>(options.bytes_per_sync / 2));
+  std::string rnd_str_512kb = RandomString(&rnd, 512 * 1024);
+
+  ASSERT_OK(Put(1, "foo", "bar"));
+  // First 1MB doesn't get range synced
+  ASSERT_OK(Put(1, "foo0_0", rnd_str_512kb));
+  ASSERT_OK(Put(1, "foo0_1", rnd_str_512kb));
+  ASSERT_OK(Put(1, "foo1_1", rnd_str));
+  ASSERT_OK(Put(1, "foo1_2", rnd_str));
+  ASSERT_OK(Put(1, "foo1_3", rnd_str));
+  ASSERT_OK(Put(1, "foo2", "bar"));
+  ASSERT_OK(Put(1, "foo3_1", rnd_str));
+  ASSERT_OK(Put(1, "foo3_2", rnd_str));
+  ASSERT_OK(Put(1, "foo3_3", rnd_str));
+  ASSERT_OK(Put(1, "foo4", "bar"));
+  dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
+
+  // Following writes should fail as flush failed.
+  ASSERT_NOK(Put(1, "foo2", "bar3"));
+  ASSERT_EQ("bar", Get(1, "foo"));
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ASSERT_GE(1, range_sync_called.load());
+
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+  ASSERT_EQ("bar", Get(1, "foo"));
+}
+
+TEST_F(DBIOFailureTest, CompactSstRangeSyncError) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  options.error_if_exists = false;
+  options.paranoid_checks = true;
+  options.write_buffer_size = 256 * 1024 * 1024;
+  options.writable_file_max_buffer_size = 128 * 1024;
+  options.bytes_per_sync = 128 * 1024;
+  options.level0_file_num_compaction_trigger = 2;
+  options.target_file_size_base = 256 * 1024 * 1024;
+  options.disable_auto_compactions = true;
+  BlockBasedTableOptions table_options;
+  table_options.filter_policy.reset(NewBloomFilterPolicy(10));
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+  Status s;
+
+  Random rnd(301);
+  std::string rnd_str =
+      RandomString(&rnd, static_cast<int>(options.bytes_per_sync / 2));
+  std::string rnd_str_512kb = RandomString(&rnd, 512 * 1024);
+
+  ASSERT_OK(Put(1, "foo", "bar"));
+  // First 1MB doesn't get range synced
+  ASSERT_OK(Put(1, "foo0_0", rnd_str_512kb));
+  ASSERT_OK(Put(1, "foo0_1", rnd_str_512kb));
+  ASSERT_OK(Put(1, "foo1_1", rnd_str));
+  ASSERT_OK(Put(1, "foo1_2", rnd_str));
+  ASSERT_OK(Put(1, "foo1_3", rnd_str));
+  Flush(1);
+  ASSERT_OK(Put(1, "foo", "bar"));
+  ASSERT_OK(Put(1, "foo3_1", rnd_str));
+  ASSERT_OK(Put(1, "foo3_2", rnd_str));
+  ASSERT_OK(Put(1, "foo3_3", rnd_str));
+  ASSERT_OK(Put(1, "foo4", "bar"));
+  Flush(1);
+  dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
+
+  std::atomic<int> range_sync_called(0);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "SpecialEnv::SStableFile::RangeSync", [&](void* arg) {
+        if (range_sync_called.fetch_add(1) == 0) {
+          Status* st = static_cast<Status*>(arg);
+          *st = Status::IOError("range sync dummy error");
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(dbfull()->SetOptions(handles_[1],
+                                 {
+                                     {"disable_auto_compactions", "false"},
+                                 }));
+  dbfull()->TEST_WaitForCompact();
+
+  // Following writes should fail as flush failed.
+  ASSERT_NOK(Put(1, "foo2", "bar3"));
+  ASSERT_EQ("bar", Get(1, "foo"));
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ASSERT_GE(1, range_sync_called.load());
+
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+  ASSERT_EQ("bar", Get(1, "foo"));
+}
+
+TEST_F(DBIOFailureTest, FlushSstCloseError) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  options.error_if_exists = false;
+  options.paranoid_checks = true;
+  options.level0_file_num_compaction_trigger = 4;
+  options.memtable_factory.reset(new SpecialSkipListFactory(2));
+
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+  Status s;
+  std::atomic<int> close_called(0);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "SpecialEnv::SStableFile::Close", [&](void* arg) {
+        if (close_called.fetch_add(1) == 0) {
+          Status* st = static_cast<Status*>(arg);
+          *st = Status::IOError("close dummy error");
+        }
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(Put(1, "foo", "bar"));
+  ASSERT_OK(Put(1, "foo1", "bar1"));
+  ASSERT_OK(Put(1, "foo", "bar2"));
+  dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
+
+  // Following writes should fail as flush failed.
+  ASSERT_NOK(Put(1, "foo2", "bar3"));
+  ASSERT_EQ("bar2", Get(1, "foo"));
+  ASSERT_EQ("bar1", Get(1, "foo1"));
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+  ASSERT_EQ("bar2", Get(1, "foo"));
+  ASSERT_EQ("bar1", Get(1, "foo1"));
+}
+
+TEST_F(DBIOFailureTest, CompactionSstCloseError) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  options.error_if_exists = false;
+  options.paranoid_checks = true;
+  options.level0_file_num_compaction_trigger = 2;
+  options.disable_auto_compactions = true;
+
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+  Status s;
+
+  ASSERT_OK(Put(1, "foo", "bar"));
+  ASSERT_OK(Put(1, "foo2", "bar"));
+  Flush(1);
+  ASSERT_OK(Put(1, "foo", "bar2"));
+  ASSERT_OK(Put(1, "foo2", "bar"));
+  Flush(1);
+  ASSERT_OK(Put(1, "foo", "bar3"));
+  ASSERT_OK(Put(1, "foo2", "bar"));
+  Flush(1);
+  dbfull()->TEST_WaitForCompact();
+
+  std::atomic<int> close_called(0);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "SpecialEnv::SStableFile::Close", [&](void* arg) {
+        if (close_called.fetch_add(1) == 0) {
+          Status* st = static_cast<Status*>(arg);
+          *st = Status::IOError("close dummy error");
+        }
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_OK(dbfull()->SetOptions(handles_[1],
+                                 {
+                                     {"disable_auto_compactions", "false"},
+                                 }));
+  dbfull()->TEST_WaitForCompact();
+
+  // Following writes should fail as compaction failed.
+  ASSERT_NOK(Put(1, "foo2", "bar3"));
+  ASSERT_EQ("bar3", Get(1, "foo"));
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+  ASSERT_EQ("bar3", Get(1, "foo"));
+}
+
+TEST_F(DBIOFailureTest, FlushSstSyncError) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  options.error_if_exists = false;
+  options.paranoid_checks = true;
+  options.use_fsync = false;
+  options.level0_file_num_compaction_trigger = 4;
+  options.memtable_factory.reset(new SpecialSkipListFactory(2));
+
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+  Status s;
+  std::atomic<int> sync_called(0);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "SpecialEnv::SStableFile::Sync", [&](void* arg) {
+        if (sync_called.fetch_add(1) == 0) {
+          Status* st = static_cast<Status*>(arg);
+          *st = Status::IOError("sync dummy error");
+        }
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(Put(1, "foo", "bar"));
+  ASSERT_OK(Put(1, "foo1", "bar1"));
+  ASSERT_OK(Put(1, "foo", "bar2"));
+  dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
+
+  // Following writes should fail as flush failed.
+  ASSERT_NOK(Put(1, "foo2", "bar3"));
+  ASSERT_EQ("bar2", Get(1, "foo"));
+  ASSERT_EQ("bar1", Get(1, "foo1"));
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+  ASSERT_EQ("bar2", Get(1, "foo"));
+  ASSERT_EQ("bar1", Get(1, "foo1"));
+}
+
+TEST_F(DBIOFailureTest, CompactionSstSyncError) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  options.error_if_exists = false;
+  options.paranoid_checks = true;
+  options.level0_file_num_compaction_trigger = 2;
+  options.disable_auto_compactions = true;
+  options.use_fsync = false;
+
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+  Status s;
+
+  ASSERT_OK(Put(1, "foo", "bar"));
+  ASSERT_OK(Put(1, "foo2", "bar"));
+  Flush(1);
+  ASSERT_OK(Put(1, "foo", "bar2"));
+  ASSERT_OK(Put(1, "foo2", "bar"));
+  Flush(1);
+  ASSERT_OK(Put(1, "foo", "bar3"));
+  ASSERT_OK(Put(1, "foo2", "bar"));
+  Flush(1);
+  dbfull()->TEST_WaitForCompact();
+
+  std::atomic<int> sync_called(0);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "SpecialEnv::SStableFile::Sync", [&](void* arg) {
+        if (sync_called.fetch_add(1) == 0) {
+          Status* st = static_cast<Status*>(arg);
+          *st = Status::IOError("close dummy error");
+        }
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_OK(dbfull()->SetOptions(handles_[1],
+                                 {
+                                     {"disable_auto_compactions", "false"},
+                                 }));
+  dbfull()->TEST_WaitForCompact();
+
+  // Following writes should fail as compaction failed.
+  ASSERT_NOK(Put(1, "foo2", "bar3"));
+  ASSERT_EQ("bar3", Get(1, "foo"));
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+  ASSERT_EQ("bar3", Get(1, "foo"));
+}
+#endif  // !(defined NDEBUG) || !defined(OS_WIN)
+#endif  // ROCKSDB_LITE
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_iter.cc b/src/rocksdb/db/db_iter.cc
new file mode 100644
index 000000000..e5d402948
--- /dev/null
+++ b/src/rocksdb/db/db_iter.cc
@@ -0,0 +1,1310 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/db_iter.h"
+#include <string>
+#include <iostream>
+#include <limits>
+
+#include "db/dbformat.h"
+#include "db/merge_context.h"
+#include "db/merge_helper.h"
+#include "db/pinned_iterators_manager.h"
+#include "file/filename.h"
+#include "logging/logging.h"
+#include "memory/arena.h"
+#include "monitoring/perf_context_imp.h"
+#include "rocksdb/env.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/options.h"
+#include "table/internal_iterator.h"
+#include "table/iterator_wrapper.h"
+#include "trace_replay/trace_replay.h"
+#include "util/mutexlock.h"
+#include "util/string_util.h"
+#include "util/user_comparator_wrapper.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+DBIter::DBIter(Env* _env, const ReadOptions& read_options,
+               const ImmutableCFOptions& cf_options,
+               const MutableCFOptions& mutable_cf_options,
+               const Comparator* cmp, InternalIterator* iter, SequenceNumber s,
+               bool arena_mode, uint64_t max_sequential_skip_in_iterations,
+               ReadCallback* read_callback, DBImpl* db_impl,
+               ColumnFamilyData* cfd, bool allow_blob)
+    : prefix_extractor_(mutable_cf_options.prefix_extractor.get()),
+      env_(_env),
+      logger_(cf_options.info_log),
+      user_comparator_(cmp),
+      merge_operator_(cf_options.merge_operator),
+      iter_(iter),
+      read_callback_(read_callback),
+      sequence_(s),
+      statistics_(cf_options.statistics),
+      num_internal_keys_skipped_(0),
+      iterate_lower_bound_(read_options.iterate_lower_bound),
+      iterate_upper_bound_(read_options.iterate_upper_bound),
+      direction_(kForward),
+      valid_(false),
+      current_entry_is_merged_(false),
+      is_key_seqnum_zero_(false),
+      prefix_same_as_start_(mutable_cf_options.prefix_extractor
+                                ? read_options.prefix_same_as_start
+                                : false),
+      pin_thru_lifetime_(read_options.pin_data),
+      expect_total_order_inner_iter_(prefix_extractor_ == nullptr ||
+                                     read_options.total_order_seek ||
+                                     read_options.auto_prefix_mode),
+      allow_blob_(allow_blob),
+      is_blob_(false),
+      arena_mode_(arena_mode),
+      range_del_agg_(&cf_options.internal_comparator, s),
+      db_impl_(db_impl),
+      cfd_(cfd),
+      start_seqnum_(read_options.iter_start_seqnum) {
+  RecordTick(statistics_, NO_ITERATOR_CREATED);
+  max_skip_ = max_sequential_skip_in_iterations;
+  max_skippable_internal_keys_ = read_options.max_skippable_internal_keys;
+  if (pin_thru_lifetime_) {
+    pinned_iters_mgr_.StartPinning();
+  }
+  if (iter_.iter()) {
+    iter_.iter()->SetPinnedItersMgr(&pinned_iters_mgr_);
+  }
+}
+
+Status DBIter::GetProperty(std::string prop_name, std::string* prop) {
+  if (prop == nullptr) {
+    return Status::InvalidArgument("prop is nullptr");
+  }
+  if (prop_name == "rocksdb.iterator.super-version-number") {
+    // First try to pass the value returned from inner iterator.
+    return iter_.iter()->GetProperty(prop_name, prop);
+  } else if (prop_name == "rocksdb.iterator.is-key-pinned") {
+    if (valid_) {
+      *prop = (pin_thru_lifetime_ && saved_key_.IsKeyPinned()) ? "1" : "0";
+    } else {
+      *prop = "Iterator is not valid.";
+    }
+    return Status::OK();
+  } else if (prop_name == "rocksdb.iterator.internal-key") {
+    *prop = saved_key_.GetUserKey().ToString();
+    return Status::OK();
+  }
+  return Status::InvalidArgument("Unidentified property.");
+}
+
+bool DBIter::ParseKey(ParsedInternalKey* ikey) {
+  if (!ParseInternalKey(iter_.key(), ikey)) {
+    status_ = Status::Corruption("corrupted internal key in DBIter");
+    valid_ = false;
+    ROCKS_LOG_ERROR(logger_, "corrupted internal key in DBIter: %s",
+                    iter_.key().ToString(true).c_str());
+    return false;
+  } else {
+    return true;
+  }
+}
+
+void DBIter::Next() {
+  assert(valid_);
+  assert(status_.ok());
+
+  PERF_CPU_TIMER_GUARD(iter_next_cpu_nanos, env_);
+  // Release temporarily pinned blocks from last operation
+  ReleaseTempPinnedData();
+  local_stats_.skip_count_ += num_internal_keys_skipped_;
+  local_stats_.skip_count_--;
+  num_internal_keys_skipped_ = 0;
+  bool ok = true;
+  if (direction_ == kReverse) {
+    is_key_seqnum_zero_ = false;
+    if (!ReverseToForward()) {
+      ok = false;
+    }
+  } else if (!current_entry_is_merged_) {
+    // If the current value is not a merge, the iter position is the
+    // current key, which is already returned. We can safely issue a
+    // Next() without checking the current key.
+    // If the current key is a merge, very likely iter already points
+    // to the next internal position.
+    assert(iter_.Valid());
+    iter_.Next();
+    PERF_COUNTER_ADD(internal_key_skipped_count, 1);
+  }
+
+  local_stats_.next_count_++;
+  if (ok && iter_.Valid()) {
+    Slice prefix;
+    if (prefix_same_as_start_) {
+      assert(prefix_extractor_ != nullptr);
+      prefix = prefix_.GetUserKey();
+    }
+    FindNextUserEntry(true /* skipping the current user key */,
+                      prefix_same_as_start_ ? &prefix : nullptr);
+  } else {
+    is_key_seqnum_zero_ = false;
+    valid_ = false;
+  }
+  if (statistics_ != nullptr && valid_) {
+    local_stats_.next_found_count_++;
+    local_stats_.bytes_read_ += (key().size() + value().size());
+  }
+}
+
+// PRE: saved_key_ has the current user key if skipping_saved_key
+// POST: saved_key_ should have the next user key if valid_,
+//       if the current entry is a result of merge
+//           current_entry_is_merged_ => true
+//           saved_value_             => the merged value
+//
+// NOTE: In between, saved_key_ can point to a user key that has
+//       a delete marker or a sequence number higher than sequence_
+//       saved_key_ MUST have a proper user_key before calling this function
+//
+// The prefix parameter, if not null, indicates that we need to iterate
+// within the prefix, and the iterator needs to be made invalid, if no
+// more entry for the prefix can be found.
+bool DBIter::FindNextUserEntry(bool skipping_saved_key, const Slice* prefix) {
+  PERF_TIMER_GUARD(find_next_user_entry_time);
+  return FindNextUserEntryInternal(skipping_saved_key, prefix);
+}
+
+// Actual implementation of DBIter::FindNextUserEntry()
+bool DBIter::FindNextUserEntryInternal(bool skipping_saved_key,
+                                       const Slice* prefix) {
+  // Loop until we hit an acceptable entry to yield
+  assert(iter_.Valid());
+  assert(status_.ok());
+  assert(direction_ == kForward);
+  current_entry_is_merged_ = false;
+
+  // How many times in a row we have skipped an entry with user key less than
+  // or equal to saved_key_. We could skip these entries either because
+  // sequence numbers were too high or because skipping_saved_key = true.
+  // What saved_key_ contains throughout this method:
+  //  - if skipping_saved_key : saved_key_ contains the key that we need
+  //                            to skip, and we haven't seen any keys greater
+  //                            than that,
+  //  - if num_skipped > 0    : saved_key_ contains the key that we have skipped
+  //                            num_skipped times, and we haven't seen any keys
+  //                            greater than that,
+  //  - none of the above     : saved_key_ can contain anything, it doesn't
+  //                            matter.
+  uint64_t num_skipped = 0;
+  // For write unprepared, the target sequence number in reseek could be larger
+  // than the snapshot, and thus needs to be skipped again. This could result in
+  // an infinite loop of reseeks. To avoid that, we limit the number of reseeks
+  // to one.
+  bool reseek_done = false;
+
+  is_blob_ = false;
+
+  do {
+    // Will update is_key_seqnum_zero_ as soon as we parsed the current key
+    // but we need to save the previous value to be used in the loop.
+    bool is_prev_key_seqnum_zero = is_key_seqnum_zero_;
+    if (!ParseKey(&ikey_)) {
+      is_key_seqnum_zero_ = false;
+      return false;
+    }
+
+    is_key_seqnum_zero_ = (ikey_.sequence == 0);
+
+    assert(iterate_upper_bound_ == nullptr || iter_.MayBeOutOfUpperBound() ||
+           user_comparator_.Compare(ikey_.user_key, *iterate_upper_bound_) < 0);
+    if (iterate_upper_bound_ != nullptr && iter_.MayBeOutOfUpperBound() &&
+        user_comparator_.Compare(ikey_.user_key, *iterate_upper_bound_) >= 0) {
+      break;
+    }
+
+    assert(prefix == nullptr || prefix_extractor_ != nullptr);
+    if (prefix != nullptr &&
+        prefix_extractor_->Transform(ikey_.user_key).compare(*prefix) != 0) {
+      assert(prefix_same_as_start_);
+      break;
+    }
+
+    if (TooManyInternalKeysSkipped()) {
+      return false;
+    }
+
+    if (IsVisible(ikey_.sequence)) {
+      // If the previous entry is of seqnum 0, the current entry will not
+      // possibly be skipped. This condition can potentially be relaxed to
+      // prev_key.seq <= ikey_.sequence. We are cautious because it will be more
+      // prone to bugs causing the same user key with the same sequence number.
+      if (!is_prev_key_seqnum_zero && skipping_saved_key &&
+          user_comparator_.Compare(ikey_.user_key, saved_key_.GetUserKey()) <=
+              0) {
+        num_skipped++;  // skip this entry
+        PERF_COUNTER_ADD(internal_key_skipped_count, 1);
+      } else {
+        assert(!skipping_saved_key ||
+               user_comparator_.Compare(ikey_.user_key,
+                                        saved_key_.GetUserKey()) > 0);
+        num_skipped = 0;
+        reseek_done = false;
+        switch (ikey_.type) {
+          case kTypeDeletion:
+          case kTypeSingleDeletion:
+            // Arrange to skip all upcoming entries for this key since
+            // they are hidden by this deletion.
+            // if iterartor specified start_seqnum we
+            // 1) return internal key, including the type
+            // 2) return ikey only if ikey.seqnum >= start_seqnum_
+            // note that if deletion seqnum is < start_seqnum_ we
+            // just skip it like in normal iterator.
+            if (start_seqnum_ > 0 && ikey_.sequence >= start_seqnum_)  {
+              saved_key_.SetInternalKey(ikey_);
+              valid_ = true;
+              return true;
+            } else {
+              saved_key_.SetUserKey(
+                  ikey_.user_key, !pin_thru_lifetime_ ||
+                                      !iter_.iter()->IsKeyPinned() /* copy */);
+              skipping_saved_key = true;
+              PERF_COUNTER_ADD(internal_delete_skipped_count, 1);
+            }
+            break;
+          case kTypeValue:
+          case kTypeBlobIndex:
+            if (start_seqnum_ > 0) {
+              // we are taking incremental snapshot here
+              // incremental snapshots aren't supported on DB with range deletes
+              assert(ikey_.type != kTypeBlobIndex);
+              if (ikey_.sequence >= start_seqnum_) {
+                saved_key_.SetInternalKey(ikey_);
+                valid_ = true;
+                return true;
+              } else {
+                // this key and all previous versions shouldn't be included,
+                // skipping_saved_key
+                saved_key_.SetUserKey(
+                    ikey_.user_key,
+                    !pin_thru_lifetime_ ||
+                        !iter_.iter()->IsKeyPinned() /* copy */);
+                skipping_saved_key = true;
+              }
+            } else {
+              saved_key_.SetUserKey(
+                  ikey_.user_key, !pin_thru_lifetime_ ||
+                                      !iter_.iter()->IsKeyPinned() /* copy */);
+              if (range_del_agg_.ShouldDelete(
+                      ikey_, RangeDelPositioningMode::kForwardTraversal)) {
+                // Arrange to skip all upcoming entries for this key since
+                // they are hidden by this deletion.
+                skipping_saved_key = true;
+                num_skipped = 0;
+                reseek_done = false;
+                PERF_COUNTER_ADD(internal_delete_skipped_count, 1);
+              } else if (ikey_.type == kTypeBlobIndex) {
+                if (!allow_blob_) {
+                  ROCKS_LOG_ERROR(logger_, "Encounter unexpected blob index.");
+                  status_ = Status::NotSupported(
+                      "Encounter unexpected blob index. Please open DB with "
+                      "ROCKSDB_NAMESPACE::blob_db::BlobDB instead.");
+                  valid_ = false;
+                  return false;
+                }
+
+                is_blob_ = true;
+                valid_ = true;
+                return true;
+              } else {
+                valid_ = true;
+                return true;
+              }
+            }
+            break;
+          case kTypeMerge:
+            saved_key_.SetUserKey(
+                ikey_.user_key,
+                !pin_thru_lifetime_ || !iter_.iter()->IsKeyPinned() /* copy */);
+            if (range_del_agg_.ShouldDelete(
+                    ikey_, RangeDelPositioningMode::kForwardTraversal)) {
+              // Arrange to skip all upcoming entries for this key since
+              // they are hidden by this deletion.
+              skipping_saved_key = true;
+              num_skipped = 0;
+              reseek_done = false;
+              PERF_COUNTER_ADD(internal_delete_skipped_count, 1);
+            } else {
+              // By now, we are sure the current ikey is going to yield a
+              // value
+              current_entry_is_merged_ = true;
+              valid_ = true;
+              return MergeValuesNewToOld();  // Go to a different state machine
+            }
+            break;
+          default:
+            assert(false);
+            break;
+        }
+      }
+    } else {
+      PERF_COUNTER_ADD(internal_recent_skipped_count, 1);
+
+      // This key was inserted after our snapshot was taken.
+      // If this happens too many times in a row for the same user key, we want
+      // to seek to the target sequence number.
+      int cmp =
+          user_comparator_.Compare(ikey_.user_key, saved_key_.GetUserKey());
+      if (cmp == 0 || (skipping_saved_key && cmp < 0)) {
+        num_skipped++;
+      } else {
+        saved_key_.SetUserKey(
+            ikey_.user_key,
+            !iter_.iter()->IsKeyPinned() || !pin_thru_lifetime_ /* copy */);
+        skipping_saved_key = false;
+        num_skipped = 0;
+        reseek_done = false;
+      }
+    }
+
+    // If we have sequentially iterated via numerous equal keys, then it's
+    // better to seek so that we can avoid too many key comparisons.
+    //
+    // To avoid infinite loops, do not reseek if we have already attempted to
+    // reseek previously.
+    //
+    // TODO(lth): If we reseek to sequence number greater than ikey_.sequence,
+    // then it does not make sense to reseek as we would actually land further
+    // away from the desired key. There is opportunity for optimization here.
+    if (num_skipped > max_skip_ && !reseek_done) {
+      is_key_seqnum_zero_ = false;
+      num_skipped = 0;
+      reseek_done = true;
+      std::string last_key;
+      if (skipping_saved_key) {
+        // We're looking for the next user-key but all we see are the same
+        // user-key with decreasing sequence numbers. Fast forward to
+        // sequence number 0 and type deletion (the smallest type).
+        AppendInternalKey(&last_key, ParsedInternalKey(saved_key_.GetUserKey(),
+                                                       0, kTypeDeletion));
+        // Don't set skipping_saved_key = false because we may still see more
+        // user-keys equal to saved_key_.
+      } else {
+        // We saw multiple entries with this user key and sequence numbers
+        // higher than sequence_. Fast forward to sequence_.
+        // Note that this only covers a case when a higher key was overwritten
+        // many times since our snapshot was taken, not the case when a lot of
+        // different keys were inserted after our snapshot was taken.
+        AppendInternalKey(&last_key,
+                          ParsedInternalKey(saved_key_.GetUserKey(), sequence_,
+                                            kValueTypeForSeek));
+      }
+      iter_.Seek(last_key);
+      RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION);
+    } else {
+      iter_.Next();
+    }
+  } while (iter_.Valid());
+
+  valid_ = false;
+  return iter_.status().ok();
+}
+
+// Merge values of the same user key starting from the current iter_ position
+// Scan from the newer entries to older entries.
+// PRE: iter_.key() points to the first merge type entry
+//      saved_key_ stores the user key
+// POST: saved_value_ has the merged value for the user key
+//       iter_ points to the next entry (or invalid)
+bool DBIter::MergeValuesNewToOld() {
+  if (!merge_operator_) {
+    ROCKS_LOG_ERROR(logger_, "Options::merge_operator is null.");
+    status_ = Status::InvalidArgument("merge_operator_ must be set.");
+    valid_ = false;
+    return false;
+  }
+
+  // Temporarily pin the blocks that hold merge operands
+  TempPinData();
+  merge_context_.Clear();
+  // Start the merge process by pushing the first operand
+  merge_context_.PushOperand(
+      iter_.value(), iter_.iter()->IsValuePinned() /* operand_pinned */);
+  TEST_SYNC_POINT("DBIter::MergeValuesNewToOld:PushedFirstOperand");
+
+  ParsedInternalKey ikey;
+  Status s;
+  for (iter_.Next(); iter_.Valid(); iter_.Next()) {
+    TEST_SYNC_POINT("DBIter::MergeValuesNewToOld:SteppedToNextOperand");
+    if (!ParseKey(&ikey)) {
+      return false;
+    }
+
+    if (!user_comparator_.Equal(ikey.user_key, saved_key_.GetUserKey())) {
+      // hit the next user key, stop right here
+      break;
+    } else if (kTypeDeletion == ikey.type || kTypeSingleDeletion == ikey.type ||
+               range_del_agg_.ShouldDelete(
+                   ikey, RangeDelPositioningMode::kForwardTraversal)) {
+      // hit a delete with the same user key, stop right here
+      // iter_ is positioned after delete
+      iter_.Next();
+      break;
+    } else if (kTypeValue == ikey.type) {
+      // hit a put, merge the put value with operands and store the
+      // final result in saved_value_. We are done!
+      const Slice val = iter_.value();
+      s = MergeHelper::TimedFullMerge(
+          merge_operator_, ikey.user_key, &val, merge_context_.GetOperands(),
+          &saved_value_, logger_, statistics_, env_, &pinned_value_, true);
+      if (!s.ok()) {
+        valid_ = false;
+        status_ = s;
+        return false;
+      }
+      // iter_ is positioned after put
+      iter_.Next();
+      if (!iter_.status().ok()) {
+        valid_ = false;
+        return false;
+      }
+      return true;
+    } else if (kTypeMerge == ikey.type) {
+      // hit a merge, add the value as an operand and run associative merge.
+      // when complete, add result to operands and continue.
+      merge_context_.PushOperand(
+          iter_.value(), iter_.iter()->IsValuePinned() /* operand_pinned */);
+      PERF_COUNTER_ADD(internal_merge_count, 1);
+    } else if (kTypeBlobIndex == ikey.type) {
+      if (!allow_blob_) {
+        ROCKS_LOG_ERROR(logger_, "Encounter unexpected blob index.");
+        status_ = Status::NotSupported(
+            "Encounter unexpected blob index. Please open DB with "
+            "ROCKSDB_NAMESPACE::blob_db::BlobDB instead.");
+      } else {
+        status_ =
+            Status::NotSupported("Blob DB does not support merge operator.");
+      }
+      valid_ = false;
+      return false;
+    } else {
+      assert(false);
+    }
+  }
+
+  if (!iter_.status().ok()) {
+    valid_ = false;
+    return false;
+  }
+
+  // we either exhausted all internal keys under this user key, or hit
+  // a deletion marker.
+  // feed null as the existing value to the merge operator, such that
+  // client can differentiate this scenario and do things accordingly.
+  s = MergeHelper::TimedFullMerge(merge_operator_, saved_key_.GetUserKey(),
+                                  nullptr, merge_context_.GetOperands(),
+                                  &saved_value_, logger_, statistics_, env_,
+                                  &pinned_value_, true);
+  if (!s.ok()) {
+    valid_ = false;
+    status_ = s;
+    return false;
+  }
+
+  assert(status_.ok());
+  return true;
+}
+
+void DBIter::Prev() {
+  assert(valid_);
+  assert(status_.ok());
+
+  PERF_CPU_TIMER_GUARD(iter_prev_cpu_nanos, env_);
+  ReleaseTempPinnedData();
+  ResetInternalKeysSkippedCounter();
+  bool ok = true;
+  if (direction_ == kForward) {
+    if (!ReverseToBackward()) {
+      ok = false;
+    }
+  }
+  if (ok) {
+    Slice prefix;
+    if (prefix_same_as_start_) {
+      assert(prefix_extractor_ != nullptr);
+      prefix = prefix_.GetUserKey();
+    }
+    PrevInternal(prefix_same_as_start_ ? &prefix : nullptr);
+  }
+
+  if (statistics_ != nullptr) {
+    local_stats_.prev_count_++;
+    if (valid_) {
+      local_stats_.prev_found_count_++;
+      local_stats_.bytes_read_ += (key().size() + value().size());
+    }
+  }
+}
+
+bool DBIter::ReverseToForward() {
+  assert(iter_.status().ok());
+
+  // When moving backwards, iter_ is positioned on _previous_ key, which may
+  // not exist or may have different prefix than the current key().
+  // If that's the case, seek iter_ to current key.
+  if (!expect_total_order_inner_iter() || !iter_.Valid()) {
+    IterKey last_key;
+    last_key.SetInternalKey(ParsedInternalKey(
+        saved_key_.GetUserKey(), kMaxSequenceNumber, kValueTypeForSeek));
+    iter_.Seek(last_key.GetInternalKey());
+  }
+
+  direction_ = kForward;
+  // Skip keys less than the current key() (a.k.a. saved_key_).
+  while (iter_.Valid()) {
+    ParsedInternalKey ikey;
+    if (!ParseKey(&ikey)) {
+      return false;
+    }
+    if (user_comparator_.Compare(ikey.user_key, saved_key_.GetUserKey()) >= 0) {
+      return true;
+    }
+    iter_.Next();
+  }
+
+  if (!iter_.status().ok()) {
+    valid_ = false;
+    return false;
+  }
+
+  return true;
+}
+
+// Move iter_ to the key before saved_key_.
+bool DBIter::ReverseToBackward() {
+  assert(iter_.status().ok());
+
+  // When current_entry_is_merged_ is true, iter_ may be positioned on the next
+  // key, which may not exist or may have prefix different from current.
+  // If that's the case, seek to saved_key_.
+  if (current_entry_is_merged_ &&
+      (!expect_total_order_inner_iter() || !iter_.Valid())) {
+    IterKey last_key;
+    // Using kMaxSequenceNumber and kValueTypeForSeek
+    // (not kValueTypeForSeekForPrev) to seek to a key strictly smaller
+    // than saved_key_.
+    last_key.SetInternalKey(ParsedInternalKey(
+        saved_key_.GetUserKey(), kMaxSequenceNumber, kValueTypeForSeek));
+    if (!expect_total_order_inner_iter()) {
+      iter_.SeekForPrev(last_key.GetInternalKey());
+    } else {
+      // Some iterators may not support SeekForPrev(), so we avoid using it
+      // when prefix seek mode is disabled. This is somewhat expensive
+      // (an extra Prev(), as well as an extra change of direction of iter_),
+      // so we may need to reconsider it later.
+      iter_.Seek(last_key.GetInternalKey());
+      if (!iter_.Valid() && iter_.status().ok()) {
+        iter_.SeekToLast();
+      }
+    }
+  }
+
+  direction_ = kReverse;
+  return FindUserKeyBeforeSavedKey();
+}
+
+void DBIter::PrevInternal(const Slice* prefix) {
+  while (iter_.Valid()) {
+    saved_key_.SetUserKey(
+        ExtractUserKey(iter_.key()),
+        !iter_.iter()->IsKeyPinned() || !pin_thru_lifetime_ /* copy */);
+
+    assert(prefix == nullptr || prefix_extractor_ != nullptr);
+    if (prefix != nullptr &&
+        prefix_extractor_->Transform(saved_key_.GetUserKey())
+                .compare(*prefix) != 0) {
+      assert(prefix_same_as_start_);
+      // Current key does not have the same prefix as start
+      valid_ = false;
+      return;
+    }
+
+    assert(iterate_lower_bound_ == nullptr || iter_.MayBeOutOfLowerBound() ||
+           user_comparator_.Compare(saved_key_.GetUserKey(),
+                                    *iterate_lower_bound_) >= 0);
+    if (iterate_lower_bound_ != nullptr && iter_.MayBeOutOfLowerBound() &&
+        user_comparator_.Compare(saved_key_.GetUserKey(),
+                                 *iterate_lower_bound_) < 0) {
+      // We've iterated earlier than the user-specified lower bound.
+      valid_ = false;
+      return;
+    }
+
+    if (!FindValueForCurrentKey()) {  // assigns valid_
+      return;
+    }
+
+    // Whether or not we found a value for current key, we need iter_ to end up
+    // on a smaller key.
+    if (!FindUserKeyBeforeSavedKey()) {
+      return;
+    }
+
+    if (valid_) {
+      // Found the value.
+      return;
+    }
+
+    if (TooManyInternalKeysSkipped(false)) {
+      return;
+    }
+  }
+
+  // We haven't found any key - iterator is not valid
+  valid_ = false;
+}
+
+// Used for backwards iteration.
+// Looks at the entries with user key saved_key_ and finds the most up-to-date
+// value for it, or executes a merge, or determines that the value was deleted.
+// Sets valid_ to true if the value is found and is ready to be presented to
+// the user through value().
+// Sets valid_ to false if the value was deleted, and we should try another key.
+// Returns false if an error occurred, and !status().ok() and !valid_.
+//
+// PRE: iter_ is positioned on the last entry with user key equal to saved_key_.
+// POST: iter_ is positioned on one of the entries equal to saved_key_, or on
+//       the entry just before them, or on the entry just after them.
+bool DBIter::FindValueForCurrentKey() {
+  assert(iter_.Valid());
+  merge_context_.Clear();
+  current_entry_is_merged_ = false;
+  // last entry before merge (could be kTypeDeletion, kTypeSingleDeletion or
+  // kTypeValue)
+  ValueType last_not_merge_type = kTypeDeletion;
+  ValueType last_key_entry_type = kTypeDeletion;
+
+  // Temporarily pin blocks that hold (merge operands / the value)
+  ReleaseTempPinnedData();
+  TempPinData();
+  size_t num_skipped = 0;
+  while (iter_.Valid()) {
+    ParsedInternalKey ikey;
+    if (!ParseKey(&ikey)) {
+      return false;
+    }
+
+    if (!IsVisible(ikey.sequence) ||
+        !user_comparator_.Equal(ikey.user_key, saved_key_.GetUserKey())) {
+      break;
+    }
+    if (TooManyInternalKeysSkipped()) {
+      return false;
+    }
+
+    // This user key has lots of entries.
+    // We're going from old to new, and it's taking too long. Let's do a Seek()
+    // and go from new to old. This helps when a key was overwritten many times.
+    if (num_skipped >= max_skip_) {
+      return FindValueForCurrentKeyUsingSeek();
+    }
+
+    last_key_entry_type = ikey.type;
+    switch (last_key_entry_type) {
+      case kTypeValue:
+      case kTypeBlobIndex:
+        if (range_del_agg_.ShouldDelete(
+                ikey, RangeDelPositioningMode::kBackwardTraversal)) {
+          last_key_entry_type = kTypeRangeDeletion;
+          PERF_COUNTER_ADD(internal_delete_skipped_count, 1);
+        } else {
+          assert(iter_.iter()->IsValuePinned());
+          pinned_value_ = iter_.value();
+        }
+        merge_context_.Clear();
+        last_not_merge_type = last_key_entry_type;
+        break;
+      case kTypeDeletion:
+      case kTypeSingleDeletion:
+        merge_context_.Clear();
+        last_not_merge_type = last_key_entry_type;
+        PERF_COUNTER_ADD(internal_delete_skipped_count, 1);
+        break;
+      case kTypeMerge:
+        if (range_del_agg_.ShouldDelete(
+                ikey, RangeDelPositioningMode::kBackwardTraversal)) {
+          merge_context_.Clear();
+          last_key_entry_type = kTypeRangeDeletion;
+          last_not_merge_type = last_key_entry_type;
+          PERF_COUNTER_ADD(internal_delete_skipped_count, 1);
+        } else {
+          assert(merge_operator_ != nullptr);
+          merge_context_.PushOperandBack(
+              iter_.value(),
+              iter_.iter()->IsValuePinned() /* operand_pinned */);
+          PERF_COUNTER_ADD(internal_merge_count, 1);
+        }
+        break;
+      default:
+        assert(false);
+    }
+
+    PERF_COUNTER_ADD(internal_key_skipped_count, 1);
+    iter_.Prev();
+    ++num_skipped;
+  }
+
+  if (!iter_.status().ok()) {
+    valid_ = false;
+    return false;
+  }
+
+  Status s;
+  is_blob_ = false;
+  switch (last_key_entry_type) {
+    case kTypeDeletion:
+    case kTypeSingleDeletion:
+    case kTypeRangeDeletion:
+      valid_ = false;
+      return true;
+    case kTypeMerge:
+      current_entry_is_merged_ = true;
+      if (last_not_merge_type == kTypeDeletion ||
+          last_not_merge_type == kTypeSingleDeletion ||
+          last_not_merge_type == kTypeRangeDeletion) {
+        s = MergeHelper::TimedFullMerge(
+            merge_operator_, saved_key_.GetUserKey(), nullptr,
+            merge_context_.GetOperands(), &saved_value_, logger_, statistics_,
+            env_, &pinned_value_, true);
+      } else if (last_not_merge_type == kTypeBlobIndex) {
+        if (!allow_blob_) {
+          ROCKS_LOG_ERROR(logger_, "Encounter unexpected blob index.");
+          status_ = Status::NotSupported(
+              "Encounter unexpected blob index. Please open DB with "
+              "ROCKSDB_NAMESPACE::blob_db::BlobDB instead.");
+        } else {
+          status_ =
+              Status::NotSupported("Blob DB does not support merge operator.");
+        }
+        valid_ = false;
+        return false;
+      } else {
+        assert(last_not_merge_type == kTypeValue);
+        s = MergeHelper::TimedFullMerge(
+            merge_operator_, saved_key_.GetUserKey(), &pinned_value_,
+            merge_context_.GetOperands(), &saved_value_, logger_, statistics_,
+            env_, &pinned_value_, true);
+      }
+      break;
+    case kTypeValue:
+      // do nothing - we've already has value in pinned_value_
+      break;
+    case kTypeBlobIndex:
+      if (!allow_blob_) {
+        ROCKS_LOG_ERROR(logger_, "Encounter unexpected blob index.");
+        status_ = Status::NotSupported(
+            "Encounter unexpected blob index. Please open DB with "
+            "ROCKSDB_NAMESPACE::blob_db::BlobDB instead.");
+        valid_ = false;
+        return false;
+      }
+      is_blob_ = true;
+      break;
+    default:
+      assert(false);
+      break;
+  }
+  if (!s.ok()) {
+    valid_ = false;
+    status_ = s;
+    return false;
+  }
+  valid_ = true;
+  return true;
+}
+
+// This function is used in FindValueForCurrentKey.
+// We use Seek() function instead of Prev() to find necessary value
+// TODO: This is very similar to FindNextUserEntry() and MergeValuesNewToOld().
+//       Would be nice to reuse some code.
+bool DBIter::FindValueForCurrentKeyUsingSeek() {
+  // FindValueForCurrentKey will enable pinning before calling
+  // FindValueForCurrentKeyUsingSeek()
+  assert(pinned_iters_mgr_.PinningEnabled());
+  std::string last_key;
+  AppendInternalKey(&last_key, ParsedInternalKey(saved_key_.GetUserKey(),
+                                                 sequence_, kValueTypeForSeek));
+  iter_.Seek(last_key);
+  RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION);
+
+  // In case read_callback presents, the value we seek to may not be visible.
+  // Find the next value that's visible.
+  ParsedInternalKey ikey;
+  is_blob_ = false;
+  while (true) {
+    if (!iter_.Valid()) {
+      valid_ = false;
+      return iter_.status().ok();
+    }
+
+    if (!ParseKey(&ikey)) {
+      return false;
+    }
+    if (!user_comparator_.Equal(ikey.user_key, saved_key_.GetUserKey())) {
+      // No visible values for this key, even though FindValueForCurrentKey()
+      // has seen some. This is possible if we're using a tailing iterator, and
+      // the entries were discarded in a compaction.
+      valid_ = false;
+      return true;
+    }
+
+    if (IsVisible(ikey.sequence)) {
+      break;
+    }
+
+    iter_.Next();
+  }
+
+  if (ikey.type == kTypeDeletion || ikey.type == kTypeSingleDeletion ||
+      range_del_agg_.ShouldDelete(
+          ikey, RangeDelPositioningMode::kBackwardTraversal)) {
+    valid_ = false;
+    return true;
+  }
+  if (ikey.type == kTypeBlobIndex && !allow_blob_) {
+    ROCKS_LOG_ERROR(logger_, "Encounter unexpected blob index.");
+    status_ = Status::NotSupported(
+        "Encounter unexpected blob index. Please open DB with "
+        "ROCKSDB_NAMESPACE::blob_db::BlobDB instead.");
+    valid_ = false;
+    return false;
+  }
+  if (ikey.type == kTypeValue || ikey.type == kTypeBlobIndex) {
+    assert(iter_.iter()->IsValuePinned());
+    pinned_value_ = iter_.value();
+    is_blob_ = (ikey.type == kTypeBlobIndex);
+    valid_ = true;
+    return true;
+  }
+
+  // kTypeMerge. We need to collect all kTypeMerge values and save them
+  // in operands
+  assert(ikey.type == kTypeMerge);
+  current_entry_is_merged_ = true;
+  merge_context_.Clear();
+  merge_context_.PushOperand(
+      iter_.value(), iter_.iter()->IsValuePinned() /* operand_pinned */);
+  while (true) {
+    iter_.Next();
+
+    if (!iter_.Valid()) {
+      if (!iter_.status().ok()) {
+        valid_ = false;
+        return false;
+      }
+      break;
+    }
+    if (!ParseKey(&ikey)) {
+      return false;
+    }
+    if (!user_comparator_.Equal(ikey.user_key, saved_key_.GetUserKey())) {
+      break;
+    }
+
+    if (ikey.type == kTypeDeletion || ikey.type == kTypeSingleDeletion ||
+        range_del_agg_.ShouldDelete(
+            ikey, RangeDelPositioningMode::kForwardTraversal)) {
+      break;
+    } else if (ikey.type == kTypeValue) {
+      const Slice val = iter_.value();
+      Status s = MergeHelper::TimedFullMerge(
+          merge_operator_, saved_key_.GetUserKey(), &val,
+          merge_context_.GetOperands(), &saved_value_, logger_, statistics_,
+          env_, &pinned_value_, true);
+      if (!s.ok()) {
+        valid_ = false;
+        status_ = s;
+        return false;
+      }
+      valid_ = true;
+      return true;
+    } else if (ikey.type == kTypeMerge) {
+      merge_context_.PushOperand(
+          iter_.value(), iter_.iter()->IsValuePinned() /* operand_pinned */);
+      PERF_COUNTER_ADD(internal_merge_count, 1);
+    } else if (ikey.type == kTypeBlobIndex) {
+      if (!allow_blob_) {
+        ROCKS_LOG_ERROR(logger_, "Encounter unexpected blob index.");
+        status_ = Status::NotSupported(
+            "Encounter unexpected blob index. Please open DB with "
+            "ROCKSDB_NAMESPACE::blob_db::BlobDB instead.");
+      } else {
+        status_ =
+            Status::NotSupported("Blob DB does not support merge operator.");
+      }
+      valid_ = false;
+      return false;
+    } else {
+      assert(false);
+    }
+  }
+
+  Status s = MergeHelper::TimedFullMerge(
+      merge_operator_, saved_key_.GetUserKey(), nullptr,
+      merge_context_.GetOperands(), &saved_value_, logger_, statistics_, env_,
+      &pinned_value_, true);
+  if (!s.ok()) {
+    valid_ = false;
+    status_ = s;
+    return false;
+  }
+
+  // Make sure we leave iter_ in a good state. If it's valid and we don't care
+  // about prefixes, that's already good enough. Otherwise it needs to be
+  // seeked to the current key.
+  if (!expect_total_order_inner_iter() || !iter_.Valid()) {
+    if (!expect_total_order_inner_iter()) {
+      iter_.SeekForPrev(last_key);
+    } else {
+      iter_.Seek(last_key);
+      if (!iter_.Valid() && iter_.status().ok()) {
+        iter_.SeekToLast();
+      }
+    }
+    RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION);
+  }
+
+  valid_ = true;
+  return true;
+}
+
+// Move backwards until the key smaller than saved_key_.
+// Changes valid_ only if return value is false.
+bool DBIter::FindUserKeyBeforeSavedKey() {
+  assert(status_.ok());
+  size_t num_skipped = 0;
+  while (iter_.Valid()) {
+    ParsedInternalKey ikey;
+    if (!ParseKey(&ikey)) {
+      return false;
+    }
+
+    if (user_comparator_.Compare(ikey.user_key, saved_key_.GetUserKey()) < 0) {
+      return true;
+    }
+
+    if (TooManyInternalKeysSkipped()) {
+      return false;
+    }
+
+    assert(ikey.sequence != kMaxSequenceNumber);
+    if (!IsVisible(ikey.sequence)) {
+      PERF_COUNTER_ADD(internal_recent_skipped_count, 1);
+    } else {
+      PERF_COUNTER_ADD(internal_key_skipped_count, 1);
+    }
+
+    if (num_skipped >= max_skip_) {
+      num_skipped = 0;
+      IterKey last_key;
+      last_key.SetInternalKey(ParsedInternalKey(
+          saved_key_.GetUserKey(), kMaxSequenceNumber, kValueTypeForSeek));
+      // It would be more efficient to use SeekForPrev() here, but some
+      // iterators may not support it.
+      iter_.Seek(last_key.GetInternalKey());
+      RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION);
+      if (!iter_.Valid()) {
+        break;
+      }
+    } else {
+      ++num_skipped;
+    }
+
+    iter_.Prev();
+  }
+
+  if (!iter_.status().ok()) {
+    valid_ = false;
+    return false;
+  }
+
+  return true;
+}
+
+bool DBIter::TooManyInternalKeysSkipped(bool increment) {
+  if ((max_skippable_internal_keys_ > 0) &&
+      (num_internal_keys_skipped_ > max_skippable_internal_keys_)) {
+    valid_ = false;
+    status_ = Status::Incomplete("Too many internal keys skipped.");
+    return true;
+  } else if (increment) {
+    num_internal_keys_skipped_++;
+  }
+  return false;
+}
+
+bool DBIter::IsVisible(SequenceNumber sequence) {
+  if (read_callback_ == nullptr) {
+    return sequence <= sequence_;
+  } else {
+    return read_callback_->IsVisible(sequence);
+  }
+}
+
+void DBIter::SetSavedKeyToSeekTarget(const Slice& target) {
+  is_key_seqnum_zero_ = false;
+  SequenceNumber seq = sequence_;
+  saved_key_.Clear();
+  saved_key_.SetInternalKey(target, seq);
+
+  if (iterate_lower_bound_ != nullptr &&
+      user_comparator_.Compare(saved_key_.GetUserKey(), *iterate_lower_bound_) <
+          0) {
+    // Seek key is smaller than the lower bound.
+    saved_key_.Clear();
+    saved_key_.SetInternalKey(*iterate_lower_bound_, seq);
+  }
+}
+
+void DBIter::SetSavedKeyToSeekForPrevTarget(const Slice& target) {
+  is_key_seqnum_zero_ = false;
+  saved_key_.Clear();
+  // now saved_key is used to store internal key.
+  saved_key_.SetInternalKey(target, 0 /* sequence_number */,
+                            kValueTypeForSeekForPrev);
+
+  if (iterate_upper_bound_ != nullptr &&
+      user_comparator_.Compare(saved_key_.GetUserKey(),
+                               *iterate_upper_bound_) >= 0) {
+    saved_key_.Clear();
+    saved_key_.SetInternalKey(*iterate_upper_bound_, kMaxSequenceNumber);
+  }
+}
+
+void DBIter::Seek(const Slice& target) {
+  PERF_CPU_TIMER_GUARD(iter_seek_cpu_nanos, env_);
+  StopWatch sw(env_, statistics_, DB_SEEK);
+
+#ifndef ROCKSDB_LITE
+  if (db_impl_ != nullptr && cfd_ != nullptr) {
+    db_impl_->TraceIteratorSeek(cfd_->GetID(), target);
+  }
+#endif  // ROCKSDB_LITE
+
+  status_ = Status::OK();
+  ReleaseTempPinnedData();
+  ResetInternalKeysSkippedCounter();
+
+  // Seek the inner iterator based on the target key.
+  {
+    PERF_TIMER_GUARD(seek_internal_seek_time);
+
+    SetSavedKeyToSeekTarget(target);
+    iter_.Seek(saved_key_.GetInternalKey());
+
+    range_del_agg_.InvalidateRangeDelMapPositions();
+    RecordTick(statistics_, NUMBER_DB_SEEK);
+  }
+  if (!iter_.Valid()) {
+    valid_ = false;
+    return;
+  }
+  direction_ = kForward;
+
+  // Now the inner iterator is placed to the target position. From there,
+  // we need to find out the next key that is visible to the user.
+  ClearSavedValue();
+  if (prefix_same_as_start_) {
+    // The case where the iterator needs to be invalidated if it has exausted
+    // keys within the same prefix of the seek key.
+    assert(prefix_extractor_ != nullptr);
+    Slice target_prefix = prefix_extractor_->Transform(target);
+    FindNextUserEntry(false /* not skipping saved_key */,
+                      &target_prefix /* prefix */);
+    if (valid_) {
+      // Remember the prefix of the seek key for the future Next() call to
+      // check.
+      prefix_.SetUserKey(target_prefix);
+    }
+  } else {
+    FindNextUserEntry(false /* not skipping saved_key */, nullptr);
+  }
+  if (!valid_) {
+    return;
+  }
+
+  // Updating stats and perf context counters.
+  if (statistics_ != nullptr) {
+    // Decrement since we don't want to count this key as skipped
+    RecordTick(statistics_, NUMBER_DB_SEEK_FOUND);
+    RecordTick(statistics_, ITER_BYTES_READ, key().size() + value().size());
+  }
+  PERF_COUNTER_ADD(iter_read_bytes, key().size() + value().size());
+}
+
+void DBIter::SeekForPrev(const Slice& target) {
+  PERF_CPU_TIMER_GUARD(iter_seek_cpu_nanos, env_);
+  StopWatch sw(env_, statistics_, DB_SEEK);
+
+#ifndef ROCKSDB_LITE
+  if (db_impl_ != nullptr && cfd_ != nullptr) {
+    db_impl_->TraceIteratorSeekForPrev(cfd_->GetID(), target);
+  }
+#endif  // ROCKSDB_LITE
+
+  status_ = Status::OK();
+  ReleaseTempPinnedData();
+  ResetInternalKeysSkippedCounter();
+
+  // Seek the inner iterator based on the target key.
+  {
+    PERF_TIMER_GUARD(seek_internal_seek_time);
+    SetSavedKeyToSeekForPrevTarget(target);
+    iter_.SeekForPrev(saved_key_.GetInternalKey());
+    range_del_agg_.InvalidateRangeDelMapPositions();
+    RecordTick(statistics_, NUMBER_DB_SEEK);
+  }
+  if (!iter_.Valid()) {
+    valid_ = false;
+    return;
+  }
+  direction_ = kReverse;
+
+  // Now the inner iterator is placed to the target position. From there,
+  // we need to find out the first key that is visible to the user in the
+  // backward direction.
+  ClearSavedValue();
+  if (prefix_same_as_start_) {
+    // The case where the iterator needs to be invalidated if it has exausted
+    // keys within the same prefix of the seek key.
+    assert(prefix_extractor_ != nullptr);
+    Slice target_prefix = prefix_extractor_->Transform(target);
+    PrevInternal(&target_prefix);
+    if (valid_) {
+      // Remember the prefix of the seek key for the future Prev() call to
+      // check.
+      prefix_.SetUserKey(target_prefix);
+    }
+  } else {
+    PrevInternal(nullptr);
+  }
+
+  // Report stats and perf context.
+  if (statistics_ != nullptr && valid_) {
+    RecordTick(statistics_, NUMBER_DB_SEEK_FOUND);
+    RecordTick(statistics_, ITER_BYTES_READ, key().size() + value().size());
+    PERF_COUNTER_ADD(iter_read_bytes, key().size() + value().size());
+  }
+}
+
+void DBIter::SeekToFirst() {
+  if (iterate_lower_bound_ != nullptr) {
+    Seek(*iterate_lower_bound_);
+    return;
+  }
+  PERF_CPU_TIMER_GUARD(iter_seek_cpu_nanos, env_);
+  // Don't use iter_::Seek() if we set a prefix extractor
+  // because prefix seek will be used.
+  if (!expect_total_order_inner_iter()) {
+    max_skip_ = std::numeric_limits<uint64_t>::max();
+  }
+  status_ = Status::OK();
+  direction_ = kForward;
+  ReleaseTempPinnedData();
+  ResetInternalKeysSkippedCounter();
+  ClearSavedValue();
+  is_key_seqnum_zero_ = false;
+
+  {
+    PERF_TIMER_GUARD(seek_internal_seek_time);
+    iter_.SeekToFirst();
+    range_del_agg_.InvalidateRangeDelMapPositions();
+  }
+
+  RecordTick(statistics_, NUMBER_DB_SEEK);
+  if (iter_.Valid()) {
+    saved_key_.SetUserKey(
+        ExtractUserKey(iter_.key()),
+        !iter_.iter()->IsKeyPinned() || !pin_thru_lifetime_ /* copy */);
+    FindNextUserEntry(false /* not skipping saved_key */,
+                      nullptr /* no prefix check */);
+    if (statistics_ != nullptr) {
+      if (valid_) {
+        RecordTick(statistics_, NUMBER_DB_SEEK_FOUND);
+        RecordTick(statistics_, ITER_BYTES_READ, key().size() + value().size());
+        PERF_COUNTER_ADD(iter_read_bytes, key().size() + value().size());
+      }
+    }
+  } else {
+    valid_ = false;
+  }
+  if (valid_ && prefix_same_as_start_) {
+    assert(prefix_extractor_ != nullptr);
+    prefix_.SetUserKey(prefix_extractor_->Transform(saved_key_.GetUserKey()));
+  }
+}
+
+void DBIter::SeekToLast() {
+  if (iterate_upper_bound_ != nullptr) {
+    // Seek to last key strictly less than ReadOptions.iterate_upper_bound.
+    SeekForPrev(*iterate_upper_bound_);
+    if (Valid() && user_comparator_.Equal(*iterate_upper_bound_, key())) {
+      ReleaseTempPinnedData();
+      PrevInternal(nullptr);
+    }
+    return;
+  }
+
+  PERF_CPU_TIMER_GUARD(iter_seek_cpu_nanos, env_);
+  // Don't use iter_::Seek() if we set a prefix extractor
+  // because prefix seek will be used.
+  if (!expect_total_order_inner_iter()) {
+    max_skip_ = std::numeric_limits<uint64_t>::max();
+  }
+  status_ = Status::OK();
+  direction_ = kReverse;
+  ReleaseTempPinnedData();
+  ResetInternalKeysSkippedCounter();
+  ClearSavedValue();
+  is_key_seqnum_zero_ = false;
+
+  {
+    PERF_TIMER_GUARD(seek_internal_seek_time);
+    iter_.SeekToLast();
+    range_del_agg_.InvalidateRangeDelMapPositions();
+  }
+  PrevInternal(nullptr);
+  if (statistics_ != nullptr) {
+    RecordTick(statistics_, NUMBER_DB_SEEK);
+    if (valid_) {
+      RecordTick(statistics_, NUMBER_DB_SEEK_FOUND);
+      RecordTick(statistics_, ITER_BYTES_READ, key().size() + value().size());
+      PERF_COUNTER_ADD(iter_read_bytes, key().size() + value().size());
+    }
+  }
+  if (valid_ && prefix_same_as_start_) {
+    assert(prefix_extractor_ != nullptr);
+    prefix_.SetUserKey(prefix_extractor_->Transform(saved_key_.GetUserKey()));
+  }
+}
+
+Iterator* NewDBIterator(Env* env, const ReadOptions& read_options,
+                        const ImmutableCFOptions& cf_options,
+                        const MutableCFOptions& mutable_cf_options,
+                        const Comparator* user_key_comparator,
+                        InternalIterator* internal_iter,
+                        const SequenceNumber& sequence,
+                        uint64_t max_sequential_skip_in_iterations,
+                        ReadCallback* read_callback, DBImpl* db_impl,
+                        ColumnFamilyData* cfd, bool allow_blob) {
+  DBIter* db_iter = new DBIter(
+      env, read_options, cf_options, mutable_cf_options, user_key_comparator,
+      internal_iter, sequence, false, max_sequential_skip_in_iterations,
+      read_callback, db_impl, cfd, allow_blob);
+  return db_iter;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/db_iter.h b/src/rocksdb/db/db_iter.h
new file mode 100644
index 000000000..32704e4d5
--- /dev/null
+++ b/src/rocksdb/db/db_iter.h
@@ -0,0 +1,344 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <stdint.h>
+#include <string>
+#include "db/db_impl/db_impl.h"
+#include "db/dbformat.h"
+#include "db/range_del_aggregator.h"
+#include "memory/arena.h"
+#include "options/cf_options.h"
+#include "rocksdb/db.h"
+#include "rocksdb/iterator.h"
+#include "table/iterator_wrapper.h"
+#include "util/autovector.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// This file declares the factory functions of DBIter, in its original form
+// or a wrapped form with class ArenaWrappedDBIter, which is defined here.
+// Class DBIter, which is declared and implemented inside db_iter.cc, is
+// an iterator that converts internal keys (yielded by an InternalIterator)
+// that were live at the specified sequence number into appropriate user
+// keys.
+// Each internal key consists of a user key, a sequence number, and a value
+// type. DBIter deals with multiple key versions, tombstones, merge operands,
+// etc, and exposes an Iterator.
+// For example, DBIter may wrap following InternalIterator:
+//    user key: AAA  value: v3   seqno: 100    type: Put
+//    user key: AAA  value: v2   seqno: 97     type: Put
+//    user key: AAA  value: v1   seqno: 95     type: Put
+//    user key: BBB  value: v1   seqno: 90     type: Put
+//    user key: BBC  value: N/A  seqno: 98     type: Delete
+//    user key: BBC  value: v1   seqno: 95     type: Put
+// If the snapshot passed in is 102, then the DBIter is expected to
+// expose the following iterator:
+//    key: AAA  value: v3
+//    key: BBB  value: v1
+// If the snapshot passed in is 96, then it should expose:
+//    key: AAA  value: v1
+//    key: BBB  value: v1
+//    key: BBC  value: v1
+//
+
+// Memtables and sstables that make the DB representation contain
+// (userkey,seq,type) => uservalue entries.  DBIter
+// combines multiple entries for the same userkey found in the DB
+// representation into a single entry while accounting for sequence
+// numbers, deletion markers, overwrites, etc.
+class DBIter final : public Iterator {
+ public:
+  // The following is grossly complicated. TODO: clean it up
+  // Which direction is the iterator currently moving?
+  // (1) When moving forward:
+  //   (1a) if current_entry_is_merged_ = false, the internal iterator is
+  //        positioned at the exact entry that yields this->key(), this->value()
+  //   (1b) if current_entry_is_merged_ = true, the internal iterator is
+  //        positioned immediately after the last entry that contributed to the
+  //        current this->value(). That entry may or may not have key equal to
+  //        this->key().
+  // (2) When moving backwards, the internal iterator is positioned
+  //     just before all entries whose user key == this->key().
+  enum Direction { kForward, kReverse };
+
+  // LocalStatistics contain Statistics counters that will be aggregated per
+  // each iterator instance and then will be sent to the global statistics when
+  // the iterator is destroyed.
+  //
+  // The purpose of this approach is to avoid perf regression happening
+  // when multiple threads bump the atomic counters from a DBIter::Next().
+  struct LocalStatistics {
+    explicit LocalStatistics() { ResetCounters(); }
+
+    void ResetCounters() {
+      next_count_ = 0;
+      next_found_count_ = 0;
+      prev_count_ = 0;
+      prev_found_count_ = 0;
+      bytes_read_ = 0;
+      skip_count_ = 0;
+    }
+
+    void BumpGlobalStatistics(Statistics* global_statistics) {
+      RecordTick(global_statistics, NUMBER_DB_NEXT, next_count_);
+      RecordTick(global_statistics, NUMBER_DB_NEXT_FOUND, next_found_count_);
+      RecordTick(global_statistics, NUMBER_DB_PREV, prev_count_);
+      RecordTick(global_statistics, NUMBER_DB_PREV_FOUND, prev_found_count_);
+      RecordTick(global_statistics, ITER_BYTES_READ, bytes_read_);
+      RecordTick(global_statistics, NUMBER_ITER_SKIP, skip_count_);
+      PERF_COUNTER_ADD(iter_read_bytes, bytes_read_);
+      ResetCounters();
+    }
+
+    // Map to Tickers::NUMBER_DB_NEXT
+    uint64_t next_count_;
+    // Map to Tickers::NUMBER_DB_NEXT_FOUND
+    uint64_t next_found_count_;
+    // Map to Tickers::NUMBER_DB_PREV
+    uint64_t prev_count_;
+    // Map to Tickers::NUMBER_DB_PREV_FOUND
+    uint64_t prev_found_count_;
+    // Map to Tickers::ITER_BYTES_READ
+    uint64_t bytes_read_;
+    // Map to Tickers::NUMBER_ITER_SKIP
+    uint64_t skip_count_;
+  };
+
+  DBIter(Env* _env, const ReadOptions& read_options,
+         const ImmutableCFOptions& cf_options,
+         const MutableCFOptions& mutable_cf_options, const Comparator* cmp,
+         InternalIterator* iter, SequenceNumber s, bool arena_mode,
+         uint64_t max_sequential_skip_in_iterations,
+         ReadCallback* read_callback, DBImpl* db_impl, ColumnFamilyData* cfd,
+         bool allow_blob);
+
+  // No copying allowed
+  DBIter(const DBIter&) = delete;
+  void operator=(const DBIter&) = delete;
+
+  ~DBIter() override {
+    // Release pinned data if any
+    if (pinned_iters_mgr_.PinningEnabled()) {
+      pinned_iters_mgr_.ReleasePinnedData();
+    }
+    RecordTick(statistics_, NO_ITERATOR_DELETED);
+    ResetInternalKeysSkippedCounter();
+    local_stats_.BumpGlobalStatistics(statistics_);
+    iter_.DeleteIter(arena_mode_);
+  }
+  void SetIter(InternalIterator* iter) {
+    assert(iter_.iter() == nullptr);
+    iter_.Set(iter);
+    iter_.iter()->SetPinnedItersMgr(&pinned_iters_mgr_);
+  }
+  ReadRangeDelAggregator* GetRangeDelAggregator() { return &range_del_agg_; }
+
+  bool Valid() const override { return valid_; }
+  Slice key() const override {
+    assert(valid_);
+    if (start_seqnum_ > 0) {
+      return saved_key_.GetInternalKey();
+    } else {
+      return saved_key_.GetUserKey();
+    }
+  }
+  Slice value() const override {
+    assert(valid_);
+    if (current_entry_is_merged_) {
+      // If pinned_value_ is set then the result of merge operator is one of
+      // the merge operands and we should return it.
+      return pinned_value_.data() ? pinned_value_ : saved_value_;
+    } else if (direction_ == kReverse) {
+      return pinned_value_;
+    } else {
+      return iter_.value();
+    }
+  }
+  Status status() const override {
+    if (status_.ok()) {
+      return iter_.status();
+    } else {
+      assert(!valid_);
+      return status_;
+    }
+  }
+  bool IsBlob() const {
+    assert(valid_ && (allow_blob_ || !is_blob_));
+    return is_blob_;
+  }
+
+  Status GetProperty(std::string prop_name, std::string* prop) override;
+
+  void Next() final override;
+  void Prev() final override;
+  void Seek(const Slice& target) final override;
+  void SeekForPrev(const Slice& target) final override;
+  void SeekToFirst() final override;
+  void SeekToLast() final override;
+  Env* env() const { return env_; }
+  void set_sequence(uint64_t s) {
+    sequence_ = s;
+    if (read_callback_) {
+      read_callback_->Refresh(s);
+    }
+  }
+  void set_valid(bool v) { valid_ = v; }
+
+ private:
+  // For all methods in this block:
+  // PRE: iter_->Valid() && status_.ok()
+  // Return false if there was an error, and status() is non-ok, valid_ = false;
+  // in this case callers would usually stop what they were doing and return.
+  bool ReverseToForward();
+  bool ReverseToBackward();
+  // Set saved_key_ to the seek key to target, with proper sequence number set.
+  // It might get adjusted if the seek key is smaller than iterator lower bound.
+  void SetSavedKeyToSeekTarget(const Slice& target);
+  // Set saved_key_ to the seek key to target, with proper sequence number set.
+  // It might get adjusted if the seek key is larger than iterator upper bound.
+  void SetSavedKeyToSeekForPrevTarget(const Slice& target);
+  bool FindValueForCurrentKey();
+  bool FindValueForCurrentKeyUsingSeek();
+  bool FindUserKeyBeforeSavedKey();
+  // If `skipping_saved_key` is true, the function will keep iterating until it
+  // finds a user key that is larger than `saved_key_`.
+  // If `prefix` is not null, the iterator needs to stop when all keys for the
+  // prefix are exhausted and the interator is set to invalid.
+  bool FindNextUserEntry(bool skipping_saved_key, const Slice* prefix);
+  // Internal implementation of FindNextUserEntry().
+  bool FindNextUserEntryInternal(bool skipping_saved_key, const Slice* prefix);
+  bool ParseKey(ParsedInternalKey* key);
+  bool MergeValuesNewToOld();
+
+  // If prefix is not null, we need to set the iterator to invalid if no more
+  // entry can be found within the prefix.
+  void PrevInternal(const Slice* prefix);
+  bool TooManyInternalKeysSkipped(bool increment = true);
+  bool IsVisible(SequenceNumber sequence);
+
+  // Temporarily pin the blocks that we encounter until ReleaseTempPinnedData()
+  // is called
+  void TempPinData() {
+    if (!pin_thru_lifetime_) {
+      pinned_iters_mgr_.StartPinning();
+    }
+  }
+
+  // Release blocks pinned by TempPinData()
+  void ReleaseTempPinnedData() {
+    if (!pin_thru_lifetime_ && pinned_iters_mgr_.PinningEnabled()) {
+      pinned_iters_mgr_.ReleasePinnedData();
+    }
+  }
+
+  inline void ClearSavedValue() {
+    if (saved_value_.capacity() > 1048576) {
+      std::string empty;
+      swap(empty, saved_value_);
+    } else {
+      saved_value_.clear();
+    }
+  }
+
+  inline void ResetInternalKeysSkippedCounter() {
+    local_stats_.skip_count_ += num_internal_keys_skipped_;
+    if (valid_) {
+      local_stats_.skip_count_--;
+    }
+    num_internal_keys_skipped_ = 0;
+  }
+
+  bool expect_total_order_inner_iter() {
+    assert(expect_total_order_inner_iter_ || prefix_extractor_ != nullptr);
+    return expect_total_order_inner_iter_;
+  }
+
+  const SliceTransform* prefix_extractor_;
+  Env* const env_;
+  Logger* logger_;
+  UserComparatorWrapper user_comparator_;
+  const MergeOperator* const merge_operator_;
+  IteratorWrapper iter_;
+  ReadCallback* read_callback_;
+  // Max visible sequence number. It is normally the snapshot seq unless we have
+  // uncommitted data in db as in WriteUnCommitted.
+  SequenceNumber sequence_;
+
+  IterKey saved_key_;
+  // Reusable internal key data structure. This is only used inside one function
+  // and should not be used across functions. Reusing this object can reduce
+  // overhead of calling construction of the function if creating it each time.
+  ParsedInternalKey ikey_;
+  std::string saved_value_;
+  Slice pinned_value_;
+  // for prefix seek mode to support prev()
+  Statistics* statistics_;
+  uint64_t max_skip_;
+  uint64_t max_skippable_internal_keys_;
+  uint64_t num_internal_keys_skipped_;
+  const Slice* iterate_lower_bound_;
+  const Slice* iterate_upper_bound_;
+
+  // The prefix of the seek key. It is only used when prefix_same_as_start_
+  // is true and prefix extractor is not null. In Next() or Prev(), current keys
+  // will be checked against this prefix, so that the iterator can be
+  // invalidated if the keys in this prefix has been exhausted. Set it using
+  // SetUserKey() and use it using GetUserKey().
+  IterKey prefix_;
+
+  Status status_;
+  Direction direction_;
+  bool valid_;
+  bool current_entry_is_merged_;
+  // True if we know that the current entry's seqnum is 0.
+  // This information is used as that the next entry will be for another
+  // user key.
+  bool is_key_seqnum_zero_;
+  const bool prefix_same_as_start_;
+  // Means that we will pin all data blocks we read as long the Iterator
+  // is not deleted, will be true if ReadOptions::pin_data is true
+  const bool pin_thru_lifetime_;
+  // Expect the inner iterator to maintain a total order.
+  // prefix_extractor_ must be non-NULL if the value is false.
+  const bool expect_total_order_inner_iter_;
+  bool allow_blob_;
+  bool is_blob_;
+  bool arena_mode_;
+  // List of operands for merge operator.
+  MergeContext merge_context_;
+  ReadRangeDelAggregator range_del_agg_;
+  LocalStatistics local_stats_;
+  PinnedIteratorsManager pinned_iters_mgr_;
+#ifdef ROCKSDB_LITE
+  ROCKSDB_FIELD_UNUSED
+#endif
+  DBImpl* db_impl_;
+#ifdef ROCKSDB_LITE
+  ROCKSDB_FIELD_UNUSED
+#endif
+  ColumnFamilyData* cfd_;
+  // for diff snapshots we want the lower bound on the seqnum;
+  // if this value > 0 iterator will return internal keys
+  SequenceNumber start_seqnum_;
+};
+
+// Return a new iterator that converts internal keys (yielded by
+// "*internal_iter") that were live at the specified `sequence` number
+// into appropriate user keys.
+extern Iterator* NewDBIterator(
+    Env* env, const ReadOptions& read_options,
+    const ImmutableCFOptions& cf_options,
+    const MutableCFOptions& mutable_cf_options,
+    const Comparator* user_key_comparator, InternalIterator* internal_iter,
+    const SequenceNumber& sequence, uint64_t max_sequential_skip_in_iterations,
+    ReadCallback* read_callback, DBImpl* db_impl = nullptr,
+    ColumnFamilyData* cfd = nullptr, bool allow_blob = false);
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/db_iter_stress_test.cc b/src/rocksdb/db/db_iter_stress_test.cc
new file mode 100644
index 000000000..57cd9866e
--- /dev/null
+++ b/src/rocksdb/db/db_iter_stress_test.cc
@@ -0,0 +1,654 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/db_iter.h"
+#include "db/dbformat.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/options.h"
+#include "rocksdb/slice.h"
+#include "test_util/testharness.h"
+#include "util/random.h"
+#include "util/string_util.h"
+#include "utilities/merge_operators.h"
+
+#ifdef GFLAGS
+
+#include "util/gflags_compat.h"
+
+using GFLAGS_NAMESPACE::ParseCommandLineFlags;
+
+DEFINE_bool(verbose, false,
+            "Print huge, detailed trace. Intended for debugging failures.");
+
+#else
+
+void ParseCommandLineFlags(int*, char***, bool) {}
+bool FLAGS_verbose = false;
+
+#endif
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBIteratorStressTest : public testing::Test {
+ public:
+  Env* env_;
+
+  DBIteratorStressTest() : env_(Env::Default()) {}
+};
+
+namespace {
+
+struct Entry {
+  std::string key;
+  ValueType type;  // kTypeValue, kTypeDeletion, kTypeMerge
+  uint64_t sequence;
+  std::string ikey;  // internal key, made from `key`, `sequence` and `type`
+  std::string value;
+  // If false, we'll pretend that this entry doesn't exist.
+  bool visible = true;
+
+  bool operator<(const Entry& e) const {
+    if (key != e.key) return key < e.key;
+    return std::tie(sequence, type) > std::tie(e.sequence, e.type);
+  }
+};
+
+struct Data {
+  std::vector<Entry> entries;
+
+  // Indices in `entries` with `visible` = false.
+  std::vector<size_t> hidden;
+  // Keys of entries whose `visible` changed since the last seek of iterators.
+  std::set<std::string> recently_touched_keys;
+};
+
+struct StressTestIterator : public InternalIterator {
+  Data* data;
+  Random64* rnd;
+  InternalKeyComparator cmp;
+
+  // Each operation will return error with this probability...
+  double error_probability = 0;
+  // ... and add/remove entries with this probability.
+  double mutation_probability = 0;
+  // The probability of adding vs removing entries will be chosen so that the
+  // amount of removed entries stays somewhat close to this number.
+  double target_hidden_fraction = 0;
+  // If true, print all mutations to stdout for debugging.
+  bool trace = false;
+
+  int iter = -1;
+  Status status_;
+
+  StressTestIterator(Data* _data, Random64* _rnd, const Comparator* _cmp)
+      : data(_data), rnd(_rnd), cmp(_cmp) {}
+
+  bool Valid() const override {
+    if (iter >= 0 && iter < (int)data->entries.size()) {
+      assert(status_.ok());
+      return true;
+    }
+    return false;
+  }
+
+  Status status() const override { return status_; }
+
+  bool MaybeFail() {
+    if (rnd->Next() >=
+        std::numeric_limits<uint64_t>::max() * error_probability) {
+      return false;
+    }
+    if (rnd->Next() % 2) {
+      status_ = Status::Incomplete("test");
+    } else {
+      status_ = Status::IOError("test");
+    }
+    if (trace) {
+      std::cout << "injecting " << status_.ToString() << std::endl;
+    }
+    iter = -1;
+    return true;
+  }
+
+  void MaybeMutate() {
+    if (rnd->Next() >=
+        std::numeric_limits<uint64_t>::max() * mutation_probability) {
+      return;
+    }
+    do {
+      // If too many entries are hidden, hide less, otherwise hide more.
+      double hide_probability =
+          data->hidden.size() > data->entries.size() * target_hidden_fraction
+              ? 1. / 3
+              : 2. / 3;
+      if (data->hidden.empty()) {
+        hide_probability = 1;
+      }
+      bool do_hide =
+          rnd->Next() < std::numeric_limits<uint64_t>::max() * hide_probability;
+      if (do_hide) {
+        // Hide a random entry.
+        size_t idx = rnd->Next() % data->entries.size();
+        Entry& e = data->entries[idx];
+        if (e.visible) {
+          if (trace) {
+            std::cout << "hiding idx " << idx << std::endl;
+          }
+          e.visible = false;
+          data->hidden.push_back(idx);
+          data->recently_touched_keys.insert(e.key);
+        } else {
+          // Already hidden. Let's go unhide something instead, just because
+          // it's easy and it doesn't really matter what we do.
+          do_hide = false;
+        }
+      }
+      if (!do_hide) {
+        // Unhide a random entry.
+        size_t hi = rnd->Next() % data->hidden.size();
+        size_t idx = data->hidden[hi];
+        if (trace) {
+          std::cout << "unhiding idx " << idx << std::endl;
+        }
+        Entry& e = data->entries[idx];
+        assert(!e.visible);
+        e.visible = true;
+        data->hidden[hi] = data->hidden.back();
+        data->hidden.pop_back();
+        data->recently_touched_keys.insert(e.key);
+      }
+    } while (rnd->Next() % 3 != 0);  // do 3 mutations on average
+  }
+
+  void SkipForward() {
+    while (iter < (int)data->entries.size() && !data->entries[iter].visible) {
+      ++iter;
+    }
+  }
+  void SkipBackward() {
+    while (iter >= 0 && !data->entries[iter].visible) {
+      --iter;
+    }
+  }
+
+  void SeekToFirst() override {
+    if (MaybeFail()) return;
+    MaybeMutate();
+
+    status_ = Status::OK();
+    iter = 0;
+    SkipForward();
+  }
+  void SeekToLast() override {
+    if (MaybeFail()) return;
+    MaybeMutate();
+
+    status_ = Status::OK();
+    iter = (int)data->entries.size() - 1;
+    SkipBackward();
+  }
+
+  void Seek(const Slice& target) override {
+    if (MaybeFail()) return;
+    MaybeMutate();
+
+    status_ = Status::OK();
+    // Binary search.
+    auto it = std::partition_point(
+        data->entries.begin(), data->entries.end(),
+        [&](const Entry& e) { return cmp.Compare(e.ikey, target) < 0; });
+    iter = (int)(it - data->entries.begin());
+    SkipForward();
+  }
+  void SeekForPrev(const Slice& target) override {
+    if (MaybeFail()) return;
+    MaybeMutate();
+
+    status_ = Status::OK();
+    // Binary search.
+    auto it = std::partition_point(
+        data->entries.begin(), data->entries.end(),
+        [&](const Entry& e) { return cmp.Compare(e.ikey, target) <= 0; });
+    iter = (int)(it - data->entries.begin());
+    --iter;
+    SkipBackward();
+  }
+
+  void Next() override {
+    assert(Valid());
+    if (MaybeFail()) return;
+    MaybeMutate();
+    ++iter;
+    SkipForward();
+  }
+  void Prev() override {
+    assert(Valid());
+    if (MaybeFail()) return;
+    MaybeMutate();
+    --iter;
+    SkipBackward();
+  }
+
+  Slice key() const override {
+    assert(Valid());
+    return data->entries[iter].ikey;
+  }
+  Slice value() const override {
+    assert(Valid());
+    return data->entries[iter].value;
+  }
+
+  bool IsKeyPinned() const override { return true; }
+  bool IsValuePinned() const override { return true; }
+};
+
+// A small reimplementation of DBIter, supporting only some of the features,
+// and doing everything in O(log n).
+// Skips all keys that are in recently_touched_keys.
+struct ReferenceIterator {
+  Data* data;
+  uint64_t sequence;  // ignore entries with sequence number below this
+
+  bool valid = false;
+  std::string key;
+  std::string value;
+
+  ReferenceIterator(Data* _data, uint64_t _sequence)
+      : data(_data), sequence(_sequence) {}
+
+  bool Valid() const { return valid; }
+
+  // Finds the first entry with key
+  // greater/less/greater-or-equal/less-or-equal than `key`, depending on
+  // arguments: if `skip`, inequality is strict; if `forward`, it's
+  // greater/greater-or-equal, otherwise less/less-or-equal.
+  // Sets `key` to the result.
+  // If no such key exists, returns false. Doesn't check `visible`.
+  bool FindNextKey(bool skip, bool forward) {
+    valid = false;
+    auto it = std::partition_point(data->entries.begin(), data->entries.end(),
+                                   [&](const Entry& e) {
+                                     if (forward != skip) {
+                                       return e.key < key;
+                                     } else {
+                                       return e.key <= key;
+                                     }
+                                   });
+    if (forward) {
+      if (it != data->entries.end()) {
+        key = it->key;
+        return true;
+      }
+    } else {
+      if (it != data->entries.begin()) {
+        --it;
+        key = it->key;
+        return true;
+      }
+    }
+    return false;
+  }
+
+  bool FindValueForCurrentKey() {
+    if (data->recently_touched_keys.count(key)) {
+      return false;
+    }
+
+    // Find the first entry for the key. The caller promises that it exists.
+    auto it = std::partition_point(data->entries.begin(), data->entries.end(),
+                                   [&](const Entry& e) {
+                                     if (e.key != key) {
+                                       return e.key < key;
+                                     }
+                                     return e.sequence > sequence;
+                                   });
+
+    // Find the first visible entry.
+    for (;; ++it) {
+      if (it == data->entries.end()) {
+        return false;
+      }
+      Entry& e = *it;
+      if (e.key != key) {
+        return false;
+      }
+      assert(e.sequence <= sequence);
+      if (!e.visible) continue;
+      if (e.type == kTypeDeletion) {
+        return false;
+      }
+      if (e.type == kTypeValue) {
+        value = e.value;
+        valid = true;
+        return true;
+      }
+      assert(e.type == kTypeMerge);
+      break;
+    }
+
+    // Collect merge operands.
+    std::vector<Slice> operands;
+    for (; it != data->entries.end(); ++it) {
+      Entry& e = *it;
+      if (e.key != key) {
+        break;
+      }
+      assert(e.sequence <= sequence);
+      if (!e.visible) continue;
+      if (e.type == kTypeDeletion) {
+        break;
+      }
+      operands.push_back(e.value);
+      if (e.type == kTypeValue) {
+        break;
+      }
+    }
+
+    // Do a merge.
+    value = operands.back().ToString();
+    for (int i = (int)operands.size() - 2; i >= 0; --i) {
+      value.append(",");
+      value.append(operands[i].data(), operands[i].size());
+    }
+
+    valid = true;
+    return true;
+  }
+
+  // Start at `key` and move until we encounter a valid value.
+  // `forward` defines the direction of movement.
+  // If `skip` is true, we're looking for key not equal to `key`.
+  void DoTheThing(bool skip, bool forward) {
+    while (FindNextKey(skip, forward) && !FindValueForCurrentKey()) {
+      skip = true;
+    }
+  }
+
+  void Seek(const Slice& target) {
+    key = target.ToString();
+    DoTheThing(false, true);
+  }
+  void SeekForPrev(const Slice& target) {
+    key = target.ToString();
+    DoTheThing(false, false);
+  }
+  void SeekToFirst() { Seek(""); }
+  void SeekToLast() {
+    key = data->entries.back().key;
+    DoTheThing(false, false);
+  }
+  void Next() {
+    assert(Valid());
+    DoTheThing(true, true);
+  }
+  void Prev() {
+    assert(Valid());
+    DoTheThing(true, false);
+  }
+};
+
+}  // namespace
+
+// Use an internal iterator that sometimes returns errors and sometimes
+// adds/removes entries on the fly. Do random operations on a DBIter and
+// check results.
+// TODO: can be improved for more coverage:
+//   * Override IsKeyPinned() and IsValuePinned() to actually use
+//     PinnedIteratorManager and check that there's no use-after free.
+//   * Try different combinations of prefix_extractor, total_order_seek,
+//     prefix_same_as_start, iterate_lower_bound, iterate_upper_bound.
+TEST_F(DBIteratorStressTest, StressTest) {
+  // We use a deterministic RNG, and everything happens in a single thread.
+  Random64 rnd(826909345792864532ll);
+
+  auto gen_key = [&](int max_key) {
+    assert(max_key > 0);
+    int len = 0;
+    int a = max_key;
+    while (a) {
+      a /= 10;
+      ++len;
+    }
+    std::string s = ToString(rnd.Next() % static_cast<uint64_t>(max_key));
+    s.insert(0, len - (int)s.size(), '0');
+    return s;
+  };
+
+  Options options;
+  options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
+  ReadOptions ropt;
+
+  size_t num_matching = 0;
+  size_t num_at_end = 0;
+  size_t num_not_ok = 0;
+  size_t num_recently_removed = 0;
+
+  // Number of iterations for each combination of parameters
+  // (there are ~250 of those).
+  // Tweak this to change the test run time.
+  // As of the time of writing, the test takes ~4 seconds for value of 5000.
+  const int num_iterations = 5000;
+  // Enable this to print all the operations for debugging.
+  bool trace = FLAGS_verbose;
+
+  for (int num_entries : {5, 10, 100}) {
+    for (double key_space : {0.1, 1.0, 3.0}) {
+      for (ValueType prevalent_entry_type :
+           {kTypeValue, kTypeDeletion, kTypeMerge}) {
+        for (double error_probability : {0.01, 0.1}) {
+          for (double mutation_probability : {0.01, 0.5}) {
+            for (double target_hidden_fraction : {0.1, 0.5}) {
+              std::string trace_str =
+                  "entries: " + ToString(num_entries) +
+                  ", key_space: " + ToString(key_space) +
+                  ", error_probability: " + ToString(error_probability) +
+                  ", mutation_probability: " + ToString(mutation_probability) +
+                  ", target_hidden_fraction: " +
+                  ToString(target_hidden_fraction);
+              SCOPED_TRACE(trace_str);
+              if (trace) {
+                std::cout << trace_str << std::endl;
+              }
+
+              // Generate data.
+              Data data;
+              int max_key = (int)(num_entries * key_space) + 1;
+              for (int i = 0; i < num_entries; ++i) {
+                Entry e;
+                e.key = gen_key(max_key);
+                if (rnd.Next() % 10 != 0) {
+                  e.type = prevalent_entry_type;
+                } else {
+                  const ValueType types[] = {kTypeValue, kTypeDeletion,
+                                             kTypeMerge};
+                  e.type =
+                      types[rnd.Next() % (sizeof(types) / sizeof(types[0]))];
+                }
+                e.sequence = i;
+                e.value = "v" + ToString(i);
+                ParsedInternalKey internal_key(e.key, e.sequence, e.type);
+                AppendInternalKey(&e.ikey, internal_key);
+
+                data.entries.push_back(e);
+              }
+              std::sort(data.entries.begin(), data.entries.end());
+              if (trace) {
+                std::cout << "entries:";
+                for (size_t i = 0; i < data.entries.size(); ++i) {
+                  Entry& e = data.entries[i];
+                  std::cout
+                      << "\n  idx " << i << ": \"" << e.key << "\": \""
+                      << e.value << "\" seq: " << e.sequence << " type: "
+                      << (e.type == kTypeValue
+                              ? "val"
+                              : e.type == kTypeDeletion ? "del" : "merge");
+                }
+                std::cout << std::endl;
+              }
+
+              std::unique_ptr<Iterator> db_iter;
+              std::unique_ptr<ReferenceIterator> ref_iter;
+              for (int iteration = 0; iteration < num_iterations; ++iteration) {
+                SCOPED_TRACE(iteration);
+                // Create a new iterator every ~30 operations.
+                if (db_iter == nullptr || rnd.Next() % 30 == 0) {
+                  uint64_t sequence = rnd.Next() % (data.entries.size() + 2);
+                  ref_iter.reset(new ReferenceIterator(&data, sequence));
+                  if (trace) {
+                    std::cout << "new iterator, seq: " << sequence << std::endl;
+                  }
+
+                  auto internal_iter =
+                      new StressTestIterator(&data, &rnd, BytewiseComparator());
+                  internal_iter->error_probability = error_probability;
+                  internal_iter->mutation_probability = mutation_probability;
+                  internal_iter->target_hidden_fraction =
+                      target_hidden_fraction;
+                  internal_iter->trace = trace;
+                  db_iter.reset(NewDBIterator(
+                      env_, ropt, ImmutableCFOptions(options),
+                      MutableCFOptions(options), BytewiseComparator(),
+                      internal_iter, sequence,
+                      options.max_sequential_skip_in_iterations,
+                      nullptr /*read_callback*/));
+                }
+
+                // Do a random operation. It's important to do it on ref_it
+                // later than on db_iter to make sure ref_it sees the correct
+                // recently_touched_keys.
+                std::string old_key;
+                bool forward = rnd.Next() % 2 > 0;
+                // Do Next()/Prev() ~90% of the time.
+                bool seek = !ref_iter->Valid() || rnd.Next() % 10 == 0;
+                if (trace) {
+                  std::cout << iteration << ": ";
+                }
+
+                if (!seek) {
+                  assert(db_iter->Valid());
+                  old_key = ref_iter->key;
+                  if (trace) {
+                    std::cout << (forward ? "Next" : "Prev") << std::endl;
+                  }
+
+                  if (forward) {
+                    db_iter->Next();
+                    ref_iter->Next();
+                  } else {
+                    db_iter->Prev();
+                    ref_iter->Prev();
+                  }
+                } else {
+                  data.recently_touched_keys.clear();
+                  // Do SeekToFirst less often than Seek.
+                  if (rnd.Next() % 4 == 0) {
+                    if (trace) {
+                      std::cout << (forward ? "SeekToFirst" : "SeekToLast")
+                                << std::endl;
+                    }
+
+                    if (forward) {
+                      old_key = "";
+                      db_iter->SeekToFirst();
+                      ref_iter->SeekToFirst();
+                    } else {
+                      old_key = data.entries.back().key;
+                      db_iter->SeekToLast();
+                      ref_iter->SeekToLast();
+                    }
+                  } else {
+                    old_key = gen_key(max_key);
+                    if (trace) {
+                      std::cout << (forward ? "Seek" : "SeekForPrev") << " \""
+                                << old_key << '"' << std::endl;
+                    }
+                    if (forward) {
+                      db_iter->Seek(old_key);
+                      ref_iter->Seek(old_key);
+                    } else {
+                      db_iter->SeekForPrev(old_key);
+                      ref_iter->SeekForPrev(old_key);
+                    }
+                  }
+                }
+
+                // Check the result.
+                if (db_iter->Valid()) {
+                  ASSERT_TRUE(db_iter->status().ok());
+                  if (data.recently_touched_keys.count(
+                          db_iter->key().ToString())) {
+                    // Ended on a key that may have been mutated during the
+                    // operation. Reference iterator skips such keys, so we
+                    // can't check the exact result.
+
+                    // Check that the key moved in the right direction.
+                    if (forward) {
+                      if (seek)
+                        ASSERT_GE(db_iter->key().ToString(), old_key);
+                      else
+                        ASSERT_GT(db_iter->key().ToString(), old_key);
+                    } else {
+                      if (seek)
+                        ASSERT_LE(db_iter->key().ToString(), old_key);
+                      else
+                        ASSERT_LT(db_iter->key().ToString(), old_key);
+                    }
+
+                    if (ref_iter->Valid()) {
+                      // Check that DBIter didn't miss any non-mutated key.
+                      if (forward) {
+                        ASSERT_LT(db_iter->key().ToString(), ref_iter->key);
+                      } else {
+                        ASSERT_GT(db_iter->key().ToString(), ref_iter->key);
+                      }
+                    }
+                    // Tell the next iteration of the loop to reseek the
+                    // iterators.
+                    ref_iter->valid = false;
+
+                    ++num_recently_removed;
+                  } else {
+                    ASSERT_TRUE(ref_iter->Valid());
+                    ASSERT_EQ(ref_iter->key, db_iter->key().ToString());
+                    ASSERT_EQ(ref_iter->value, db_iter->value());
+                    ++num_matching;
+                  }
+                } else if (db_iter->status().ok()) {
+                  ASSERT_FALSE(ref_iter->Valid());
+                  ++num_at_end;
+                } else {
+                  // Non-ok status. Nothing to check here.
+                  // Tell the next iteration of the loop to reseek the
+                  // iterators.
+                  ref_iter->valid = false;
+                  ++num_not_ok;
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  // Check that all cases were hit many times.
+  EXPECT_GT(num_matching, 10000);
+  EXPECT_GT(num_at_end, 10000);
+  EXPECT_GT(num_not_ok, 10000);
+  EXPECT_GT(num_recently_removed, 10000);
+
+  std::cout << "stats:\n  exact matches: " << num_matching
+            << "\n  end reached: " << num_at_end
+            << "\n  non-ok status: " << num_not_ok
+            << "\n  mutated on the fly: " << num_recently_removed << std::endl;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  ParseCommandLineFlags(&argc, &argv, true);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_iter_test.cc b/src/rocksdb/db/db_iter_test.cc
new file mode 100644
index 000000000..ddbea8d17
--- /dev/null
+++ b/src/rocksdb/db/db_iter_test.cc
@@ -0,0 +1,3175 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include <string>
+#include <vector>
+#include <algorithm>
+#include <utility>
+
+#include "db/db_iter.h"
+#include "db/dbformat.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/options.h"
+#include "rocksdb/perf_context.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/statistics.h"
+#include "table/iterator_wrapper.h"
+#include "table/merging_iterator.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "util/string_util.h"
+#include "utilities/merge_operators.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+static uint64_t TestGetTickerCount(const Options& options,
+                                   Tickers ticker_type) {
+  return options.statistics->getTickerCount(ticker_type);
+}
+
+class TestIterator : public InternalIterator {
+ public:
+  explicit TestIterator(const Comparator* comparator)
+      : initialized_(false),
+        valid_(false),
+        sequence_number_(0),
+        iter_(0),
+        cmp(comparator) {
+    data_.reserve(16);
+  }
+
+  void AddPut(std::string argkey, std::string argvalue) {
+    Add(argkey, kTypeValue, argvalue);
+  }
+
+  void AddDeletion(std::string argkey) {
+    Add(argkey, kTypeDeletion, std::string());
+  }
+
+  void AddSingleDeletion(std::string argkey) {
+    Add(argkey, kTypeSingleDeletion, std::string());
+  }
+
+  void AddMerge(std::string argkey, std::string argvalue) {
+    Add(argkey, kTypeMerge, argvalue);
+  }
+
+  void Add(std::string argkey, ValueType type, std::string argvalue) {
+    Add(argkey, type, argvalue, sequence_number_++);
+  }
+
+  void Add(std::string argkey, ValueType type, std::string argvalue,
+           size_t seq_num, bool update_iter = false) {
+    valid_ = true;
+    ParsedInternalKey internal_key(argkey, seq_num, type);
+    data_.push_back(
+        std::pair<std::string, std::string>(std::string(), argvalue));
+    AppendInternalKey(&data_.back().first, internal_key);
+    if (update_iter && valid_ && cmp.Compare(data_.back().first, key()) < 0) {
+      // insert a key smaller than current key
+      Finish();
+      // data_[iter_] is not anymore the current element of the iterator.
+      // Increment it to reposition it to the right position.
+      iter_++;
+    }
+  }
+
+  // should be called before operations with iterator
+  void Finish() {
+    initialized_ = true;
+    std::sort(data_.begin(), data_.end(),
+              [this](std::pair<std::string, std::string> a,
+                     std::pair<std::string, std::string> b) {
+      return (cmp.Compare(a.first, b.first) < 0);
+    });
+  }
+
+  // Removes the key from the set of keys over which this iterator iterates.
+  // Not to be confused with AddDeletion().
+  // If the iterator is currently positioned on this key, the deletion will
+  // apply next time the iterator moves.
+  // Used for simulating ForwardIterator updating to a new version that doesn't
+  // have some of the keys (e.g. after compaction with a filter).
+  void Vanish(std::string _key) {
+    if (valid_ && data_[iter_].first == _key) {
+      delete_current_ = true;
+      return;
+    }
+    for (auto it = data_.begin(); it != data_.end(); ++it) {
+      ParsedInternalKey ikey;
+      bool ok __attribute__((__unused__)) = ParseInternalKey(it->first, &ikey);
+      assert(ok);
+      if (ikey.user_key != _key) {
+        continue;
+      }
+      if (valid_ && data_.begin() + iter_ > it) {
+        --iter_;
+      }
+      data_.erase(it);
+      return;
+    }
+    assert(false);
+  }
+
+  // Number of operations done on this iterator since construction.
+  size_t steps() const { return steps_; }
+
+  bool Valid() const override {
+    assert(initialized_);
+    return valid_;
+  }
+
+  void SeekToFirst() override {
+    assert(initialized_);
+    ++steps_;
+    DeleteCurrentIfNeeded();
+    valid_ = (data_.size() > 0);
+    iter_ = 0;
+  }
+
+  void SeekToLast() override {
+    assert(initialized_);
+    ++steps_;
+    DeleteCurrentIfNeeded();
+    valid_ = (data_.size() > 0);
+    iter_ = data_.size() - 1;
+  }
+
+  void Seek(const Slice& target) override {
+    assert(initialized_);
+    SeekToFirst();
+    ++steps_;
+    if (!valid_) {
+      return;
+    }
+    while (iter_ < data_.size() &&
+           (cmp.Compare(data_[iter_].first, target) < 0)) {
+      ++iter_;
+    }
+
+    if (iter_ == data_.size()) {
+      valid_ = false;
+    }
+  }
+
+  void SeekForPrev(const Slice& target) override {
+    assert(initialized_);
+    DeleteCurrentIfNeeded();
+    SeekForPrevImpl(target, &cmp);
+  }
+
+  void Next() override {
+    assert(initialized_);
+    assert(valid_);
+    assert(iter_ < data_.size());
+
+    ++steps_;
+    if (delete_current_) {
+      DeleteCurrentIfNeeded();
+    } else {
+      ++iter_;
+    }
+    valid_ = iter_ < data_.size();
+  }
+
+  void Prev() override {
+    assert(initialized_);
+    assert(valid_);
+    assert(iter_ < data_.size());
+
+    ++steps_;
+    DeleteCurrentIfNeeded();
+    if (iter_ == 0) {
+      valid_ = false;
+    } else {
+      --iter_;
+    }
+  }
+
+  Slice key() const override {
+    assert(initialized_);
+    return data_[iter_].first;
+  }
+
+  Slice value() const override {
+    assert(initialized_);
+    return data_[iter_].second;
+  }
+
+  Status status() const override {
+    assert(initialized_);
+    return Status::OK();
+  }
+
+  bool IsKeyPinned() const override { return true; }
+  bool IsValuePinned() const override { return true; }
+
+ private:
+  bool initialized_;
+  bool valid_;
+  size_t sequence_number_;
+  size_t iter_;
+  size_t steps_ = 0;
+
+  InternalKeyComparator cmp;
+  std::vector<std::pair<std::string, std::string>> data_;
+  bool delete_current_ = false;
+
+  void DeleteCurrentIfNeeded() {
+    if (!delete_current_) {
+      return;
+    }
+    data_.erase(data_.begin() + iter_);
+    delete_current_ = false;
+  }
+};
+
+class DBIteratorTest : public testing::Test {
+ public:
+  Env* env_;
+
+  DBIteratorTest() : env_(Env::Default()) {}
+};
+
+TEST_F(DBIteratorTest, DBIteratorPrevNext) {
+  Options options;
+  ImmutableCFOptions cf_options = ImmutableCFOptions(options);
+  MutableCFOptions mutable_cf_options = MutableCFOptions(options);
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddDeletion("a");
+    internal_iter->AddDeletion("a");
+    internal_iter->AddDeletion("a");
+    internal_iter->AddDeletion("a");
+    internal_iter->AddPut("a", "val_a");
+
+    internal_iter->AddPut("b", "val_b");
+    internal_iter->Finish();
+
+    ReadOptions ro;
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+        internal_iter, 10, options.max_sequential_skip_in_iterations,
+        nullptr /*read_callback*/));
+
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "b");
+    ASSERT_EQ(db_iter->value().ToString(), "val_b");
+
+    db_iter->Prev();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "val_a");
+
+    db_iter->Next();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "b");
+    ASSERT_EQ(db_iter->value().ToString(), "val_b");
+
+    db_iter->Next();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
+  // Test to check the SeekToLast() with iterate_upper_bound not set
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddPut("a", "val_a");
+    internal_iter->AddPut("b", "val_b");
+    internal_iter->AddPut("b", "val_b");
+    internal_iter->AddPut("c", "val_c");
+    internal_iter->Finish();
+
+    ReadOptions ro;
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+        internal_iter, 10, options.max_sequential_skip_in_iterations,
+        nullptr /*read_callback*/));
+
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "c");
+  }
+
+  // Test to check the SeekToLast() with iterate_upper_bound set
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+
+    internal_iter->AddPut("a", "val_a");
+    internal_iter->AddPut("b", "val_b");
+    internal_iter->AddPut("c", "val_c");
+    internal_iter->AddPut("d", "val_d");
+    internal_iter->AddPut("e", "val_e");
+    internal_iter->AddPut("f", "val_f");
+    internal_iter->Finish();
+
+    Slice prefix("d");
+
+    ReadOptions ro;
+    ro.iterate_upper_bound = &prefix;
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+        internal_iter, 10, options.max_sequential_skip_in_iterations,
+        nullptr /*read_callback*/));
+
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "c");
+
+    db_iter->Next();
+    ASSERT_TRUE(!db_iter->Valid());
+
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "c");
+  }
+  // Test to check the SeekToLast() iterate_upper_bound set to a key that
+  // is not Put yet
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+
+    internal_iter->AddPut("a", "val_a");
+    internal_iter->AddPut("a", "val_a");
+    internal_iter->AddPut("b", "val_b");
+    internal_iter->AddPut("c", "val_c");
+    internal_iter->AddPut("d", "val_d");
+    internal_iter->Finish();
+
+    Slice prefix("z");
+
+    ReadOptions ro;
+    ro.iterate_upper_bound = &prefix;
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+        internal_iter, 10, options.max_sequential_skip_in_iterations,
+        nullptr /*read_callback*/));
+
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "d");
+
+    db_iter->Next();
+    ASSERT_TRUE(!db_iter->Valid());
+
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "d");
+
+    db_iter->Prev();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "c");
+  }
+  // Test to check the SeekToLast() with iterate_upper_bound set to the
+  // first key
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddPut("a", "val_a");
+    internal_iter->AddPut("a", "val_a");
+    internal_iter->AddPut("a", "val_a");
+    internal_iter->AddPut("b", "val_b");
+    internal_iter->AddPut("b", "val_b");
+    internal_iter->Finish();
+
+    Slice prefix("a");
+
+    ReadOptions ro;
+    ro.iterate_upper_bound = &prefix;
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+        internal_iter, 10, options.max_sequential_skip_in_iterations,
+        nullptr /*read_callback*/));
+
+    db_iter->SeekToLast();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
+  // Test case to check SeekToLast with iterate_upper_bound set
+  // (same key put may times - SeekToLast should start with the
+  // maximum sequence id of the upper bound)
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddPut("a", "val_a");
+    internal_iter->AddPut("b", "val_b");
+    internal_iter->AddPut("c", "val_c");
+    internal_iter->AddPut("c", "val_c");
+    internal_iter->AddPut("c", "val_c");
+    internal_iter->AddPut("c", "val_c");
+    internal_iter->AddPut("c", "val_c");
+    internal_iter->AddPut("c", "val_c");
+    internal_iter->AddPut("c", "val_c");
+    internal_iter->Finish();
+
+    Slice prefix("c");
+
+    ReadOptions ro;
+    ro.iterate_upper_bound = &prefix;
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+        internal_iter, 7, options.max_sequential_skip_in_iterations,
+        nullptr /*read_callback*/));
+
+    SetPerfLevel(kEnableCount);
+    ASSERT_TRUE(GetPerfLevel() == kEnableCount);
+
+    get_perf_context()->Reset();
+    db_iter->SeekToLast();
+
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(static_cast<int>(get_perf_context()->internal_key_skipped_count), 1);
+    ASSERT_EQ(db_iter->key().ToString(), "b");
+
+    SetPerfLevel(kDisable);
+  }
+  // Test to check the SeekToLast() with the iterate_upper_bound set
+  // (Checking the value of the key which has sequence ids greater than
+  // and less that the iterator's sequence id)
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+
+    internal_iter->AddPut("a", "val_a1");
+    internal_iter->AddPut("a", "val_a2");
+    internal_iter->AddPut("b", "val_b1");
+    internal_iter->AddPut("c", "val_c1");
+    internal_iter->AddPut("c", "val_c2");
+    internal_iter->AddPut("c", "val_c3");
+    internal_iter->AddPut("b", "val_b2");
+    internal_iter->AddPut("d", "val_d1");
+    internal_iter->Finish();
+
+    Slice prefix("c");
+
+    ReadOptions ro;
+    ro.iterate_upper_bound = &prefix;
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+        internal_iter, 4, options.max_sequential_skip_in_iterations,
+        nullptr /*read_callback*/));
+
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "b");
+    ASSERT_EQ(db_iter->value().ToString(), "val_b1");
+  }
+
+  // Test to check the SeekToLast() with the iterate_upper_bound set to the
+  // key that is deleted
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddPut("a", "val_a");
+    internal_iter->AddDeletion("a");
+    internal_iter->AddPut("b", "val_b");
+    internal_iter->AddPut("c", "val_c");
+    internal_iter->Finish();
+
+    Slice prefix("a");
+
+    ReadOptions ro;
+    ro.iterate_upper_bound = &prefix;
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+        internal_iter, 10, options.max_sequential_skip_in_iterations,
+        nullptr /*read_callback*/));
+
+    db_iter->SeekToLast();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
+  // Test to check the SeekToLast() with the iterate_upper_bound set
+  // (Deletion cases)
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddPut("a", "val_a");
+    internal_iter->AddPut("b", "val_b");
+    internal_iter->AddDeletion("b");
+    internal_iter->AddPut("c", "val_c");
+    internal_iter->Finish();
+
+    Slice prefix("c");
+
+    ReadOptions ro;
+    ro.iterate_upper_bound = &prefix;
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+        internal_iter, 10, options.max_sequential_skip_in_iterations,
+        nullptr /*read_callback*/));
+
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+
+    db_iter->Next();
+    ASSERT_TRUE(!db_iter->Valid());
+
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+  }
+  // Test to check the SeekToLast() with iterate_upper_bound set
+  // (Deletion cases - Lot of internal keys after the upper_bound
+  // is deleted)
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddPut("a", "val_a");
+    internal_iter->AddPut("b", "val_b");
+    internal_iter->AddDeletion("c");
+    internal_iter->AddDeletion("d");
+    internal_iter->AddDeletion("e");
+    internal_iter->AddDeletion("f");
+    internal_iter->AddDeletion("g");
+    internal_iter->AddDeletion("h");
+    internal_iter->Finish();
+
+    Slice prefix("c");
+
+    ReadOptions ro;
+    ro.iterate_upper_bound = &prefix;
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+        internal_iter, 7, options.max_sequential_skip_in_iterations,
+        nullptr /*read_callback*/));
+
+    SetPerfLevel(kEnableCount);
+    ASSERT_TRUE(GetPerfLevel() == kEnableCount);
+
+    get_perf_context()->Reset();
+    db_iter->SeekToLast();
+
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(static_cast<int>(get_perf_context()->internal_delete_skipped_count), 0);
+    ASSERT_EQ(db_iter->key().ToString(), "b");
+
+    SetPerfLevel(kDisable);
+  }
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddDeletion("a");
+    internal_iter->AddDeletion("a");
+    internal_iter->AddDeletion("a");
+    internal_iter->AddDeletion("a");
+    internal_iter->AddPut("a", "val_a");
+
+    internal_iter->AddPut("b", "val_b");
+    internal_iter->Finish();
+
+    ReadOptions ro;
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+        internal_iter, 10, options.max_sequential_skip_in_iterations,
+        nullptr /*read_callback*/));
+
+    db_iter->SeekToFirst();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "val_a");
+
+    db_iter->Next();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "b");
+    ASSERT_EQ(db_iter->value().ToString(), "val_b");
+
+    db_iter->Prev();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "val_a");
+
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddPut("a", "val_a");
+    internal_iter->AddPut("b", "val_b");
+
+    internal_iter->AddPut("a", "val_a");
+    internal_iter->AddPut("b", "val_b");
+
+    internal_iter->AddPut("a", "val_a");
+    internal_iter->AddPut("b", "val_b");
+
+    internal_iter->AddPut("a", "val_a");
+    internal_iter->AddPut("b", "val_b");
+
+    internal_iter->AddPut("a", "val_a");
+    internal_iter->AddPut("b", "val_b");
+    internal_iter->Finish();
+
+    ReadOptions ro;
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+        internal_iter, 2, options.max_sequential_skip_in_iterations,
+        nullptr /*read_callback*/));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "b");
+    ASSERT_EQ(db_iter->value().ToString(), "val_b");
+
+    db_iter->Next();
+    ASSERT_TRUE(!db_iter->Valid());
+
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "b");
+    ASSERT_EQ(db_iter->value().ToString(), "val_b");
+  }
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddPut("a", "val_a");
+    internal_iter->AddPut("a", "val_a");
+    internal_iter->AddPut("a", "val_a");
+    internal_iter->AddPut("a", "val_a");
+    internal_iter->AddPut("a", "val_a");
+
+    internal_iter->AddPut("b", "val_b");
+
+    internal_iter->AddPut("c", "val_c");
+    internal_iter->Finish();
+
+    ReadOptions ro;
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+        internal_iter, 10, options.max_sequential_skip_in_iterations,
+        nullptr /*read_callback*/));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "c");
+    ASSERT_EQ(db_iter->value().ToString(), "val_c");
+
+    db_iter->Prev();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "b");
+    ASSERT_EQ(db_iter->value().ToString(), "val_b");
+
+    db_iter->Next();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "c");
+    ASSERT_EQ(db_iter->value().ToString(), "val_c");
+  }
+}
+
+TEST_F(DBIteratorTest, DBIteratorEmpty) {
+  Options options;
+  ImmutableCFOptions cf_options = ImmutableCFOptions(options);
+  MutableCFOptions mutable_cf_options = MutableCFOptions(options);
+  ReadOptions ro;
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->Finish();
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+        internal_iter, 0, options.max_sequential_skip_in_iterations,
+        nullptr /*read_callback*/));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->Finish();
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+        internal_iter, 0, options.max_sequential_skip_in_iterations,
+        nullptr /*read_callback*/));
+    db_iter->SeekToFirst();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
+}
+
+TEST_F(DBIteratorTest, DBIteratorUseSkipCountSkips) {
+  ReadOptions ro;
+  Options options;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
+
+  TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+  for (size_t i = 0; i < 200; ++i) {
+    internal_iter->AddPut("a", "a");
+    internal_iter->AddPut("b", "b");
+    internal_iter->AddPut("c", "c");
+  }
+  internal_iter->Finish();
+
+  std::unique_ptr<Iterator> db_iter(NewDBIterator(
+      env_, ro, ImmutableCFOptions(options), MutableCFOptions(options),
+      BytewiseComparator(), internal_iter, 2,
+      options.max_sequential_skip_in_iterations, nullptr /*read_callback*/));
+  db_iter->SeekToLast();
+  ASSERT_TRUE(db_iter->Valid());
+  ASSERT_EQ(db_iter->key().ToString(), "c");
+  ASSERT_EQ(db_iter->value().ToString(), "c");
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 1u);
+
+  db_iter->Prev();
+  ASSERT_TRUE(db_iter->Valid());
+  ASSERT_EQ(db_iter->key().ToString(), "b");
+  ASSERT_EQ(db_iter->value().ToString(), "b");
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 2u);
+
+  db_iter->Prev();
+  ASSERT_TRUE(db_iter->Valid());
+  ASSERT_EQ(db_iter->key().ToString(), "a");
+  ASSERT_EQ(db_iter->value().ToString(), "a");
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 3u);
+
+  db_iter->Prev();
+  ASSERT_TRUE(!db_iter->Valid());
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 3u);
+}
+
+TEST_F(DBIteratorTest, DBIteratorUseSkip) {
+  ReadOptions ro;
+  Options options;
+  options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
+  ImmutableCFOptions cf_options = ImmutableCFOptions(options);
+  MutableCFOptions mutable_cf_options = MutableCFOptions(options);
+
+  {
+    for (size_t i = 0; i < 200; ++i) {
+      TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+      internal_iter->AddMerge("b", "merge_1");
+      internal_iter->AddMerge("a", "merge_2");
+      for (size_t k = 0; k < 200; ++k) {
+        internal_iter->AddPut("c", ToString(k));
+      }
+      internal_iter->Finish();
+
+      options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+      std::unique_ptr<Iterator> db_iter(NewDBIterator(
+          env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+          internal_iter, i + 2, options.max_sequential_skip_in_iterations,
+          nullptr /*read_callback*/));
+      db_iter->SeekToLast();
+      ASSERT_TRUE(db_iter->Valid());
+
+      ASSERT_EQ(db_iter->key().ToString(), "c");
+      ASSERT_EQ(db_iter->value().ToString(), ToString(i));
+      db_iter->Prev();
+      ASSERT_TRUE(db_iter->Valid());
+
+      ASSERT_EQ(db_iter->key().ToString(), "b");
+      ASSERT_EQ(db_iter->value().ToString(), "merge_1");
+      db_iter->Prev();
+      ASSERT_TRUE(db_iter->Valid());
+
+      ASSERT_EQ(db_iter->key().ToString(), "a");
+      ASSERT_EQ(db_iter->value().ToString(), "merge_2");
+      db_iter->Prev();
+
+      ASSERT_TRUE(!db_iter->Valid());
+    }
+  }
+
+  {
+    for (size_t i = 0; i < 200; ++i) {
+      TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+      internal_iter->AddMerge("b", "merge_1");
+      internal_iter->AddMerge("a", "merge_2");
+      for (size_t k = 0; k < 200; ++k) {
+        internal_iter->AddDeletion("c");
+      }
+      internal_iter->AddPut("c", "200");
+      internal_iter->Finish();
+
+      std::unique_ptr<Iterator> db_iter(NewDBIterator(
+          env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+          internal_iter, i + 2, options.max_sequential_skip_in_iterations,
+          nullptr /*read_callback*/));
+      db_iter->SeekToLast();
+      ASSERT_TRUE(db_iter->Valid());
+
+      ASSERT_EQ(db_iter->key().ToString(), "b");
+      ASSERT_EQ(db_iter->value().ToString(), "merge_1");
+      db_iter->Prev();
+      ASSERT_TRUE(db_iter->Valid());
+
+      ASSERT_EQ(db_iter->key().ToString(), "a");
+      ASSERT_EQ(db_iter->value().ToString(), "merge_2");
+      db_iter->Prev();
+
+      ASSERT_TRUE(!db_iter->Valid());
+    }
+
+    {
+      TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+      internal_iter->AddMerge("b", "merge_1");
+      internal_iter->AddMerge("a", "merge_2");
+      for (size_t i = 0; i < 200; ++i) {
+        internal_iter->AddDeletion("c");
+      }
+      internal_iter->AddPut("c", "200");
+      internal_iter->Finish();
+
+      std::unique_ptr<Iterator> db_iter(NewDBIterator(
+          env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+          internal_iter, 202, options.max_sequential_skip_in_iterations,
+          nullptr /*read_callback*/));
+      db_iter->SeekToLast();
+      ASSERT_TRUE(db_iter->Valid());
+
+      ASSERT_EQ(db_iter->key().ToString(), "c");
+      ASSERT_EQ(db_iter->value().ToString(), "200");
+      db_iter->Prev();
+      ASSERT_TRUE(db_iter->Valid());
+
+      ASSERT_EQ(db_iter->key().ToString(), "b");
+      ASSERT_EQ(db_iter->value().ToString(), "merge_1");
+      db_iter->Prev();
+      ASSERT_TRUE(db_iter->Valid());
+
+      ASSERT_EQ(db_iter->key().ToString(), "a");
+      ASSERT_EQ(db_iter->value().ToString(), "merge_2");
+      db_iter->Prev();
+
+      ASSERT_TRUE(!db_iter->Valid());
+    }
+  }
+
+  {
+    for (size_t i = 0; i < 200; ++i) {
+      TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+      for (size_t k = 0; k < 200; ++k) {
+        internal_iter->AddDeletion("c");
+      }
+      internal_iter->AddPut("c", "200");
+      internal_iter->Finish();
+      std::unique_ptr<Iterator> db_iter(NewDBIterator(
+          env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+          internal_iter, i, options.max_sequential_skip_in_iterations,
+          nullptr /*read_callback*/));
+      db_iter->SeekToLast();
+      ASSERT_TRUE(!db_iter->Valid());
+
+      db_iter->SeekToFirst();
+      ASSERT_TRUE(!db_iter->Valid());
+    }
+
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    for (size_t i = 0; i < 200; ++i) {
+      internal_iter->AddDeletion("c");
+    }
+    internal_iter->AddPut("c", "200");
+    internal_iter->Finish();
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+        internal_iter, 200, options.max_sequential_skip_in_iterations,
+        nullptr /*read_callback*/));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "c");
+    ASSERT_EQ(db_iter->value().ToString(), "200");
+
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+
+    db_iter->SeekToFirst();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "c");
+    ASSERT_EQ(db_iter->value().ToString(), "200");
+
+    db_iter->Next();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
+
+  {
+    for (size_t i = 0; i < 200; ++i) {
+      TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+      internal_iter->AddMerge("b", "merge_1");
+      internal_iter->AddMerge("a", "merge_2");
+      for (size_t k = 0; k < 200; ++k) {
+        internal_iter->AddPut("d", ToString(k));
+      }
+
+      for (size_t k = 0; k < 200; ++k) {
+        internal_iter->AddPut("c", ToString(k));
+      }
+      internal_iter->Finish();
+
+      std::unique_ptr<Iterator> db_iter(NewDBIterator(
+          env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+          internal_iter, i + 2, options.max_sequential_skip_in_iterations,
+          nullptr /*read_callback*/));
+      db_iter->SeekToLast();
+      ASSERT_TRUE(db_iter->Valid());
+
+      ASSERT_EQ(db_iter->key().ToString(), "d");
+      ASSERT_EQ(db_iter->value().ToString(), ToString(i));
+      db_iter->Prev();
+      ASSERT_TRUE(db_iter->Valid());
+
+      ASSERT_EQ(db_iter->key().ToString(), "b");
+      ASSERT_EQ(db_iter->value().ToString(), "merge_1");
+      db_iter->Prev();
+      ASSERT_TRUE(db_iter->Valid());
+
+      ASSERT_EQ(db_iter->key().ToString(), "a");
+      ASSERT_EQ(db_iter->value().ToString(), "merge_2");
+      db_iter->Prev();
+
+      ASSERT_TRUE(!db_iter->Valid());
+    }
+  }
+
+  {
+    for (size_t i = 0; i < 200; ++i) {
+      TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+      internal_iter->AddMerge("b", "b");
+      internal_iter->AddMerge("a", "a");
+      for (size_t k = 0; k < 200; ++k) {
+        internal_iter->AddMerge("c", ToString(k));
+      }
+      internal_iter->Finish();
+
+      std::unique_ptr<Iterator> db_iter(NewDBIterator(
+          env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+          internal_iter, i + 2, options.max_sequential_skip_in_iterations,
+          nullptr /*read_callback*/));
+      db_iter->SeekToLast();
+      ASSERT_TRUE(db_iter->Valid());
+
+      ASSERT_EQ(db_iter->key().ToString(), "c");
+      std::string merge_result = "0";
+      for (size_t j = 1; j <= i; ++j) {
+        merge_result += "," + ToString(j);
+      }
+      ASSERT_EQ(db_iter->value().ToString(), merge_result);
+
+      db_iter->Prev();
+      ASSERT_TRUE(db_iter->Valid());
+      ASSERT_EQ(db_iter->key().ToString(), "b");
+      ASSERT_EQ(db_iter->value().ToString(), "b");
+
+      db_iter->Prev();
+      ASSERT_TRUE(db_iter->Valid());
+      ASSERT_EQ(db_iter->key().ToString(), "a");
+      ASSERT_EQ(db_iter->value().ToString(), "a");
+
+      db_iter->Prev();
+      ASSERT_TRUE(!db_iter->Valid());
+    }
+  }
+}
+
+TEST_F(DBIteratorTest, DBIteratorSkipInternalKeys) {
+  Options options;
+  ImmutableCFOptions cf_options = ImmutableCFOptions(options);
+  MutableCFOptions mutable_cf_options = MutableCFOptions(options);
+  ReadOptions ro;
+
+  // Basic test case ... Make sure explicityly passing the default value works.
+  // Skipping internal keys is disabled by default, when the value is 0.
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddPut("a", "val_a");
+    internal_iter->AddDeletion("b");
+    internal_iter->AddDeletion("b");
+    internal_iter->AddPut("c", "val_c");
+    internal_iter->AddPut("c", "val_c");
+    internal_iter->AddDeletion("c");
+    internal_iter->AddPut("d", "val_d");
+    internal_iter->Finish();
+
+    ro.max_skippable_internal_keys = 0;
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+        internal_iter, 10, options.max_sequential_skip_in_iterations,
+        nullptr /*read_callback*/));
+
+    db_iter->SeekToFirst();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "val_a");
+
+    db_iter->Next();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "d");
+    ASSERT_EQ(db_iter->value().ToString(), "val_d");
+
+    db_iter->Next();
+    ASSERT_TRUE(!db_iter->Valid());
+    ASSERT_TRUE(db_iter->status().ok());
+
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "d");
+    ASSERT_EQ(db_iter->value().ToString(), "val_d");
+
+    db_iter->Prev();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "val_a");
+
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+    ASSERT_TRUE(db_iter->status().ok());
+  }
+
+  // Test to make sure that the request will *not* fail as incomplete if
+  // num_internal_keys_skipped is *equal* to max_skippable_internal_keys
+  // threshold. (It will fail as incomplete only when the threshold is
+  // exceeded.)
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddPut("a", "val_a");
+    internal_iter->AddDeletion("b");
+    internal_iter->AddDeletion("b");
+    internal_iter->AddPut("c", "val_c");
+    internal_iter->Finish();
+
+    ro.max_skippable_internal_keys = 2;
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+        internal_iter, 10, options.max_sequential_skip_in_iterations,
+        nullptr /*read_callback*/));
+
+    db_iter->SeekToFirst();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "val_a");
+
+    db_iter->Next();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "c");
+    ASSERT_EQ(db_iter->value().ToString(), "val_c");
+
+    db_iter->Next();
+    ASSERT_TRUE(!db_iter->Valid());
+    ASSERT_TRUE(db_iter->status().ok());
+
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "c");
+    ASSERT_EQ(db_iter->value().ToString(), "val_c");
+
+    db_iter->Prev();
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "val_a");
+
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+    ASSERT_TRUE(db_iter->status().ok());
+  }
+
+  // Fail the request as incomplete when num_internal_keys_skipped >
+  // max_skippable_internal_keys
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddPut("a", "val_a");
+    internal_iter->AddDeletion("b");
+    internal_iter->AddDeletion("b");
+    internal_iter->AddDeletion("b");
+    internal_iter->AddPut("c", "val_c");
+    internal_iter->Finish();
+
+    ro.max_skippable_internal_keys = 2;
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+        internal_iter, 10, options.max_sequential_skip_in_iterations,
+        nullptr /*read_callback*/));
+
+    db_iter->SeekToFirst();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "val_a");
+
+    db_iter->Next();
+    ASSERT_TRUE(!db_iter->Valid());
+    ASSERT_TRUE(db_iter->status().IsIncomplete());
+
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "c");
+    ASSERT_EQ(db_iter->value().ToString(), "val_c");
+
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+    ASSERT_TRUE(db_iter->status().IsIncomplete());
+  }
+
+  // Test that the num_internal_keys_skipped counter resets after a successful
+  // read.
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddPut("a", "val_a");
+    internal_iter->AddDeletion("b");
+    internal_iter->AddDeletion("b");
+    internal_iter->AddPut("c", "val_c");
+    internal_iter->AddDeletion("d");
+    internal_iter->AddDeletion("d");
+    internal_iter->AddDeletion("d");
+    internal_iter->AddPut("e", "val_e");
+    internal_iter->Finish();
+
+    ro.max_skippable_internal_keys = 2;
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+        internal_iter, 10, options.max_sequential_skip_in_iterations,
+        nullptr /*read_callback*/));
+
+    db_iter->SeekToFirst();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "val_a");
+
+    db_iter->Next();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "c");
+    ASSERT_EQ(db_iter->value().ToString(), "val_c");
+
+    db_iter->Next();  // num_internal_keys_skipped counter resets here.
+    ASSERT_TRUE(!db_iter->Valid());
+    ASSERT_TRUE(db_iter->status().IsIncomplete());
+  }
+
+  // Test that the num_internal_keys_skipped counter resets after a successful
+  // read.
+  // Reverse direction
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddPut("a", "val_a");
+    internal_iter->AddDeletion("b");
+    internal_iter->AddDeletion("b");
+    internal_iter->AddDeletion("b");
+    internal_iter->AddPut("c", "val_c");
+    internal_iter->AddDeletion("d");
+    internal_iter->AddDeletion("d");
+    internal_iter->AddPut("e", "val_e");
+    internal_iter->Finish();
+
+    ro.max_skippable_internal_keys = 2;
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+        internal_iter, 10, options.max_sequential_skip_in_iterations,
+        nullptr /*read_callback*/));
+
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "e");
+    ASSERT_EQ(db_iter->value().ToString(), "val_e");
+
+    db_iter->Prev();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "c");
+    ASSERT_EQ(db_iter->value().ToString(), "val_c");
+
+    db_iter->Prev();  // num_internal_keys_skipped counter resets here.
+    ASSERT_TRUE(!db_iter->Valid());
+    ASSERT_TRUE(db_iter->status().IsIncomplete());
+  }
+
+  // Test that skipping separate keys is handled
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddPut("a", "val_a");
+    internal_iter->AddDeletion("b");
+    internal_iter->AddDeletion("c");
+    internal_iter->AddDeletion("d");
+    internal_iter->AddPut("e", "val_e");
+    internal_iter->Finish();
+
+    ro.max_skippable_internal_keys = 2;
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+        internal_iter, 10, options.max_sequential_skip_in_iterations,
+        nullptr /*read_callback*/));
+
+    db_iter->SeekToFirst();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "val_a");
+
+    db_iter->Next();
+    ASSERT_TRUE(!db_iter->Valid());
+    ASSERT_TRUE(db_iter->status().IsIncomplete());
+
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "e");
+    ASSERT_EQ(db_iter->value().ToString(), "val_e");
+
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+    ASSERT_TRUE(db_iter->status().IsIncomplete());
+  }
+
+  // Test if alternating puts and deletes of the same key are handled correctly.
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddPut("a", "val_a");
+    internal_iter->AddPut("b", "val_b");
+    internal_iter->AddDeletion("b");
+    internal_iter->AddPut("c", "val_c");
+    internal_iter->AddDeletion("c");
+    internal_iter->AddPut("d", "val_d");
+    internal_iter->AddDeletion("d");
+    internal_iter->AddPut("e", "val_e");
+    internal_iter->Finish();
+
+    ro.max_skippable_internal_keys = 2;
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+        internal_iter, 10, options.max_sequential_skip_in_iterations,
+        nullptr /*read_callback*/));
+
+    db_iter->SeekToFirst();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "val_a");
+
+    db_iter->Next();
+    ASSERT_TRUE(!db_iter->Valid());
+    ASSERT_TRUE(db_iter->status().IsIncomplete());
+
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "e");
+    ASSERT_EQ(db_iter->value().ToString(), "val_e");
+
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+    ASSERT_TRUE(db_iter->status().IsIncomplete());
+  }
+
+  // Test for large number of skippable internal keys with *default*
+  // max_sequential_skip_in_iterations.
+  {
+    for (size_t i = 1; i <= 200; ++i) {
+      TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+      internal_iter->AddPut("a", "val_a");
+      for (size_t j = 1; j <= i; ++j) {
+        internal_iter->AddPut("b", "val_b");
+        internal_iter->AddDeletion("b");
+      }
+      internal_iter->AddPut("c", "val_c");
+      internal_iter->Finish();
+
+      ro.max_skippable_internal_keys = i;
+      std::unique_ptr<Iterator> db_iter(NewDBIterator(
+          env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+          internal_iter, 2 * i + 1, options.max_sequential_skip_in_iterations,
+          nullptr /*read_callback*/));
+
+      db_iter->SeekToFirst();
+      ASSERT_TRUE(db_iter->Valid());
+      ASSERT_EQ(db_iter->key().ToString(), "a");
+      ASSERT_EQ(db_iter->value().ToString(), "val_a");
+
+      db_iter->Next();
+      if ((options.max_sequential_skip_in_iterations + 1) >=
+          ro.max_skippable_internal_keys) {
+        ASSERT_TRUE(!db_iter->Valid());
+        ASSERT_TRUE(db_iter->status().IsIncomplete());
+      } else {
+        ASSERT_TRUE(db_iter->Valid());
+        ASSERT_EQ(db_iter->key().ToString(), "c");
+        ASSERT_EQ(db_iter->value().ToString(), "val_c");
+      }
+
+      db_iter->SeekToLast();
+      ASSERT_TRUE(db_iter->Valid());
+      ASSERT_EQ(db_iter->key().ToString(), "c");
+      ASSERT_EQ(db_iter->value().ToString(), "val_c");
+
+      db_iter->Prev();
+      if ((options.max_sequential_skip_in_iterations + 1) >=
+          ro.max_skippable_internal_keys) {
+        ASSERT_TRUE(!db_iter->Valid());
+        ASSERT_TRUE(db_iter->status().IsIncomplete());
+      } else {
+        ASSERT_TRUE(db_iter->Valid());
+        ASSERT_EQ(db_iter->key().ToString(), "a");
+        ASSERT_EQ(db_iter->value().ToString(), "val_a");
+      }
+    }
+  }
+
+  // Test for large number of skippable internal keys with a *non-default*
+  // max_sequential_skip_in_iterations.
+  {
+    for (size_t i = 1; i <= 200; ++i) {
+      TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+      internal_iter->AddPut("a", "val_a");
+      for (size_t j = 1; j <= i; ++j) {
+        internal_iter->AddPut("b", "val_b");
+        internal_iter->AddDeletion("b");
+      }
+      internal_iter->AddPut("c", "val_c");
+      internal_iter->Finish();
+
+      options.max_sequential_skip_in_iterations = 1000;
+      ro.max_skippable_internal_keys = i;
+      std::unique_ptr<Iterator> db_iter(NewDBIterator(
+          env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+          internal_iter, 2 * i + 1, options.max_sequential_skip_in_iterations,
+          nullptr /*read_callback*/));
+
+      db_iter->SeekToFirst();
+      ASSERT_TRUE(db_iter->Valid());
+      ASSERT_EQ(db_iter->key().ToString(), "a");
+      ASSERT_EQ(db_iter->value().ToString(), "val_a");
+
+      db_iter->Next();
+      ASSERT_TRUE(!db_iter->Valid());
+      ASSERT_TRUE(db_iter->status().IsIncomplete());
+
+      db_iter->SeekToLast();
+      ASSERT_TRUE(db_iter->Valid());
+      ASSERT_EQ(db_iter->key().ToString(), "c");
+      ASSERT_EQ(db_iter->value().ToString(), "val_c");
+
+      db_iter->Prev();
+      ASSERT_TRUE(!db_iter->Valid());
+      ASSERT_TRUE(db_iter->status().IsIncomplete());
+    }
+  }
+}
+
+TEST_F(DBIteratorTest, DBIterator1) {
+  ReadOptions ro;
+  Options options;
+  options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
+
+  TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+  internal_iter->AddPut("a", "0");
+  internal_iter->AddPut("b", "0");
+  internal_iter->AddDeletion("b");
+  internal_iter->AddMerge("a", "1");
+  internal_iter->AddMerge("b", "2");
+  internal_iter->Finish();
+
+  std::unique_ptr<Iterator> db_iter(NewDBIterator(
+      env_, ro, ImmutableCFOptions(options), MutableCFOptions(options),
+      BytewiseComparator(), internal_iter, 1,
+      options.max_sequential_skip_in_iterations, nullptr /*read_callback*/));
+  db_iter->SeekToFirst();
+  ASSERT_TRUE(db_iter->Valid());
+  ASSERT_EQ(db_iter->key().ToString(), "a");
+  ASSERT_EQ(db_iter->value().ToString(), "0");
+  db_iter->Next();
+  ASSERT_TRUE(db_iter->Valid());
+  ASSERT_EQ(db_iter->key().ToString(), "b");
+  db_iter->Next();
+  ASSERT_FALSE(db_iter->Valid());
+}
+
+TEST_F(DBIteratorTest, DBIterator2) {
+  ReadOptions ro;
+  Options options;
+  options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
+
+  TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+  internal_iter->AddPut("a", "0");
+  internal_iter->AddPut("b", "0");
+  internal_iter->AddDeletion("b");
+  internal_iter->AddMerge("a", "1");
+  internal_iter->AddMerge("b", "2");
+  internal_iter->Finish();
+
+  std::unique_ptr<Iterator> db_iter(NewDBIterator(
+      env_, ro, ImmutableCFOptions(options), MutableCFOptions(options),
+      BytewiseComparator(), internal_iter, 0,
+      options.max_sequential_skip_in_iterations, nullptr /*read_callback*/));
+  db_iter->SeekToFirst();
+  ASSERT_TRUE(db_iter->Valid());
+  ASSERT_EQ(db_iter->key().ToString(), "a");
+  ASSERT_EQ(db_iter->value().ToString(), "0");
+  db_iter->Next();
+  ASSERT_TRUE(!db_iter->Valid());
+}
+
+TEST_F(DBIteratorTest, DBIterator3) {
+  ReadOptions ro;
+  Options options;
+  options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
+
+  TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+  internal_iter->AddPut("a", "0");
+  internal_iter->AddPut("b", "0");
+  internal_iter->AddDeletion("b");
+  internal_iter->AddMerge("a", "1");
+  internal_iter->AddMerge("b", "2");
+  internal_iter->Finish();
+
+  std::unique_ptr<Iterator> db_iter(NewDBIterator(
+      env_, ro, ImmutableCFOptions(options), MutableCFOptions(options),
+      BytewiseComparator(), internal_iter, 2,
+      options.max_sequential_skip_in_iterations, nullptr /*read_callback*/));
+  db_iter->SeekToFirst();
+  ASSERT_TRUE(db_iter->Valid());
+  ASSERT_EQ(db_iter->key().ToString(), "a");
+  ASSERT_EQ(db_iter->value().ToString(), "0");
+  db_iter->Next();
+  ASSERT_TRUE(!db_iter->Valid());
+}
+
+TEST_F(DBIteratorTest, DBIterator4) {
+  ReadOptions ro;
+  Options options;
+  options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
+
+  TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+  internal_iter->AddPut("a", "0");
+  internal_iter->AddPut("b", "0");
+  internal_iter->AddDeletion("b");
+  internal_iter->AddMerge("a", "1");
+  internal_iter->AddMerge("b", "2");
+  internal_iter->Finish();
+
+  std::unique_ptr<Iterator> db_iter(NewDBIterator(
+      env_, ro, ImmutableCFOptions(options), MutableCFOptions(options),
+      BytewiseComparator(), internal_iter, 4,
+      options.max_sequential_skip_in_iterations, nullptr /*read_callback*/));
+  db_iter->SeekToFirst();
+  ASSERT_TRUE(db_iter->Valid());
+  ASSERT_EQ(db_iter->key().ToString(), "a");
+  ASSERT_EQ(db_iter->value().ToString(), "0,1");
+  db_iter->Next();
+  ASSERT_TRUE(db_iter->Valid());
+  ASSERT_EQ(db_iter->key().ToString(), "b");
+  ASSERT_EQ(db_iter->value().ToString(), "2");
+  db_iter->Next();
+  ASSERT_TRUE(!db_iter->Valid());
+}
+
+TEST_F(DBIteratorTest, DBIterator5) {
+  ReadOptions ro;
+  Options options;
+  options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
+  ImmutableCFOptions cf_options = ImmutableCFOptions(options);
+  MutableCFOptions mutable_cf_options = MutableCFOptions(options);
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddMerge("a", "merge_2");
+    internal_iter->AddMerge("a", "merge_3");
+    internal_iter->AddPut("a", "put_1");
+    internal_iter->AddMerge("a", "merge_4");
+    internal_iter->AddMerge("a", "merge_5");
+    internal_iter->AddMerge("a", "merge_6");
+    internal_iter->Finish();
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+        internal_iter, 0, options.max_sequential_skip_in_iterations,
+        nullptr /*read_callback*/));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_1");
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddMerge("a", "merge_2");
+    internal_iter->AddMerge("a", "merge_3");
+    internal_iter->AddPut("a", "put_1");
+    internal_iter->AddMerge("a", "merge_4");
+    internal_iter->AddMerge("a", "merge_5");
+    internal_iter->AddMerge("a", "merge_6");
+    internal_iter->Finish();
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+        internal_iter, 1, options.max_sequential_skip_in_iterations,
+        nullptr /*read_callback*/));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_1,merge_2");
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddMerge("a", "merge_2");
+    internal_iter->AddMerge("a", "merge_3");
+    internal_iter->AddPut("a", "put_1");
+    internal_iter->AddMerge("a", "merge_4");
+    internal_iter->AddMerge("a", "merge_5");
+    internal_iter->AddMerge("a", "merge_6");
+    internal_iter->Finish();
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+        internal_iter, 2, options.max_sequential_skip_in_iterations,
+        nullptr /*read_callback*/));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_1,merge_2,merge_3");
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddMerge("a", "merge_2");
+    internal_iter->AddMerge("a", "merge_3");
+    internal_iter->AddPut("a", "put_1");
+    internal_iter->AddMerge("a", "merge_4");
+    internal_iter->AddMerge("a", "merge_5");
+    internal_iter->AddMerge("a", "merge_6");
+    internal_iter->Finish();
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+        internal_iter, 3, options.max_sequential_skip_in_iterations,
+        nullptr /*read_callback*/));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "put_1");
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddMerge("a", "merge_2");
+    internal_iter->AddMerge("a", "merge_3");
+    internal_iter->AddPut("a", "put_1");
+    internal_iter->AddMerge("a", "merge_4");
+    internal_iter->AddMerge("a", "merge_5");
+    internal_iter->AddMerge("a", "merge_6");
+    internal_iter->Finish();
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+        internal_iter, 4, options.max_sequential_skip_in_iterations,
+        nullptr /*read_callback*/));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "put_1,merge_4");
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddMerge("a", "merge_2");
+    internal_iter->AddMerge("a", "merge_3");
+    internal_iter->AddPut("a", "put_1");
+    internal_iter->AddMerge("a", "merge_4");
+    internal_iter->AddMerge("a", "merge_5");
+    internal_iter->AddMerge("a", "merge_6");
+    internal_iter->Finish();
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+        internal_iter, 5, options.max_sequential_skip_in_iterations,
+        nullptr /*read_callback*/));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "put_1,merge_4,merge_5");
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddMerge("a", "merge_2");
+    internal_iter->AddMerge("a", "merge_3");
+    internal_iter->AddPut("a", "put_1");
+    internal_iter->AddMerge("a", "merge_4");
+    internal_iter->AddMerge("a", "merge_5");
+    internal_iter->AddMerge("a", "merge_6");
+    internal_iter->Finish();
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+        internal_iter, 6, options.max_sequential_skip_in_iterations,
+        nullptr /*read_callback*/));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "put_1,merge_4,merge_5,merge_6");
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
+
+  {
+    // put, singledelete, merge
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddPut("a", "val_a");
+    internal_iter->AddSingleDeletion("a");
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddMerge("a", "merge_2");
+    internal_iter->AddPut("b", "val_b");
+    internal_iter->Finish();
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+        internal_iter, 10, options.max_sequential_skip_in_iterations,
+        nullptr /*read_callback*/));
+    db_iter->Seek("b");
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "b");
+    db_iter->Prev();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+  }
+}
+
+TEST_F(DBIteratorTest, DBIterator6) {
+  ReadOptions ro;
+  Options options;
+  options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
+  ImmutableCFOptions cf_options = ImmutableCFOptions(options);
+  MutableCFOptions mutable_cf_options = MutableCFOptions(options);
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddMerge("a", "merge_2");
+    internal_iter->AddMerge("a", "merge_3");
+    internal_iter->AddDeletion("a");
+    internal_iter->AddMerge("a", "merge_4");
+    internal_iter->AddMerge("a", "merge_5");
+    internal_iter->AddMerge("a", "merge_6");
+    internal_iter->Finish();
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+        internal_iter, 0, options.max_sequential_skip_in_iterations,
+        nullptr /*read_callback*/));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_1");
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddMerge("a", "merge_2");
+    internal_iter->AddMerge("a", "merge_3");
+    internal_iter->AddDeletion("a");
+    internal_iter->AddMerge("a", "merge_4");
+    internal_iter->AddMerge("a", "merge_5");
+    internal_iter->AddMerge("a", "merge_6");
+    internal_iter->Finish();
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+        internal_iter, 1, options.max_sequential_skip_in_iterations,
+        nullptr /*read_callback*/));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_1,merge_2");
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddMerge("a", "merge_2");
+    internal_iter->AddMerge("a", "merge_3");
+    internal_iter->AddDeletion("a");
+    internal_iter->AddMerge("a", "merge_4");
+    internal_iter->AddMerge("a", "merge_5");
+    internal_iter->AddMerge("a", "merge_6");
+    internal_iter->Finish();
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+        internal_iter, 2, options.max_sequential_skip_in_iterations,
+        nullptr /*read_callback*/));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_1,merge_2,merge_3");
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddMerge("a", "merge_2");
+    internal_iter->AddMerge("a", "merge_3");
+    internal_iter->AddDeletion("a");
+    internal_iter->AddMerge("a", "merge_4");
+    internal_iter->AddMerge("a", "merge_5");
+    internal_iter->AddMerge("a", "merge_6");
+    internal_iter->Finish();
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+        internal_iter, 3, options.max_sequential_skip_in_iterations,
+        nullptr /*read_callback*/));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddMerge("a", "merge_2");
+    internal_iter->AddMerge("a", "merge_3");
+    internal_iter->AddDeletion("a");
+    internal_iter->AddMerge("a", "merge_4");
+    internal_iter->AddMerge("a", "merge_5");
+    internal_iter->AddMerge("a", "merge_6");
+    internal_iter->Finish();
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+        internal_iter, 4, options.max_sequential_skip_in_iterations,
+        nullptr /*read_callback*/));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_4");
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddMerge("a", "merge_2");
+    internal_iter->AddMerge("a", "merge_3");
+    internal_iter->AddDeletion("a");
+    internal_iter->AddMerge("a", "merge_4");
+    internal_iter->AddMerge("a", "merge_5");
+    internal_iter->AddMerge("a", "merge_6");
+    internal_iter->Finish();
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+        internal_iter, 5, options.max_sequential_skip_in_iterations,
+        nullptr /*read_callback*/));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_4,merge_5");
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddMerge("a", "merge_2");
+    internal_iter->AddMerge("a", "merge_3");
+    internal_iter->AddDeletion("a");
+    internal_iter->AddMerge("a", "merge_4");
+    internal_iter->AddMerge("a", "merge_5");
+    internal_iter->AddMerge("a", "merge_6");
+    internal_iter->Finish();
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+        internal_iter, 6, options.max_sequential_skip_in_iterations,
+        nullptr /*read_callback*/));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_4,merge_5,merge_6");
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
+}
+
+TEST_F(DBIteratorTest, DBIterator7) {
+  ReadOptions ro;
+  Options options;
+  options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
+  ImmutableCFOptions cf_options = ImmutableCFOptions(options);
+  MutableCFOptions mutable_cf_options = MutableCFOptions(options);
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddPut("b", "val");
+    internal_iter->AddMerge("b", "merge_2");
+
+    internal_iter->AddDeletion("b");
+    internal_iter->AddMerge("b", "merge_3");
+
+    internal_iter->AddMerge("c", "merge_4");
+    internal_iter->AddMerge("c", "merge_5");
+
+    internal_iter->AddDeletion("b");
+    internal_iter->AddMerge("b", "merge_6");
+    internal_iter->AddMerge("b", "merge_7");
+    internal_iter->AddMerge("b", "merge_8");
+    internal_iter->AddMerge("b", "merge_9");
+    internal_iter->AddMerge("b", "merge_10");
+    internal_iter->AddMerge("b", "merge_11");
+
+    internal_iter->AddDeletion("c");
+    internal_iter->Finish();
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+        internal_iter, 0, options.max_sequential_skip_in_iterations,
+        nullptr /*read_callback*/));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_1");
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddPut("b", "val");
+    internal_iter->AddMerge("b", "merge_2");
+
+    internal_iter->AddDeletion("b");
+    internal_iter->AddMerge("b", "merge_3");
+
+    internal_iter->AddMerge("c", "merge_4");
+    internal_iter->AddMerge("c", "merge_5");
+
+    internal_iter->AddDeletion("b");
+    internal_iter->AddMerge("b", "merge_6");
+    internal_iter->AddMerge("b", "merge_7");
+    internal_iter->AddMerge("b", "merge_8");
+    internal_iter->AddMerge("b", "merge_9");
+    internal_iter->AddMerge("b", "merge_10");
+    internal_iter->AddMerge("b", "merge_11");
+
+    internal_iter->AddDeletion("c");
+    internal_iter->Finish();
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+        internal_iter, 2, options.max_sequential_skip_in_iterations,
+        nullptr /*read_callback*/));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+
+    ASSERT_EQ(db_iter->key().ToString(), "b");
+    ASSERT_EQ(db_iter->value().ToString(), "val,merge_2");
+    db_iter->Prev();
+    ASSERT_TRUE(db_iter->Valid());
+
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_1");
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddPut("b", "val");
+    internal_iter->AddMerge("b", "merge_2");
+
+    internal_iter->AddDeletion("b");
+    internal_iter->AddMerge("b", "merge_3");
+
+    internal_iter->AddMerge("c", "merge_4");
+    internal_iter->AddMerge("c", "merge_5");
+
+    internal_iter->AddDeletion("b");
+    internal_iter->AddMerge("b", "merge_6");
+    internal_iter->AddMerge("b", "merge_7");
+    internal_iter->AddMerge("b", "merge_8");
+    internal_iter->AddMerge("b", "merge_9");
+    internal_iter->AddMerge("b", "merge_10");
+    internal_iter->AddMerge("b", "merge_11");
+
+    internal_iter->AddDeletion("c");
+    internal_iter->Finish();
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+        internal_iter, 4, options.max_sequential_skip_in_iterations,
+        nullptr /*read_callback*/));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+
+    ASSERT_EQ(db_iter->key().ToString(), "b");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_3");
+    db_iter->Prev();
+    ASSERT_TRUE(db_iter->Valid());
+
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_1");
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddPut("b", "val");
+    internal_iter->AddMerge("b", "merge_2");
+
+    internal_iter->AddDeletion("b");
+    internal_iter->AddMerge("b", "merge_3");
+
+    internal_iter->AddMerge("c", "merge_4");
+    internal_iter->AddMerge("c", "merge_5");
+
+    internal_iter->AddDeletion("b");
+    internal_iter->AddMerge("b", "merge_6");
+    internal_iter->AddMerge("b", "merge_7");
+    internal_iter->AddMerge("b", "merge_8");
+    internal_iter->AddMerge("b", "merge_9");
+    internal_iter->AddMerge("b", "merge_10");
+    internal_iter->AddMerge("b", "merge_11");
+
+    internal_iter->AddDeletion("c");
+    internal_iter->Finish();
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+        internal_iter, 5, options.max_sequential_skip_in_iterations,
+        nullptr /*read_callback*/));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+
+    ASSERT_EQ(db_iter->key().ToString(), "c");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_4");
+    db_iter->Prev();
+
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "b");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_3");
+    db_iter->Prev();
+    ASSERT_TRUE(db_iter->Valid());
+
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_1");
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddPut("b", "val");
+    internal_iter->AddMerge("b", "merge_2");
+
+    internal_iter->AddDeletion("b");
+    internal_iter->AddMerge("b", "merge_3");
+
+    internal_iter->AddMerge("c", "merge_4");
+    internal_iter->AddMerge("c", "merge_5");
+
+    internal_iter->AddDeletion("b");
+    internal_iter->AddMerge("b", "merge_6");
+    internal_iter->AddMerge("b", "merge_7");
+    internal_iter->AddMerge("b", "merge_8");
+    internal_iter->AddMerge("b", "merge_9");
+    internal_iter->AddMerge("b", "merge_10");
+    internal_iter->AddMerge("b", "merge_11");
+
+    internal_iter->AddDeletion("c");
+    internal_iter->Finish();
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+        internal_iter, 6, options.max_sequential_skip_in_iterations,
+        nullptr /*read_callback*/));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+
+    ASSERT_EQ(db_iter->key().ToString(), "c");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_4,merge_5");
+    db_iter->Prev();
+    ASSERT_TRUE(db_iter->Valid());
+
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "b");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_3");
+    db_iter->Prev();
+    ASSERT_TRUE(db_iter->Valid());
+
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_1");
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddPut("b", "val");
+    internal_iter->AddMerge("b", "merge_2");
+
+    internal_iter->AddDeletion("b");
+    internal_iter->AddMerge("b", "merge_3");
+
+    internal_iter->AddMerge("c", "merge_4");
+    internal_iter->AddMerge("c", "merge_5");
+
+    internal_iter->AddDeletion("b");
+    internal_iter->AddMerge("b", "merge_6");
+    internal_iter->AddMerge("b", "merge_7");
+    internal_iter->AddMerge("b", "merge_8");
+    internal_iter->AddMerge("b", "merge_9");
+    internal_iter->AddMerge("b", "merge_10");
+    internal_iter->AddMerge("b", "merge_11");
+
+    internal_iter->AddDeletion("c");
+    internal_iter->Finish();
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+        internal_iter, 7, options.max_sequential_skip_in_iterations,
+        nullptr /*read_callback*/));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+
+    ASSERT_EQ(db_iter->key().ToString(), "c");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_4,merge_5");
+    db_iter->Prev();
+    ASSERT_TRUE(db_iter->Valid());
+
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_1");
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddPut("b", "val");
+    internal_iter->AddMerge("b", "merge_2");
+
+    internal_iter->AddDeletion("b");
+    internal_iter->AddMerge("b", "merge_3");
+
+    internal_iter->AddMerge("c", "merge_4");
+    internal_iter->AddMerge("c", "merge_5");
+
+    internal_iter->AddDeletion("b");
+    internal_iter->AddMerge("b", "merge_6");
+    internal_iter->AddMerge("b", "merge_7");
+    internal_iter->AddMerge("b", "merge_8");
+    internal_iter->AddMerge("b", "merge_9");
+    internal_iter->AddMerge("b", "merge_10");
+    internal_iter->AddMerge("b", "merge_11");
+
+    internal_iter->AddDeletion("c");
+    internal_iter->Finish();
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+        internal_iter, 9, options.max_sequential_skip_in_iterations,
+        nullptr /*read_callback*/));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+
+    ASSERT_EQ(db_iter->key().ToString(), "c");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_4,merge_5");
+    db_iter->Prev();
+    ASSERT_TRUE(db_iter->Valid());
+
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "b");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_6,merge_7");
+    db_iter->Prev();
+    ASSERT_TRUE(db_iter->Valid());
+
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_1");
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddPut("b", "val");
+    internal_iter->AddMerge("b", "merge_2");
+
+    internal_iter->AddDeletion("b");
+    internal_iter->AddMerge("b", "merge_3");
+
+    internal_iter->AddMerge("c", "merge_4");
+    internal_iter->AddMerge("c", "merge_5");
+
+    internal_iter->AddDeletion("b");
+    internal_iter->AddMerge("b", "merge_6");
+    internal_iter->AddMerge("b", "merge_7");
+    internal_iter->AddMerge("b", "merge_8");
+    internal_iter->AddMerge("b", "merge_9");
+    internal_iter->AddMerge("b", "merge_10");
+    internal_iter->AddMerge("b", "merge_11");
+
+    internal_iter->AddDeletion("c");
+    internal_iter->Finish();
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+        internal_iter, 13, options.max_sequential_skip_in_iterations,
+        nullptr /*read_callback*/));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+
+    ASSERT_EQ(db_iter->key().ToString(), "c");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_4,merge_5");
+    db_iter->Prev();
+    ASSERT_TRUE(db_iter->Valid());
+
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "b");
+    ASSERT_EQ(db_iter->value().ToString(),
+              "merge_6,merge_7,merge_8,merge_9,merge_10,merge_11");
+    db_iter->Prev();
+    ASSERT_TRUE(db_iter->Valid());
+
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_1");
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddPut("b", "val");
+    internal_iter->AddMerge("b", "merge_2");
+
+    internal_iter->AddDeletion("b");
+    internal_iter->AddMerge("b", "merge_3");
+
+    internal_iter->AddMerge("c", "merge_4");
+    internal_iter->AddMerge("c", "merge_5");
+
+    internal_iter->AddDeletion("b");
+    internal_iter->AddMerge("b", "merge_6");
+    internal_iter->AddMerge("b", "merge_7");
+    internal_iter->AddMerge("b", "merge_8");
+    internal_iter->AddMerge("b", "merge_9");
+    internal_iter->AddMerge("b", "merge_10");
+    internal_iter->AddMerge("b", "merge_11");
+
+    internal_iter->AddDeletion("c");
+    internal_iter->Finish();
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+        internal_iter, 14, options.max_sequential_skip_in_iterations,
+        nullptr /*read_callback*/));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+
+    ASSERT_EQ(db_iter->key().ToString(), "b");
+    ASSERT_EQ(db_iter->value().ToString(),
+              "merge_6,merge_7,merge_8,merge_9,merge_10,merge_11");
+    db_iter->Prev();
+    ASSERT_TRUE(db_iter->Valid());
+
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_1");
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
+}
+
+TEST_F(DBIteratorTest, DBIterator8) {
+  ReadOptions ro;
+  Options options;
+  options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
+
+  TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+  internal_iter->AddDeletion("a");
+  internal_iter->AddPut("a", "0");
+  internal_iter->AddPut("b", "0");
+  internal_iter->Finish();
+
+  std::unique_ptr<Iterator> db_iter(NewDBIterator(
+      env_, ro, ImmutableCFOptions(options), MutableCFOptions(options),
+      BytewiseComparator(), internal_iter, 10,
+      options.max_sequential_skip_in_iterations, nullptr /*read_callback*/));
+  db_iter->SeekToLast();
+  ASSERT_TRUE(db_iter->Valid());
+  ASSERT_EQ(db_iter->key().ToString(), "b");
+  ASSERT_EQ(db_iter->value().ToString(), "0");
+
+  db_iter->Prev();
+  ASSERT_TRUE(db_iter->Valid());
+  ASSERT_EQ(db_iter->key().ToString(), "a");
+  ASSERT_EQ(db_iter->value().ToString(), "0");
+}
+
+// TODO(3.13): fix the issue of Seek() then Prev() which might not necessary
+//             return the biggest element smaller than the seek key.
+TEST_F(DBIteratorTest, DBIterator9) {
+  ReadOptions ro;
+  Options options;
+  options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddMerge("a", "merge_2");
+    internal_iter->AddMerge("b", "merge_3");
+    internal_iter->AddMerge("b", "merge_4");
+    internal_iter->AddMerge("d", "merge_5");
+    internal_iter->AddMerge("d", "merge_6");
+    internal_iter->Finish();
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, ImmutableCFOptions(options), MutableCFOptions(options),
+        BytewiseComparator(), internal_iter, 10,
+        options.max_sequential_skip_in_iterations, nullptr /*read_callback*/));
+
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    db_iter->Prev();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "b");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_3,merge_4");
+    db_iter->Next();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "d");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_5,merge_6");
+
+    db_iter->Seek("b");
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "b");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_3,merge_4");
+    db_iter->Prev();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_1,merge_2");
+
+    db_iter->SeekForPrev("b");
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "b");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_3,merge_4");
+    db_iter->Next();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "d");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_5,merge_6");
+
+    db_iter->Seek("c");
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "d");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_5,merge_6");
+    db_iter->Prev();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "b");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_3,merge_4");
+
+    db_iter->SeekForPrev("c");
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "b");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_3,merge_4");
+    db_iter->Next();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "d");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_5,merge_6");
+  }
+}
+
+// TODO(3.13): fix the issue of Seek() then Prev() which might not necessary
+//             return the biggest element smaller than the seek key.
+TEST_F(DBIteratorTest, DBIterator10) {
+  ReadOptions ro;
+  Options options;
+
+  TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+  internal_iter->AddPut("a", "1");
+  internal_iter->AddPut("b", "2");
+  internal_iter->AddPut("c", "3");
+  internal_iter->AddPut("d", "4");
+  internal_iter->Finish();
+
+  std::unique_ptr<Iterator> db_iter(NewDBIterator(
+      env_, ro, ImmutableCFOptions(options), MutableCFOptions(options),
+      BytewiseComparator(), internal_iter, 10,
+      options.max_sequential_skip_in_iterations, nullptr /*read_callback*/));
+
+  db_iter->Seek("c");
+  ASSERT_TRUE(db_iter->Valid());
+  db_iter->Prev();
+  ASSERT_TRUE(db_iter->Valid());
+  ASSERT_EQ(db_iter->key().ToString(), "b");
+  ASSERT_EQ(db_iter->value().ToString(), "2");
+
+  db_iter->Next();
+  ASSERT_TRUE(db_iter->Valid());
+  ASSERT_EQ(db_iter->key().ToString(), "c");
+  ASSERT_EQ(db_iter->value().ToString(), "3");
+
+  db_iter->SeekForPrev("c");
+  ASSERT_TRUE(db_iter->Valid());
+  db_iter->Next();
+  ASSERT_TRUE(db_iter->Valid());
+  ASSERT_EQ(db_iter->key().ToString(), "d");
+  ASSERT_EQ(db_iter->value().ToString(), "4");
+
+  db_iter->Prev();
+  ASSERT_TRUE(db_iter->Valid());
+  ASSERT_EQ(db_iter->key().ToString(), "c");
+  ASSERT_EQ(db_iter->value().ToString(), "3");
+}
+
+TEST_F(DBIteratorTest, SeekToLastOccurrenceSeq0) {
+  ReadOptions ro;
+  Options options;
+  options.merge_operator = nullptr;
+
+  TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+  internal_iter->AddPut("a", "1");
+  internal_iter->AddPut("b", "2");
+  internal_iter->Finish();
+
+  std::unique_ptr<Iterator> db_iter(NewDBIterator(
+      env_, ro, ImmutableCFOptions(options), MutableCFOptions(options),
+      BytewiseComparator(), internal_iter, 10, 0 /* force seek */,
+      nullptr /*read_callback*/));
+  db_iter->SeekToFirst();
+  ASSERT_TRUE(db_iter->Valid());
+  ASSERT_EQ(db_iter->key().ToString(), "a");
+  ASSERT_EQ(db_iter->value().ToString(), "1");
+  db_iter->Next();
+  ASSERT_TRUE(db_iter->Valid());
+  ASSERT_EQ(db_iter->key().ToString(), "b");
+  ASSERT_EQ(db_iter->value().ToString(), "2");
+  db_iter->Next();
+  ASSERT_FALSE(db_iter->Valid());
+}
+
+TEST_F(DBIteratorTest, DBIterator11) {
+  ReadOptions ro;
+  Options options;
+  options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
+
+  TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+  internal_iter->AddPut("a", "0");
+  internal_iter->AddPut("b", "0");
+  internal_iter->AddSingleDeletion("b");
+  internal_iter->AddMerge("a", "1");
+  internal_iter->AddMerge("b", "2");
+  internal_iter->Finish();
+
+  std::unique_ptr<Iterator> db_iter(NewDBIterator(
+      env_, ro, ImmutableCFOptions(options), MutableCFOptions(options),
+      BytewiseComparator(), internal_iter, 1,
+      options.max_sequential_skip_in_iterations, nullptr /*read_callback*/));
+  db_iter->SeekToFirst();
+  ASSERT_TRUE(db_iter->Valid());
+  ASSERT_EQ(db_iter->key().ToString(), "a");
+  ASSERT_EQ(db_iter->value().ToString(), "0");
+  db_iter->Next();
+  ASSERT_TRUE(db_iter->Valid());
+  ASSERT_EQ(db_iter->key().ToString(), "b");
+  db_iter->Next();
+  ASSERT_FALSE(db_iter->Valid());
+}
+
+TEST_F(DBIteratorTest, DBIterator12) {
+  ReadOptions ro;
+  Options options;
+  options.merge_operator = nullptr;
+
+  TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+  internal_iter->AddPut("a", "1");
+  internal_iter->AddPut("b", "2");
+  internal_iter->AddPut("c", "3");
+  internal_iter->AddSingleDeletion("b");
+  internal_iter->Finish();
+
+  std::unique_ptr<Iterator> db_iter(NewDBIterator(
+      env_, ro, ImmutableCFOptions(options), MutableCFOptions(options),
+      BytewiseComparator(), internal_iter, 10, 0, nullptr /*read_callback*/));
+  db_iter->SeekToLast();
+  ASSERT_TRUE(db_iter->Valid());
+  ASSERT_EQ(db_iter->key().ToString(), "c");
+  ASSERT_EQ(db_iter->value().ToString(), "3");
+  db_iter->Prev();
+  ASSERT_TRUE(db_iter->Valid());
+  ASSERT_EQ(db_iter->key().ToString(), "a");
+  ASSERT_EQ(db_iter->value().ToString(), "1");
+  db_iter->Prev();
+  ASSERT_FALSE(db_iter->Valid());
+}
+
+TEST_F(DBIteratorTest, DBIterator13) {
+  ReadOptions ro;
+  Options options;
+  options.merge_operator = nullptr;
+
+  std::string key;
+  key.resize(9);
+  key.assign(9, static_cast<char>(0));
+  key[0] = 'b';
+
+  TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+  internal_iter->AddPut(key, "0");
+  internal_iter->AddPut(key, "1");
+  internal_iter->AddPut(key, "2");
+  internal_iter->AddPut(key, "3");
+  internal_iter->AddPut(key, "4");
+  internal_iter->AddPut(key, "5");
+  internal_iter->AddPut(key, "6");
+  internal_iter->AddPut(key, "7");
+  internal_iter->AddPut(key, "8");
+  internal_iter->Finish();
+
+  std::unique_ptr<Iterator> db_iter(NewDBIterator(
+      env_, ro, ImmutableCFOptions(options), MutableCFOptions(options),
+      BytewiseComparator(), internal_iter, 2, 3, nullptr /*read_callback*/));
+  db_iter->Seek("b");
+  ASSERT_TRUE(db_iter->Valid());
+  ASSERT_EQ(db_iter->key().ToString(), key);
+  ASSERT_EQ(db_iter->value().ToString(), "2");
+}
+
+TEST_F(DBIteratorTest, DBIterator14) {
+  ReadOptions ro;
+  Options options;
+  options.merge_operator = nullptr;
+
+  std::string key("b");
+  TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+  internal_iter->AddPut("b", "0");
+  internal_iter->AddPut("b", "1");
+  internal_iter->AddPut("b", "2");
+  internal_iter->AddPut("b", "3");
+  internal_iter->AddPut("a", "4");
+  internal_iter->AddPut("a", "5");
+  internal_iter->AddPut("a", "6");
+  internal_iter->AddPut("c", "7");
+  internal_iter->AddPut("c", "8");
+  internal_iter->AddPut("c", "9");
+  internal_iter->Finish();
+
+  std::unique_ptr<Iterator> db_iter(NewDBIterator(
+      env_, ro, ImmutableCFOptions(options), MutableCFOptions(options),
+      BytewiseComparator(), internal_iter, 4, 1, nullptr /*read_callback*/));
+  db_iter->Seek("b");
+  ASSERT_TRUE(db_iter->Valid());
+  ASSERT_EQ(db_iter->key().ToString(), "b");
+  ASSERT_EQ(db_iter->value().ToString(), "3");
+  db_iter->SeekToFirst();
+  ASSERT_EQ(db_iter->key().ToString(), "a");
+  ASSERT_EQ(db_iter->value().ToString(), "4");
+}
+
+TEST_F(DBIteratorTest, DBIteratorTestDifferentialSnapshots) {
+  { // test that KVs earlier that iter_start_seqnum are filtered out
+    ReadOptions ro;
+    ro.iter_start_seqnum=5;
+    Options options;
+    options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    for (size_t i = 0; i < 10; ++i) {
+      internal_iter->AddPut(std::to_string(i), std::to_string(i) + "a");
+      internal_iter->AddPut(std::to_string(i), std::to_string(i) + "b");
+      internal_iter->AddPut(std::to_string(i), std::to_string(i) + "c");
+    }
+    internal_iter->Finish();
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, ImmutableCFOptions(options), MutableCFOptions(options),
+        BytewiseComparator(), internal_iter, 13,
+        options.max_sequential_skip_in_iterations, nullptr));
+    // Expecting InternalKeys in [5,8] range with correct type
+    int seqnums[4] = {5,8,11,13};
+    std::string user_keys[4] = {"1","2","3","4"};
+    std::string values[4] = {"1c", "2c", "3c", "4b"};
+    int i = 0;
+    for (db_iter->SeekToFirst(); db_iter->Valid(); db_iter->Next()) {
+      FullKey fkey;
+      ParseFullKey(db_iter->key(), &fkey);
+      ASSERT_EQ(user_keys[i], fkey.user_key.ToString());
+      ASSERT_EQ(EntryType::kEntryPut, fkey.type);
+      ASSERT_EQ(seqnums[i], fkey.sequence);
+      ASSERT_EQ(values[i], db_iter->value().ToString());
+      i++;
+    }
+    ASSERT_EQ(i, 4);
+  }
+
+  { // Test that deletes are returned correctly as internal KVs
+    ReadOptions ro;
+    ro.iter_start_seqnum=5;
+    Options options;
+    options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    for (size_t i = 0; i < 10; ++i) {
+      internal_iter->AddPut(std::to_string(i), std::to_string(i) + "a");
+      internal_iter->AddPut(std::to_string(i), std::to_string(i) + "b");
+      internal_iter->AddDeletion(std::to_string(i));
+    }
+    internal_iter->Finish();
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, ImmutableCFOptions(options), MutableCFOptions(options),
+        BytewiseComparator(), internal_iter, 13,
+        options.max_sequential_skip_in_iterations, nullptr));
+    // Expecting InternalKeys in [5,8] range with correct type
+    int seqnums[4] = {5,8,11,13};
+    EntryType key_types[4] = {EntryType::kEntryDelete,EntryType::kEntryDelete,
+      EntryType::kEntryDelete,EntryType::kEntryPut};
+    std::string user_keys[4] = {"1","2","3","4"};
+    std::string values[4] = {"", "", "", "4b"};
+    int i = 0;
+    for (db_iter->SeekToFirst(); db_iter->Valid(); db_iter->Next()) {
+      FullKey fkey;
+      ParseFullKey(db_iter->key(), &fkey);
+      ASSERT_EQ(user_keys[i], fkey.user_key.ToString());
+      ASSERT_EQ(key_types[i], fkey.type);
+      ASSERT_EQ(seqnums[i], fkey.sequence);
+      ASSERT_EQ(values[i], db_iter->value().ToString());
+      i++;
+    }
+    ASSERT_EQ(i, 4);
+  }
+}
+
+class DBIterWithMergeIterTest : public testing::Test {
+ public:
+  DBIterWithMergeIterTest()
+      : env_(Env::Default()), icomp_(BytewiseComparator()) {
+    options_.merge_operator = nullptr;
+
+    internal_iter1_ = new TestIterator(BytewiseComparator());
+    internal_iter1_->Add("a", kTypeValue, "1", 3u);
+    internal_iter1_->Add("f", kTypeValue, "2", 5u);
+    internal_iter1_->Add("g", kTypeValue, "3", 7u);
+    internal_iter1_->Finish();
+
+    internal_iter2_ = new TestIterator(BytewiseComparator());
+    internal_iter2_->Add("a", kTypeValue, "4", 6u);
+    internal_iter2_->Add("b", kTypeValue, "5", 1u);
+    internal_iter2_->Add("c", kTypeValue, "6", 2u);
+    internal_iter2_->Add("d", kTypeValue, "7", 3u);
+    internal_iter2_->Finish();
+
+    std::vector<InternalIterator*> child_iters;
+    child_iters.push_back(internal_iter1_);
+    child_iters.push_back(internal_iter2_);
+    InternalKeyComparator icomp(BytewiseComparator());
+    InternalIterator* merge_iter =
+        NewMergingIterator(&icomp_, &child_iters[0], 2u);
+
+    db_iter_.reset(NewDBIterator(
+        env_, ro_, ImmutableCFOptions(options_), MutableCFOptions(options_),
+        BytewiseComparator(), merge_iter,
+        8 /* read data earlier than seqId 8 */,
+        3 /* max iterators before reseek */, nullptr /*read_callback*/));
+  }
+
+  Env* env_;
+  ReadOptions ro_;
+  Options options_;
+  TestIterator* internal_iter1_;
+  TestIterator* internal_iter2_;
+  InternalKeyComparator icomp_;
+  Iterator* merge_iter_;
+  std::unique_ptr<Iterator> db_iter_;
+};
+
+TEST_F(DBIterWithMergeIterTest, InnerMergeIterator1) {
+  db_iter_->SeekToFirst();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "a");
+  ASSERT_EQ(db_iter_->value().ToString(), "4");
+  db_iter_->Next();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "b");
+  ASSERT_EQ(db_iter_->value().ToString(), "5");
+  db_iter_->Next();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "c");
+  ASSERT_EQ(db_iter_->value().ToString(), "6");
+  db_iter_->Next();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "d");
+  ASSERT_EQ(db_iter_->value().ToString(), "7");
+  db_iter_->Next();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "f");
+  ASSERT_EQ(db_iter_->value().ToString(), "2");
+  db_iter_->Next();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "g");
+  ASSERT_EQ(db_iter_->value().ToString(), "3");
+  db_iter_->Next();
+  ASSERT_FALSE(db_iter_->Valid());
+}
+
+TEST_F(DBIterWithMergeIterTest, InnerMergeIterator2) {
+  // Test Prev() when one child iterator is at its end.
+  db_iter_->SeekForPrev("g");
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "g");
+  ASSERT_EQ(db_iter_->value().ToString(), "3");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "f");
+  ASSERT_EQ(db_iter_->value().ToString(), "2");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "d");
+  ASSERT_EQ(db_iter_->value().ToString(), "7");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "c");
+  ASSERT_EQ(db_iter_->value().ToString(), "6");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "b");
+  ASSERT_EQ(db_iter_->value().ToString(), "5");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "a");
+  ASSERT_EQ(db_iter_->value().ToString(), "4");
+}
+
+TEST_F(DBIterWithMergeIterTest, InnerMergeIteratorDataRace1) {
+  // Test Prev() when one child iterator is at its end but more rows
+  // are added.
+  db_iter_->Seek("f");
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "f");
+  ASSERT_EQ(db_iter_->value().ToString(), "2");
+
+  // Test call back inserts a key in the end of the mem table after
+  // MergeIterator::Prev() realized the mem table iterator is at its end
+  // and before an SeekToLast() is called.
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "MergeIterator::Prev:BeforePrev",
+      [&](void* /*arg*/) { internal_iter2_->Add("z", kTypeValue, "7", 12u); });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "d");
+  ASSERT_EQ(db_iter_->value().ToString(), "7");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "c");
+  ASSERT_EQ(db_iter_->value().ToString(), "6");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "b");
+  ASSERT_EQ(db_iter_->value().ToString(), "5");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "a");
+  ASSERT_EQ(db_iter_->value().ToString(), "4");
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBIterWithMergeIterTest, InnerMergeIteratorDataRace2) {
+  // Test Prev() when one child iterator is at its end but more rows
+  // are added.
+  db_iter_->Seek("f");
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "f");
+  ASSERT_EQ(db_iter_->value().ToString(), "2");
+
+  // Test call back inserts entries for update a key in the end of the
+  // mem table after MergeIterator::Prev() realized the mem tableiterator is at
+  // its end and before an SeekToLast() is called.
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "MergeIterator::Prev:BeforePrev", [&](void* /*arg*/) {
+        internal_iter2_->Add("z", kTypeValue, "7", 12u);
+        internal_iter2_->Add("z", kTypeValue, "7", 11u);
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "d");
+  ASSERT_EQ(db_iter_->value().ToString(), "7");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "c");
+  ASSERT_EQ(db_iter_->value().ToString(), "6");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "b");
+  ASSERT_EQ(db_iter_->value().ToString(), "5");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "a");
+  ASSERT_EQ(db_iter_->value().ToString(), "4");
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBIterWithMergeIterTest, InnerMergeIteratorDataRace3) {
+  // Test Prev() when one child iterator is at its end but more rows
+  // are added and max_skipped is triggered.
+  db_iter_->Seek("f");
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "f");
+  ASSERT_EQ(db_iter_->value().ToString(), "2");
+
+  // Test call back inserts entries for update a key in the end of the
+  // mem table after MergeIterator::Prev() realized the mem table iterator is at
+  // its end and before an SeekToLast() is called.
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "MergeIterator::Prev:BeforePrev", [&](void* /*arg*/) {
+        internal_iter2_->Add("z", kTypeValue, "7", 16u, true);
+        internal_iter2_->Add("z", kTypeValue, "7", 15u, true);
+        internal_iter2_->Add("z", kTypeValue, "7", 14u, true);
+        internal_iter2_->Add("z", kTypeValue, "7", 13u, true);
+        internal_iter2_->Add("z", kTypeValue, "7", 12u, true);
+        internal_iter2_->Add("z", kTypeValue, "7", 11u, true);
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "d");
+  ASSERT_EQ(db_iter_->value().ToString(), "7");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "c");
+  ASSERT_EQ(db_iter_->value().ToString(), "6");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "b");
+  ASSERT_EQ(db_iter_->value().ToString(), "5");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "a");
+  ASSERT_EQ(db_iter_->value().ToString(), "4");
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBIterWithMergeIterTest, InnerMergeIteratorDataRace4) {
+  // Test Prev() when one child iterator has more rows inserted
+  // between Seek() and Prev() when changing directions.
+  internal_iter2_->Add("z", kTypeValue, "9", 4u);
+
+  db_iter_->Seek("g");
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "g");
+  ASSERT_EQ(db_iter_->value().ToString(), "3");
+
+  // Test call back inserts entries for update a key before "z" in
+  // mem table after MergeIterator::Prev() calls mem table iterator's
+  // Seek() and before calling Prev()
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "MergeIterator::Prev:BeforePrev", [&](void* arg) {
+        IteratorWrapper* it = reinterpret_cast<IteratorWrapper*>(arg);
+        if (it->key().starts_with("z")) {
+          internal_iter2_->Add("x", kTypeValue, "7", 16u, true);
+          internal_iter2_->Add("x", kTypeValue, "7", 15u, true);
+          internal_iter2_->Add("x", kTypeValue, "7", 14u, true);
+          internal_iter2_->Add("x", kTypeValue, "7", 13u, true);
+          internal_iter2_->Add("x", kTypeValue, "7", 12u, true);
+          internal_iter2_->Add("x", kTypeValue, "7", 11u, true);
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "f");
+  ASSERT_EQ(db_iter_->value().ToString(), "2");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "d");
+  ASSERT_EQ(db_iter_->value().ToString(), "7");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "c");
+  ASSERT_EQ(db_iter_->value().ToString(), "6");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "b");
+  ASSERT_EQ(db_iter_->value().ToString(), "5");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "a");
+  ASSERT_EQ(db_iter_->value().ToString(), "4");
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBIterWithMergeIterTest, InnerMergeIteratorDataRace5) {
+  internal_iter2_->Add("z", kTypeValue, "9", 4u);
+
+  // Test Prev() when one child iterator has more rows inserted
+  // between Seek() and Prev() when changing directions.
+  db_iter_->Seek("g");
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "g");
+  ASSERT_EQ(db_iter_->value().ToString(), "3");
+
+  // Test call back inserts entries for update a key before "z" in
+  // mem table after MergeIterator::Prev() calls mem table iterator's
+  // Seek() and before calling Prev()
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "MergeIterator::Prev:BeforePrev", [&](void* arg) {
+        IteratorWrapper* it = reinterpret_cast<IteratorWrapper*>(arg);
+        if (it->key().starts_with("z")) {
+          internal_iter2_->Add("x", kTypeValue, "7", 16u, true);
+          internal_iter2_->Add("x", kTypeValue, "7", 15u, true);
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "f");
+  ASSERT_EQ(db_iter_->value().ToString(), "2");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "d");
+  ASSERT_EQ(db_iter_->value().ToString(), "7");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "c");
+  ASSERT_EQ(db_iter_->value().ToString(), "6");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "b");
+  ASSERT_EQ(db_iter_->value().ToString(), "5");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "a");
+  ASSERT_EQ(db_iter_->value().ToString(), "4");
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBIterWithMergeIterTest, InnerMergeIteratorDataRace6) {
+  internal_iter2_->Add("z", kTypeValue, "9", 4u);
+
+  // Test Prev() when one child iterator has more rows inserted
+  // between Seek() and Prev() when changing directions.
+  db_iter_->Seek("g");
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "g");
+  ASSERT_EQ(db_iter_->value().ToString(), "3");
+
+  // Test call back inserts an entry for update a key before "z" in
+  // mem table after MergeIterator::Prev() calls mem table iterator's
+  // Seek() and before calling Prev()
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "MergeIterator::Prev:BeforePrev", [&](void* arg) {
+        IteratorWrapper* it = reinterpret_cast<IteratorWrapper*>(arg);
+        if (it->key().starts_with("z")) {
+          internal_iter2_->Add("x", kTypeValue, "7", 16u, true);
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "f");
+  ASSERT_EQ(db_iter_->value().ToString(), "2");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "d");
+  ASSERT_EQ(db_iter_->value().ToString(), "7");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "c");
+  ASSERT_EQ(db_iter_->value().ToString(), "6");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "b");
+  ASSERT_EQ(db_iter_->value().ToString(), "5");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "a");
+  ASSERT_EQ(db_iter_->value().ToString(), "4");
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBIterWithMergeIterTest, InnerMergeIteratorDataRace7) {
+  internal_iter1_->Add("u", kTypeValue, "10", 4u);
+  internal_iter1_->Add("v", kTypeValue, "11", 4u);
+  internal_iter1_->Add("w", kTypeValue, "12", 4u);
+  internal_iter2_->Add("z", kTypeValue, "9", 4u);
+
+  // Test Prev() when one child iterator has more rows inserted
+  // between Seek() and Prev() when changing directions.
+  db_iter_->Seek("g");
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "g");
+  ASSERT_EQ(db_iter_->value().ToString(), "3");
+
+  // Test call back inserts entries for update a key before "z" in
+  // mem table after MergeIterator::Prev() calls mem table iterator's
+  // Seek() and before calling Prev()
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "MergeIterator::Prev:BeforePrev", [&](void* arg) {
+        IteratorWrapper* it = reinterpret_cast<IteratorWrapper*>(arg);
+        if (it->key().starts_with("z")) {
+          internal_iter2_->Add("x", kTypeValue, "7", 16u, true);
+          internal_iter2_->Add("x", kTypeValue, "7", 15u, true);
+          internal_iter2_->Add("x", kTypeValue, "7", 14u, true);
+          internal_iter2_->Add("x", kTypeValue, "7", 13u, true);
+          internal_iter2_->Add("x", kTypeValue, "7", 12u, true);
+          internal_iter2_->Add("x", kTypeValue, "7", 11u, true);
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "f");
+  ASSERT_EQ(db_iter_->value().ToString(), "2");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "d");
+  ASSERT_EQ(db_iter_->value().ToString(), "7");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "c");
+  ASSERT_EQ(db_iter_->value().ToString(), "6");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "b");
+  ASSERT_EQ(db_iter_->value().ToString(), "5");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "a");
+  ASSERT_EQ(db_iter_->value().ToString(), "4");
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBIterWithMergeIterTest, InnerMergeIteratorDataRace8) {
+  // internal_iter1_: a, f, g
+  // internal_iter2_: a, b, c, d, adding (z)
+  internal_iter2_->Add("z", kTypeValue, "9", 4u);
+
+  // Test Prev() when one child iterator has more rows inserted
+  // between Seek() and Prev() when changing directions.
+  db_iter_->Seek("g");
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "g");
+  ASSERT_EQ(db_iter_->value().ToString(), "3");
+
+  // Test call back inserts two keys before "z" in mem table after
+  // MergeIterator::Prev() calls mem table iterator's Seek() and
+  // before calling Prev()
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "MergeIterator::Prev:BeforePrev", [&](void* arg) {
+        IteratorWrapper* it = reinterpret_cast<IteratorWrapper*>(arg);
+        if (it->key().starts_with("z")) {
+          internal_iter2_->Add("x", kTypeValue, "7", 16u, true);
+          internal_iter2_->Add("y", kTypeValue, "7", 17u, true);
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "f");
+  ASSERT_EQ(db_iter_->value().ToString(), "2");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "d");
+  ASSERT_EQ(db_iter_->value().ToString(), "7");
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+
+TEST_F(DBIteratorTest, SeekPrefixTombstones) {
+  ReadOptions ro;
+  Options options;
+  options.prefix_extractor.reset(NewNoopTransform());
+  TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+  internal_iter->AddDeletion("b");
+  internal_iter->AddDeletion("c");
+  internal_iter->AddDeletion("d");
+  internal_iter->AddDeletion("e");
+  internal_iter->AddDeletion("f");
+  internal_iter->AddDeletion("g");
+  internal_iter->Finish();
+
+  ro.prefix_same_as_start = true;
+  std::unique_ptr<Iterator> db_iter(NewDBIterator(
+      env_, ro, ImmutableCFOptions(options), MutableCFOptions(options),
+      BytewiseComparator(), internal_iter, 10,
+      options.max_sequential_skip_in_iterations, nullptr /*read_callback*/));
+
+  int skipped_keys = 0;
+
+  get_perf_context()->Reset();
+  db_iter->SeekForPrev("z");
+  skipped_keys =
+      static_cast<int>(get_perf_context()->internal_key_skipped_count);
+  ASSERT_EQ(skipped_keys, 0);
+
+  get_perf_context()->Reset();
+  db_iter->Seek("a");
+  skipped_keys =
+      static_cast<int>(get_perf_context()->internal_key_skipped_count);
+  ASSERT_EQ(skipped_keys, 0);
+}
+
+TEST_F(DBIteratorTest, SeekToFirstLowerBound) {
+  const int kNumKeys = 3;
+  for (int i = 0; i < kNumKeys + 2; ++i) {
+    // + 2 for two special cases: lower bound before and lower bound after the
+    // internal iterator's keys
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    for (int j = 1; j <= kNumKeys; ++j) {
+      internal_iter->AddPut(std::to_string(j), "val");
+    }
+    internal_iter->Finish();
+
+    ReadOptions ro;
+    auto lower_bound_str = std::to_string(i);
+    Slice lower_bound(lower_bound_str);
+    ro.iterate_lower_bound = &lower_bound;
+    Options options;
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, ImmutableCFOptions(options), MutableCFOptions(options),
+        BytewiseComparator(), internal_iter, 10 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
+
+    db_iter->SeekToFirst();
+    if (i == kNumKeys + 1) {
+      // lower bound was beyond the last key
+      ASSERT_FALSE(db_iter->Valid());
+    } else {
+      ASSERT_TRUE(db_iter->Valid());
+      int expected;
+      if (i == 0) {
+        // lower bound was before the first key
+        expected = 1;
+      } else {
+        // lower bound was at the ith key
+        expected = i;
+      }
+      ASSERT_EQ(std::to_string(expected), db_iter->key().ToString());
+    }
+  }
+}
+
+TEST_F(DBIteratorTest, PrevLowerBound) {
+  const int kNumKeys = 3;
+  const int kLowerBound = 2;
+  TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+  for (int j = 1; j <= kNumKeys; ++j) {
+    internal_iter->AddPut(std::to_string(j), "val");
+  }
+  internal_iter->Finish();
+
+  ReadOptions ro;
+  auto lower_bound_str = std::to_string(kLowerBound);
+  Slice lower_bound(lower_bound_str);
+  ro.iterate_lower_bound = &lower_bound;
+  Options options;
+  std::unique_ptr<Iterator> db_iter(NewDBIterator(
+      env_, ro, ImmutableCFOptions(options), MutableCFOptions(options),
+      BytewiseComparator(), internal_iter, 10 /* sequence */,
+      options.max_sequential_skip_in_iterations, nullptr /* read_callback */));
+
+  db_iter->SeekToLast();
+  for (int i = kNumKeys; i >= kLowerBound; --i) {
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(std::to_string(i), db_iter->key().ToString());
+    db_iter->Prev();
+  }
+  ASSERT_FALSE(db_iter->Valid());
+}
+
+TEST_F(DBIteratorTest, SeekLessLowerBound) {
+  const int kNumKeys = 3;
+  const int kLowerBound = 2;
+  TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+  for (int j = 1; j <= kNumKeys; ++j) {
+    internal_iter->AddPut(std::to_string(j), "val");
+  }
+  internal_iter->Finish();
+
+  ReadOptions ro;
+  auto lower_bound_str = std::to_string(kLowerBound);
+  Slice lower_bound(lower_bound_str);
+  ro.iterate_lower_bound = &lower_bound;
+  Options options;
+  std::unique_ptr<Iterator> db_iter(NewDBIterator(
+      env_, ro, ImmutableCFOptions(options), MutableCFOptions(options),
+      BytewiseComparator(), internal_iter, 10 /* sequence */,
+      options.max_sequential_skip_in_iterations, nullptr /* read_callback */));
+
+  auto before_lower_bound_str = std::to_string(kLowerBound - 1);
+  Slice before_lower_bound(lower_bound_str);
+
+  db_iter->Seek(before_lower_bound);
+  ASSERT_TRUE(db_iter->Valid());
+  ASSERT_EQ(lower_bound_str, db_iter->key().ToString());
+}
+
+TEST_F(DBIteratorTest, ReverseToForwardWithDisappearingKeys) {
+  Options options;
+  options.prefix_extractor.reset(NewCappedPrefixTransform(0));
+
+  TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+  internal_iter->AddPut("a", "A");
+  internal_iter->AddPut("b", "B");
+  for (int i = 0; i < 100; ++i) {
+    internal_iter->AddPut("c" + ToString(i), "");
+  }
+  internal_iter->Finish();
+
+  std::unique_ptr<Iterator> db_iter(NewDBIterator(
+      env_, ReadOptions(), ImmutableCFOptions(options),
+      MutableCFOptions(options), BytewiseComparator(), internal_iter, 10,
+      options.max_sequential_skip_in_iterations, nullptr /*read_callback*/));
+
+  db_iter->SeekForPrev("a");
+  ASSERT_TRUE(db_iter->Valid());
+  ASSERT_OK(db_iter->status());
+  ASSERT_EQ("a", db_iter->key().ToString());
+
+  internal_iter->Vanish("a");
+  db_iter->Next();
+  ASSERT_TRUE(db_iter->Valid());
+  ASSERT_OK(db_iter->status());
+  ASSERT_EQ("b", db_iter->key().ToString());
+
+  // A (sort of) bug used to cause DBIter to pointlessly drag the internal
+  // iterator all the way to the end. But this doesn't really matter at the time
+  // of writing because the only iterator that can see disappearing keys is
+  // ForwardIterator, which doesn't support SeekForPrev().
+  EXPECT_LT(internal_iter->steps(), 20);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_iterator_test.cc b/src/rocksdb/db/db_iterator_test.cc
new file mode 100644
index 000000000..99ffb5ce4
--- /dev/null
+++ b/src/rocksdb/db/db_iterator_test.cc
@@ -0,0 +1,2998 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <functional>
+
+#include "db/arena_wrapped_db_iter.h"
+#include "db/db_iter.h"
+#include "db/db_test_util.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "rocksdb/iostats_context.h"
+#include "rocksdb/perf_context.h"
+#include "table/block_based/flush_block_policy.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// A dumb ReadCallback which saying every key is committed.
+class DummyReadCallback : public ReadCallback {
+ public:
+  DummyReadCallback() : ReadCallback(kMaxSequenceNumber) {}
+  bool IsVisibleFullCheck(SequenceNumber /*seq*/) override { return true; }
+  void SetSnapshot(SequenceNumber seq) { max_visible_seq_ = seq; }
+};
+
+// Test param:
+//   bool: whether to pass read_callback to NewIterator().
+class DBIteratorTest : public DBTestBase,
+                       public testing::WithParamInterface<bool> {
+ public:
+  DBIteratorTest() : DBTestBase("/db_iterator_test") {}
+
+  Iterator* NewIterator(const ReadOptions& read_options,
+                        ColumnFamilyHandle* column_family = nullptr) {
+    if (column_family == nullptr) {
+      column_family = db_->DefaultColumnFamily();
+    }
+    auto* cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family)->cfd();
+    SequenceNumber seq = read_options.snapshot != nullptr
+                             ? read_options.snapshot->GetSequenceNumber()
+                             : db_->GetLatestSequenceNumber();
+    bool use_read_callback = GetParam();
+    DummyReadCallback* read_callback = nullptr;
+    if (use_read_callback) {
+      read_callback = new DummyReadCallback();
+      read_callback->SetSnapshot(seq);
+      InstrumentedMutexLock lock(&mutex_);
+      read_callbacks_.push_back(
+          std::unique_ptr<DummyReadCallback>(read_callback));
+    }
+    return dbfull()->NewIteratorImpl(read_options, cfd, seq, read_callback);
+  }
+
+ private:
+  InstrumentedMutex mutex_;
+  std::vector<std::unique_ptr<DummyReadCallback>> read_callbacks_;
+};
+
+TEST_P(DBIteratorTest, IteratorProperty) {
+  // The test needs to be changed if kPersistedTier is supported in iterator.
+  Options options = CurrentOptions();
+  CreateAndReopenWithCF({"pikachu"}, options);
+  Put(1, "1", "2");
+  Delete(1, "2");
+  ReadOptions ropt;
+  ropt.pin_data = false;
+  {
+    std::unique_ptr<Iterator> iter(NewIterator(ropt, handles_[1]));
+    iter->SeekToFirst();
+    std::string prop_value;
+    ASSERT_NOK(iter->GetProperty("non_existing.value", &prop_value));
+    ASSERT_OK(iter->GetProperty("rocksdb.iterator.is-key-pinned", &prop_value));
+    ASSERT_EQ("0", prop_value);
+    ASSERT_OK(iter->GetProperty("rocksdb.iterator.internal-key", &prop_value));
+    ASSERT_EQ("1", prop_value);
+    iter->Next();
+    ASSERT_OK(iter->GetProperty("rocksdb.iterator.is-key-pinned", &prop_value));
+    ASSERT_EQ("Iterator is not valid.", prop_value);
+
+    // Get internal key at which the iteration stopped (tombstone in this case).
+    ASSERT_OK(iter->GetProperty("rocksdb.iterator.internal-key", &prop_value));
+    ASSERT_EQ("2", prop_value);
+  }
+  Close();
+}
+
+TEST_P(DBIteratorTest, PersistedTierOnIterator) {
+  // The test needs to be changed if kPersistedTier is supported in iterator.
+  Options options = CurrentOptions();
+  CreateAndReopenWithCF({"pikachu"}, options);
+  ReadOptions ropt;
+  ropt.read_tier = kPersistedTier;
+
+  auto* iter = db_->NewIterator(ropt, handles_[1]);
+  ASSERT_TRUE(iter->status().IsNotSupported());
+  delete iter;
+
+  std::vector<Iterator*> iters;
+  ASSERT_TRUE(db_->NewIterators(ropt, {handles_[1]}, &iters).IsNotSupported());
+  Close();
+}
+
+TEST_P(DBIteratorTest, NonBlockingIteration) {
+  do {
+    ReadOptions non_blocking_opts, regular_opts;
+    Options options = CurrentOptions();
+    options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+    non_blocking_opts.read_tier = kBlockCacheTier;
+    CreateAndReopenWithCF({"pikachu"}, options);
+    // write one kv to the database.
+    ASSERT_OK(Put(1, "a", "b"));
+
+    // scan using non-blocking iterator. We should find it because
+    // it is in memtable.
+    Iterator* iter = NewIterator(non_blocking_opts, handles_[1]);
+    int count = 0;
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+      ASSERT_OK(iter->status());
+      count++;
+    }
+    ASSERT_EQ(count, 1);
+    delete iter;
+
+    // flush memtable to storage. Now, the key should not be in the
+    // memtable neither in the block cache.
+    ASSERT_OK(Flush(1));
+
+    // verify that a non-blocking iterator does not find any
+    // kvs. Neither does it do any IOs to storage.
+    uint64_t numopen = TestGetTickerCount(options, NO_FILE_OPENS);
+    uint64_t cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD);
+    iter = NewIterator(non_blocking_opts, handles_[1]);
+    count = 0;
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+      count++;
+    }
+    ASSERT_EQ(count, 0);
+    ASSERT_TRUE(iter->status().IsIncomplete());
+    ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS));
+    ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD));
+    delete iter;
+
+    // read in the specified block via a regular get
+    ASSERT_EQ(Get(1, "a"), "b");
+
+    // verify that we can find it via a non-blocking scan
+    numopen = TestGetTickerCount(options, NO_FILE_OPENS);
+    cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD);
+    iter = NewIterator(non_blocking_opts, handles_[1]);
+    count = 0;
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+      ASSERT_OK(iter->status());
+      count++;
+    }
+    ASSERT_EQ(count, 1);
+    ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS));
+    ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD));
+    delete iter;
+
+    // This test verifies block cache behaviors, which is not used by plain
+    // table format.
+  } while (ChangeOptions(kSkipPlainTable | kSkipNoSeekToLast | kSkipMmapReads));
+}
+
+TEST_P(DBIteratorTest, IterSeekBeforePrev) {
+  ASSERT_OK(Put("a", "b"));
+  ASSERT_OK(Put("c", "d"));
+  dbfull()->Flush(FlushOptions());
+  ASSERT_OK(Put("0", "f"));
+  ASSERT_OK(Put("1", "h"));
+  dbfull()->Flush(FlushOptions());
+  ASSERT_OK(Put("2", "j"));
+  auto iter = NewIterator(ReadOptions());
+  iter->Seek(Slice("c"));
+  iter->Prev();
+  iter->Seek(Slice("a"));
+  iter->Prev();
+  delete iter;
+}
+
+TEST_P(DBIteratorTest, IterReseekNewUpperBound) {
+  Random rnd(301);
+  Options options = CurrentOptions();
+  BlockBasedTableOptions table_options;
+  table_options.block_size = 1024;
+  table_options.block_size_deviation = 50;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options.compression = kNoCompression;
+  Reopen(options);
+
+  ASSERT_OK(Put("a", RandomString(&rnd, 400)));
+  ASSERT_OK(Put("aabb", RandomString(&rnd, 400)));
+  ASSERT_OK(Put("aaef", RandomString(&rnd, 400)));
+  ASSERT_OK(Put("b", RandomString(&rnd, 400)));
+  dbfull()->Flush(FlushOptions());
+  ReadOptions opts;
+  Slice ub = Slice("aa");
+  opts.iterate_upper_bound = &ub;
+  auto iter = NewIterator(opts);
+  iter->Seek(Slice("a"));
+  ub = Slice("b");
+  iter->Seek(Slice("aabc"));
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key().ToString(), "aaef");
+  delete iter;
+}
+
+TEST_P(DBIteratorTest, IterSeekForPrevBeforeNext) {
+  ASSERT_OK(Put("a", "b"));
+  ASSERT_OK(Put("c", "d"));
+  dbfull()->Flush(FlushOptions());
+  ASSERT_OK(Put("0", "f"));
+  ASSERT_OK(Put("1", "h"));
+  dbfull()->Flush(FlushOptions());
+  ASSERT_OK(Put("2", "j"));
+  auto iter = NewIterator(ReadOptions());
+  iter->SeekForPrev(Slice("0"));
+  iter->Next();
+  iter->SeekForPrev(Slice("1"));
+  iter->Next();
+  delete iter;
+}
+
+namespace {
+std::string MakeLongKey(size_t length, char c) {
+  return std::string(length, c);
+}
+}  // namespace
+
+TEST_P(DBIteratorTest, IterLongKeys) {
+  ASSERT_OK(Put(MakeLongKey(20, 0), "0"));
+  ASSERT_OK(Put(MakeLongKey(32, 2), "2"));
+  ASSERT_OK(Put("a", "b"));
+  dbfull()->Flush(FlushOptions());
+  ASSERT_OK(Put(MakeLongKey(50, 1), "1"));
+  ASSERT_OK(Put(MakeLongKey(127, 3), "3"));
+  ASSERT_OK(Put(MakeLongKey(64, 4), "4"));
+  auto iter = NewIterator(ReadOptions());
+
+  // Create a key that needs to be skipped for Seq too new
+  iter->Seek(MakeLongKey(20, 0));
+  ASSERT_EQ(IterStatus(iter), MakeLongKey(20, 0) + "->0");
+  iter->Next();
+  ASSERT_EQ(IterStatus(iter), MakeLongKey(50, 1) + "->1");
+  iter->Next();
+  ASSERT_EQ(IterStatus(iter), MakeLongKey(32, 2) + "->2");
+  iter->Next();
+  ASSERT_EQ(IterStatus(iter), MakeLongKey(127, 3) + "->3");
+  iter->Next();
+  ASSERT_EQ(IterStatus(iter), MakeLongKey(64, 4) + "->4");
+
+  iter->SeekForPrev(MakeLongKey(127, 3));
+  ASSERT_EQ(IterStatus(iter), MakeLongKey(127, 3) + "->3");
+  iter->Prev();
+  ASSERT_EQ(IterStatus(iter), MakeLongKey(32, 2) + "->2");
+  iter->Prev();
+  ASSERT_EQ(IterStatus(iter), MakeLongKey(50, 1) + "->1");
+  delete iter;
+
+  iter = NewIterator(ReadOptions());
+  iter->Seek(MakeLongKey(50, 1));
+  ASSERT_EQ(IterStatus(iter), MakeLongKey(50, 1) + "->1");
+  iter->Next();
+  ASSERT_EQ(IterStatus(iter), MakeLongKey(32, 2) + "->2");
+  iter->Next();
+  ASSERT_EQ(IterStatus(iter), MakeLongKey(127, 3) + "->3");
+  delete iter;
+}
+
+TEST_P(DBIteratorTest, IterNextWithNewerSeq) {
+  ASSERT_OK(Put("0", "0"));
+  dbfull()->Flush(FlushOptions());
+  ASSERT_OK(Put("a", "b"));
+  ASSERT_OK(Put("c", "d"));
+  ASSERT_OK(Put("d", "e"));
+  auto iter = NewIterator(ReadOptions());
+
+  // Create a key that needs to be skipped for Seq too new
+  for (uint64_t i = 0; i < last_options_.max_sequential_skip_in_iterations + 1;
+       i++) {
+    ASSERT_OK(Put("b", "f"));
+  }
+
+  iter->Seek(Slice("a"));
+  ASSERT_EQ(IterStatus(iter), "a->b");
+  iter->Next();
+  ASSERT_EQ(IterStatus(iter), "c->d");
+  iter->SeekForPrev(Slice("b"));
+  ASSERT_EQ(IterStatus(iter), "a->b");
+  iter->Next();
+  ASSERT_EQ(IterStatus(iter), "c->d");
+
+  delete iter;
+}
+
+TEST_P(DBIteratorTest, IterPrevWithNewerSeq) {
+  ASSERT_OK(Put("0", "0"));
+  dbfull()->Flush(FlushOptions());
+  ASSERT_OK(Put("a", "b"));
+  ASSERT_OK(Put("c", "d"));
+  ASSERT_OK(Put("d", "e"));
+  auto iter = NewIterator(ReadOptions());
+
+  // Create a key that needs to be skipped for Seq too new
+  for (uint64_t i = 0; i < last_options_.max_sequential_skip_in_iterations + 1;
+       i++) {
+    ASSERT_OK(Put("b", "f"));
+  }
+
+  iter->Seek(Slice("d"));
+  ASSERT_EQ(IterStatus(iter), "d->e");
+  iter->Prev();
+  ASSERT_EQ(IterStatus(iter), "c->d");
+  iter->Prev();
+  ASSERT_EQ(IterStatus(iter), "a->b");
+  iter->Prev();
+  iter->SeekForPrev(Slice("d"));
+  ASSERT_EQ(IterStatus(iter), "d->e");
+  iter->Prev();
+  ASSERT_EQ(IterStatus(iter), "c->d");
+  iter->Prev();
+  ASSERT_EQ(IterStatus(iter), "a->b");
+  iter->Prev();
+  delete iter;
+}
+
+TEST_P(DBIteratorTest, IterPrevWithNewerSeq2) {
+  ASSERT_OK(Put("0", "0"));
+  dbfull()->Flush(FlushOptions());
+  ASSERT_OK(Put("a", "b"));
+  ASSERT_OK(Put("c", "d"));
+  ASSERT_OK(Put("e", "f"));
+  auto iter = NewIterator(ReadOptions());
+  auto iter2 = NewIterator(ReadOptions());
+  iter->Seek(Slice("c"));
+  iter2->SeekForPrev(Slice("d"));
+  ASSERT_EQ(IterStatus(iter), "c->d");
+  ASSERT_EQ(IterStatus(iter2), "c->d");
+
+  // Create a key that needs to be skipped for Seq too new
+  for (uint64_t i = 0; i < last_options_.max_sequential_skip_in_iterations + 1;
+       i++) {
+    ASSERT_OK(Put("b", "f"));
+  }
+
+  iter->Prev();
+  ASSERT_EQ(IterStatus(iter), "a->b");
+  iter->Prev();
+  iter2->Prev();
+  ASSERT_EQ(IterStatus(iter2), "a->b");
+  iter2->Prev();
+  delete iter;
+  delete iter2;
+}
+
+TEST_P(DBIteratorTest, IterEmpty) {
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    Iterator* iter = NewIterator(ReadOptions(), handles_[1]);
+
+    iter->SeekToFirst();
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+    iter->SeekToLast();
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+    iter->Seek("foo");
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+    iter->SeekForPrev("foo");
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+    delete iter;
+  } while (ChangeCompactOptions());
+}
+
+TEST_P(DBIteratorTest, IterSingle) {
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    ASSERT_OK(Put(1, "a", "va"));
+    Iterator* iter = NewIterator(ReadOptions(), handles_[1]);
+
+    iter->SeekToFirst();
+    ASSERT_EQ(IterStatus(iter), "a->va");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+    iter->SeekToFirst();
+    ASSERT_EQ(IterStatus(iter), "a->va");
+    iter->Prev();
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+    iter->SeekToLast();
+    ASSERT_EQ(IterStatus(iter), "a->va");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+    iter->SeekToLast();
+    ASSERT_EQ(IterStatus(iter), "a->va");
+    iter->Prev();
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+    iter->Seek("");
+    ASSERT_EQ(IterStatus(iter), "a->va");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+    iter->SeekForPrev("");
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+    iter->Seek("a");
+    ASSERT_EQ(IterStatus(iter), "a->va");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+    iter->SeekForPrev("a");
+    ASSERT_EQ(IterStatus(iter), "a->va");
+    iter->Prev();
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+    iter->Seek("b");
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+    iter->SeekForPrev("b");
+    ASSERT_EQ(IterStatus(iter), "a->va");
+    iter->Prev();
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+    delete iter;
+  } while (ChangeCompactOptions());
+}
+
+TEST_P(DBIteratorTest, IterMulti) {
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    ASSERT_OK(Put(1, "a", "va"));
+    ASSERT_OK(Put(1, "b", "vb"));
+    ASSERT_OK(Put(1, "c", "vc"));
+    Iterator* iter = NewIterator(ReadOptions(), handles_[1]);
+
+    iter->SeekToFirst();
+    ASSERT_EQ(IterStatus(iter), "a->va");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "b->vb");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "c->vc");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+    iter->SeekToFirst();
+    ASSERT_EQ(IterStatus(iter), "a->va");
+    iter->Prev();
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+    iter->SeekToLast();
+    ASSERT_EQ(IterStatus(iter), "c->vc");
+    iter->Prev();
+    ASSERT_EQ(IterStatus(iter), "b->vb");
+    iter->Prev();
+    ASSERT_EQ(IterStatus(iter), "a->va");
+    iter->Prev();
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+    iter->SeekToLast();
+    ASSERT_EQ(IterStatus(iter), "c->vc");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+    iter->Seek("");
+    ASSERT_EQ(IterStatus(iter), "a->va");
+    iter->Seek("a");
+    ASSERT_EQ(IterStatus(iter), "a->va");
+    iter->Seek("ax");
+    ASSERT_EQ(IterStatus(iter), "b->vb");
+    iter->SeekForPrev("d");
+    ASSERT_EQ(IterStatus(iter), "c->vc");
+    iter->SeekForPrev("c");
+    ASSERT_EQ(IterStatus(iter), "c->vc");
+    iter->SeekForPrev("bx");
+    ASSERT_EQ(IterStatus(iter), "b->vb");
+
+    iter->Seek("b");
+    ASSERT_EQ(IterStatus(iter), "b->vb");
+    iter->Seek("z");
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+    iter->SeekForPrev("b");
+    ASSERT_EQ(IterStatus(iter), "b->vb");
+    iter->SeekForPrev("");
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+    // Switch from reverse to forward
+    iter->SeekToLast();
+    iter->Prev();
+    iter->Prev();
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "b->vb");
+
+    // Switch from forward to reverse
+    iter->SeekToFirst();
+    iter->Next();
+    iter->Next();
+    iter->Prev();
+    ASSERT_EQ(IterStatus(iter), "b->vb");
+
+    // Make sure iter stays at snapshot
+    ASSERT_OK(Put(1, "a", "va2"));
+    ASSERT_OK(Put(1, "a2", "va3"));
+    ASSERT_OK(Put(1, "b", "vb2"));
+    ASSERT_OK(Put(1, "c", "vc2"));
+    ASSERT_OK(Delete(1, "b"));
+    iter->SeekToFirst();
+    ASSERT_EQ(IterStatus(iter), "a->va");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "b->vb");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "c->vc");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+    iter->SeekToLast();
+    ASSERT_EQ(IterStatus(iter), "c->vc");
+    iter->Prev();
+    ASSERT_EQ(IterStatus(iter), "b->vb");
+    iter->Prev();
+    ASSERT_EQ(IterStatus(iter), "a->va");
+    iter->Prev();
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+    delete iter;
+  } while (ChangeCompactOptions());
+}
+
+// Check that we can skip over a run of user keys
+// by using reseek rather than sequential scan
+TEST_P(DBIteratorTest, IterReseek) {
+  anon::OptionsOverride options_override;
+  options_override.skip_policy = kSkipNoSnapshot;
+  Options options = CurrentOptions(options_override);
+  options.max_sequential_skip_in_iterations = 3;
+  options.create_if_missing = true;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  // insert three keys with same userkey and verify that
+  // reseek is not invoked. For each of these test cases,
+  // verify that we can find the next key "b".
+  ASSERT_OK(Put(1, "a", "zero"));
+  ASSERT_OK(Put(1, "a", "one"));
+  ASSERT_OK(Put(1, "a", "two"));
+  ASSERT_OK(Put(1, "b", "bone"));
+  Iterator* iter = NewIterator(ReadOptions(), handles_[1]);
+  iter->SeekToFirst();
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 0);
+  ASSERT_EQ(IterStatus(iter), "a->two");
+  iter->Next();
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 0);
+  ASSERT_EQ(IterStatus(iter), "b->bone");
+  delete iter;
+
+  // insert a total of three keys with same userkey and verify
+  // that reseek is still not invoked.
+  ASSERT_OK(Put(1, "a", "three"));
+  iter = NewIterator(ReadOptions(), handles_[1]);
+  iter->SeekToFirst();
+  ASSERT_EQ(IterStatus(iter), "a->three");
+  iter->Next();
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 0);
+  ASSERT_EQ(IterStatus(iter), "b->bone");
+  delete iter;
+
+  // insert a total of four keys with same userkey and verify
+  // that reseek is invoked.
+  ASSERT_OK(Put(1, "a", "four"));
+  iter = NewIterator(ReadOptions(), handles_[1]);
+  iter->SeekToFirst();
+  ASSERT_EQ(IterStatus(iter), "a->four");
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 0);
+  iter->Next();
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 1);
+  ASSERT_EQ(IterStatus(iter), "b->bone");
+  delete iter;
+
+  // Testing reverse iterator
+  // At this point, we have three versions of "a" and one version of "b".
+  // The reseek statistics is already at 1.
+  int num_reseeks = static_cast<int>(
+      TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION));
+
+  // Insert another version of b and assert that reseek is not invoked
+  ASSERT_OK(Put(1, "b", "btwo"));
+  iter = NewIterator(ReadOptions(), handles_[1]);
+  iter->SeekToLast();
+  ASSERT_EQ(IterStatus(iter), "b->btwo");
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION),
+            num_reseeks);
+  iter->Prev();
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION),
+            num_reseeks + 1);
+  ASSERT_EQ(IterStatus(iter), "a->four");
+  delete iter;
+
+  // insert two more versions of b. This makes a total of 4 versions
+  // of b and 4 versions of a.
+  ASSERT_OK(Put(1, "b", "bthree"));
+  ASSERT_OK(Put(1, "b", "bfour"));
+  iter = NewIterator(ReadOptions(), handles_[1]);
+  iter->SeekToLast();
+  ASSERT_EQ(IterStatus(iter), "b->bfour");
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION),
+            num_reseeks + 2);
+  iter->Prev();
+
+  // the previous Prev call should have invoked reseek
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION),
+            num_reseeks + 3);
+  ASSERT_EQ(IterStatus(iter), "a->four");
+  delete iter;
+}
+
+TEST_P(DBIteratorTest, IterSmallAndLargeMix) {
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    ASSERT_OK(Put(1, "a", "va"));
+    ASSERT_OK(Put(1, "b", std::string(100000, 'b')));
+    ASSERT_OK(Put(1, "c", "vc"));
+    ASSERT_OK(Put(1, "d", std::string(100000, 'd')));
+    ASSERT_OK(Put(1, "e", std::string(100000, 'e')));
+
+    Iterator* iter = NewIterator(ReadOptions(), handles_[1]);
+
+    iter->SeekToFirst();
+    ASSERT_EQ(IterStatus(iter), "a->va");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "b->" + std::string(100000, 'b'));
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "c->vc");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "d->" + std::string(100000, 'd'));
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "e->" + std::string(100000, 'e'));
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+    iter->SeekToLast();
+    ASSERT_EQ(IterStatus(iter), "e->" + std::string(100000, 'e'));
+    iter->Prev();
+    ASSERT_EQ(IterStatus(iter), "d->" + std::string(100000, 'd'));
+    iter->Prev();
+    ASSERT_EQ(IterStatus(iter), "c->vc");
+    iter->Prev();
+    ASSERT_EQ(IterStatus(iter), "b->" + std::string(100000, 'b'));
+    iter->Prev();
+    ASSERT_EQ(IterStatus(iter), "a->va");
+    iter->Prev();
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+    delete iter;
+  } while (ChangeCompactOptions());
+}
+
+TEST_P(DBIteratorTest, IterMultiWithDelete) {
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    ASSERT_OK(Put(1, "ka", "va"));
+    ASSERT_OK(Put(1, "kb", "vb"));
+    ASSERT_OK(Put(1, "kc", "vc"));
+    ASSERT_OK(Delete(1, "kb"));
+    ASSERT_EQ("NOT_FOUND", Get(1, "kb"));
+
+    Iterator* iter = NewIterator(ReadOptions(), handles_[1]);
+    iter->Seek("kc");
+    ASSERT_EQ(IterStatus(iter), "kc->vc");
+    if (!CurrentOptions().merge_operator) {
+      // TODO: merge operator does not support backward iteration yet
+      if (kPlainTableAllBytesPrefix != option_config_ &&
+          kBlockBasedTableWithWholeKeyHashIndex != option_config_ &&
+          kHashLinkList != option_config_ &&
+          kHashSkipList != option_config_) {  // doesn't support SeekToLast
+        iter->Prev();
+        ASSERT_EQ(IterStatus(iter), "ka->va");
+      }
+    }
+    delete iter;
+  } while (ChangeOptions());
+}
+
+TEST_P(DBIteratorTest, IterPrevMaxSkip) {
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    for (int i = 0; i < 2; i++) {
+      ASSERT_OK(Put(1, "key1", "v1"));
+      ASSERT_OK(Put(1, "key2", "v2"));
+      ASSERT_OK(Put(1, "key3", "v3"));
+      ASSERT_OK(Put(1, "key4", "v4"));
+      ASSERT_OK(Put(1, "key5", "v5"));
+    }
+
+    VerifyIterLast("key5->v5", 1);
+
+    ASSERT_OK(Delete(1, "key5"));
+    VerifyIterLast("key4->v4", 1);
+
+    ASSERT_OK(Delete(1, "key4"));
+    VerifyIterLast("key3->v3", 1);
+
+    ASSERT_OK(Delete(1, "key3"));
+    VerifyIterLast("key2->v2", 1);
+
+    ASSERT_OK(Delete(1, "key2"));
+    VerifyIterLast("key1->v1", 1);
+
+    ASSERT_OK(Delete(1, "key1"));
+    VerifyIterLast("(invalid)", 1);
+  } while (ChangeOptions(kSkipMergePut | kSkipNoSeekToLast));
+}
+
+TEST_P(DBIteratorTest, IterWithSnapshot) {
+  anon::OptionsOverride options_override;
+  options_override.skip_policy = kSkipNoSnapshot;
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions(options_override));
+    ASSERT_OK(Put(1, "key1", "val1"));
+    ASSERT_OK(Put(1, "key2", "val2"));
+    ASSERT_OK(Put(1, "key3", "val3"));
+    ASSERT_OK(Put(1, "key4", "val4"));
+    ASSERT_OK(Put(1, "key5", "val5"));
+
+    const Snapshot* snapshot = db_->GetSnapshot();
+    ReadOptions options;
+    options.snapshot = snapshot;
+    Iterator* iter = NewIterator(options, handles_[1]);
+
+    ASSERT_OK(Put(1, "key0", "val0"));
+    // Put more values after the snapshot
+    ASSERT_OK(Put(1, "key100", "val100"));
+    ASSERT_OK(Put(1, "key101", "val101"));
+
+    iter->Seek("key5");
+    ASSERT_EQ(IterStatus(iter), "key5->val5");
+    if (!CurrentOptions().merge_operator) {
+      // TODO: merge operator does not support backward iteration yet
+      if (kPlainTableAllBytesPrefix != option_config_ &&
+          kBlockBasedTableWithWholeKeyHashIndex != option_config_ &&
+          kHashLinkList != option_config_ && kHashSkipList != option_config_) {
+        iter->Prev();
+        ASSERT_EQ(IterStatus(iter), "key4->val4");
+        iter->Prev();
+        ASSERT_EQ(IterStatus(iter), "key3->val3");
+
+        iter->Next();
+        ASSERT_EQ(IterStatus(iter), "key4->val4");
+        iter->Next();
+        ASSERT_EQ(IterStatus(iter), "key5->val5");
+      }
+      iter->Next();
+      ASSERT_TRUE(!iter->Valid());
+    }
+
+    if (!CurrentOptions().merge_operator) {
+      // TODO(gzh): merge operator does not support backward iteration yet
+      if (kPlainTableAllBytesPrefix != option_config_ &&
+          kBlockBasedTableWithWholeKeyHashIndex != option_config_ &&
+          kHashLinkList != option_config_ && kHashSkipList != option_config_) {
+        iter->SeekForPrev("key1");
+        ASSERT_EQ(IterStatus(iter), "key1->val1");
+        iter->Next();
+        ASSERT_EQ(IterStatus(iter), "key2->val2");
+        iter->Next();
+        ASSERT_EQ(IterStatus(iter), "key3->val3");
+        iter->Prev();
+        ASSERT_EQ(IterStatus(iter), "key2->val2");
+        iter->Prev();
+        ASSERT_EQ(IterStatus(iter), "key1->val1");
+        iter->Prev();
+        ASSERT_TRUE(!iter->Valid());
+      }
+    }
+    db_->ReleaseSnapshot(snapshot);
+    delete iter;
+  } while (ChangeOptions());
+}
+
+TEST_P(DBIteratorTest, IteratorPinsRef) {
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    Put(1, "foo", "hello");
+
+    // Get iterator that will yield the current contents of the DB.
+    Iterator* iter = NewIterator(ReadOptions(), handles_[1]);
+
+    // Write to force compactions
+    Put(1, "foo", "newvalue1");
+    for (int i = 0; i < 100; i++) {
+      // 100K values
+      ASSERT_OK(Put(1, Key(i), Key(i) + std::string(100000, 'v')));
+    }
+    Put(1, "foo", "newvalue2");
+
+    iter->SeekToFirst();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("foo", iter->key().ToString());
+    ASSERT_EQ("hello", iter->value().ToString());
+    iter->Next();
+    ASSERT_TRUE(!iter->Valid());
+    delete iter;
+  } while (ChangeCompactOptions());
+}
+
+TEST_P(DBIteratorTest, IteratorDeleteAfterCfDelete) {
+  CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+
+  Put(1, "foo", "delete-cf-then-delete-iter");
+  Put(1, "hello", "value2");
+
+  ColumnFamilyHandle* cf = handles_[1];
+  ReadOptions ro;
+
+  auto* iter = db_->NewIterator(ro, cf);
+  iter->SeekToFirst();
+  ASSERT_EQ(IterStatus(iter), "foo->delete-cf-then-delete-iter");
+
+  // delete CF handle
+  db_->DestroyColumnFamilyHandle(cf);
+  handles_.erase(std::begin(handles_) + 1);
+
+  // delete Iterator after CF handle is deleted
+  iter->Next();
+  ASSERT_EQ(IterStatus(iter), "hello->value2");
+  delete iter;
+}
+
+TEST_P(DBIteratorTest, IteratorDeleteAfterCfDrop) {
+  CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+
+  Put(1, "foo", "drop-cf-then-delete-iter");
+
+  ReadOptions ro;
+  ColumnFamilyHandle* cf = handles_[1];
+
+  auto* iter = db_->NewIterator(ro, cf);
+  iter->SeekToFirst();
+  ASSERT_EQ(IterStatus(iter), "foo->drop-cf-then-delete-iter");
+
+  // drop and delete CF
+  db_->DropColumnFamily(cf);
+  db_->DestroyColumnFamilyHandle(cf);
+  handles_.erase(std::begin(handles_) + 1);
+
+  // delete Iterator after CF handle is dropped
+  delete iter;
+}
+
+// SetOptions not defined in ROCKSDB LITE
+#ifndef ROCKSDB_LITE
+TEST_P(DBIteratorTest, DBIteratorBoundTest) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+
+  options.prefix_extractor = nullptr;
+  DestroyAndReopen(options);
+  ASSERT_OK(Put("a", "0"));
+  ASSERT_OK(Put("foo", "bar"));
+  ASSERT_OK(Put("foo1", "bar1"));
+  ASSERT_OK(Put("g1", "0"));
+
+  // testing basic case with no iterate_upper_bound and no prefix_extractor
+  {
+    ReadOptions ro;
+    ro.iterate_upper_bound = nullptr;
+
+    std::unique_ptr<Iterator> iter(NewIterator(ro));
+
+    iter->Seek("foo");
+
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(Slice("foo")), 0);
+
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(Slice("foo1")), 0);
+
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(Slice("g1")), 0);
+
+    iter->SeekForPrev("g1");
+
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(Slice("g1")), 0);
+
+    iter->Prev();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(Slice("foo1")), 0);
+
+    iter->Prev();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(Slice("foo")), 0);
+  }
+
+  // testing iterate_upper_bound and forward iterator
+  // to make sure it stops at bound
+  {
+    ReadOptions ro;
+    // iterate_upper_bound points beyond the last expected entry
+    Slice prefix("foo2");
+    ro.iterate_upper_bound = &prefix;
+
+    std::unique_ptr<Iterator> iter(NewIterator(ro));
+
+    iter->Seek("foo");
+
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(Slice("foo")), 0);
+
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(("foo1")), 0);
+
+    iter->Next();
+    // should stop here...
+    ASSERT_TRUE(!iter->Valid());
+  }
+  // Testing SeekToLast with iterate_upper_bound set
+  {
+    ReadOptions ro;
+
+    Slice prefix("foo");
+    ro.iterate_upper_bound = &prefix;
+
+    std::unique_ptr<Iterator> iter(NewIterator(ro));
+
+    iter->SeekToLast();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(Slice("a")), 0);
+  }
+
+  // prefix is the first letter of the key
+  ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "fixed:1"}}));
+  ASSERT_OK(Put("a", "0"));
+  ASSERT_OK(Put("foo", "bar"));
+  ASSERT_OK(Put("foo1", "bar1"));
+  ASSERT_OK(Put("g1", "0"));
+
+  // testing with iterate_upper_bound and prefix_extractor
+  // Seek target and iterate_upper_bound are not is same prefix
+  // This should be an error
+  {
+    ReadOptions ro;
+    Slice upper_bound("g");
+    ro.iterate_upper_bound = &upper_bound;
+
+    std::unique_ptr<Iterator> iter(NewIterator(ro));
+
+    iter->Seek("foo");
+
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("foo", iter->key().ToString());
+
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("foo1", iter->key().ToString());
+
+    iter->Next();
+    ASSERT_TRUE(!iter->Valid());
+  }
+
+  // testing that iterate_upper_bound prevents iterating over deleted items
+  // if the bound has already reached
+  {
+    options.prefix_extractor = nullptr;
+    DestroyAndReopen(options);
+    ASSERT_OK(Put("a", "0"));
+    ASSERT_OK(Put("b", "0"));
+    ASSERT_OK(Put("b1", "0"));
+    ASSERT_OK(Put("c", "0"));
+    ASSERT_OK(Put("d", "0"));
+    ASSERT_OK(Put("e", "0"));
+    ASSERT_OK(Delete("c"));
+    ASSERT_OK(Delete("d"));
+
+    // base case with no bound
+    ReadOptions ro;
+    ro.iterate_upper_bound = nullptr;
+
+    std::unique_ptr<Iterator> iter(NewIterator(ro));
+
+    iter->Seek("b");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(Slice("b")), 0);
+
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(("b1")), 0);
+
+    get_perf_context()->Reset();
+    iter->Next();
+
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(static_cast<int>(get_perf_context()->internal_delete_skipped_count), 2);
+
+    // now testing with iterate_bound
+    Slice prefix("c");
+    ro.iterate_upper_bound = &prefix;
+
+    iter.reset(NewIterator(ro));
+
+    get_perf_context()->Reset();
+
+    iter->Seek("b");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(Slice("b")), 0);
+
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(("b1")), 0);
+
+    iter->Next();
+    // the iteration should stop as soon as the bound key is reached
+    // even though the key is deleted
+    // hence internal_delete_skipped_count should be 0
+    ASSERT_TRUE(!iter->Valid());
+    ASSERT_EQ(static_cast<int>(get_perf_context()->internal_delete_skipped_count), 0);
+  }
+}
+
+TEST_P(DBIteratorTest, DBIteratorBoundMultiSeek) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  options.prefix_extractor = nullptr;
+  DestroyAndReopen(options);
+  ASSERT_OK(Put("a", "0"));
+  ASSERT_OK(Put("z", "0"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("foo1", "bar1"));
+  ASSERT_OK(Put("foo2", "bar2"));
+  ASSERT_OK(Put("foo3", "bar3"));
+  ASSERT_OK(Put("foo4", "bar4"));
+
+  {
+    std::string up_str = "foo5";
+    Slice up(up_str);
+    ReadOptions ro;
+    ro.iterate_upper_bound = &up;
+    std::unique_ptr<Iterator> iter(NewIterator(ro));
+
+    iter->Seek("foo1");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(Slice("foo1")), 0);
+
+    uint64_t prev_block_cache_hit =
+        TestGetTickerCount(options, BLOCK_CACHE_HIT);
+    uint64_t prev_block_cache_miss =
+        TestGetTickerCount(options, BLOCK_CACHE_MISS);
+
+    ASSERT_GT(prev_block_cache_hit + prev_block_cache_miss, 0);
+
+    iter->Seek("foo4");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(Slice("foo4")), 0);
+    ASSERT_EQ(prev_block_cache_hit,
+              TestGetTickerCount(options, BLOCK_CACHE_HIT));
+    ASSERT_EQ(prev_block_cache_miss,
+              TestGetTickerCount(options, BLOCK_CACHE_MISS));
+
+    iter->Seek("foo2");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(Slice("foo2")), 0);
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(Slice("foo3")), 0);
+    ASSERT_EQ(prev_block_cache_hit,
+              TestGetTickerCount(options, BLOCK_CACHE_HIT));
+    ASSERT_EQ(prev_block_cache_miss,
+              TestGetTickerCount(options, BLOCK_CACHE_MISS));
+  }
+}
+#endif
+
+TEST_P(DBIteratorTest, DBIteratorBoundOptimizationTest) {
+  for (auto format_version : {2, 3, 4}) {
+    int upper_bound_hits = 0;
+    Options options = CurrentOptions();
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+        "BlockBasedTableIterator:out_of_bound",
+        [&upper_bound_hits](void*) { upper_bound_hits++; });
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+    options.env = env_;
+    options.create_if_missing = true;
+    options.prefix_extractor = nullptr;
+    BlockBasedTableOptions table_options;
+    table_options.format_version = format_version;
+    table_options.flush_block_policy_factory =
+        std::make_shared<FlushBlockEveryKeyPolicyFactory>();
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+    DestroyAndReopen(options);
+    ASSERT_OK(Put("foo1", "bar1"));
+    ASSERT_OK(Put("foo2", "bar2"));
+    ASSERT_OK(Put("foo4", "bar4"));
+    ASSERT_OK(Flush());
+
+    Slice ub("foo3");
+    ReadOptions ro;
+    ro.iterate_upper_bound = &ub;
+
+    std::unique_ptr<Iterator> iter(NewIterator(ro));
+
+    iter->Seek("foo");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(Slice("foo1")), 0);
+    ASSERT_EQ(upper_bound_hits, 0);
+
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(Slice("foo2")), 0);
+    ASSERT_EQ(upper_bound_hits, 0);
+
+    iter->Next();
+    ASSERT_FALSE(iter->Valid());
+    ASSERT_EQ(upper_bound_hits, 1);
+  }
+}
+
+// Enable kBinarySearchWithFirstKey, do some iterator operations and check that
+// they don't do unnecessary block reads.
+TEST_P(DBIteratorTest, IndexWithFirstKey) {
+  for (int tailing = 0; tailing < 2; ++tailing) {
+    SCOPED_TRACE("tailing = " + std::to_string(tailing));
+    Options options = CurrentOptions();
+    options.env = env_;
+    options.create_if_missing = true;
+    options.prefix_extractor = nullptr;
+    options.merge_operator = MergeOperators::CreateStringAppendOperator();
+    options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+    Statistics* stats = options.statistics.get();
+    BlockBasedTableOptions table_options;
+    table_options.index_type =
+        BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey;
+    table_options.index_shortening =
+        BlockBasedTableOptions::IndexShorteningMode::kNoShortening;
+    table_options.flush_block_policy_factory =
+        std::make_shared<FlushBlockEveryKeyPolicyFactory>();
+    table_options.block_cache =
+        NewLRUCache(8000);  // fits all blocks and their cache metadata overhead
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+    DestroyAndReopen(options);
+    ASSERT_OK(Merge("a1", "x1"));
+    ASSERT_OK(Merge("b1", "y1"));
+    ASSERT_OK(Merge("c0", "z1"));
+    ASSERT_OK(Flush());
+    ASSERT_OK(Merge("a2", "x2"));
+    ASSERT_OK(Merge("b2", "y2"));
+    ASSERT_OK(Merge("c0", "z2"));
+    ASSERT_OK(Flush());
+    ASSERT_OK(Merge("a3", "x3"));
+    ASSERT_OK(Merge("b3", "y3"));
+    ASSERT_OK(Merge("c3", "z3"));
+    ASSERT_OK(Flush());
+
+    // Block cache is not important for this test.
+    // We use BLOCK_CACHE_DATA_* counters just because they're the most readily
+    // available way of counting block accesses.
+
+    ReadOptions ropt;
+    ropt.tailing = tailing;
+    std::unique_ptr<Iterator> iter(NewIterator(ropt));
+
+    iter->Seek("b10");
+    ASSERT_TRUE(iter->Valid());
+    EXPECT_EQ("b2", iter->key().ToString());
+    EXPECT_EQ("y2", iter->value().ToString());
+    EXPECT_EQ(1, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    EXPECT_EQ("b3", iter->key().ToString());
+    EXPECT_EQ("y3", iter->value().ToString());
+    EXPECT_EQ(2, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+    EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+
+    iter->Seek("c0");
+    ASSERT_TRUE(iter->Valid());
+    EXPECT_EQ("c0", iter->key().ToString());
+    EXPECT_EQ("z1,z2", iter->value().ToString());
+    EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+    EXPECT_EQ(4, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    EXPECT_EQ("c3", iter->key().ToString());
+    EXPECT_EQ("z3", iter->value().ToString());
+    EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+    EXPECT_EQ(5, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+
+    iter.reset();
+
+    // Enable iterate_upper_bound and check that iterator is not trying to read
+    // blocks that are fully above upper bound.
+    std::string ub = "b3";
+    Slice ub_slice(ub);
+    ropt.iterate_upper_bound = &ub_slice;
+    iter.reset(NewIterator(ropt));
+
+    iter->Seek("b2");
+    ASSERT_TRUE(iter->Valid());
+    EXPECT_EQ("b2", iter->key().ToString());
+    EXPECT_EQ("y2", iter->value().ToString());
+    EXPECT_EQ(1, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+    EXPECT_EQ(5, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+
+    iter->Next();
+    ASSERT_FALSE(iter->Valid());
+    EXPECT_EQ(1, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+    EXPECT_EQ(5, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+  }
+}
+
+TEST_P(DBIteratorTest, IndexWithFirstKeyGet) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  options.prefix_extractor = nullptr;
+  options.merge_operator = MergeOperators::CreateStringAppendOperator();
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  Statistics* stats = options.statistics.get();
+  BlockBasedTableOptions table_options;
+  table_options.index_type =
+      BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey;
+  table_options.index_shortening =
+      BlockBasedTableOptions::IndexShorteningMode::kNoShortening;
+  table_options.flush_block_policy_factory =
+      std::make_shared<FlushBlockEveryKeyPolicyFactory>();
+  table_options.block_cache = NewLRUCache(1000);  // fits all blocks
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  DestroyAndReopen(options);
+  ASSERT_OK(Merge("a", "x1"));
+  ASSERT_OK(Merge("c", "y1"));
+  ASSERT_OK(Merge("e", "z1"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Merge("c", "y2"));
+  ASSERT_OK(Merge("e", "z2"));
+  ASSERT_OK(Flush());
+
+  // Get() between blocks shouldn't read any blocks.
+  ASSERT_EQ("NOT_FOUND", Get("b"));
+  EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+  EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+
+  // Get() of an existing key shouldn't read any unnecessary blocks when there's
+  // only one key per block.
+
+  ASSERT_EQ("y1,y2", Get("c"));
+  EXPECT_EQ(2, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+  EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+
+  ASSERT_EQ("x1", Get("a"));
+  EXPECT_EQ(3, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+  EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+
+  EXPECT_EQ(std::vector<std::string>({"NOT_FOUND", "z1,z2"}),
+            MultiGet({"b", "e"}));
+}
+
+// TODO(3.13): fix the issue of Seek() + Prev() which might not necessary
+//             return the biggest key which is smaller than the seek key.
+TEST_P(DBIteratorTest, PrevAfterAndNextAfterMerge) {
+  Options options;
+  options.create_if_missing = true;
+  options.merge_operator = MergeOperators::CreatePutOperator();
+  options.env = env_;
+  DestroyAndReopen(options);
+
+  // write three entries with different keys using Merge()
+  WriteOptions wopts;
+  db_->Merge(wopts, "1", "data1");
+  db_->Merge(wopts, "2", "data2");
+  db_->Merge(wopts, "3", "data3");
+
+  std::unique_ptr<Iterator> it(NewIterator(ReadOptions()));
+
+  it->Seek("2");
+  ASSERT_TRUE(it->Valid());
+  ASSERT_EQ("2", it->key().ToString());
+
+  it->Prev();
+  ASSERT_TRUE(it->Valid());
+  ASSERT_EQ("1", it->key().ToString());
+
+  it->SeekForPrev("1");
+  ASSERT_TRUE(it->Valid());
+  ASSERT_EQ("1", it->key().ToString());
+
+  it->Next();
+  ASSERT_TRUE(it->Valid());
+  ASSERT_EQ("2", it->key().ToString());
+}
+
+class DBIteratorTestForPinnedData : public DBIteratorTest {
+ public:
+  enum TestConfig {
+    NORMAL,
+    CLOSE_AND_OPEN,
+    COMPACT_BEFORE_READ,
+    FLUSH_EVERY_1000,
+    MAX
+  };
+  DBIteratorTestForPinnedData() : DBIteratorTest() {}
+  void PinnedDataIteratorRandomized(TestConfig run_config) {
+    // Generate Random data
+    Random rnd(301);
+
+    int puts = 100000;
+    int key_pool = static_cast<int>(puts * 0.7);
+    int key_size = 100;
+    int val_size = 1000;
+    int seeks_percentage = 20;   // 20% of keys will be used to test seek()
+    int delete_percentage = 20;  // 20% of keys will be deleted
+    int merge_percentage = 20;   // 20% of keys will be added using Merge()
+
+    Options options = CurrentOptions();
+    BlockBasedTableOptions table_options;
+    table_options.use_delta_encoding = false;
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+    options.merge_operator = MergeOperators::CreatePutOperator();
+    DestroyAndReopen(options);
+
+    std::vector<std::string> generated_keys(key_pool);
+    for (int i = 0; i < key_pool; i++) {
+      generated_keys[i] = RandomString(&rnd, key_size);
+    }
+
+    std::map<std::string, std::string> true_data;
+    std::vector<std::string> random_keys;
+    std::vector<std::string> deleted_keys;
+    for (int i = 0; i < puts; i++) {
+      auto& k = generated_keys[rnd.Next() % key_pool];
+      auto v = RandomString(&rnd, val_size);
+
+      // Insert data to true_data map and to DB
+      true_data[k] = v;
+      if (rnd.PercentTrue(merge_percentage)) {
+        ASSERT_OK(db_->Merge(WriteOptions(), k, v));
+      } else {
+        ASSERT_OK(Put(k, v));
+      }
+
+      // Pick random keys to be used to test Seek()
+      if (rnd.PercentTrue(seeks_percentage)) {
+        random_keys.push_back(k);
+      }
+
+      // Delete some random keys
+      if (rnd.PercentTrue(delete_percentage)) {
+        deleted_keys.push_back(k);
+        true_data.erase(k);
+        ASSERT_OK(Delete(k));
+      }
+
+      if (run_config == TestConfig::FLUSH_EVERY_1000) {
+        if (i && i % 1000 == 0) {
+          Flush();
+        }
+      }
+    }
+
+    if (run_config == TestConfig::CLOSE_AND_OPEN) {
+      Close();
+      Reopen(options);
+    } else if (run_config == TestConfig::COMPACT_BEFORE_READ) {
+      db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+    }
+
+    ReadOptions ro;
+    ro.pin_data = true;
+    auto iter = NewIterator(ro);
+
+    {
+      // Test Seek to random keys
+      std::vector<Slice> keys_slices;
+      std::vector<std::string> true_keys;
+      for (auto& k : random_keys) {
+        iter->Seek(k);
+        if (!iter->Valid()) {
+          ASSERT_EQ(true_data.lower_bound(k), true_data.end());
+          continue;
+        }
+        std::string prop_value;
+        ASSERT_OK(
+            iter->GetProperty("rocksdb.iterator.is-key-pinned", &prop_value));
+        ASSERT_EQ("1", prop_value);
+        keys_slices.push_back(iter->key());
+        true_keys.push_back(true_data.lower_bound(k)->first);
+      }
+
+      for (size_t i = 0; i < keys_slices.size(); i++) {
+        ASSERT_EQ(keys_slices[i].ToString(), true_keys[i]);
+      }
+    }
+
+    {
+      // Test SeekForPrev to random keys
+      std::vector<Slice> keys_slices;
+      std::vector<std::string> true_keys;
+      for (auto& k : random_keys) {
+        iter->SeekForPrev(k);
+        if (!iter->Valid()) {
+          ASSERT_EQ(true_data.upper_bound(k), true_data.begin());
+          continue;
+        }
+        std::string prop_value;
+        ASSERT_OK(
+            iter->GetProperty("rocksdb.iterator.is-key-pinned", &prop_value));
+        ASSERT_EQ("1", prop_value);
+        keys_slices.push_back(iter->key());
+        true_keys.push_back((--true_data.upper_bound(k))->first);
+      }
+
+      for (size_t i = 0; i < keys_slices.size(); i++) {
+        ASSERT_EQ(keys_slices[i].ToString(), true_keys[i]);
+      }
+    }
+
+    {
+      // Test iterating all data forward
+      std::vector<Slice> all_keys;
+      for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+        std::string prop_value;
+        ASSERT_OK(
+            iter->GetProperty("rocksdb.iterator.is-key-pinned", &prop_value));
+        ASSERT_EQ("1", prop_value);
+        all_keys.push_back(iter->key());
+      }
+      ASSERT_EQ(all_keys.size(), true_data.size());
+
+      // Verify that all keys slices are valid
+      auto data_iter = true_data.begin();
+      for (size_t i = 0; i < all_keys.size(); i++) {
+        ASSERT_EQ(all_keys[i].ToString(), data_iter->first);
+        data_iter++;
+      }
+    }
+
+    {
+      // Test iterating all data backward
+      std::vector<Slice> all_keys;
+      for (iter->SeekToLast(); iter->Valid(); iter->Prev()) {
+        std::string prop_value;
+        ASSERT_OK(
+            iter->GetProperty("rocksdb.iterator.is-key-pinned", &prop_value));
+        ASSERT_EQ("1", prop_value);
+        all_keys.push_back(iter->key());
+      }
+      ASSERT_EQ(all_keys.size(), true_data.size());
+
+      // Verify that all keys slices are valid (backward)
+      auto data_iter = true_data.rbegin();
+      for (size_t i = 0; i < all_keys.size(); i++) {
+        ASSERT_EQ(all_keys[i].ToString(), data_iter->first);
+        data_iter++;
+      }
+    }
+
+    delete iter;
+}
+};
+
+TEST_P(DBIteratorTestForPinnedData, PinnedDataIteratorRandomizedNormal) {
+  PinnedDataIteratorRandomized(TestConfig::NORMAL);
+}
+
+TEST_P(DBIteratorTestForPinnedData, PinnedDataIteratorRandomizedCLoseAndOpen) {
+  PinnedDataIteratorRandomized(TestConfig::CLOSE_AND_OPEN);
+}
+
+TEST_P(DBIteratorTestForPinnedData,
+       PinnedDataIteratorRandomizedCompactBeforeRead) {
+  PinnedDataIteratorRandomized(TestConfig::COMPACT_BEFORE_READ);
+}
+
+TEST_P(DBIteratorTestForPinnedData, PinnedDataIteratorRandomizedFlush) {
+  PinnedDataIteratorRandomized(TestConfig::FLUSH_EVERY_1000);
+}
+
+#ifndef ROCKSDB_LITE
+TEST_P(DBIteratorTest, PinnedDataIteratorMultipleFiles) {
+  Options options = CurrentOptions();
+  BlockBasedTableOptions table_options;
+  table_options.use_delta_encoding = false;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options.disable_auto_compactions = true;
+  options.write_buffer_size = 1024 * 1024 * 10;  // 10 Mb
+  DestroyAndReopen(options);
+
+  std::map<std::string, std::string> true_data;
+
+  // Generate 4 sst files in L2
+  Random rnd(301);
+  for (int i = 1; i <= 1000; i++) {
+    std::string k = Key(i * 3);
+    std::string v = RandomString(&rnd, 100);
+    ASSERT_OK(Put(k, v));
+    true_data[k] = v;
+    if (i % 250 == 0) {
+      ASSERT_OK(Flush());
+    }
+  }
+  ASSERT_EQ(FilesPerLevel(0), "4");
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ(FilesPerLevel(0), "0,4");
+
+  // Generate 4 sst files in L0
+  for (int i = 1; i <= 1000; i++) {
+    std::string k = Key(i * 2);
+    std::string v = RandomString(&rnd, 100);
+    ASSERT_OK(Put(k, v));
+    true_data[k] = v;
+    if (i % 250 == 0) {
+      ASSERT_OK(Flush());
+    }
+  }
+  ASSERT_EQ(FilesPerLevel(0), "4,4");
+
+  // Add some keys/values in memtables
+  for (int i = 1; i <= 1000; i++) {
+    std::string k = Key(i);
+    std::string v = RandomString(&rnd, 100);
+    ASSERT_OK(Put(k, v));
+    true_data[k] = v;
+  }
+  ASSERT_EQ(FilesPerLevel(0), "4,4");
+
+  ReadOptions ro;
+  ro.pin_data = true;
+  auto iter = NewIterator(ro);
+
+  std::vector<std::pair<Slice, std::string>> results;
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    std::string prop_value;
+    ASSERT_OK(iter->GetProperty("rocksdb.iterator.is-key-pinned", &prop_value));
+    ASSERT_EQ("1", prop_value);
+    results.emplace_back(iter->key(), iter->value().ToString());
+  }
+
+  ASSERT_EQ(results.size(), true_data.size());
+  auto data_iter = true_data.begin();
+  for (size_t i = 0; i < results.size(); i++, data_iter++) {
+    auto& kv = results[i];
+    ASSERT_EQ(kv.first, data_iter->first);
+    ASSERT_EQ(kv.second, data_iter->second);
+  }
+
+  delete iter;
+}
+#endif
+
+TEST_P(DBIteratorTest, PinnedDataIteratorMergeOperator) {
+  Options options = CurrentOptions();
+  BlockBasedTableOptions table_options;
+  table_options.use_delta_encoding = false;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options.merge_operator = MergeOperators::CreateUInt64AddOperator();
+  DestroyAndReopen(options);
+
+  std::string numbers[7];
+  for (int val = 0; val <= 6; val++) {
+    PutFixed64(numbers + val, val);
+  }
+
+  // +1 all keys in range [ 0 => 999]
+  for (int i = 0; i < 1000; i++) {
+    WriteOptions wo;
+    ASSERT_OK(db_->Merge(wo, Key(i), numbers[1]));
+  }
+
+  // +2 all keys divisible by 2 in range [ 0 => 999]
+  for (int i = 0; i < 1000; i += 2) {
+    WriteOptions wo;
+    ASSERT_OK(db_->Merge(wo, Key(i), numbers[2]));
+  }
+
+  // +3 all keys divisible by 5 in range [ 0 => 999]
+  for (int i = 0; i < 1000; i += 5) {
+    WriteOptions wo;
+    ASSERT_OK(db_->Merge(wo, Key(i), numbers[3]));
+  }
+
+  ReadOptions ro;
+  ro.pin_data = true;
+  auto iter = NewIterator(ro);
+
+  std::vector<std::pair<Slice, std::string>> results;
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    std::string prop_value;
+    ASSERT_OK(iter->GetProperty("rocksdb.iterator.is-key-pinned", &prop_value));
+    ASSERT_EQ("1", prop_value);
+    results.emplace_back(iter->key(), iter->value().ToString());
+  }
+
+  ASSERT_EQ(results.size(), 1000);
+  for (size_t i = 0; i < results.size(); i++) {
+    auto& kv = results[i];
+    ASSERT_EQ(kv.first, Key(static_cast<int>(i)));
+    int expected_val = 1;
+    if (i % 2 == 0) {
+      expected_val += 2;
+    }
+    if (i % 5 == 0) {
+      expected_val += 3;
+    }
+    ASSERT_EQ(kv.second, numbers[expected_val]);
+  }
+
+  delete iter;
+}
+
+TEST_P(DBIteratorTest, PinnedDataIteratorReadAfterUpdate) {
+  Options options = CurrentOptions();
+  BlockBasedTableOptions table_options;
+  table_options.use_delta_encoding = false;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options.write_buffer_size = 100000;
+  DestroyAndReopen(options);
+
+  Random rnd(301);
+
+  std::map<std::string, std::string> true_data;
+  for (int i = 0; i < 1000; i++) {
+    std::string k = RandomString(&rnd, 10);
+    std::string v = RandomString(&rnd, 1000);
+    ASSERT_OK(Put(k, v));
+    true_data[k] = v;
+  }
+
+  ReadOptions ro;
+  ro.pin_data = true;
+  auto iter = NewIterator(ro);
+
+  // Delete 50% of the keys and update the other 50%
+  for (auto& kv : true_data) {
+    if (rnd.OneIn(2)) {
+      ASSERT_OK(Delete(kv.first));
+    } else {
+      std::string new_val = RandomString(&rnd, 1000);
+      ASSERT_OK(Put(kv.first, new_val));
+    }
+  }
+
+  std::vector<std::pair<Slice, std::string>> results;
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    std::string prop_value;
+    ASSERT_OK(iter->GetProperty("rocksdb.iterator.is-key-pinned", &prop_value));
+    ASSERT_EQ("1", prop_value);
+    results.emplace_back(iter->key(), iter->value().ToString());
+  }
+
+  auto data_iter = true_data.begin();
+  for (size_t i = 0; i < results.size(); i++, data_iter++) {
+    auto& kv = results[i];
+    ASSERT_EQ(kv.first, data_iter->first);
+    ASSERT_EQ(kv.second, data_iter->second);
+  }
+
+  delete iter;
+}
+
+class SliceTransformLimitedDomainGeneric : public SliceTransform {
+  const char* Name() const override {
+    return "SliceTransformLimitedDomainGeneric";
+  }
+
+  Slice Transform(const Slice& src) const override {
+    return Slice(src.data(), 1);
+  }
+
+  bool InDomain(const Slice& src) const override {
+    // prefix will be x????
+    return src.size() >= 1;
+  }
+
+  bool InRange(const Slice& dst) const override {
+    // prefix will be x????
+    return dst.size() == 1;
+  }
+};
+
+TEST_P(DBIteratorTest, IterSeekForPrevCrossingFiles) {
+  Options options = CurrentOptions();
+  options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+  options.disable_auto_compactions = true;
+  // Enable prefix bloom for SST files
+  BlockBasedTableOptions table_options;
+  table_options.filter_policy.reset(NewBloomFilterPolicy(10, true));
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put("a1", "va1"));
+  ASSERT_OK(Put("a2", "va2"));
+  ASSERT_OK(Put("a3", "va3"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put("b1", "vb1"));
+  ASSERT_OK(Put("b2", "vb2"));
+  ASSERT_OK(Put("b3", "vb3"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put("b4", "vb4"));
+  ASSERT_OK(Put("d1", "vd1"));
+  ASSERT_OK(Put("d2", "vd2"));
+  ASSERT_OK(Put("d4", "vd4"));
+  ASSERT_OK(Flush());
+
+  MoveFilesToLevel(1);
+  {
+    ReadOptions ro;
+    Iterator* iter = NewIterator(ro);
+
+    iter->SeekForPrev("a4");
+    ASSERT_EQ(iter->key().ToString(), "a3");
+    ASSERT_EQ(iter->value().ToString(), "va3");
+
+    iter->SeekForPrev("c2");
+    ASSERT_EQ(iter->key().ToString(), "b3");
+    iter->SeekForPrev("d3");
+    ASSERT_EQ(iter->key().ToString(), "d2");
+    iter->SeekForPrev("b5");
+    ASSERT_EQ(iter->key().ToString(), "b4");
+    delete iter;
+  }
+
+  {
+    ReadOptions ro;
+    ro.prefix_same_as_start = true;
+    Iterator* iter = NewIterator(ro);
+    iter->SeekForPrev("c2");
+    ASSERT_TRUE(!iter->Valid());
+    delete iter;
+  }
+}
+
+TEST_P(DBIteratorTest, IterSeekForPrevCrossingFilesCustomPrefixExtractor) {
+  Options options = CurrentOptions();
+  options.prefix_extractor =
+      std::make_shared<SliceTransformLimitedDomainGeneric>();
+  options.disable_auto_compactions = true;
+  // Enable prefix bloom for SST files
+  BlockBasedTableOptions table_options;
+  table_options.filter_policy.reset(NewBloomFilterPolicy(10, true));
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put("a1", "va1"));
+  ASSERT_OK(Put("a2", "va2"));
+  ASSERT_OK(Put("a3", "va3"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put("b1", "vb1"));
+  ASSERT_OK(Put("b2", "vb2"));
+  ASSERT_OK(Put("b3", "vb3"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put("b4", "vb4"));
+  ASSERT_OK(Put("d1", "vd1"));
+  ASSERT_OK(Put("d2", "vd2"));
+  ASSERT_OK(Put("d4", "vd4"));
+  ASSERT_OK(Flush());
+
+  MoveFilesToLevel(1);
+  {
+    ReadOptions ro;
+    Iterator* iter = NewIterator(ro);
+
+    iter->SeekForPrev("a4");
+    ASSERT_EQ(iter->key().ToString(), "a3");
+    ASSERT_EQ(iter->value().ToString(), "va3");
+
+    iter->SeekForPrev("c2");
+    ASSERT_EQ(iter->key().ToString(), "b3");
+    iter->SeekForPrev("d3");
+    ASSERT_EQ(iter->key().ToString(), "d2");
+    iter->SeekForPrev("b5");
+    ASSERT_EQ(iter->key().ToString(), "b4");
+    delete iter;
+  }
+
+  {
+    ReadOptions ro;
+    ro.prefix_same_as_start = true;
+    Iterator* iter = NewIterator(ro);
+    iter->SeekForPrev("c2");
+    ASSERT_TRUE(!iter->Valid());
+    delete iter;
+  }
+}
+
+TEST_P(DBIteratorTest, IterPrevKeyCrossingBlocks) {
+  Options options = CurrentOptions();
+  BlockBasedTableOptions table_options;
+  table_options.block_size = 1;  // every block will contain one entry
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options.merge_operator = MergeOperators::CreateStringAppendTESTOperator();
+  options.disable_auto_compactions = true;
+  options.max_sequential_skip_in_iterations = 8;
+
+  DestroyAndReopen(options);
+
+  // Putting such deletes will force DBIter::Prev() to fallback to a Seek
+  for (int file_num = 0; file_num < 10; file_num++) {
+    ASSERT_OK(Delete("key4"));
+    ASSERT_OK(Flush());
+  }
+
+  // First File containing 5 blocks of puts
+  ASSERT_OK(Put("key1", "val1.0"));
+  ASSERT_OK(Put("key2", "val2.0"));
+  ASSERT_OK(Put("key3", "val3.0"));
+  ASSERT_OK(Put("key4", "val4.0"));
+  ASSERT_OK(Put("key5", "val5.0"));
+  ASSERT_OK(Flush());
+
+  // Second file containing 9 blocks of merge operands
+  ASSERT_OK(db_->Merge(WriteOptions(), "key1", "val1.1"));
+  ASSERT_OK(db_->Merge(WriteOptions(), "key1", "val1.2"));
+
+  ASSERT_OK(db_->Merge(WriteOptions(), "key2", "val2.1"));
+  ASSERT_OK(db_->Merge(WriteOptions(), "key2", "val2.2"));
+  ASSERT_OK(db_->Merge(WriteOptions(), "key2", "val2.3"));
+
+  ASSERT_OK(db_->Merge(WriteOptions(), "key3", "val3.1"));
+  ASSERT_OK(db_->Merge(WriteOptions(), "key3", "val3.2"));
+  ASSERT_OK(db_->Merge(WriteOptions(), "key3", "val3.3"));
+  ASSERT_OK(db_->Merge(WriteOptions(), "key3", "val3.4"));
+  ASSERT_OK(Flush());
+
+  {
+    ReadOptions ro;
+    ro.fill_cache = false;
+    Iterator* iter = NewIterator(ro);
+
+    iter->SeekToLast();
+    ASSERT_EQ(iter->key().ToString(), "key5");
+    ASSERT_EQ(iter->value().ToString(), "val5.0");
+
+    iter->Prev();
+    ASSERT_EQ(iter->key().ToString(), "key4");
+    ASSERT_EQ(iter->value().ToString(), "val4.0");
+
+    iter->Prev();
+    ASSERT_EQ(iter->key().ToString(), "key3");
+    ASSERT_EQ(iter->value().ToString(), "val3.0,val3.1,val3.2,val3.3,val3.4");
+
+    iter->Prev();
+    ASSERT_EQ(iter->key().ToString(), "key2");
+    ASSERT_EQ(iter->value().ToString(), "val2.0,val2.1,val2.2,val2.3");
+
+    iter->Prev();
+    ASSERT_EQ(iter->key().ToString(), "key1");
+    ASSERT_EQ(iter->value().ToString(), "val1.0,val1.1,val1.2");
+
+    delete iter;
+  }
+}
+
+TEST_P(DBIteratorTest, IterPrevKeyCrossingBlocksRandomized) {
+  Options options = CurrentOptions();
+  options.merge_operator = MergeOperators::CreateStringAppendTESTOperator();
+  options.disable_auto_compactions = true;
+  options.level0_slowdown_writes_trigger = (1 << 30);
+  options.level0_stop_writes_trigger = (1 << 30);
+  options.max_sequential_skip_in_iterations = 8;
+  DestroyAndReopen(options);
+
+  const int kNumKeys = 500;
+  // Small number of merge operands to make sure that DBIter::Prev() dont
+  // fall back to Seek()
+  const int kNumMergeOperands = 3;
+  // Use value size that will make sure that every block contain 1 key
+  const int kValSize =
+      static_cast<int>(BlockBasedTableOptions().block_size) * 4;
+  // Percentage of keys that wont get merge operations
+  const int kNoMergeOpPercentage = 20;
+  // Percentage of keys that will be deleted
+  const int kDeletePercentage = 10;
+
+  // For half of the key range we will write multiple deletes first to
+  // force DBIter::Prev() to fall back to Seek()
+  for (int file_num = 0; file_num < 10; file_num++) {
+    for (int i = 0; i < kNumKeys; i += 2) {
+      ASSERT_OK(Delete(Key(i)));
+    }
+    ASSERT_OK(Flush());
+  }
+
+  Random rnd(301);
+  std::map<std::string, std::string> true_data;
+  std::string gen_key;
+  std::string gen_val;
+
+  for (int i = 0; i < kNumKeys; i++) {
+    gen_key = Key(i);
+    gen_val = RandomString(&rnd, kValSize);
+
+    ASSERT_OK(Put(gen_key, gen_val));
+    true_data[gen_key] = gen_val;
+  }
+  ASSERT_OK(Flush());
+
+  // Separate values and merge operands in different file so that we
+  // make sure that we dont merge them while flushing but actually
+  // merge them in the read path
+  for (int i = 0; i < kNumKeys; i++) {
+    if (rnd.PercentTrue(kNoMergeOpPercentage)) {
+      // Dont give merge operations for some keys
+      continue;
+    }
+
+    for (int j = 0; j < kNumMergeOperands; j++) {
+      gen_key = Key(i);
+      gen_val = RandomString(&rnd, kValSize);
+
+      ASSERT_OK(db_->Merge(WriteOptions(), gen_key, gen_val));
+      true_data[gen_key] += "," + gen_val;
+    }
+  }
+  ASSERT_OK(Flush());
+
+  for (int i = 0; i < kNumKeys; i++) {
+    if (rnd.PercentTrue(kDeletePercentage)) {
+      gen_key = Key(i);
+
+      ASSERT_OK(Delete(gen_key));
+      true_data.erase(gen_key);
+    }
+  }
+  ASSERT_OK(Flush());
+
+  {
+    ReadOptions ro;
+    ro.fill_cache = false;
+    Iterator* iter = NewIterator(ro);
+    auto data_iter = true_data.rbegin();
+
+    for (iter->SeekToLast(); iter->Valid(); iter->Prev()) {
+      ASSERT_EQ(iter->key().ToString(), data_iter->first);
+      ASSERT_EQ(iter->value().ToString(), data_iter->second);
+      data_iter++;
+    }
+    ASSERT_EQ(data_iter, true_data.rend());
+
+    delete iter;
+  }
+
+  {
+    ReadOptions ro;
+    ro.fill_cache = false;
+    Iterator* iter = NewIterator(ro);
+    auto data_iter = true_data.rbegin();
+
+    int entries_right = 0;
+    std::string seek_key;
+    for (iter->SeekToLast(); iter->Valid(); iter->Prev()) {
+      // Verify key/value of current position
+      ASSERT_EQ(iter->key().ToString(), data_iter->first);
+      ASSERT_EQ(iter->value().ToString(), data_iter->second);
+
+      bool restore_position_with_seek = rnd.Uniform(2);
+      if (restore_position_with_seek) {
+        seek_key = iter->key().ToString();
+      }
+
+      // Do some Next() operations the restore the iterator to orignal position
+      int next_count =
+          entries_right > 0 ? rnd.Uniform(std::min(entries_right, 10)) : 0;
+      for (int i = 0; i < next_count; i++) {
+        iter->Next();
+        data_iter--;
+
+        ASSERT_EQ(iter->key().ToString(), data_iter->first);
+        ASSERT_EQ(iter->value().ToString(), data_iter->second);
+      }
+
+      if (restore_position_with_seek) {
+        // Restore orignal position using Seek()
+        iter->Seek(seek_key);
+        for (int i = 0; i < next_count; i++) {
+          data_iter++;
+        }
+
+        ASSERT_EQ(iter->key().ToString(), data_iter->first);
+        ASSERT_EQ(iter->value().ToString(), data_iter->second);
+      } else {
+        // Restore original position using Prev()
+        for (int i = 0; i < next_count; i++) {
+          iter->Prev();
+          data_iter++;
+
+          ASSERT_EQ(iter->key().ToString(), data_iter->first);
+          ASSERT_EQ(iter->value().ToString(), data_iter->second);
+        }
+      }
+
+      entries_right++;
+      data_iter++;
+    }
+    ASSERT_EQ(data_iter, true_data.rend());
+
+    delete iter;
+  }
+}
+
+TEST_P(DBIteratorTest, IteratorWithLocalStatistics) {
+  Options options = CurrentOptions();
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  DestroyAndReopen(options);
+
+  Random rnd(301);
+  for (int i = 0; i < 1000; i++) {
+    // Key 10 bytes / Value 10 bytes
+    ASSERT_OK(Put(RandomString(&rnd, 10), RandomString(&rnd, 10)));
+  }
+
+  std::atomic<uint64_t> total_next(0);
+  std::atomic<uint64_t> total_next_found(0);
+  std::atomic<uint64_t> total_prev(0);
+  std::atomic<uint64_t> total_prev_found(0);
+  std::atomic<uint64_t> total_bytes(0);
+
+  std::vector<port::Thread> threads;
+  std::function<void()> reader_func_next = [&]() {
+    SetPerfLevel(kEnableCount);
+    get_perf_context()->Reset();
+    Iterator* iter = NewIterator(ReadOptions());
+
+    iter->SeekToFirst();
+    // Seek will bump ITER_BYTES_READ
+    uint64_t bytes = 0;
+    bytes += iter->key().size();
+    bytes += iter->value().size();
+    while (true) {
+      iter->Next();
+      total_next++;
+
+      if (!iter->Valid()) {
+        break;
+      }
+      total_next_found++;
+      bytes += iter->key().size();
+      bytes += iter->value().size();
+    }
+
+    delete iter;
+    ASSERT_EQ(bytes, get_perf_context()->iter_read_bytes);
+    SetPerfLevel(kDisable);
+    total_bytes += bytes;
+  };
+
+  std::function<void()> reader_func_prev = [&]() {
+    SetPerfLevel(kEnableCount);
+    Iterator* iter = NewIterator(ReadOptions());
+
+    iter->SeekToLast();
+    // Seek will bump ITER_BYTES_READ
+    uint64_t bytes = 0;
+    bytes += iter->key().size();
+    bytes += iter->value().size();
+    while (true) {
+      iter->Prev();
+      total_prev++;
+
+      if (!iter->Valid()) {
+        break;
+      }
+      total_prev_found++;
+      bytes += iter->key().size();
+      bytes += iter->value().size();
+    }
+
+    delete iter;
+    ASSERT_EQ(bytes, get_perf_context()->iter_read_bytes);
+    SetPerfLevel(kDisable);
+    total_bytes += bytes;
+  };
+
+  for (int i = 0; i < 10; i++) {
+    threads.emplace_back(reader_func_next);
+  }
+  for (int i = 0; i < 15; i++) {
+    threads.emplace_back(reader_func_prev);
+  }
+
+  for (auto& t : threads) {
+    t.join();
+  }
+
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_DB_NEXT), (uint64_t)total_next);
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_DB_NEXT_FOUND),
+            (uint64_t)total_next_found);
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_DB_PREV), (uint64_t)total_prev);
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_DB_PREV_FOUND),
+            (uint64_t)total_prev_found);
+  ASSERT_EQ(TestGetTickerCount(options, ITER_BYTES_READ), (uint64_t)total_bytes);
+
+}
+
+TEST_P(DBIteratorTest, ReadAhead) {
+  Options options;
+  env_->count_random_reads_ = true;
+  options.env = env_;
+  options.disable_auto_compactions = true;
+  options.write_buffer_size = 4 << 20;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  BlockBasedTableOptions table_options;
+  table_options.block_size = 1024;
+  table_options.no_block_cache = true;
+  options.table_factory.reset(new BlockBasedTableFactory(table_options));
+  Reopen(options);
+
+  std::string value(1024, 'a');
+  for (int i = 0; i < 100; i++) {
+    Put(Key(i), value);
+  }
+  ASSERT_OK(Flush());
+  MoveFilesToLevel(2);
+
+  for (int i = 0; i < 100; i++) {
+    Put(Key(i), value);
+  }
+  ASSERT_OK(Flush());
+  MoveFilesToLevel(1);
+
+  for (int i = 0; i < 100; i++) {
+    Put(Key(i), value);
+  }
+  ASSERT_OK(Flush());
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ("1,1,1", FilesPerLevel());
+#endif  // !ROCKSDB_LITE
+
+  env_->random_read_bytes_counter_ = 0;
+  options.statistics->setTickerCount(NO_FILE_OPENS, 0);
+  ReadOptions read_options;
+  auto* iter = NewIterator(read_options);
+  iter->SeekToFirst();
+  int64_t num_file_opens = TestGetTickerCount(options, NO_FILE_OPENS);
+  size_t bytes_read = env_->random_read_bytes_counter_;
+  delete iter;
+
+  int64_t num_file_closes = TestGetTickerCount(options, NO_FILE_CLOSES);
+  env_->random_read_bytes_counter_ = 0;
+  options.statistics->setTickerCount(NO_FILE_OPENS, 0);
+  read_options.readahead_size = 1024 * 10;
+  iter = NewIterator(read_options);
+  iter->SeekToFirst();
+  int64_t num_file_opens_readahead = TestGetTickerCount(options, NO_FILE_OPENS);
+  size_t bytes_read_readahead = env_->random_read_bytes_counter_;
+  delete iter;
+  int64_t num_file_closes_readahead =
+      TestGetTickerCount(options, NO_FILE_CLOSES);
+  ASSERT_EQ(num_file_opens, num_file_opens_readahead);
+  ASSERT_EQ(num_file_closes, num_file_closes_readahead);
+  ASSERT_GT(bytes_read_readahead, bytes_read);
+  ASSERT_GT(bytes_read_readahead, read_options.readahead_size * 3);
+
+  // Verify correctness.
+  iter = NewIterator(read_options);
+  int count = 0;
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    ASSERT_EQ(value, iter->value());
+    count++;
+  }
+  ASSERT_EQ(100, count);
+  for (int i = 0; i < 100; i++) {
+    iter->Seek(Key(i));
+    ASSERT_EQ(value, iter->value());
+  }
+  delete iter;
+}
+
+// Insert a key, create a snapshot iterator, overwrite key lots of times,
+// seek to a smaller key. Expect DBIter to fall back to a seek instead of
+// going through all the overwrites linearly.
+TEST_P(DBIteratorTest, DBIteratorSkipRecentDuplicatesTest) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  options.max_sequential_skip_in_iterations = 3;
+  options.prefix_extractor = nullptr;
+  options.write_buffer_size = 1 << 27;  // big enough to avoid flush
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  DestroyAndReopen(options);
+
+  // Insert.
+  ASSERT_OK(Put("b", "0"));
+
+  // Create iterator.
+  ReadOptions ro;
+  std::unique_ptr<Iterator> iter(NewIterator(ro));
+
+  // Insert a lot.
+  for (int i = 0; i < 100; ++i) {
+    ASSERT_OK(Put("b", std::to_string(i + 1).c_str()));
+  }
+
+#ifndef ROCKSDB_LITE
+  // Check that memtable wasn't flushed.
+  std::string val;
+  ASSERT_TRUE(db_->GetProperty("rocksdb.num-files-at-level0", &val));
+  EXPECT_EQ("0", val);
+#endif
+
+  // Seek iterator to a smaller key.
+  get_perf_context()->Reset();
+  iter->Seek("a");
+  ASSERT_TRUE(iter->Valid());
+  EXPECT_EQ("b", iter->key().ToString());
+  EXPECT_EQ("0", iter->value().ToString());
+
+  // Check that the seek didn't do too much work.
+  // Checks are not tight, just make sure that everything is well below 100.
+  EXPECT_LT(get_perf_context()->internal_key_skipped_count, 4);
+  EXPECT_LT(get_perf_context()->internal_recent_skipped_count, 8);
+  EXPECT_LT(get_perf_context()->seek_on_memtable_count, 10);
+  EXPECT_LT(get_perf_context()->next_on_memtable_count, 10);
+  EXPECT_LT(get_perf_context()->prev_on_memtable_count, 10);
+
+  // Check that iterator did something like what we expect.
+  EXPECT_EQ(get_perf_context()->internal_delete_skipped_count, 0);
+  EXPECT_EQ(get_perf_context()->internal_merge_count, 0);
+  EXPECT_GE(get_perf_context()->internal_recent_skipped_count, 2);
+  EXPECT_GE(get_perf_context()->seek_on_memtable_count, 2);
+  EXPECT_EQ(1, options.statistics->getTickerCount(
+                 NUMBER_OF_RESEEKS_IN_ITERATION));
+}
+
+TEST_P(DBIteratorTest, Refresh) {
+  ASSERT_OK(Put("x", "y"));
+
+  std::unique_ptr<Iterator> iter(NewIterator(ReadOptions()));
+  iter->Seek(Slice("a"));
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key().compare(Slice("x")), 0);
+  iter->Next();
+  ASSERT_FALSE(iter->Valid());
+
+  ASSERT_OK(Put("c", "d"));
+
+  iter->Seek(Slice("a"));
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key().compare(Slice("x")), 0);
+  iter->Next();
+  ASSERT_FALSE(iter->Valid());
+
+  iter->Refresh();
+
+  iter->Seek(Slice("a"));
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key().compare(Slice("c")), 0);
+  iter->Next();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key().compare(Slice("x")), 0);
+  iter->Next();
+  ASSERT_FALSE(iter->Valid());
+
+  dbfull()->Flush(FlushOptions());
+
+  ASSERT_OK(Put("m", "n"));
+
+  iter->Seek(Slice("a"));
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key().compare(Slice("c")), 0);
+  iter->Next();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key().compare(Slice("x")), 0);
+  iter->Next();
+  ASSERT_FALSE(iter->Valid());
+
+  iter->Refresh();
+
+  iter->Seek(Slice("a"));
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key().compare(Slice("c")), 0);
+  iter->Next();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key().compare(Slice("m")), 0);
+  iter->Next();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key().compare(Slice("x")), 0);
+  iter->Next();
+  ASSERT_FALSE(iter->Valid());
+
+  iter.reset();
+}
+
+TEST_P(DBIteratorTest, RefreshWithSnapshot) {
+  ASSERT_OK(Put("x", "y"));
+  const Snapshot* snapshot = db_->GetSnapshot();
+  ReadOptions options;
+  options.snapshot = snapshot;
+  Iterator* iter = NewIterator(options);
+
+  iter->Seek(Slice("a"));
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key().compare(Slice("x")), 0);
+  iter->Next();
+  ASSERT_FALSE(iter->Valid());
+
+  ASSERT_OK(Put("c", "d"));
+
+  iter->Seek(Slice("a"));
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key().compare(Slice("x")), 0);
+  iter->Next();
+  ASSERT_FALSE(iter->Valid());
+
+  Status s;
+  s = iter->Refresh();
+  ASSERT_TRUE(s.IsNotSupported());
+  db_->ReleaseSnapshot(snapshot);
+  delete iter;
+}
+
+TEST_P(DBIteratorTest, CreationFailure) {
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::NewInternalIterator:StatusCallback", [](void* arg) {
+        *(reinterpret_cast<Status*>(arg)) = Status::Corruption("test status");
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  Iterator* iter = NewIterator(ReadOptions());
+  ASSERT_FALSE(iter->Valid());
+  ASSERT_TRUE(iter->status().IsCorruption());
+  delete iter;
+}
+
+TEST_P(DBIteratorTest, UpperBoundWithChangeDirection) {
+  Options options = CurrentOptions();
+  options.max_sequential_skip_in_iterations = 3;
+  DestroyAndReopen(options);
+
+  // write a bunch of kvs to the database.
+  ASSERT_OK(Put("a", "1"));
+  ASSERT_OK(Put("y", "1"));
+  ASSERT_OK(Put("y1", "1"));
+  ASSERT_OK(Put("y2", "1"));
+  ASSERT_OK(Put("y3", "1"));
+  ASSERT_OK(Put("z", "1"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("a", "1"));
+  ASSERT_OK(Put("z", "1"));
+  ASSERT_OK(Put("bar", "1"));
+  ASSERT_OK(Put("foo", "1"));
+
+  std::string upper_bound = "x";
+  Slice ub_slice(upper_bound);
+  ReadOptions ro;
+  ro.iterate_upper_bound = &ub_slice;
+  ro.max_skippable_internal_keys = 1000;
+
+  Iterator* iter = NewIterator(ro);
+  iter->Seek("foo");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("foo", iter->key().ToString());
+
+  iter->Prev();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_OK(iter->status());
+  ASSERT_EQ("bar", iter->key().ToString());
+
+  delete iter;
+}
+
+TEST_P(DBIteratorTest, TableFilter) {
+  ASSERT_OK(Put("a", "1"));
+  dbfull()->Flush(FlushOptions());
+  ASSERT_OK(Put("b", "2"));
+  ASSERT_OK(Put("c", "3"));
+  dbfull()->Flush(FlushOptions());
+  ASSERT_OK(Put("d", "4"));
+  ASSERT_OK(Put("e", "5"));
+  ASSERT_OK(Put("f", "6"));
+  dbfull()->Flush(FlushOptions());
+
+  // Ensure the table_filter callback is called once for each table.
+  {
+    std::set<uint64_t> unseen{1, 2, 3};
+    ReadOptions opts;
+    opts.table_filter = [&](const TableProperties& props) {
+      auto it = unseen.find(props.num_entries);
+      if (it == unseen.end()) {
+        ADD_FAILURE() << "saw table properties with an unexpected "
+                      << props.num_entries << " entries";
+      } else {
+        unseen.erase(it);
+      }
+      return true;
+    };
+    auto iter = NewIterator(opts);
+    iter->SeekToFirst();
+    ASSERT_EQ(IterStatus(iter), "a->1");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "b->2");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "c->3");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "d->4");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "e->5");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "f->6");
+    iter->Next();
+    ASSERT_FALSE(iter->Valid());
+    ASSERT_TRUE(unseen.empty());
+    delete iter;
+  }
+
+  // Ensure returning false in the table_filter hides the keys from that table
+  // during iteration.
+  {
+    ReadOptions opts;
+    opts.table_filter = [](const TableProperties& props) {
+      return props.num_entries != 2;
+    };
+    auto iter = NewIterator(opts);
+    iter->SeekToFirst();
+    ASSERT_EQ(IterStatus(iter), "a->1");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "d->4");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "e->5");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "f->6");
+    iter->Next();
+    ASSERT_FALSE(iter->Valid());
+    delete iter;
+  }
+}
+
+TEST_P(DBIteratorTest, UpperBoundWithPrevReseek) {
+  Options options = CurrentOptions();
+  options.max_sequential_skip_in_iterations = 3;
+  DestroyAndReopen(options);
+
+  // write a bunch of kvs to the database.
+  ASSERT_OK(Put("a", "1"));
+  ASSERT_OK(Put("y", "1"));
+  ASSERT_OK(Put("z", "1"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("a", "1"));
+  ASSERT_OK(Put("z", "1"));
+  ASSERT_OK(Put("bar", "1"));
+  ASSERT_OK(Put("foo", "1"));
+  ASSERT_OK(Put("foo", "2"));
+
+  ASSERT_OK(Put("foo", "3"));
+  ASSERT_OK(Put("foo", "4"));
+  ASSERT_OK(Put("foo", "5"));
+  const Snapshot* snapshot = db_->GetSnapshot();
+  ASSERT_OK(Put("foo", "6"));
+
+  std::string upper_bound = "x";
+  Slice ub_slice(upper_bound);
+  ReadOptions ro;
+  ro.snapshot = snapshot;
+  ro.iterate_upper_bound = &ub_slice;
+
+  Iterator* iter = NewIterator(ro);
+  iter->SeekForPrev("goo");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("foo", iter->key().ToString());
+  iter->Prev();
+
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("bar", iter->key().ToString());
+
+  delete iter;
+  db_->ReleaseSnapshot(snapshot);
+}
+
+TEST_P(DBIteratorTest, SkipStatistics) {
+  Options options = CurrentOptions();
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  DestroyAndReopen(options);
+
+  int skip_count = 0;
+
+  // write a bunch of kvs to the database.
+  ASSERT_OK(Put("a", "1"));
+  ASSERT_OK(Put("b", "1"));
+  ASSERT_OK(Put("c", "1"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("d", "1"));
+  ASSERT_OK(Put("e", "1"));
+  ASSERT_OK(Put("f", "1"));
+  ASSERT_OK(Put("a", "2"));
+  ASSERT_OK(Put("b", "2"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Delete("d"));
+  ASSERT_OK(Delete("e"));
+  ASSERT_OK(Delete("f"));
+
+  Iterator* iter = NewIterator(ReadOptions());
+  int count = 0;
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    ASSERT_OK(iter->status());
+    count++;
+  }
+  ASSERT_EQ(count, 3);
+  delete iter;
+  skip_count += 8; // 3 deletes + 3 original keys + 2 lower in sequence
+  ASSERT_EQ(skip_count, TestGetTickerCount(options, NUMBER_ITER_SKIP));
+
+  iter = NewIterator(ReadOptions());
+  count = 0;
+  for (iter->SeekToLast(); iter->Valid(); iter->Prev()) {
+    ASSERT_OK(iter->status());
+    count++;
+  }
+  ASSERT_EQ(count, 3);
+  delete iter;
+  skip_count += 8; // Same as above, but in reverse order
+  ASSERT_EQ(skip_count, TestGetTickerCount(options, NUMBER_ITER_SKIP));
+
+  ASSERT_OK(Put("aa", "1"));
+  ASSERT_OK(Put("ab", "1"));
+  ASSERT_OK(Put("ac", "1"));
+  ASSERT_OK(Put("ad", "1"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Delete("ab"));
+  ASSERT_OK(Delete("ac"));
+  ASSERT_OK(Delete("ad"));
+
+  ReadOptions ro;
+  Slice prefix("b");
+  ro.iterate_upper_bound = &prefix;
+
+  iter = NewIterator(ro);
+  count = 0;
+  for(iter->Seek("aa"); iter->Valid(); iter->Next()) {
+    ASSERT_OK(iter->status());
+    count++;
+  }
+  ASSERT_EQ(count, 1);
+  delete iter;
+  skip_count += 6; // 3 deletes + 3 original keys
+  ASSERT_EQ(skip_count, TestGetTickerCount(options, NUMBER_ITER_SKIP));
+
+  iter = NewIterator(ro);
+  count = 0;
+  for(iter->SeekToLast(); iter->Valid(); iter->Prev()) {
+    ASSERT_OK(iter->status());
+    count++;
+  }
+  ASSERT_EQ(count, 2);
+  delete iter;
+  // 3 deletes + 3 original keys + lower sequence of "a"
+  skip_count += 7;
+  ASSERT_EQ(skip_count, TestGetTickerCount(options, NUMBER_ITER_SKIP));
+}
+
+TEST_P(DBIteratorTest, SeekAfterHittingManyInternalKeys) {
+  Options options = CurrentOptions();
+  DestroyAndReopen(options);
+  ReadOptions ropts;
+  ropts.max_skippable_internal_keys = 2;
+
+  Put("1", "val_1");
+  // Add more tombstones than max_skippable_internal_keys so that Next() fails.
+  Delete("2");
+  Delete("3");
+  Delete("4");
+  Delete("5");
+  Put("6", "val_6");
+
+  std::unique_ptr<Iterator> iter(NewIterator(ropts));
+  iter->SeekToFirst();
+
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key().ToString(), "1");
+  ASSERT_EQ(iter->value().ToString(), "val_1");
+
+  // This should fail as incomplete due to too many non-visible internal keys on
+  // the way to the next valid user key.
+  iter->Next();
+  ASSERT_TRUE(!iter->Valid());
+  ASSERT_TRUE(iter->status().IsIncomplete());
+
+  // Get the internal key at which Next() failed.
+  std::string prop_value;
+  ASSERT_OK(iter->GetProperty("rocksdb.iterator.internal-key", &prop_value));
+  ASSERT_EQ("4", prop_value);
+
+  // Create a new iterator to seek to the internal key.
+  std::unique_ptr<Iterator> iter2(NewIterator(ropts));
+  iter2->Seek(prop_value);
+  ASSERT_TRUE(iter2->Valid());
+  ASSERT_OK(iter2->status());
+
+  ASSERT_EQ(iter2->key().ToString(), "6");
+  ASSERT_EQ(iter2->value().ToString(), "val_6");
+}
+
+// Reproduces a former bug where iterator would skip some records when DBIter
+// re-seeks subiterator with Incomplete status.
+TEST_P(DBIteratorTest, NonBlockingIterationBugRepro) {
+  Options options = CurrentOptions();
+  BlockBasedTableOptions table_options;
+  // Make sure the sst file has more than one block.
+  table_options.flush_block_policy_factory =
+      std::make_shared<FlushBlockEveryKeyPolicyFactory>();
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  DestroyAndReopen(options);
+
+  // Two records in sst file, each in its own block.
+  Put("b", "");
+  Put("d", "");
+  Flush();
+
+  // Create a nonblocking iterator before writing to memtable.
+  ReadOptions ropt;
+  ropt.read_tier = kBlockCacheTier;
+  std::unique_ptr<Iterator> iter(NewIterator(ropt));
+
+  // Overwrite a key in memtable many times to hit
+  // max_sequential_skip_in_iterations (which is 8 by default).
+  for (int i = 0; i < 20; ++i) {
+    Put("c", "");
+  }
+
+  // Load the second block in sst file into the block cache.
+  {
+    std::unique_ptr<Iterator> iter2(NewIterator(ReadOptions()));
+    iter2->Seek("d");
+  }
+
+  // Finally seek the nonblocking iterator.
+  iter->Seek("a");
+  // With the bug, the status used to be OK, and the iterator used to point to
+  // "d".
+  EXPECT_TRUE(iter->status().IsIncomplete());
+}
+
+TEST_P(DBIteratorTest, SeekBackwardAfterOutOfUpperBound) {
+  Put("a", "");
+  Put("b", "");
+  Flush();
+
+  ReadOptions ropt;
+  Slice ub = "b";
+  ropt.iterate_upper_bound = &ub;
+
+  std::unique_ptr<Iterator> it(dbfull()->NewIterator(ropt));
+  it->SeekForPrev("a");
+  ASSERT_TRUE(it->Valid());
+  ASSERT_OK(it->status());
+  ASSERT_EQ("a", it->key().ToString());
+  it->Next();
+  ASSERT_FALSE(it->Valid());
+  ASSERT_OK(it->status());
+  it->SeekForPrev("a");
+  ASSERT_OK(it->status());
+
+  ASSERT_TRUE(it->Valid());
+  ASSERT_EQ("a", it->key().ToString());
+}
+
+TEST_P(DBIteratorTest, AvoidReseekLevelIterator) {
+  Options options = CurrentOptions();
+  options.compression = CompressionType::kNoCompression;
+  BlockBasedTableOptions table_options;
+  table_options.block_size = 800;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  Reopen(options);
+
+  Random rnd(301);
+  std::string random_str = RandomString(&rnd, 180);
+
+  ASSERT_OK(Put("1", random_str));
+  ASSERT_OK(Put("2", random_str));
+  ASSERT_OK(Put("3", random_str));
+  ASSERT_OK(Put("4", random_str));
+  // A new block
+  ASSERT_OK(Put("5", random_str));
+  ASSERT_OK(Put("6", random_str));
+  ASSERT_OK(Put("7", random_str));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("8", random_str));
+  ASSERT_OK(Put("9", random_str));
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+  int num_find_file_in_level = 0;
+  int num_idx_blk_seek = 0;
+  SyncPoint::GetInstance()->SetCallBack(
+      "LevelIterator::Seek:BeforeFindFile",
+      [&](void* /*arg*/) { num_find_file_in_level++; });
+  SyncPoint::GetInstance()->SetCallBack(
+      "IndexBlockIter::Seek:0", [&](void* /*arg*/) { num_idx_blk_seek++; });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  {
+    std::unique_ptr<Iterator> iter(NewIterator(ReadOptions()));
+    iter->Seek("1");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(1, num_find_file_in_level);
+    ASSERT_EQ(1, num_idx_blk_seek);
+
+    iter->Seek("2");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(1, num_find_file_in_level);
+    ASSERT_EQ(1, num_idx_blk_seek);
+
+    iter->Seek("3");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(1, num_find_file_in_level);
+    ASSERT_EQ(1, num_idx_blk_seek);
+
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(1, num_find_file_in_level);
+    ASSERT_EQ(1, num_idx_blk_seek);
+
+    iter->Seek("5");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(1, num_find_file_in_level);
+    ASSERT_EQ(2, num_idx_blk_seek);
+
+    iter->Seek("6");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(1, num_find_file_in_level);
+    ASSERT_EQ(2, num_idx_blk_seek);
+
+    iter->Seek("7");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(1, num_find_file_in_level);
+    ASSERT_EQ(3, num_idx_blk_seek);
+
+    iter->Seek("8");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(2, num_find_file_in_level);
+    // Still re-seek because "8" is the boundary key, which has
+    // the same user key as the seek key.
+    ASSERT_EQ(4, num_idx_blk_seek);
+
+    iter->Seek("5");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(3, num_find_file_in_level);
+    ASSERT_EQ(5, num_idx_blk_seek);
+
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(3, num_find_file_in_level);
+    ASSERT_EQ(5, num_idx_blk_seek);
+
+    // Seek backward never triggers the index block seek to be skipped
+    iter->Seek("5");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(3, num_find_file_in_level);
+    ASSERT_EQ(6, num_idx_blk_seek);
+  }
+
+  SyncPoint::GetInstance()->DisableProcessing();
+}
+
+// MyRocks may change iterate bounds before seek. Simply test to make sure such
+// usage doesn't break iterator.
+TEST_P(DBIteratorTest, IterateBoundChangedBeforeSeek) {
+  Options options = CurrentOptions();
+  options.compression = CompressionType::kNoCompression;
+  BlockBasedTableOptions table_options;
+  table_options.block_size = 100;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  std::string value(50, 'v');
+  Reopen(options);
+  ASSERT_OK(Put("aaa", value));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("bbb", "v"));
+  ASSERT_OK(Put("ccc", "v"));
+  ASSERT_OK(Put("ddd", "v"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("eee", "v"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+  std::string ub1 = "e";
+  std::string ub2 = "c";
+  Slice ub(ub1);
+  ReadOptions read_opts1;
+  read_opts1.iterate_upper_bound = &ub;
+  Iterator* iter = NewIterator(read_opts1);
+  // Seek and iterate accross block boundary.
+  iter->Seek("b");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_OK(iter->status());
+  ASSERT_EQ("bbb", iter->key());
+  ub = Slice(ub2);
+  iter->Seek("b");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_OK(iter->status());
+  ASSERT_EQ("bbb", iter->key());
+  iter->Next();
+  ASSERT_FALSE(iter->Valid());
+  ASSERT_OK(iter->status());
+  delete iter;
+
+  std::string lb1 = "a";
+  std::string lb2 = "c";
+  Slice lb(lb1);
+  ReadOptions read_opts2;
+  read_opts2.iterate_lower_bound = &lb;
+  iter = NewIterator(read_opts2);
+  iter->SeekForPrev("d");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_OK(iter->status());
+  ASSERT_EQ("ccc", iter->key());
+  lb = Slice(lb2);
+  iter->SeekForPrev("d");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_OK(iter->status());
+  ASSERT_EQ("ccc", iter->key());
+  iter->Prev();
+  ASSERT_FALSE(iter->Valid());
+  ASSERT_OK(iter->status());
+  delete iter;
+}
+
+TEST_P(DBIteratorTest, IterateWithLowerBoundAcrossFileBoundary) {
+  ASSERT_OK(Put("aaa", "v"));
+  ASSERT_OK(Put("bbb", "v"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("ccc", "v"));
+  ASSERT_OK(Put("ddd", "v"));
+  ASSERT_OK(Flush());
+  // Move both files to bottom level.
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  Slice lower_bound("b");
+  ReadOptions read_opts;
+  read_opts.iterate_lower_bound = &lower_bound;
+  std::unique_ptr<Iterator> iter(NewIterator(read_opts));
+  iter->SeekForPrev("d");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_OK(iter->status());
+  ASSERT_EQ("ccc", iter->key());
+  iter->Prev();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_OK(iter->status());
+  ASSERT_EQ("bbb", iter->key());
+  iter->Prev();
+  ASSERT_FALSE(iter->Valid());
+  ASSERT_OK(iter->status());
+}
+
+INSTANTIATE_TEST_CASE_P(DBIteratorTestInstance, DBIteratorTest,
+                        testing::Values(true, false));
+
+// Tests how DBIter work with ReadCallback
+class DBIteratorWithReadCallbackTest : public DBIteratorTest {};
+
+TEST_F(DBIteratorWithReadCallbackTest, ReadCallback) {
+  class TestReadCallback : public ReadCallback {
+   public:
+    explicit TestReadCallback(SequenceNumber _max_visible_seq)
+        : ReadCallback(_max_visible_seq) {}
+
+    bool IsVisibleFullCheck(SequenceNumber seq) override {
+      return seq <= max_visible_seq_;
+    }
+  };
+
+  ASSERT_OK(Put("foo", "v1"));
+  ASSERT_OK(Put("foo", "v2"));
+  ASSERT_OK(Put("foo", "v3"));
+  ASSERT_OK(Put("a", "va"));
+  ASSERT_OK(Put("z", "vz"));
+  SequenceNumber seq1 = db_->GetLatestSequenceNumber();
+  TestReadCallback callback1(seq1);
+  ASSERT_OK(Put("foo", "v4"));
+  ASSERT_OK(Put("foo", "v5"));
+  ASSERT_OK(Put("bar", "v7"));
+
+  SequenceNumber seq2 = db_->GetLatestSequenceNumber();
+  auto* cfd =
+      reinterpret_cast<ColumnFamilyHandleImpl*>(db_->DefaultColumnFamily())
+          ->cfd();
+  // The iterator are suppose to see data before seq1.
+  Iterator* iter =
+      dbfull()->NewIteratorImpl(ReadOptions(), cfd, seq2, &callback1);
+
+  // Seek
+  // The latest value of "foo" before seq1 is "v3"
+  iter->Seek("foo");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_OK(iter->status());
+  ASSERT_EQ("foo", iter->key());
+  ASSERT_EQ("v3", iter->value());
+  // "bar" is not visible to the iterator. It will move on to the next key
+  // "foo".
+  iter->Seek("bar");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_OK(iter->status());
+  ASSERT_EQ("foo", iter->key());
+  ASSERT_EQ("v3", iter->value());
+
+  // Next
+  // Seek to "a"
+  iter->Seek("a");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_OK(iter->status());
+  ASSERT_EQ("va", iter->value());
+  // "bar" is not visible to the iterator. It will move on to the next key
+  // "foo".
+  iter->Next();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_OK(iter->status());
+  ASSERT_EQ("foo", iter->key());
+  ASSERT_EQ("v3", iter->value());
+
+  // Prev
+  // Seek to "z"
+  iter->Seek("z");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_OK(iter->status());
+  ASSERT_EQ("vz", iter->value());
+  // The previous key is "foo", which is visible to the iterator.
+  iter->Prev();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_OK(iter->status());
+  ASSERT_EQ("foo", iter->key());
+  ASSERT_EQ("v3", iter->value());
+  // "bar" is not visible to the iterator. It will move on to the next key "a".
+  iter->Prev();  // skipping "bar"
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_OK(iter->status());
+  ASSERT_EQ("a", iter->key());
+  ASSERT_EQ("va", iter->value());
+
+  // SeekForPrev
+  // The previous key is "foo", which is visible to the iterator.
+  iter->SeekForPrev("y");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_OK(iter->status());
+  ASSERT_EQ("foo", iter->key());
+  ASSERT_EQ("v3", iter->value());
+  // "bar" is not visible to the iterator. It will move on to the next key "a".
+  iter->SeekForPrev("bar");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_OK(iter->status());
+  ASSERT_EQ("a", iter->key());
+  ASSERT_EQ("va", iter->value());
+
+  delete iter;
+
+  // Prev beyond max_sequential_skip_in_iterations
+  uint64_t num_versions =
+      CurrentOptions().max_sequential_skip_in_iterations + 10;
+  for (uint64_t i = 0; i < num_versions; i++) {
+    ASSERT_OK(Put("bar", ToString(i)));
+  }
+  SequenceNumber seq3 = db_->GetLatestSequenceNumber();
+  TestReadCallback callback2(seq3);
+  ASSERT_OK(Put("bar", "v8"));
+  SequenceNumber seq4 = db_->GetLatestSequenceNumber();
+
+  // The iterator is suppose to see data before seq3.
+  iter = dbfull()->NewIteratorImpl(ReadOptions(), cfd, seq4, &callback2);
+  // Seek to "z", which is visible.
+  iter->Seek("z");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_OK(iter->status());
+  ASSERT_EQ("vz", iter->value());
+  // Previous key is "foo" and the last value "v5" is visible.
+  iter->Prev();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_OK(iter->status());
+  ASSERT_EQ("foo", iter->key());
+  ASSERT_EQ("v5", iter->value());
+  // Since the number of values of "bar" is more than
+  // max_sequential_skip_in_iterations, Prev() will ultimately fallback to
+  // seek in forward direction. Here we test the fallback seek is correct.
+  // The last visible value should be (num_versions - 1), as "v8" is not
+  // visible.
+  iter->Prev();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_OK(iter->status());
+  ASSERT_EQ("bar", iter->key());
+  ASSERT_EQ(ToString(num_versions - 1), iter->value());
+
+  delete iter;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_log_iter_test.cc b/src/rocksdb/db/db_log_iter_test.cc
new file mode 100644
index 000000000..1f9ff0d45
--- /dev/null
+++ b/src/rocksdb/db/db_log_iter_test.cc
@@ -0,0 +1,294 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+// Introduction of SyncPoint effectively disabled building and running this test
+// in Release build.
+// which is a pity, it is a good test
+#if !defined(ROCKSDB_LITE)
+
+#include "db/db_test_util.h"
+#include "port/stack_trace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBTestXactLogIterator : public DBTestBase {
+ public:
+  DBTestXactLogIterator() : DBTestBase("/db_log_iter_test") {}
+
+  std::unique_ptr<TransactionLogIterator> OpenTransactionLogIter(
+      const SequenceNumber seq) {
+    std::unique_ptr<TransactionLogIterator> iter;
+    Status status = dbfull()->GetUpdatesSince(seq, &iter);
+    EXPECT_OK(status);
+    EXPECT_TRUE(iter->Valid());
+    return iter;
+  }
+};
+
+namespace {
+SequenceNumber ReadRecords(
+    std::unique_ptr<TransactionLogIterator>& iter,
+    int& count) {
+  count = 0;
+  SequenceNumber lastSequence = 0;
+  BatchResult res;
+  while (iter->Valid()) {
+    res = iter->GetBatch();
+    EXPECT_TRUE(res.sequence > lastSequence);
+    ++count;
+    lastSequence = res.sequence;
+    EXPECT_OK(iter->status());
+    iter->Next();
+  }
+  return res.sequence;
+}
+
+void ExpectRecords(
+    const int expected_no_records,
+    std::unique_ptr<TransactionLogIterator>& iter) {
+  int num_records;
+  ReadRecords(iter, num_records);
+  ASSERT_EQ(num_records, expected_no_records);
+}
+}  // namespace
+
+TEST_F(DBTestXactLogIterator, TransactionLogIterator) {
+  do {
+    Options options = OptionsForLogIterTest();
+    DestroyAndReopen(options);
+    CreateAndReopenWithCF({"pikachu"}, options);
+    Put(0, "key1", DummyString(1024));
+    Put(1, "key2", DummyString(1024));
+    Put(1, "key2", DummyString(1024));
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 3U);
+    {
+      auto iter = OpenTransactionLogIter(0);
+      ExpectRecords(3, iter);
+    }
+    ReopenWithColumnFamilies({"default", "pikachu"}, options);
+    env_->SleepForMicroseconds(2 * 1000 * 1000);
+    {
+      Put(0, "key4", DummyString(1024));
+      Put(1, "key5", DummyString(1024));
+      Put(0, "key6", DummyString(1024));
+    }
+    {
+      auto iter = OpenTransactionLogIter(0);
+      ExpectRecords(6, iter);
+    }
+  } while (ChangeCompactOptions());
+}
+
+#ifndef NDEBUG  // sync point is not included with DNDEBUG build
+TEST_F(DBTestXactLogIterator, TransactionLogIteratorRace) {
+  static const int LOG_ITERATOR_RACE_TEST_COUNT = 2;
+  static const char* sync_points[LOG_ITERATOR_RACE_TEST_COUNT][4] = {
+      {"WalManager::GetSortedWalFiles:1",  "WalManager::PurgeObsoleteFiles:1",
+       "WalManager::PurgeObsoleteFiles:2", "WalManager::GetSortedWalFiles:2"},
+      {"WalManager::GetSortedWalsOfType:1",
+       "WalManager::PurgeObsoleteFiles:1",
+       "WalManager::PurgeObsoleteFiles:2",
+       "WalManager::GetSortedWalsOfType:2"}};
+  for (int test = 0; test < LOG_ITERATOR_RACE_TEST_COUNT; ++test) {
+    // Setup sync point dependency to reproduce the race condition of
+    // a log file moved to archived dir, in the middle of GetSortedWalFiles
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+        {sync_points[test][0], sync_points[test][1]},
+        {sync_points[test][2], sync_points[test][3]},
+    });
+
+    do {
+      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearTrace();
+      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+      Options options = OptionsForLogIterTest();
+      DestroyAndReopen(options);
+      Put("key1", DummyString(1024));
+      dbfull()->Flush(FlushOptions());
+      Put("key2", DummyString(1024));
+      dbfull()->Flush(FlushOptions());
+      Put("key3", DummyString(1024));
+      dbfull()->Flush(FlushOptions());
+      Put("key4", DummyString(1024));
+      ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 4U);
+      dbfull()->FlushWAL(false);
+
+      {
+        auto iter = OpenTransactionLogIter(0);
+        ExpectRecords(4, iter);
+      }
+
+      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+      // trigger async flush, and log move. Well, log move will
+      // wait until the GetSortedWalFiles:1 to reproduce the race
+      // condition
+      FlushOptions flush_options;
+      flush_options.wait = false;
+      dbfull()->Flush(flush_options);
+
+      // "key5" would be written in a new memtable and log
+      Put("key5", DummyString(1024));
+      dbfull()->FlushWAL(false);
+      {
+        // this iter would miss "key4" if not fixed
+        auto iter = OpenTransactionLogIter(0);
+        ExpectRecords(5, iter);
+      }
+    } while (ChangeCompactOptions());
+  }
+}
+#endif
+
+TEST_F(DBTestXactLogIterator, TransactionLogIteratorStallAtLastRecord) {
+  do {
+    Options options = OptionsForLogIterTest();
+    DestroyAndReopen(options);
+    Put("key1", DummyString(1024));
+    auto iter = OpenTransactionLogIter(0);
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(iter->Valid());
+    iter->Next();
+    ASSERT_TRUE(!iter->Valid());
+    ASSERT_OK(iter->status());
+    Put("key2", DummyString(1024));
+    iter->Next();
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(iter->Valid());
+  } while (ChangeCompactOptions());
+}
+
+TEST_F(DBTestXactLogIterator, TransactionLogIteratorCheckAfterRestart) {
+  do {
+    Options options = OptionsForLogIterTest();
+    DestroyAndReopen(options);
+    Put("key1", DummyString(1024));
+    Put("key2", DummyString(1023));
+    dbfull()->Flush(FlushOptions());
+    Reopen(options);
+    auto iter = OpenTransactionLogIter(0);
+    ExpectRecords(2, iter);
+  } while (ChangeCompactOptions());
+}
+
+TEST_F(DBTestXactLogIterator, TransactionLogIteratorCorruptedLog) {
+  do {
+    Options options = OptionsForLogIterTest();
+    DestroyAndReopen(options);
+    for (int i = 0; i < 1024; i++) {
+      Put("key"+ToString(i), DummyString(10));
+    }
+    dbfull()->Flush(FlushOptions());
+    dbfull()->FlushWAL(false);
+    // Corrupt this log to create a gap
+    ROCKSDB_NAMESPACE::VectorLogPtr wal_files;
+    ASSERT_OK(dbfull()->GetSortedWalFiles(wal_files));
+    const auto logfile_path = dbname_ + "/" + wal_files.front()->PathName();
+    if (mem_env_) {
+      mem_env_->Truncate(logfile_path, wal_files.front()->SizeFileBytes() / 2);
+    } else {
+      ASSERT_EQ(0, truncate(logfile_path.c_str(),
+                   wal_files.front()->SizeFileBytes() / 2));
+    }
+
+    // Insert a new entry to a new log file
+    Put("key1025", DummyString(10));
+    dbfull()->FlushWAL(false);
+    // Try to read from the beginning. Should stop before the gap and read less
+    // than 1025 entries
+    auto iter = OpenTransactionLogIter(0);
+    int count;
+    SequenceNumber last_sequence_read = ReadRecords(iter, count);
+    ASSERT_LT(last_sequence_read, 1025U);
+    // Try to read past the gap, should be able to seek to key1025
+    auto iter2 = OpenTransactionLogIter(last_sequence_read + 1);
+    ExpectRecords(1, iter2);
+  } while (ChangeCompactOptions());
+}
+
+TEST_F(DBTestXactLogIterator, TransactionLogIteratorBatchOperations) {
+  do {
+    Options options = OptionsForLogIterTest();
+    DestroyAndReopen(options);
+    CreateAndReopenWithCF({"pikachu"}, options);
+    WriteBatch batch;
+    batch.Put(handles_[1], "key1", DummyString(1024));
+    batch.Put(handles_[0], "key2", DummyString(1024));
+    batch.Put(handles_[1], "key3", DummyString(1024));
+    batch.Delete(handles_[0], "key2");
+    dbfull()->Write(WriteOptions(), &batch);
+    Flush(1);
+    Flush(0);
+    ReopenWithColumnFamilies({"default", "pikachu"}, options);
+    Put(1, "key4", DummyString(1024));
+    auto iter = OpenTransactionLogIter(3);
+    ExpectRecords(2, iter);
+  } while (ChangeCompactOptions());
+}
+
+TEST_F(DBTestXactLogIterator, TransactionLogIteratorBlobs) {
+  Options options = OptionsForLogIterTest();
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+  {
+    WriteBatch batch;
+    batch.Put(handles_[1], "key1", DummyString(1024));
+    batch.Put(handles_[0], "key2", DummyString(1024));
+    batch.PutLogData(Slice("blob1"));
+    batch.Put(handles_[1], "key3", DummyString(1024));
+    batch.PutLogData(Slice("blob2"));
+    batch.Delete(handles_[0], "key2");
+    dbfull()->Write(WriteOptions(), &batch);
+    ReopenWithColumnFamilies({"default", "pikachu"}, options);
+  }
+
+  auto res = OpenTransactionLogIter(0)->GetBatch();
+  struct Handler : public WriteBatch::Handler {
+    std::string seen;
+    Status PutCF(uint32_t cf, const Slice& key, const Slice& value) override {
+      seen += "Put(" + ToString(cf) + ", " + key.ToString() + ", " +
+              ToString(value.size()) + ")";
+      return Status::OK();
+    }
+    Status MergeCF(uint32_t cf, const Slice& key, const Slice& value) override {
+      seen += "Merge(" + ToString(cf) + ", " + key.ToString() + ", " +
+              ToString(value.size()) + ")";
+      return Status::OK();
+    }
+    void LogData(const Slice& blob) override {
+      seen += "LogData(" + blob.ToString() + ")";
+    }
+    Status DeleteCF(uint32_t cf, const Slice& key) override {
+      seen += "Delete(" + ToString(cf) + ", " + key.ToString() + ")";
+      return Status::OK();
+    }
+  } handler;
+  res.writeBatchPtr->Iterate(&handler);
+  ASSERT_EQ(
+      "Put(1, key1, 1024)"
+      "Put(0, key2, 1024)"
+      "LogData(blob1)"
+      "Put(1, key3, 1024)"
+      "LogData(blob2)"
+      "Delete(0, key2)",
+      handler.seen);
+}
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // !defined(ROCKSDB_LITE)
+
+int main(int argc, char** argv) {
+#if !defined(ROCKSDB_LITE)
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+#else
+  (void) argc;
+  (void) argv;
+  return 0;
+#endif
+}
diff --git a/src/rocksdb/db/db_memtable_test.cc b/src/rocksdb/db/db_memtable_test.cc
new file mode 100644
index 000000000..a2f4e327c
--- /dev/null
+++ b/src/rocksdb/db/db_memtable_test.cc
@@ -0,0 +1,340 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include <memory>
+#include <string>
+
+#include "db/db_test_util.h"
+#include "db/memtable.h"
+#include "db/range_del_aggregator.h"
+#include "port/stack_trace.h"
+#include "rocksdb/memtablerep.h"
+#include "rocksdb/slice_transform.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBMemTableTest : public DBTestBase {
+ public:
+  DBMemTableTest() : DBTestBase("/db_memtable_test") {}
+};
+
+class MockMemTableRep : public MemTableRep {
+ public:
+  explicit MockMemTableRep(Allocator* allocator, MemTableRep* rep)
+      : MemTableRep(allocator), rep_(rep), num_insert_with_hint_(0) {}
+
+  KeyHandle Allocate(const size_t len, char** buf) override {
+    return rep_->Allocate(len, buf);
+  }
+
+  void Insert(KeyHandle handle) override { rep_->Insert(handle); }
+
+  void InsertWithHint(KeyHandle handle, void** hint) override {
+    num_insert_with_hint_++;
+    EXPECT_NE(nullptr, hint);
+    last_hint_in_ = *hint;
+    rep_->InsertWithHint(handle, hint);
+    last_hint_out_ = *hint;
+  }
+
+  bool Contains(const char* key) const override { return rep_->Contains(key); }
+
+  void Get(const LookupKey& k, void* callback_args,
+           bool (*callback_func)(void* arg, const char* entry)) override {
+    rep_->Get(k, callback_args, callback_func);
+  }
+
+  size_t ApproximateMemoryUsage() override {
+    return rep_->ApproximateMemoryUsage();
+  }
+
+  Iterator* GetIterator(Arena* arena) override {
+    return rep_->GetIterator(arena);
+  }
+
+  void* last_hint_in() { return last_hint_in_; }
+  void* last_hint_out() { return last_hint_out_; }
+  int num_insert_with_hint() { return num_insert_with_hint_; }
+
+ private:
+  std::unique_ptr<MemTableRep> rep_;
+  void* last_hint_in_;
+  void* last_hint_out_;
+  int num_insert_with_hint_;
+};
+
+class MockMemTableRepFactory : public MemTableRepFactory {
+ public:
+  MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator& cmp,
+                                 Allocator* allocator,
+                                 const SliceTransform* transform,
+                                 Logger* logger) override {
+    SkipListFactory factory;
+    MemTableRep* skiplist_rep =
+        factory.CreateMemTableRep(cmp, allocator, transform, logger);
+    mock_rep_ = new MockMemTableRep(allocator, skiplist_rep);
+    return mock_rep_;
+  }
+
+  MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator& cmp,
+                                 Allocator* allocator,
+                                 const SliceTransform* transform,
+                                 Logger* logger,
+                                 uint32_t column_family_id) override {
+    last_column_family_id_ = column_family_id;
+    return CreateMemTableRep(cmp, allocator, transform, logger);
+  }
+
+  const char* Name() const override { return "MockMemTableRepFactory"; }
+
+  MockMemTableRep* rep() { return mock_rep_; }
+
+  bool IsInsertConcurrentlySupported() const override { return false; }
+
+  uint32_t GetLastColumnFamilyId() { return last_column_family_id_; }
+
+ private:
+  MockMemTableRep* mock_rep_;
+  // workaround since there's no port::kMaxUint32 yet.
+  uint32_t last_column_family_id_ = static_cast<uint32_t>(-1);
+};
+
+class TestPrefixExtractor : public SliceTransform {
+ public:
+  const char* Name() const override { return "TestPrefixExtractor"; }
+
+  Slice Transform(const Slice& key) const override {
+    const char* p = separator(key);
+    if (p == nullptr) {
+      return Slice();
+    }
+    return Slice(key.data(), p - key.data() + 1);
+  }
+
+  bool InDomain(const Slice& key) const override {
+    return separator(key) != nullptr;
+  }
+
+  bool InRange(const Slice& /*key*/) const override { return false; }
+
+ private:
+  const char* separator(const Slice& key) const {
+    return reinterpret_cast<const char*>(memchr(key.data(), '_', key.size()));
+  }
+};
+
+// Test that ::Add properly returns false when inserting duplicate keys
+TEST_F(DBMemTableTest, DuplicateSeq) {
+  SequenceNumber seq = 123;
+  std::string value;
+  Status s;
+  MergeContext merge_context;
+  Options options;
+  InternalKeyComparator ikey_cmp(options.comparator);
+  ReadRangeDelAggregator range_del_agg(&ikey_cmp,
+                                       kMaxSequenceNumber /* upper_bound */);
+
+  // Create a MemTable
+  InternalKeyComparator cmp(BytewiseComparator());
+  auto factory = std::make_shared<SkipListFactory>();
+  options.memtable_factory = factory;
+  ImmutableCFOptions ioptions(options);
+  WriteBufferManager wb(options.db_write_buffer_size);
+  MemTable* mem = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb,
+                               kMaxSequenceNumber, 0 /* column_family_id */);
+
+  // Write some keys and make sure it returns false on duplicates
+  bool res;
+  res = mem->Add(seq, kTypeValue, "key", "value2");
+  ASSERT_TRUE(res);
+  res = mem->Add(seq, kTypeValue, "key", "value2");
+  ASSERT_FALSE(res);
+  // Changing the type should still cause the duplicatae key
+  res = mem->Add(seq, kTypeMerge, "key", "value2");
+  ASSERT_FALSE(res);
+  // Changing the seq number will make the key fresh
+  res = mem->Add(seq + 1, kTypeMerge, "key", "value2");
+  ASSERT_TRUE(res);
+  // Test with different types for duplicate keys
+  res = mem->Add(seq, kTypeDeletion, "key", "");
+  ASSERT_FALSE(res);
+  res = mem->Add(seq, kTypeSingleDeletion, "key", "");
+  ASSERT_FALSE(res);
+
+  // Test the duplicate keys under stress
+  for (int i = 0; i < 10000; i++) {
+    bool insert_dup = i % 10 == 1;
+    if (!insert_dup) {
+      seq++;
+    }
+    res = mem->Add(seq, kTypeValue, "foo", "value" + ToString(seq));
+    if (insert_dup) {
+      ASSERT_FALSE(res);
+    } else {
+      ASSERT_TRUE(res);
+    }
+  }
+  delete mem;
+
+  // Test with InsertWithHint
+  options.memtable_insert_with_hint_prefix_extractor.reset(
+      new TestPrefixExtractor());  // which uses _ to extract the prefix
+  ioptions = ImmutableCFOptions(options);
+  mem = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb,
+                     kMaxSequenceNumber, 0 /* column_family_id */);
+  // Insert a duplicate key with _ in it
+  res = mem->Add(seq, kTypeValue, "key_1", "value");
+  ASSERT_TRUE(res);
+  res = mem->Add(seq, kTypeValue, "key_1", "value");
+  ASSERT_FALSE(res);
+  delete mem;
+
+  // Test when InsertConcurrently will be invoked
+  options.allow_concurrent_memtable_write = true;
+  ioptions = ImmutableCFOptions(options);
+  mem = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb,
+                     kMaxSequenceNumber, 0 /* column_family_id */);
+  MemTablePostProcessInfo post_process_info;
+  res = mem->Add(seq, kTypeValue, "key", "value", true, &post_process_info);
+  ASSERT_TRUE(res);
+  res = mem->Add(seq, kTypeValue, "key", "value", true, &post_process_info);
+  ASSERT_FALSE(res);
+  delete mem;
+}
+
+// A simple test to verify that the concurrent merge writes is functional
+TEST_F(DBMemTableTest, ConcurrentMergeWrite) {
+  int num_ops = 1000;
+  std::string value;
+  Status s;
+  MergeContext merge_context;
+  Options options;
+  // A merge operator that is not sensitive to concurrent writes since in this
+  // test we don't order the writes.
+  options.merge_operator = MergeOperators::CreateUInt64AddOperator();
+
+  // Create a MemTable
+  InternalKeyComparator cmp(BytewiseComparator());
+  auto factory = std::make_shared<SkipListFactory>();
+  options.memtable_factory = factory;
+  options.allow_concurrent_memtable_write = true;
+  ImmutableCFOptions ioptions(options);
+  WriteBufferManager wb(options.db_write_buffer_size);
+  MemTable* mem = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb,
+                               kMaxSequenceNumber, 0 /* column_family_id */);
+
+  // Put 0 as the base
+  PutFixed64(&value, static_cast<uint64_t>(0));
+  bool res = mem->Add(0, kTypeValue, "key", value);
+  ASSERT_TRUE(res);
+  value.clear();
+
+  // Write Merge concurrently
+  ROCKSDB_NAMESPACE::port::Thread write_thread1([&]() {
+    MemTablePostProcessInfo post_process_info1;
+    std::string v1;
+    for (int seq = 1; seq < num_ops / 2; seq++) {
+      PutFixed64(&v1, seq);
+      bool res1 =
+          mem->Add(seq, kTypeMerge, "key", v1, true, &post_process_info1);
+      ASSERT_TRUE(res1);
+      v1.clear();
+    }
+  });
+  ROCKSDB_NAMESPACE::port::Thread write_thread2([&]() {
+    MemTablePostProcessInfo post_process_info2;
+    std::string v2;
+    for (int seq = num_ops / 2; seq < num_ops; seq++) {
+      PutFixed64(&v2, seq);
+      bool res2 =
+          mem->Add(seq, kTypeMerge, "key", v2, true, &post_process_info2);
+      ASSERT_TRUE(res2);
+      v2.clear();
+    }
+  });
+  write_thread1.join();
+  write_thread2.join();
+
+  Status status;
+  ReadOptions roptions;
+  SequenceNumber max_covering_tombstone_seq = 0;
+  LookupKey lkey("key", kMaxSequenceNumber);
+  res = mem->Get(lkey, &value, &status, &merge_context,
+                 &max_covering_tombstone_seq, roptions);
+  ASSERT_TRUE(res);
+  uint64_t ivalue = DecodeFixed64(Slice(value).data());
+  uint64_t sum = 0;
+  for (int seq = 0; seq < num_ops; seq++) {
+    sum += seq;
+  }
+  ASSERT_EQ(ivalue, sum);
+
+  delete mem;
+}
+
+TEST_F(DBMemTableTest, InsertWithHint) {
+  Options options;
+  options.allow_concurrent_memtable_write = false;
+  options.create_if_missing = true;
+  options.memtable_factory.reset(new MockMemTableRepFactory());
+  options.memtable_insert_with_hint_prefix_extractor.reset(
+      new TestPrefixExtractor());
+  options.env = env_;
+  Reopen(options);
+  MockMemTableRep* rep =
+      reinterpret_cast<MockMemTableRepFactory*>(options.memtable_factory.get())
+          ->rep();
+  ASSERT_OK(Put("foo_k1", "foo_v1"));
+  ASSERT_EQ(nullptr, rep->last_hint_in());
+  void* hint_foo = rep->last_hint_out();
+  ASSERT_OK(Put("foo_k2", "foo_v2"));
+  ASSERT_EQ(hint_foo, rep->last_hint_in());
+  ASSERT_EQ(hint_foo, rep->last_hint_out());
+  ASSERT_OK(Put("foo_k3", "foo_v3"));
+  ASSERT_EQ(hint_foo, rep->last_hint_in());
+  ASSERT_EQ(hint_foo, rep->last_hint_out());
+  ASSERT_OK(Put("bar_k1", "bar_v1"));
+  ASSERT_EQ(nullptr, rep->last_hint_in());
+  void* hint_bar = rep->last_hint_out();
+  ASSERT_NE(hint_foo, hint_bar);
+  ASSERT_OK(Put("bar_k2", "bar_v2"));
+  ASSERT_EQ(hint_bar, rep->last_hint_in());
+  ASSERT_EQ(hint_bar, rep->last_hint_out());
+  ASSERT_EQ(5, rep->num_insert_with_hint());
+  ASSERT_OK(Put("whitelisted", "vvv"));
+  ASSERT_EQ(5, rep->num_insert_with_hint());
+  ASSERT_EQ("foo_v1", Get("foo_k1"));
+  ASSERT_EQ("foo_v2", Get("foo_k2"));
+  ASSERT_EQ("foo_v3", Get("foo_k3"));
+  ASSERT_EQ("bar_v1", Get("bar_k1"));
+  ASSERT_EQ("bar_v2", Get("bar_k2"));
+  ASSERT_EQ("vvv", Get("whitelisted"));
+}
+
+TEST_F(DBMemTableTest, ColumnFamilyId) {
+  // Verifies MemTableRepFactory is told the right column family id.
+  Options options;
+  options.allow_concurrent_memtable_write = false;
+  options.create_if_missing = true;
+  options.memtable_factory.reset(new MockMemTableRepFactory());
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  for (uint32_t cf = 0; cf < 2; ++cf) {
+    ASSERT_OK(Put(cf, "key", "val"));
+    ASSERT_OK(Flush(cf));
+    ASSERT_EQ(
+        cf, static_cast<MockMemTableRepFactory*>(options.memtable_factory.get())
+                ->GetLastColumnFamilyId());
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_merge_operand_test.cc b/src/rocksdb/db/db_merge_operand_test.cc
new file mode 100644
index 000000000..a0ab34e01
--- /dev/null
+++ b/src/rocksdb/db/db_merge_operand_test.cc
@@ -0,0 +1,240 @@
+//  Copyright (c) 2018-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/db_test_util.h"
+#include "port/stack_trace.h"
+#include "rocksdb/perf_context.h"
+#include "rocksdb/utilities/debug.h"
+#include "table/block_based/block_builder.h"
+#include "test_util/fault_injection_test_env.h"
+#if !defined(ROCKSDB_LITE)
+#include "test_util/sync_point.h"
+#endif
+#include "rocksdb/merge_operator.h"
+#include "utilities/merge_operators.h"
+#include "utilities/merge_operators/sortlist.h"
+#include "utilities/merge_operators/string_append/stringappend2.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBMergeOperandTest : public DBTestBase {
+ public:
+  DBMergeOperandTest() : DBTestBase("/db_merge_operand_test") {}
+};
+
+TEST_F(DBMergeOperandTest, GetMergeOperandsBasic) {
+  class LimitedStringAppendMergeOp : public StringAppendTESTOperator {
+   public:
+    LimitedStringAppendMergeOp(int limit, char delim)
+        : StringAppendTESTOperator(delim), limit_(limit) {}
+
+    const char* Name() const override {
+      return "DBMergeOperatorTest::LimitedStringAppendMergeOp";
+    }
+
+    bool ShouldMerge(const std::vector<Slice>& operands) const override {
+      if (operands.size() > 0 && limit_ > 0 && operands.size() >= limit_) {
+        return true;
+      }
+      return false;
+    }
+
+   private:
+    size_t limit_ = 0;
+  };
+
+  Options options;
+  options.create_if_missing = true;
+  // Use only the latest two merge operands.
+  options.merge_operator = std::make_shared<LimitedStringAppendMergeOp>(2, ',');
+  options.env = env_;
+  Reopen(options);
+  int num_records = 4;
+  int number_of_operands = 0;
+  std::vector<PinnableSlice> values(num_records);
+  GetMergeOperandsOptions merge_operands_info;
+  merge_operands_info.expected_max_number_of_operands = num_records;
+
+  // k0 value in memtable
+  Put("k0", "PutARock");
+  db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k0",
+                        values.data(), &merge_operands_info,
+                        &number_of_operands);
+  ASSERT_EQ(values[0], "PutARock");
+
+  // k0.1 value in SST
+  Put("k0.1", "RockInSST");
+  ASSERT_OK(Flush());
+  db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k0.1",
+                        values.data(), &merge_operands_info,
+                        &number_of_operands);
+  ASSERT_EQ(values[0], "RockInSST");
+
+  // All k1 values are in memtable.
+  ASSERT_OK(Merge("k1", "a"));
+  Put("k1", "x");
+  ASSERT_OK(Merge("k1", "b"));
+  ASSERT_OK(Merge("k1", "c"));
+  ASSERT_OK(Merge("k1", "d"));
+  db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k1",
+                        values.data(), &merge_operands_info,
+                        &number_of_operands);
+  ASSERT_EQ(values[0], "x");
+  ASSERT_EQ(values[1], "b");
+  ASSERT_EQ(values[2], "c");
+  ASSERT_EQ(values[3], "d");
+
+  // expected_max_number_of_operands is less than number of merge operands so
+  // status should be Incomplete.
+  merge_operands_info.expected_max_number_of_operands = num_records - 1;
+  Status status = db_->GetMergeOperands(
+      ReadOptions(), db_->DefaultColumnFamily(), "k1", values.data(),
+      &merge_operands_info, &number_of_operands);
+  ASSERT_EQ(status.IsIncomplete(), true);
+  merge_operands_info.expected_max_number_of_operands = num_records;
+
+  // All k1.1 values are in memtable.
+  ASSERT_OK(Merge("k1.1", "r"));
+  Delete("k1.1");
+  ASSERT_OK(Merge("k1.1", "c"));
+  ASSERT_OK(Merge("k1.1", "k"));
+  ASSERT_OK(Merge("k1.1", "s"));
+  db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k1.1",
+                        values.data(), &merge_operands_info,
+                        &number_of_operands);
+  ASSERT_EQ(values[0], "c");
+  ASSERT_EQ(values[1], "k");
+  ASSERT_EQ(values[2], "s");
+
+  // All k2 values are flushed to L0 into a single file.
+  ASSERT_OK(Merge("k2", "q"));
+  ASSERT_OK(Merge("k2", "w"));
+  ASSERT_OK(Merge("k2", "e"));
+  ASSERT_OK(Merge("k2", "r"));
+  ASSERT_OK(Flush());
+  db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k2",
+                        values.data(), &merge_operands_info,
+                        &number_of_operands);
+  ASSERT_EQ(values[0], "q");
+  ASSERT_EQ(values[1], "w");
+  ASSERT_EQ(values[2], "e");
+  ASSERT_EQ(values[3], "r");
+
+  // All k2.1 values are flushed to L0 into a single file.
+  ASSERT_OK(Merge("k2.1", "m"));
+  Put("k2.1", "l");
+  ASSERT_OK(Merge("k2.1", "n"));
+  ASSERT_OK(Merge("k2.1", "o"));
+  ASSERT_OK(Flush());
+  db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k2.1",
+                        values.data(), &merge_operands_info,
+                        &number_of_operands);
+  ASSERT_EQ(values[0], "l,n,o");
+
+  // All k2.2 values are flushed to L0 into a single file.
+  ASSERT_OK(Merge("k2.2", "g"));
+  Delete("k2.2");
+  ASSERT_OK(Merge("k2.2", "o"));
+  ASSERT_OK(Merge("k2.2", "t"));
+  ASSERT_OK(Flush());
+  db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k2.2",
+                        values.data(), &merge_operands_info,
+                        &number_of_operands);
+  ASSERT_EQ(values[0], "o,t");
+
+  // Do some compaction that will make the following tests more predictable
+  //  Slice start("PutARock");
+  //  Slice end("t");
+  db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+
+  // All k3 values are flushed and are in different files.
+  ASSERT_OK(Merge("k3", "ab"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Merge("k3", "bc"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Merge("k3", "cd"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Merge("k3", "de"));
+  db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k3",
+                        values.data(), &merge_operands_info,
+                        &number_of_operands);
+  ASSERT_EQ(values[0], "ab");
+  ASSERT_EQ(values[1], "bc");
+  ASSERT_EQ(values[2], "cd");
+  ASSERT_EQ(values[3], "de");
+
+  // All k3.1 values are flushed and are in different files.
+  ASSERT_OK(Merge("k3.1", "ab"));
+  ASSERT_OK(Flush());
+  Put("k3.1", "bc");
+  ASSERT_OK(Flush());
+  ASSERT_OK(Merge("k3.1", "cd"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Merge("k3.1", "de"));
+  db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k3.1",
+                        values.data(), &merge_operands_info,
+                        &number_of_operands);
+  ASSERT_EQ(values[0], "bc");
+  ASSERT_EQ(values[1], "cd");
+  ASSERT_EQ(values[2], "de");
+
+  // All k3.2 values are flushed and are in different files.
+  ASSERT_OK(Merge("k3.2", "ab"));
+  ASSERT_OK(Flush());
+  Delete("k3.2");
+  ASSERT_OK(Flush());
+  ASSERT_OK(Merge("k3.2", "cd"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Merge("k3.2", "de"));
+  db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k3.2",
+                        values.data(), &merge_operands_info,
+                        &number_of_operands);
+  ASSERT_EQ(values[0], "cd");
+  ASSERT_EQ(values[1], "de");
+
+  // All K4 values are in different levels
+  ASSERT_OK(Merge("k4", "ba"));
+  ASSERT_OK(Flush());
+  MoveFilesToLevel(4);
+  ASSERT_OK(Merge("k4", "cb"));
+  ASSERT_OK(Flush());
+  MoveFilesToLevel(3);
+  ASSERT_OK(Merge("k4", "dc"));
+  ASSERT_OK(Flush());
+  MoveFilesToLevel(1);
+  ASSERT_OK(Merge("k4", "ed"));
+  db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k4",
+                        values.data(), &merge_operands_info,
+                        &number_of_operands);
+  ASSERT_EQ(values[0], "ba");
+  ASSERT_EQ(values[1], "cb");
+  ASSERT_EQ(values[2], "dc");
+  ASSERT_EQ(values[3], "ed");
+
+  // First 3 k5 values are in SST and next 4 k5 values are in Immutable Memtable
+  ASSERT_OK(Merge("k5", "who"));
+  ASSERT_OK(Merge("k5", "am"));
+  ASSERT_OK(Merge("k5", "i"));
+  ASSERT_OK(Flush());
+  Put("k5", "remember");
+  ASSERT_OK(Merge("k5", "i"));
+  ASSERT_OK(Merge("k5", "am"));
+  ASSERT_OK(Merge("k5", "rocks"));
+  dbfull()->TEST_SwitchMemtable();
+  db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k5",
+                        values.data(), &merge_operands_info,
+                        &number_of_operands);
+  ASSERT_EQ(values[0], "remember");
+  ASSERT_EQ(values[1], "i");
+  ASSERT_EQ(values[2], "am");
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_merge_operator_test.cc b/src/rocksdb/db/db_merge_operator_test.cc
new file mode 100644
index 000000000..4f762468d
--- /dev/null
+++ b/src/rocksdb/db/db_merge_operator_test.cc
@@ -0,0 +1,666 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#include <string>
+#include <vector>
+
+#include "db/db_test_util.h"
+#include "db/forward_iterator.h"
+#include "port/stack_trace.h"
+#include "rocksdb/merge_operator.h"
+#include "utilities/merge_operators.h"
+#include "utilities/merge_operators/string_append/stringappend2.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class TestReadCallback : public ReadCallback {
+ public:
+  TestReadCallback(SnapshotChecker* snapshot_checker,
+                   SequenceNumber snapshot_seq)
+      : ReadCallback(snapshot_seq),
+        snapshot_checker_(snapshot_checker),
+        snapshot_seq_(snapshot_seq) {}
+
+  bool IsVisibleFullCheck(SequenceNumber seq) override {
+    return snapshot_checker_->CheckInSnapshot(seq, snapshot_seq_) ==
+           SnapshotCheckerResult::kInSnapshot;
+  }
+
+ private:
+  SnapshotChecker* snapshot_checker_;
+  SequenceNumber snapshot_seq_;
+};
+
+// Test merge operator functionality.
+class DBMergeOperatorTest : public DBTestBase {
+ public:
+  DBMergeOperatorTest() : DBTestBase("/db_merge_operator_test") {}
+
+  std::string GetWithReadCallback(SnapshotChecker* snapshot_checker,
+                                  const Slice& key,
+                                  const Snapshot* snapshot = nullptr) {
+    SequenceNumber seq = snapshot == nullptr ? db_->GetLatestSequenceNumber()
+                                             : snapshot->GetSequenceNumber();
+    TestReadCallback read_callback(snapshot_checker, seq);
+    ReadOptions read_opt;
+    read_opt.snapshot = snapshot;
+    PinnableSlice value;
+    DBImpl::GetImplOptions get_impl_options;
+    get_impl_options.column_family = db_->DefaultColumnFamily();
+    get_impl_options.value = &value;
+    get_impl_options.callback = &read_callback;
+    Status s = dbfull()->GetImpl(read_opt, key, get_impl_options);
+    if (!s.ok()) {
+      return s.ToString();
+    }
+    return value.ToString();
+  }
+};
+
+TEST_F(DBMergeOperatorTest, LimitMergeOperands) {
+  class LimitedStringAppendMergeOp : public StringAppendTESTOperator {
+   public:
+    LimitedStringAppendMergeOp(int limit, char delim)
+        : StringAppendTESTOperator(delim), limit_(limit) {}
+
+    const char* Name() const override {
+      return "DBMergeOperatorTest::LimitedStringAppendMergeOp";
+    }
+
+    bool ShouldMerge(const std::vector<Slice>& operands) const override {
+      if (operands.size() > 0 && limit_ > 0 && operands.size() >= limit_) {
+        return true;
+      }
+      return false;
+    }
+
+   private:
+    size_t limit_ = 0;
+  };
+
+  Options options;
+  options.create_if_missing = true;
+  // Use only the latest two merge operands.
+  options.merge_operator =
+      std::make_shared<LimitedStringAppendMergeOp>(2, ',');
+  options.env = env_;
+  Reopen(options);
+  // All K1 values are in memtable.
+  ASSERT_OK(Merge("k1", "a"));
+  ASSERT_OK(Merge("k1", "b"));
+  ASSERT_OK(Merge("k1", "c"));
+  ASSERT_OK(Merge("k1", "d"));
+  std::string value;
+  ASSERT_TRUE(db_->Get(ReadOptions(), "k1", &value).ok());
+  // Make sure that only the latest two merge operands are used. If this was
+  // not the case the value would be "a,b,c,d".
+  ASSERT_EQ(value, "c,d");
+
+  // All K2 values are flushed to L0 into a single file.
+  ASSERT_OK(Merge("k2", "a"));
+  ASSERT_OK(Merge("k2", "b"));
+  ASSERT_OK(Merge("k2", "c"));
+  ASSERT_OK(Merge("k2", "d"));
+  ASSERT_OK(Flush());
+  ASSERT_TRUE(db_->Get(ReadOptions(), "k2", &value).ok());
+  ASSERT_EQ(value, "c,d");
+
+  // All K3 values are flushed and are in different files.
+  ASSERT_OK(Merge("k3", "ab"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Merge("k3", "bc"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Merge("k3", "cd"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Merge("k3", "de"));
+  ASSERT_TRUE(db_->Get(ReadOptions(), "k3", &value).ok());
+  ASSERT_EQ(value, "cd,de");
+
+  // All K4 values are in different levels
+  ASSERT_OK(Merge("k4", "ab"));
+  ASSERT_OK(Flush());
+  MoveFilesToLevel(4);
+  ASSERT_OK(Merge("k4", "bc"));
+  ASSERT_OK(Flush());
+  MoveFilesToLevel(3);
+  ASSERT_OK(Merge("k4", "cd"));
+  ASSERT_OK(Flush());
+  MoveFilesToLevel(1);
+  ASSERT_OK(Merge("k4", "de"));
+  ASSERT_TRUE(db_->Get(ReadOptions(), "k4", &value).ok());
+  ASSERT_EQ(value, "cd,de");
+}
+
+TEST_F(DBMergeOperatorTest, MergeErrorOnRead) {
+  Options options;
+  options.create_if_missing = true;
+  options.merge_operator.reset(new TestPutOperator());
+  options.env = env_;
+  Reopen(options);
+  ASSERT_OK(Merge("k1", "v1"));
+  ASSERT_OK(Merge("k1", "corrupted"));
+  std::string value;
+  ASSERT_TRUE(db_->Get(ReadOptions(), "k1", &value).IsCorruption());
+  VerifyDBInternal({{"k1", "corrupted"}, {"k1", "v1"}});
+}
+
+TEST_F(DBMergeOperatorTest, MergeErrorOnWrite) {
+  Options options;
+  options.create_if_missing = true;
+  options.merge_operator.reset(new TestPutOperator());
+  options.max_successive_merges = 3;
+  options.env = env_;
+  Reopen(options);
+  ASSERT_OK(Merge("k1", "v1"));
+  ASSERT_OK(Merge("k1", "v2"));
+  // Will trigger a merge when hitting max_successive_merges and the merge
+  // will fail. The delta will be inserted nevertheless.
+  ASSERT_OK(Merge("k1", "corrupted"));
+  // Data should stay unmerged after the error.
+  VerifyDBInternal({{"k1", "corrupted"}, {"k1", "v2"}, {"k1", "v1"}});
+}
+
+TEST_F(DBMergeOperatorTest, MergeErrorOnIteration) {
+  Options options;
+  options.create_if_missing = true;
+  options.merge_operator.reset(new TestPutOperator());
+  options.env = env_;
+
+  DestroyAndReopen(options);
+  ASSERT_OK(Merge("k1", "v1"));
+  ASSERT_OK(Merge("k1", "corrupted"));
+  ASSERT_OK(Put("k2", "v2"));
+  auto* iter = db_->NewIterator(ReadOptions());
+  iter->Seek("k1");
+  ASSERT_FALSE(iter->Valid());
+  ASSERT_TRUE(iter->status().IsCorruption());
+  delete iter;
+  iter = db_->NewIterator(ReadOptions());
+  iter->Seek("k2");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_OK(iter->status());
+  iter->Prev();
+  ASSERT_FALSE(iter->Valid());
+  ASSERT_TRUE(iter->status().IsCorruption());
+  delete iter;
+  VerifyDBInternal({{"k1", "corrupted"}, {"k1", "v1"}, {"k2", "v2"}});
+
+  DestroyAndReopen(options);
+  ASSERT_OK(Merge("k1", "v1"));
+  ASSERT_OK(Put("k2", "v2"));
+  ASSERT_OK(Merge("k2", "corrupted"));
+  iter = db_->NewIterator(ReadOptions());
+  iter->Seek("k1");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_OK(iter->status());
+  iter->Next();
+  ASSERT_FALSE(iter->Valid());
+  ASSERT_TRUE(iter->status().IsCorruption());
+  delete iter;
+  VerifyDBInternal({{"k1", "v1"}, {"k2", "corrupted"}, {"k2", "v2"}});
+}
+
+
+class MergeOperatorPinningTest : public DBMergeOperatorTest,
+                                 public testing::WithParamInterface<bool> {
+ public:
+  MergeOperatorPinningTest() { disable_block_cache_ = GetParam(); }
+
+  bool disable_block_cache_;
+};
+
+INSTANTIATE_TEST_CASE_P(MergeOperatorPinningTest, MergeOperatorPinningTest,
+                        ::testing::Bool());
+
+#ifndef ROCKSDB_LITE
+TEST_P(MergeOperatorPinningTest, OperandsMultiBlocks) {
+  Options options = CurrentOptions();
+  BlockBasedTableOptions table_options;
+  table_options.block_size = 1;  // every block will contain one entry
+  table_options.no_block_cache = disable_block_cache_;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options.merge_operator = MergeOperators::CreateStringAppendTESTOperator();
+  options.level0_slowdown_writes_trigger = (1 << 30);
+  options.level0_stop_writes_trigger = (1 << 30);
+  options.disable_auto_compactions = true;
+  DestroyAndReopen(options);
+
+  const int kKeysPerFile = 10;
+  const int kOperandsPerKeyPerFile = 7;
+  const int kOperandSize = 100;
+  // Filse to write in L0 before compacting to lower level
+  const int kFilesPerLevel = 3;
+
+  Random rnd(301);
+  std::map<std::string, std::string> true_data;
+  int batch_num = 1;
+  int lvl_to_fill = 4;
+  int key_id = 0;
+  while (true) {
+    for (int j = 0; j < kKeysPerFile; j++) {
+      std::string key = Key(key_id % 35);
+      key_id++;
+      for (int k = 0; k < kOperandsPerKeyPerFile; k++) {
+        std::string val = RandomString(&rnd, kOperandSize);
+        ASSERT_OK(db_->Merge(WriteOptions(), key, val));
+        if (true_data[key].size() == 0) {
+          true_data[key] = val;
+        } else {
+          true_data[key] += "," + val;
+        }
+      }
+    }
+
+    if (lvl_to_fill == -1) {
+      // Keep last batch in memtable and stop
+      break;
+    }
+
+    ASSERT_OK(Flush());
+    if (batch_num % kFilesPerLevel == 0) {
+      if (lvl_to_fill != 0) {
+        MoveFilesToLevel(lvl_to_fill);
+      }
+      lvl_to_fill--;
+    }
+    batch_num++;
+  }
+
+  // 3 L0 files
+  // 1 L1 file
+  // 3 L2 files
+  // 1 L3 file
+  // 3 L4 Files
+  ASSERT_EQ(FilesPerLevel(), "3,1,3,1,3");
+
+  VerifyDBFromMap(true_data);
+}
+
+class MergeOperatorHook : public MergeOperator {
+ public:
+  explicit MergeOperatorHook(std::shared_ptr<MergeOperator> _merge_op)
+      : merge_op_(_merge_op) {}
+
+  bool FullMergeV2(const MergeOperationInput& merge_in,
+                   MergeOperationOutput* merge_out) const override {
+    before_merge_();
+    bool res = merge_op_->FullMergeV2(merge_in, merge_out);
+    after_merge_();
+    return res;
+  }
+
+  const char* Name() const override { return merge_op_->Name(); }
+
+  std::shared_ptr<MergeOperator> merge_op_;
+  std::function<void()> before_merge_ = []() {};
+  std::function<void()> after_merge_ = []() {};
+};
+
+TEST_P(MergeOperatorPinningTest, EvictCacheBeforeMerge) {
+  Options options = CurrentOptions();
+
+  auto merge_hook =
+      std::make_shared<MergeOperatorHook>(MergeOperators::CreateMaxOperator());
+  options.merge_operator = merge_hook;
+  options.disable_auto_compactions = true;
+  options.level0_slowdown_writes_trigger = (1 << 30);
+  options.level0_stop_writes_trigger = (1 << 30);
+  options.max_open_files = 20;
+  BlockBasedTableOptions bbto;
+  bbto.no_block_cache = disable_block_cache_;
+  if (bbto.no_block_cache == false) {
+    bbto.block_cache = NewLRUCache(64 * 1024 * 1024);
+  } else {
+    bbto.block_cache = nullptr;
+  }
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  DestroyAndReopen(options);
+
+  const int kNumOperands = 30;
+  const int kNumKeys = 1000;
+  const int kOperandSize = 100;
+  Random rnd(301);
+
+  // 1000 keys every key have 30 operands, every operand is in a different file
+  std::map<std::string, std::string> true_data;
+  for (int i = 0; i < kNumOperands; i++) {
+    for (int j = 0; j < kNumKeys; j++) {
+      std::string k = Key(j);
+      std::string v = RandomString(&rnd, kOperandSize);
+      ASSERT_OK(db_->Merge(WriteOptions(), k, v));
+
+      true_data[k] = std::max(true_data[k], v);
+    }
+    ASSERT_OK(Flush());
+  }
+
+  std::vector<uint64_t> file_numbers = ListTableFiles(env_, dbname_);
+  ASSERT_EQ(file_numbers.size(), kNumOperands);
+  int merge_cnt = 0;
+
+  // Code executed before merge operation
+  merge_hook->before_merge_ = [&]() {
+    // Evict all tables from cache before every merge operation
+    for (uint64_t num : file_numbers) {
+      TableCache::Evict(dbfull()->TEST_table_cache(), num);
+    }
+    // Decrease cache capacity to force all unrefed blocks to be evicted
+    if (bbto.block_cache) {
+      bbto.block_cache->SetCapacity(1);
+    }
+    merge_cnt++;
+  };
+
+  // Code executed after merge operation
+  merge_hook->after_merge_ = [&]() {
+    // Increase capacity again after doing the merge
+    if (bbto.block_cache) {
+      bbto.block_cache->SetCapacity(64 * 1024 * 1024);
+    }
+  };
+
+  size_t total_reads;
+  VerifyDBFromMap(true_data, &total_reads);
+  ASSERT_EQ(merge_cnt, total_reads);
+
+  db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+
+  VerifyDBFromMap(true_data, &total_reads);
+}
+
+TEST_P(MergeOperatorPinningTest, TailingIterator) {
+  Options options = CurrentOptions();
+  options.merge_operator = MergeOperators::CreateMaxOperator();
+  BlockBasedTableOptions bbto;
+  bbto.no_block_cache = disable_block_cache_;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  DestroyAndReopen(options);
+
+  const int kNumOperands = 100;
+  const int kNumWrites = 100000;
+
+  std::function<void()> writer_func = [&]() {
+    int k = 0;
+    for (int i = 0; i < kNumWrites; i++) {
+      db_->Merge(WriteOptions(), Key(k), Key(k));
+
+      if (i && i % kNumOperands == 0) {
+        k++;
+      }
+      if (i && i % 127 == 0) {
+        ASSERT_OK(Flush());
+      }
+      if (i && i % 317 == 0) {
+        ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+      }
+    }
+  };
+
+  std::function<void()> reader_func = [&]() {
+    ReadOptions ro;
+    ro.tailing = true;
+    Iterator* iter = db_->NewIterator(ro);
+
+    iter->SeekToFirst();
+    for (int i = 0; i < (kNumWrites / kNumOperands); i++) {
+      while (!iter->Valid()) {
+        // wait for the key to be written
+        env_->SleepForMicroseconds(100);
+        iter->Seek(Key(i));
+      }
+      ASSERT_EQ(iter->key(), Key(i));
+      ASSERT_EQ(iter->value(), Key(i));
+
+      iter->Next();
+    }
+
+    delete iter;
+  };
+
+  ROCKSDB_NAMESPACE::port::Thread writer_thread(writer_func);
+  ROCKSDB_NAMESPACE::port::Thread reader_thread(reader_func);
+
+  writer_thread.join();
+  reader_thread.join();
+}
+
+TEST_F(DBMergeOperatorTest, TailingIteratorMemtableUnrefedBySomeoneElse) {
+  Options options = CurrentOptions();
+  options.merge_operator = MergeOperators::CreateStringAppendOperator();
+  DestroyAndReopen(options);
+
+  // Overview of the test:
+  //  * There are two merge operands for the same key: one in an sst file,
+  //    another in a memtable.
+  //  * Seek a tailing iterator to this key.
+  //  * As part of the seek, the iterator will:
+  //      (a) first visit the operand in the memtable and tell ForwardIterator
+  //          to pin this operand, then
+  //      (b) move on to the operand in the sst file, then pass both operands
+  //          to merge operator.
+  //  * The memtable may get flushed and unreferenced by another thread between
+  //    (a) and (b). The test simulates it by flushing the memtable inside a
+  //    SyncPoint callback located between (a) and (b).
+  //  * In this case it's ForwardIterator's responsibility to keep the memtable
+  //    pinned until (b) is complete. There used to be a bug causing
+  //    ForwardIterator to not pin it in some circumstances. This test
+  //    reproduces it.
+
+  db_->Merge(WriteOptions(), "key", "sst");
+  db_->Flush(FlushOptions()); // Switch to SuperVersion A
+  db_->Merge(WriteOptions(), "key", "memtable");
+
+  // Pin SuperVersion A
+  std::unique_ptr<Iterator> someone_else(db_->NewIterator(ReadOptions()));
+
+  bool pushed_first_operand = false;
+  bool stepped_to_next_operand = false;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBIter::MergeValuesNewToOld:PushedFirstOperand", [&](void*) {
+        EXPECT_FALSE(pushed_first_operand);
+        pushed_first_operand = true;
+        db_->Flush(FlushOptions()); // Switch to SuperVersion B
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBIter::MergeValuesNewToOld:SteppedToNextOperand", [&](void*) {
+        EXPECT_FALSE(stepped_to_next_operand);
+        stepped_to_next_operand = true;
+        someone_else.reset(); // Unpin SuperVersion A
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ReadOptions ro;
+  ro.tailing = true;
+  std::unique_ptr<Iterator> iter(db_->NewIterator(ro));
+  iter->Seek("key");
+
+  ASSERT_TRUE(iter->status().ok());
+  ASSERT_TRUE(iter->Valid());
+  EXPECT_EQ(std::string("sst,memtable"), iter->value().ToString());
+  EXPECT_TRUE(pushed_first_operand);
+  EXPECT_TRUE(stepped_to_next_operand);
+}
+#endif  // ROCKSDB_LITE
+
+TEST_F(DBMergeOperatorTest, SnapshotCheckerAndReadCallback) {
+  Options options = CurrentOptions();
+  options.merge_operator = MergeOperators::CreateStringAppendOperator();
+  DestroyAndReopen(options);
+
+  class TestSnapshotChecker : public SnapshotChecker {
+   public:
+    SnapshotCheckerResult CheckInSnapshot(
+        SequenceNumber seq, SequenceNumber snapshot_seq) const override {
+      return IsInSnapshot(seq, snapshot_seq)
+                 ? SnapshotCheckerResult::kInSnapshot
+                 : SnapshotCheckerResult::kNotInSnapshot;
+    }
+
+    bool IsInSnapshot(SequenceNumber seq, SequenceNumber snapshot_seq) const {
+      switch (snapshot_seq) {
+        case 0:
+          return seq == 0;
+        case 1:
+          return seq <= 1;
+        case 2:
+          // seq = 2 not visible to snapshot with seq = 2
+          return seq <= 1;
+        case 3:
+          return seq <= 3;
+        case 4:
+          // seq = 4 not visible to snpahost with seq = 4
+          return seq <= 3;
+        default:
+          // seq >=4 is uncommitted
+          return seq <= 4;
+      };
+    }
+  };
+  TestSnapshotChecker* snapshot_checker = new TestSnapshotChecker();
+  dbfull()->SetSnapshotChecker(snapshot_checker);
+
+  std::string value;
+  ASSERT_OK(Merge("foo", "v1"));
+  ASSERT_EQ(1, db_->GetLatestSequenceNumber());
+  ASSERT_EQ("v1", GetWithReadCallback(snapshot_checker, "foo"));
+  ASSERT_OK(Merge("foo", "v2"));
+  ASSERT_EQ(2, db_->GetLatestSequenceNumber());
+  // v2 is not visible to latest snapshot, which has seq = 2.
+  ASSERT_EQ("v1", GetWithReadCallback(snapshot_checker, "foo"));
+  // Take a snapshot with seq = 2.
+  const Snapshot* snapshot1 = db_->GetSnapshot();
+  ASSERT_EQ(2, snapshot1->GetSequenceNumber());
+  // v2 is not visible to snapshot1, which has seq = 2
+  ASSERT_EQ("v1", GetWithReadCallback(snapshot_checker, "foo", snapshot1));
+
+  // Verify flush doesn't alter the result.
+  ASSERT_OK(Flush());
+  ASSERT_EQ("v1", GetWithReadCallback(snapshot_checker, "foo", snapshot1));
+  ASSERT_EQ("v1", GetWithReadCallback(snapshot_checker, "foo"));
+
+  ASSERT_OK(Merge("foo", "v3"));
+  ASSERT_EQ(3, db_->GetLatestSequenceNumber());
+  ASSERT_EQ("v1,v2,v3", GetWithReadCallback(snapshot_checker, "foo"));
+  ASSERT_OK(Merge("foo", "v4"));
+  ASSERT_EQ(4, db_->GetLatestSequenceNumber());
+  // v4 is not visible to latest snapshot, which has seq = 4.
+  ASSERT_EQ("v1,v2,v3", GetWithReadCallback(snapshot_checker, "foo"));
+  const Snapshot* snapshot2 = db_->GetSnapshot();
+  ASSERT_EQ(4, snapshot2->GetSequenceNumber());
+  // v4 is not visible to snapshot2, which has seq = 4.
+  ASSERT_EQ("v1,v2,v3",
+            GetWithReadCallback(snapshot_checker, "foo", snapshot2));
+
+  // Verify flush doesn't alter the result.
+  ASSERT_OK(Flush());
+  ASSERT_EQ("v1", GetWithReadCallback(snapshot_checker, "foo", snapshot1));
+  ASSERT_EQ("v1,v2,v3",
+            GetWithReadCallback(snapshot_checker, "foo", snapshot2));
+  ASSERT_EQ("v1,v2,v3", GetWithReadCallback(snapshot_checker, "foo"));
+
+  ASSERT_OK(Merge("foo", "v5"));
+  ASSERT_EQ(5, db_->GetLatestSequenceNumber());
+  // v5 is uncommitted
+  ASSERT_EQ("v1,v2,v3,v4", GetWithReadCallback(snapshot_checker, "foo"));
+
+  // full manual compaction.
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+  // Verify compaction doesn't alter the result.
+  ASSERT_EQ("v1", GetWithReadCallback(snapshot_checker, "foo", snapshot1));
+  ASSERT_EQ("v1,v2,v3",
+            GetWithReadCallback(snapshot_checker, "foo", snapshot2));
+  ASSERT_EQ("v1,v2,v3,v4", GetWithReadCallback(snapshot_checker, "foo"));
+
+  db_->ReleaseSnapshot(snapshot1);
+  db_->ReleaseSnapshot(snapshot2);
+}
+
+class PerConfigMergeOperatorPinningTest
+    : public DBMergeOperatorTest,
+      public testing::WithParamInterface<std::tuple<bool, int>> {
+ public:
+  PerConfigMergeOperatorPinningTest() {
+    std::tie(disable_block_cache_, option_config_) = GetParam();
+  }
+
+  bool disable_block_cache_;
+};
+
+INSTANTIATE_TEST_CASE_P(
+    MergeOperatorPinningTest, PerConfigMergeOperatorPinningTest,
+    ::testing::Combine(::testing::Bool(),
+                       ::testing::Range(static_cast<int>(DBTestBase::kDefault),
+                                        static_cast<int>(DBTestBase::kEnd))));
+
+TEST_P(PerConfigMergeOperatorPinningTest, Randomized) {
+  if (ShouldSkipOptions(option_config_, kSkipMergePut)) {
+    return;
+  }
+
+  Options options = CurrentOptions();
+  options.merge_operator = MergeOperators::CreateMaxOperator();
+  BlockBasedTableOptions table_options;
+  table_options.no_block_cache = disable_block_cache_;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  DestroyAndReopen(options);
+
+  Random rnd(301);
+  std::map<std::string, std::string> true_data;
+
+  const int kTotalMerges = 5000;
+  // Every key gets ~10 operands
+  const int kKeyRange = kTotalMerges / 10;
+  const int kOperandSize = 20;
+  const int kNumPutBefore = kKeyRange / 10;  // 10% value
+  const int kNumPutAfter = kKeyRange / 10;   // 10% overwrite
+  const int kNumDelete = kKeyRange / 10;     // 10% delete
+
+  // kNumPutBefore keys will have base values
+  for (int i = 0; i < kNumPutBefore; i++) {
+    std::string key = Key(rnd.Next() % kKeyRange);
+    std::string value = RandomString(&rnd, kOperandSize);
+    ASSERT_OK(db_->Put(WriteOptions(), key, value));
+
+    true_data[key] = value;
+  }
+
+  // Do kTotalMerges merges
+  for (int i = 0; i < kTotalMerges; i++) {
+    std::string key = Key(rnd.Next() % kKeyRange);
+    std::string value = RandomString(&rnd, kOperandSize);
+    ASSERT_OK(db_->Merge(WriteOptions(), key, value));
+
+    if (true_data[key] < value) {
+      true_data[key] = value;
+    }
+  }
+
+  // Overwrite random kNumPutAfter keys
+  for (int i = 0; i < kNumPutAfter; i++) {
+    std::string key = Key(rnd.Next() % kKeyRange);
+    std::string value = RandomString(&rnd, kOperandSize);
+    ASSERT_OK(db_->Put(WriteOptions(), key, value));
+
+    true_data[key] = value;
+  }
+
+  // Delete random kNumDelete keys
+  for (int i = 0; i < kNumDelete; i++) {
+    std::string key = Key(rnd.Next() % kKeyRange);
+    ASSERT_OK(db_->Delete(WriteOptions(), key));
+
+    true_data.erase(key);
+  }
+
+  VerifyDBFromMap(true_data);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_options_test.cc b/src/rocksdb/db/db_options_test.cc
new file mode 100644
index 000000000..383f66cbf
--- /dev/null
+++ b/src/rocksdb/db/db_options_test.cc
@@ -0,0 +1,870 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include <limits>
+#include <string>
+#include <unordered_map>
+
+#include "db/column_family.h"
+#include "db/db_impl/db_impl.h"
+#include "db/db_test_util.h"
+#include "options/options_helper.h"
+#include "port/stack_trace.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/rate_limiter.h"
+#include "rocksdb/stats_history.h"
+#include "test_util/sync_point.h"
+#include "test_util/testutil.h"
+#include "util/random.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBOptionsTest : public DBTestBase {
+ public:
+  DBOptionsTest() : DBTestBase("/db_options_test") {}
+
+#ifndef ROCKSDB_LITE
+  std::unordered_map<std::string, std::string> GetMutableDBOptionsMap(
+      const DBOptions& options) {
+    std::string options_str;
+    GetStringFromDBOptions(&options_str, options);
+    std::unordered_map<std::string, std::string> options_map;
+    StringToMap(options_str, &options_map);
+    std::unordered_map<std::string, std::string> mutable_map;
+    for (const auto opt : db_options_type_info) {
+      if (opt.second.is_mutable &&
+          opt.second.verification != OptionVerificationType::kDeprecated) {
+        mutable_map[opt.first] = options_map[opt.first];
+      }
+    }
+    return mutable_map;
+  }
+
+  std::unordered_map<std::string, std::string> GetMutableCFOptionsMap(
+      const ColumnFamilyOptions& options) {
+    std::string options_str;
+    GetStringFromColumnFamilyOptions(&options_str, options);
+    std::unordered_map<std::string, std::string> options_map;
+    StringToMap(options_str, &options_map);
+    std::unordered_map<std::string, std::string> mutable_map;
+    for (const auto opt : cf_options_type_info) {
+      if (opt.second.is_mutable &&
+          opt.second.verification != OptionVerificationType::kDeprecated) {
+        mutable_map[opt.first] = options_map[opt.first];
+      }
+    }
+    return mutable_map;
+  }
+
+  std::unordered_map<std::string, std::string> GetRandomizedMutableCFOptionsMap(
+      Random* rnd) {
+    Options options = CurrentOptions();
+    options.env = env_;
+    ImmutableDBOptions db_options(options);
+    test::RandomInitCFOptions(&options, options, rnd);
+    auto sanitized_options = SanitizeOptions(db_options, options);
+    auto opt_map = GetMutableCFOptionsMap(sanitized_options);
+    delete options.compaction_filter;
+    return opt_map;
+  }
+
+  std::unordered_map<std::string, std::string> GetRandomizedMutableDBOptionsMap(
+      Random* rnd) {
+    DBOptions db_options;
+    test::RandomInitDBOptions(&db_options, rnd);
+    auto sanitized_options = SanitizeOptions(dbname_, db_options);
+    return GetMutableDBOptionsMap(sanitized_options);
+  }
+#endif  // ROCKSDB_LITE
+};
+
+// RocksDB lite don't support dynamic options.
+#ifndef ROCKSDB_LITE
+
+TEST_F(DBOptionsTest, GetLatestDBOptions) {
+  // GetOptions should be able to get latest option changed by SetOptions.
+  Options options;
+  options.create_if_missing = true;
+  options.env = env_;
+  Random rnd(228);
+  Reopen(options);
+  auto new_options = GetRandomizedMutableDBOptionsMap(&rnd);
+  ASSERT_OK(dbfull()->SetDBOptions(new_options));
+  ASSERT_EQ(new_options, GetMutableDBOptionsMap(dbfull()->GetDBOptions()));
+}
+
+TEST_F(DBOptionsTest, GetLatestCFOptions) {
+  // GetOptions should be able to get latest option changed by SetOptions.
+  Options options;
+  options.create_if_missing = true;
+  options.env = env_;
+  Random rnd(228);
+  Reopen(options);
+  CreateColumnFamilies({"foo"}, options);
+  ReopenWithColumnFamilies({"default", "foo"}, options);
+  auto options_default = GetRandomizedMutableCFOptionsMap(&rnd);
+  auto options_foo = GetRandomizedMutableCFOptionsMap(&rnd);
+  ASSERT_OK(dbfull()->SetOptions(handles_[0], options_default));
+  ASSERT_OK(dbfull()->SetOptions(handles_[1], options_foo));
+  ASSERT_EQ(options_default,
+            GetMutableCFOptionsMap(dbfull()->GetOptions(handles_[0])));
+  ASSERT_EQ(options_foo,
+            GetMutableCFOptionsMap(dbfull()->GetOptions(handles_[1])));
+}
+
+TEST_F(DBOptionsTest, SetBytesPerSync) {
+  const size_t kValueSize = 1024 * 1024;  // 1MB
+  Options options;
+  options.create_if_missing = true;
+  options.bytes_per_sync = 1024 * 1024;
+  options.use_direct_reads = false;
+  options.write_buffer_size = 400 * kValueSize;
+  options.disable_auto_compactions = true;
+  options.compression = kNoCompression;
+  options.env = env_;
+  Reopen(options);
+  int counter = 0;
+  int low_bytes_per_sync = 0;
+  int i = 0;
+  const std::string kValue(kValueSize, 'v');
+  ASSERT_EQ(options.bytes_per_sync, dbfull()->GetDBOptions().bytes_per_sync);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "WritableFileWriter::RangeSync:0", [&](void* /*arg*/) { counter++; });
+
+  WriteOptions write_opts;
+  // should sync approximately 40MB/1MB ~= 40 times.
+  for (i = 0; i < 40; i++) {
+    Put(Key(i), kValue, write_opts);
+  }
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  low_bytes_per_sync = counter;
+  ASSERT_GT(low_bytes_per_sync, 35);
+  ASSERT_LT(low_bytes_per_sync, 45);
+
+  counter = 0;
+  // 8388608 = 8 * 1024 * 1024
+  ASSERT_OK(dbfull()->SetDBOptions({{"bytes_per_sync", "8388608"}}));
+  ASSERT_EQ(8388608, dbfull()->GetDBOptions().bytes_per_sync);
+  // should sync approximately 40MB*2/8MB ~= 10 times.
+  // data will be 40*2MB because of previous Puts too.
+  for (i = 0; i < 40; i++) {
+    Put(Key(i), kValue, write_opts);
+  }
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_GT(counter, 5);
+  ASSERT_LT(counter, 15);
+
+  // Redundant assert. But leaving it here just to get the point across that
+  // low_bytes_per_sync > counter.
+  ASSERT_GT(low_bytes_per_sync, counter);
+}
+
+TEST_F(DBOptionsTest, SetWalBytesPerSync) {
+  const size_t kValueSize = 1024 * 1024 * 3;
+  Options options;
+  options.create_if_missing = true;
+  options.wal_bytes_per_sync = 512;
+  options.write_buffer_size = 100 * kValueSize;
+  options.disable_auto_compactions = true;
+  options.compression = kNoCompression;
+  options.env = env_;
+  Reopen(options);
+  ASSERT_EQ(512, dbfull()->GetDBOptions().wal_bytes_per_sync);
+  int counter = 0;
+  int low_bytes_per_sync = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "WritableFileWriter::RangeSync:0", [&](void* /*arg*/) { counter++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  const std::string kValue(kValueSize, 'v');
+  int i = 0;
+  for (; i < 10; i++) {
+    Put(Key(i), kValue);
+  }
+  // Do not flush. If we flush here, SwitchWAL will reuse old WAL file since its
+  // empty and will not get the new wal_bytes_per_sync value.
+  low_bytes_per_sync = counter;
+  //5242880 = 1024 * 1024 * 5
+  ASSERT_OK(dbfull()->SetDBOptions({{"wal_bytes_per_sync", "5242880"}}));
+  ASSERT_EQ(5242880, dbfull()->GetDBOptions().wal_bytes_per_sync);
+  counter = 0;
+  i = 0;
+  for (; i < 10; i++) {
+    Put(Key(i), kValue);
+  }
+  ASSERT_GT(counter, 0);
+  ASSERT_GT(low_bytes_per_sync, 0);
+  ASSERT_GT(low_bytes_per_sync, counter);
+}
+
+TEST_F(DBOptionsTest, WritableFileMaxBufferSize) {
+  Options options;
+  options.create_if_missing = true;
+  options.writable_file_max_buffer_size = 1024 * 1024;
+  options.level0_file_num_compaction_trigger = 3;
+  options.max_manifest_file_size = 1;
+  options.env = env_;
+  int buffer_size = 1024 * 1024;
+  Reopen(options);
+  ASSERT_EQ(buffer_size,
+            dbfull()->GetDBOptions().writable_file_max_buffer_size);
+
+  std::atomic<int> match_cnt(0);
+  std::atomic<int> unmatch_cnt(0);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "WritableFileWriter::WritableFileWriter:0", [&](void* arg) {
+        int value = static_cast<int>(reinterpret_cast<uintptr_t>(arg));
+        if (value == buffer_size) {
+          match_cnt++;
+        } else {
+          unmatch_cnt++;
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  int i = 0;
+  for (; i < 3; i++) {
+    ASSERT_OK(Put("foo", ToString(i)));
+    ASSERT_OK(Put("bar", ToString(i)));
+    Flush();
+  }
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(unmatch_cnt, 0);
+  ASSERT_GE(match_cnt, 11);
+
+  ASSERT_OK(
+      dbfull()->SetDBOptions({{"writable_file_max_buffer_size", "524288"}}));
+  buffer_size = 512 * 1024;
+  match_cnt = 0;
+  unmatch_cnt = 0;  // SetDBOptions() will create a WriteableFileWriter
+
+  ASSERT_EQ(buffer_size,
+            dbfull()->GetDBOptions().writable_file_max_buffer_size);
+  i = 0;
+  for (; i < 3; i++) {
+    ASSERT_OK(Put("foo", ToString(i)));
+    ASSERT_OK(Put("bar", ToString(i)));
+    Flush();
+  }
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(unmatch_cnt, 0);
+  ASSERT_GE(match_cnt, 11);
+}
+
+TEST_F(DBOptionsTest, SetOptionsAndReopen) {
+  Random rnd(1044);
+  auto rand_opts = GetRandomizedMutableCFOptionsMap(&rnd);
+  ASSERT_OK(dbfull()->SetOptions(rand_opts));
+  // Verify if DB can be reopen after setting options.
+  Options options;
+  options.env = env_;
+  ASSERT_OK(TryReopen(options));
+}
+
+TEST_F(DBOptionsTest, EnableAutoCompactionAndTriggerStall) {
+  const std::string kValue(1024, 'v');
+  for (int method_type = 0; method_type < 2; method_type++) {
+    for (int option_type = 0; option_type < 4; option_type++) {
+      Options options;
+      options.create_if_missing = true;
+      options.disable_auto_compactions = true;
+      options.write_buffer_size = 1024 * 1024 * 10;
+      options.compression = CompressionType::kNoCompression;
+      options.level0_file_num_compaction_trigger = 1;
+      options.level0_stop_writes_trigger = std::numeric_limits<int>::max();
+      options.level0_slowdown_writes_trigger = std::numeric_limits<int>::max();
+      options.hard_pending_compaction_bytes_limit =
+          std::numeric_limits<uint64_t>::max();
+      options.soft_pending_compaction_bytes_limit =
+          std::numeric_limits<uint64_t>::max();
+      options.env = env_;
+
+      DestroyAndReopen(options);
+      int i = 0;
+      for (; i < 1024; i++) {
+        Put(Key(i), kValue);
+      }
+      Flush();
+      for (; i < 1024 * 2; i++) {
+        Put(Key(i), kValue);
+      }
+      Flush();
+      dbfull()->TEST_WaitForFlushMemTable();
+      ASSERT_EQ(2, NumTableFilesAtLevel(0));
+      uint64_t l0_size = SizeAtLevel(0);
+
+      switch (option_type) {
+        case 0:
+          // test with level0_stop_writes_trigger
+          options.level0_stop_writes_trigger = 2;
+          options.level0_slowdown_writes_trigger = 2;
+          break;
+        case 1:
+          options.level0_slowdown_writes_trigger = 2;
+          break;
+        case 2:
+          options.hard_pending_compaction_bytes_limit = l0_size;
+          options.soft_pending_compaction_bytes_limit = l0_size;
+          break;
+        case 3:
+          options.soft_pending_compaction_bytes_limit = l0_size;
+          break;
+      }
+      Reopen(options);
+      dbfull()->TEST_WaitForCompact();
+      ASSERT_FALSE(dbfull()->TEST_write_controler().IsStopped());
+      ASSERT_FALSE(dbfull()->TEST_write_controler().NeedsDelay());
+
+      SyncPoint::GetInstance()->LoadDependency(
+          {{"DBOptionsTest::EnableAutoCompactionAndTriggerStall:1",
+            "BackgroundCallCompaction:0"},
+           {"DBImpl::BackgroundCompaction():BeforePickCompaction",
+            "DBOptionsTest::EnableAutoCompactionAndTriggerStall:2"},
+           {"DBOptionsTest::EnableAutoCompactionAndTriggerStall:3",
+            "DBImpl::BackgroundCompaction():AfterPickCompaction"}});
+      // Block background compaction.
+      SyncPoint::GetInstance()->EnableProcessing();
+
+      switch (method_type) {
+        case 0:
+          ASSERT_OK(
+              dbfull()->SetOptions({{"disable_auto_compactions", "false"}}));
+          break;
+        case 1:
+          ASSERT_OK(dbfull()->EnableAutoCompaction(
+              {dbfull()->DefaultColumnFamily()}));
+          break;
+      }
+      TEST_SYNC_POINT("DBOptionsTest::EnableAutoCompactionAndTriggerStall:1");
+      // Wait for stall condition recalculate.
+      TEST_SYNC_POINT("DBOptionsTest::EnableAutoCompactionAndTriggerStall:2");
+
+      switch (option_type) {
+        case 0:
+          ASSERT_TRUE(dbfull()->TEST_write_controler().IsStopped());
+          break;
+        case 1:
+          ASSERT_FALSE(dbfull()->TEST_write_controler().IsStopped());
+          ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+          break;
+        case 2:
+          ASSERT_TRUE(dbfull()->TEST_write_controler().IsStopped());
+          break;
+        case 3:
+          ASSERT_FALSE(dbfull()->TEST_write_controler().IsStopped());
+          ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+          break;
+      }
+      TEST_SYNC_POINT("DBOptionsTest::EnableAutoCompactionAndTriggerStall:3");
+
+      // Background compaction executed.
+      dbfull()->TEST_WaitForCompact();
+      ASSERT_FALSE(dbfull()->TEST_write_controler().IsStopped());
+      ASSERT_FALSE(dbfull()->TEST_write_controler().NeedsDelay());
+    }
+  }
+}
+
+TEST_F(DBOptionsTest, SetOptionsMayTriggerCompaction) {
+  Options options;
+  options.create_if_missing = true;
+  options.level0_file_num_compaction_trigger = 1000;
+  options.env = env_;
+  Reopen(options);
+  for (int i = 0; i < 3; i++) {
+    // Need to insert two keys to avoid trivial move.
+    ASSERT_OK(Put("foo", ToString(i)));
+    ASSERT_OK(Put("bar", ToString(i)));
+    Flush();
+  }
+  ASSERT_EQ("3", FilesPerLevel());
+  ASSERT_OK(
+      dbfull()->SetOptions({{"level0_file_num_compaction_trigger", "3"}}));
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ("0,1", FilesPerLevel());
+}
+
+TEST_F(DBOptionsTest, SetBackgroundCompactionThreads) {
+  Options options;
+  options.create_if_missing = true;
+  options.max_background_compactions = 1;   // default value
+  options.env = env_;
+  Reopen(options);
+  ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed());
+  ASSERT_OK(dbfull()->SetDBOptions({{"max_background_compactions", "3"}}));
+  ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed());
+  auto stop_token = dbfull()->TEST_write_controler().GetStopToken();
+  ASSERT_EQ(3, dbfull()->TEST_BGCompactionsAllowed());
+}
+
+TEST_F(DBOptionsTest, SetBackgroundJobs) {
+  Options options;
+  options.create_if_missing = true;
+  options.max_background_jobs = 8;
+  options.env = env_;
+  Reopen(options);
+
+  for (int i = 0; i < 2; ++i) {
+    if (i > 0) {
+      options.max_background_jobs = 12;
+      ASSERT_OK(dbfull()->SetDBOptions(
+          {{"max_background_jobs",
+            std::to_string(options.max_background_jobs)}}));
+    }
+
+    const int expected_max_flushes = options.max_background_jobs / 4;
+
+    ASSERT_EQ(expected_max_flushes, dbfull()->TEST_BGFlushesAllowed());
+    ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed());
+
+    auto stop_token = dbfull()->TEST_write_controler().GetStopToken();
+
+    const int expected_max_compactions = 3 * expected_max_flushes;
+
+    ASSERT_EQ(expected_max_flushes, dbfull()->TEST_BGFlushesAllowed());
+    ASSERT_EQ(expected_max_compactions, dbfull()->TEST_BGCompactionsAllowed());
+
+    ASSERT_EQ(expected_max_flushes,
+              env_->GetBackgroundThreads(Env::Priority::HIGH));
+    ASSERT_EQ(expected_max_compactions,
+              env_->GetBackgroundThreads(Env::Priority::LOW));
+  }
+}
+
+TEST_F(DBOptionsTest, AvoidFlushDuringShutdown) {
+  Options options;
+  options.create_if_missing = true;
+  options.disable_auto_compactions = true;
+  options.env = env_;
+  WriteOptions write_without_wal;
+  write_without_wal.disableWAL = true;
+
+  ASSERT_FALSE(options.avoid_flush_during_shutdown);
+  DestroyAndReopen(options);
+  ASSERT_OK(Put("foo", "v1", write_without_wal));
+  Reopen(options);
+  ASSERT_EQ("v1", Get("foo"));
+  ASSERT_EQ("1", FilesPerLevel());
+
+  DestroyAndReopen(options);
+  ASSERT_OK(Put("foo", "v2", write_without_wal));
+  ASSERT_OK(dbfull()->SetDBOptions({{"avoid_flush_during_shutdown", "true"}}));
+  Reopen(options);
+  ASSERT_EQ("NOT_FOUND", Get("foo"));
+  ASSERT_EQ("", FilesPerLevel());
+}
+
+TEST_F(DBOptionsTest, SetDelayedWriteRateOption) {
+  Options options;
+  options.create_if_missing = true;
+  options.delayed_write_rate = 2 * 1024U * 1024U;
+  options.env = env_;
+  Reopen(options);
+  ASSERT_EQ(2 * 1024U * 1024U, dbfull()->TEST_write_controler().max_delayed_write_rate());
+
+  ASSERT_OK(dbfull()->SetDBOptions({{"delayed_write_rate", "20000"}}));
+  ASSERT_EQ(20000, dbfull()->TEST_write_controler().max_delayed_write_rate());
+}
+
+TEST_F(DBOptionsTest, MaxTotalWalSizeChange) {
+  Random rnd(1044);
+  const auto value_size = size_t(1024);
+  std::string value;
+  test::RandomString(&rnd, value_size, &value);
+
+  Options options;
+  options.create_if_missing = true;
+  options.env = env_;
+  CreateColumnFamilies({"1", "2", "3"}, options);
+  ReopenWithColumnFamilies({"default", "1", "2", "3"}, options);
+
+  WriteOptions write_options;
+
+  const int key_count = 100;
+  for (int i = 0; i < key_count; ++i) {
+    for (size_t cf = 0; cf < handles_.size(); ++cf) {
+      ASSERT_OK(Put(static_cast<int>(cf), Key(i), value));
+    }
+  }
+  ASSERT_OK(dbfull()->SetDBOptions({{"max_total_wal_size", "10"}}));
+
+  for (size_t cf = 0; cf < handles_.size(); ++cf) {
+    dbfull()->TEST_WaitForFlushMemTable(handles_[cf]);
+    ASSERT_EQ("1", FilesPerLevel(static_cast<int>(cf)));
+  }
+}
+
+TEST_F(DBOptionsTest, SetStatsDumpPeriodSec) {
+  Options options;
+  options.create_if_missing = true;
+  options.stats_dump_period_sec = 5;
+  options.env = env_;
+  Reopen(options);
+  ASSERT_EQ(5u, dbfull()->GetDBOptions().stats_dump_period_sec);
+
+  for (int i = 0; i < 20; i++) {
+    unsigned int num = rand() % 5000 + 1;
+    ASSERT_OK(
+        dbfull()->SetDBOptions({{"stats_dump_period_sec", ToString(num)}}));
+    ASSERT_EQ(num, dbfull()->GetDBOptions().stats_dump_period_sec);
+  }
+  Close();
+}
+
+TEST_F(DBOptionsTest, SetOptionsStatsPersistPeriodSec) {
+  Options options;
+  options.create_if_missing = true;
+  options.stats_persist_period_sec = 5;
+  options.env = env_;
+  Reopen(options);
+  ASSERT_EQ(5u, dbfull()->GetDBOptions().stats_persist_period_sec);
+
+  ASSERT_OK(dbfull()->SetDBOptions({{"stats_persist_period_sec", "12345"}}));
+  ASSERT_EQ(12345u, dbfull()->GetDBOptions().stats_persist_period_sec);
+  ASSERT_NOK(dbfull()->SetDBOptions({{"stats_persist_period_sec", "abcde"}}));
+  ASSERT_EQ(12345u, dbfull()->GetDBOptions().stats_persist_period_sec);
+}
+
+static void assert_candidate_files_empty(DBImpl* dbfull, const bool empty) {
+  dbfull->TEST_LockMutex();
+  JobContext job_context(0);
+  dbfull->FindObsoleteFiles(&job_context, false);
+  ASSERT_EQ(empty, job_context.full_scan_candidate_files.empty());
+  dbfull->TEST_UnlockMutex();
+  if (job_context.HaveSomethingToDelete()) {
+    // fulfill the contract of FindObsoleteFiles by calling PurgeObsoleteFiles
+    // afterwards; otherwise the test may hang on shutdown
+    dbfull->PurgeObsoleteFiles(job_context);
+  }
+  job_context.Clean();
+}
+
+TEST_F(DBOptionsTest, DeleteObsoleteFilesPeriodChange) {
+  SpecialEnv env(env_);
+  env.time_elapse_only_sleep_ = true;
+  Options options;
+  options.env = &env;
+  options.create_if_missing = true;
+  ASSERT_OK(TryReopen(options));
+
+  // Verify that candidate files set is empty when no full scan requested.
+  assert_candidate_files_empty(dbfull(), true);
+
+  ASSERT_OK(
+      dbfull()->SetDBOptions({{"delete_obsolete_files_period_micros", "0"}}));
+
+  // After delete_obsolete_files_period_micros updated to 0, the next call
+  // to FindObsoleteFiles should make a full scan
+  assert_candidate_files_empty(dbfull(), false);
+
+  ASSERT_OK(
+      dbfull()->SetDBOptions({{"delete_obsolete_files_period_micros", "20"}}));
+
+  assert_candidate_files_empty(dbfull(), true);
+
+  env.addon_time_.store(20);
+  assert_candidate_files_empty(dbfull(), true);
+
+  env.addon_time_.store(21);
+  assert_candidate_files_empty(dbfull(), false);
+
+  Close();
+}
+
+TEST_F(DBOptionsTest, MaxOpenFilesChange) {
+  SpecialEnv env(env_);
+  Options options;
+  options.env = CurrentOptions().env;
+  options.max_open_files = -1;
+
+  Reopen(options);
+
+  Cache* tc = dbfull()->TEST_table_cache();
+
+  ASSERT_EQ(-1, dbfull()->GetDBOptions().max_open_files);
+  ASSERT_LT(2000, tc->GetCapacity());
+  ASSERT_OK(dbfull()->SetDBOptions({{"max_open_files", "1024"}}));
+  ASSERT_EQ(1024, dbfull()->GetDBOptions().max_open_files);
+  // examine the table cache (actual size should be 1014)
+  ASSERT_GT(1500, tc->GetCapacity());
+  Close();
+}
+
+TEST_F(DBOptionsTest, SanitizeDelayedWriteRate) {
+  Options options;
+  options.delayed_write_rate = 0;
+  Reopen(options);
+  ASSERT_EQ(16 * 1024 * 1024, dbfull()->GetDBOptions().delayed_write_rate);
+
+  options.rate_limiter.reset(NewGenericRateLimiter(31 * 1024 * 1024));
+  Reopen(options);
+  ASSERT_EQ(31 * 1024 * 1024, dbfull()->GetDBOptions().delayed_write_rate);
+}
+
+TEST_F(DBOptionsTest, SanitizeUniversalTTLCompaction) {
+  Options options;
+  options.compaction_style = kCompactionStyleUniversal;
+
+  options.ttl = 0;
+  options.periodic_compaction_seconds = 0;
+  Reopen(options);
+  ASSERT_EQ(0, dbfull()->GetOptions().ttl);
+  ASSERT_EQ(0, dbfull()->GetOptions().periodic_compaction_seconds);
+
+  options.ttl = 0;
+  options.periodic_compaction_seconds = 100;
+  Reopen(options);
+  ASSERT_EQ(0, dbfull()->GetOptions().ttl);
+  ASSERT_EQ(100, dbfull()->GetOptions().periodic_compaction_seconds);
+
+  options.ttl = 100;
+  options.periodic_compaction_seconds = 0;
+  Reopen(options);
+  ASSERT_EQ(100, dbfull()->GetOptions().ttl);
+  ASSERT_EQ(100, dbfull()->GetOptions().periodic_compaction_seconds);
+
+  options.ttl = 100;
+  options.periodic_compaction_seconds = 500;
+  Reopen(options);
+  ASSERT_EQ(100, dbfull()->GetOptions().ttl);
+  ASSERT_EQ(100, dbfull()->GetOptions().periodic_compaction_seconds);
+}
+
+TEST_F(DBOptionsTest, SanitizeTtlDefault) {
+  Options options;
+  Reopen(options);
+  ASSERT_EQ(30 * 24 * 60 * 60, dbfull()->GetOptions().ttl);
+
+  options.compaction_style = kCompactionStyleLevel;
+  options.ttl = 0;
+  Reopen(options);
+  ASSERT_EQ(0, dbfull()->GetOptions().ttl);
+
+  options.ttl = 100;
+  Reopen(options);
+  ASSERT_EQ(100, dbfull()->GetOptions().ttl);
+}
+
+TEST_F(DBOptionsTest, SanitizeFIFOPeriodicCompaction) {
+  Options options;
+  options.compaction_style = kCompactionStyleFIFO;
+  options.ttl = 0;
+  Reopen(options);
+  ASSERT_EQ(30 * 24 * 60 * 60, dbfull()->GetOptions().ttl);
+
+  options.ttl = 100;
+  Reopen(options);
+  ASSERT_EQ(100, dbfull()->GetOptions().ttl);
+
+  options.ttl = 100 * 24 * 60 * 60;
+  Reopen(options);
+  ASSERT_EQ(100 * 24 * 60 * 60, dbfull()->GetOptions().ttl);
+
+  options.ttl = 200;
+  options.periodic_compaction_seconds = 300;
+  Reopen(options);
+  ASSERT_EQ(200, dbfull()->GetOptions().ttl);
+
+  options.ttl = 500;
+  options.periodic_compaction_seconds = 300;
+  Reopen(options);
+  ASSERT_EQ(300, dbfull()->GetOptions().ttl);
+}
+
+TEST_F(DBOptionsTest, SetFIFOCompactionOptions) {
+  Options options;
+  options.compaction_style = kCompactionStyleFIFO;
+  options.write_buffer_size = 10 << 10;  // 10KB
+  options.arena_block_size = 4096;
+  options.compression = kNoCompression;
+  options.create_if_missing = true;
+  options.compaction_options_fifo.allow_compaction = false;
+  env_->time_elapse_only_sleep_ = false;
+  options.env = env_;
+
+  // Test dynamically changing ttl.
+  env_->addon_time_.store(0);
+  options.ttl = 1 * 60 * 60;  // 1 hour
+  ASSERT_OK(TryReopen(options));
+
+  Random rnd(301);
+  for (int i = 0; i < 10; i++) {
+    // Generate and flush a file about 10KB.
+    for (int j = 0; j < 10; j++) {
+      ASSERT_OK(Put(ToString(i * 20 + j), RandomString(&rnd, 980)));
+    }
+    Flush();
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ(NumTableFilesAtLevel(0), 10);
+
+  // Add 61 seconds to the time.
+  env_->addon_time_.fetch_add(61);
+
+  // No files should be compacted as ttl is set to 1 hour.
+  ASSERT_EQ(dbfull()->GetOptions().ttl, 3600);
+  dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_EQ(NumTableFilesAtLevel(0), 10);
+
+  // Set ttl to 1 minute. So all files should get deleted.
+  ASSERT_OK(dbfull()->SetOptions({{"ttl", "60"}}));
+  ASSERT_EQ(dbfull()->GetOptions().ttl, 60);
+  dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+
+  // Test dynamically changing compaction_options_fifo.max_table_files_size
+  env_->addon_time_.store(0);
+  options.compaction_options_fifo.max_table_files_size = 500 << 10;  // 00KB
+  options.ttl = 0;
+  DestroyAndReopen(options);
+
+  for (int i = 0; i < 10; i++) {
+    // Generate and flush a file about 10KB.
+    for (int j = 0; j < 10; j++) {
+      ASSERT_OK(Put(ToString(i * 20 + j), RandomString(&rnd, 980)));
+    }
+    Flush();
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ(NumTableFilesAtLevel(0), 10);
+
+  // No files should be compacted as max_table_files_size is set to 500 KB.
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.max_table_files_size,
+            500 << 10);
+  dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_EQ(NumTableFilesAtLevel(0), 10);
+
+  // Set max_table_files_size to 12 KB. So only 1 file should remain now.
+  ASSERT_OK(dbfull()->SetOptions(
+      {{"compaction_options_fifo", "{max_table_files_size=12288;}"}}));
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.max_table_files_size,
+            12 << 10);
+  dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ(NumTableFilesAtLevel(0), 1);
+
+  // Test dynamically changing compaction_options_fifo.allow_compaction
+  options.compaction_options_fifo.max_table_files_size = 500 << 10;  // 500KB
+  options.ttl = 0;
+  options.compaction_options_fifo.allow_compaction = false;
+  options.level0_file_num_compaction_trigger = 6;
+  DestroyAndReopen(options);
+
+  for (int i = 0; i < 10; i++) {
+    // Generate and flush a file about 10KB.
+    for (int j = 0; j < 10; j++) {
+      ASSERT_OK(Put(ToString(i * 20 + j), RandomString(&rnd, 980)));
+    }
+    Flush();
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ(NumTableFilesAtLevel(0), 10);
+
+  // No files should be compacted as max_table_files_size is set to 500 KB and
+  // allow_compaction is false
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.allow_compaction,
+            false);
+  dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_EQ(NumTableFilesAtLevel(0), 10);
+
+  // Set allow_compaction to true. So number of files should be between 1 and 5.
+  ASSERT_OK(dbfull()->SetOptions(
+      {{"compaction_options_fifo", "{allow_compaction=true;}"}}));
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.allow_compaction,
+            true);
+  dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_GE(NumTableFilesAtLevel(0), 1);
+  ASSERT_LE(NumTableFilesAtLevel(0), 5);
+}
+
+TEST_F(DBOptionsTest, CompactionReadaheadSizeChange) {
+  SpecialEnv env(env_);
+  Options options;
+  options.env = &env;
+
+  options.compaction_readahead_size = 0;
+  options.new_table_reader_for_compaction_inputs = true;
+  options.level0_file_num_compaction_trigger = 2;
+  const std::string kValue(1024, 'v');
+  Reopen(options);
+
+  ASSERT_EQ(0, dbfull()->GetDBOptions().compaction_readahead_size);
+  ASSERT_OK(dbfull()->SetDBOptions({{"compaction_readahead_size", "256"}}));
+  ASSERT_EQ(256, dbfull()->GetDBOptions().compaction_readahead_size);
+  for (int i = 0; i < 1024; i++) {
+    Put(Key(i), kValue);
+  }
+  Flush();
+  for (int i = 0; i < 1024 * 2; i++) {
+    Put(Key(i), kValue);
+  }
+  Flush();
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(256, env_->compaction_readahead_size_);
+  Close();
+}
+
+TEST_F(DBOptionsTest, FIFOTtlBackwardCompatible) {
+  Options options;
+  options.compaction_style = kCompactionStyleFIFO;
+  options.write_buffer_size = 10 << 10;  // 10KB
+  options.create_if_missing = true;
+
+  ASSERT_OK(TryReopen(options));
+
+  Random rnd(301);
+  for (int i = 0; i < 10; i++) {
+    // Generate and flush a file about 10KB.
+    for (int j = 0; j < 10; j++) {
+      ASSERT_OK(Put(ToString(i * 20 + j), RandomString(&rnd, 980)));
+    }
+    Flush();
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ(NumTableFilesAtLevel(0), 10);
+
+  // In release 6.0, ttl was promoted from a secondary level option under
+  // compaction_options_fifo to a top level option under ColumnFamilyOptions.
+  // We still need to handle old SetOptions calls but should ignore
+  // ttl under compaction_options_fifo.
+  ASSERT_OK(dbfull()->SetOptions(
+      {{"compaction_options_fifo",
+        "{allow_compaction=true;max_table_files_size=1024;ttl=731;}"},
+       {"ttl", "60"}}));
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.allow_compaction,
+            true);
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.max_table_files_size,
+            1024);
+  ASSERT_EQ(dbfull()->GetOptions().ttl, 60);
+
+  // Put ttl as the first option inside compaction_options_fifo. That works as
+  // it doesn't overwrite any other option.
+  ASSERT_OK(dbfull()->SetOptions(
+      {{"compaction_options_fifo",
+        "{ttl=985;allow_compaction=true;max_table_files_size=1024;}"},
+       {"ttl", "191"}}));
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.allow_compaction,
+            true);
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.max_table_files_size,
+            1024);
+  ASSERT_EQ(dbfull()->GetOptions().ttl, 191);
+}
+
+#endif  // ROCKSDB_LITE
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_properties_test.cc b/src/rocksdb/db/db_properties_test.cc
new file mode 100644
index 000000000..50dc3efef
--- /dev/null
+++ b/src/rocksdb/db/db_properties_test.cc
@@ -0,0 +1,1711 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <stdio.h>
+
+#include <algorithm>
+#include <string>
+
+#include "db/db_test_util.h"
+#include "port/stack_trace.h"
+#include "rocksdb/listener.h"
+#include "rocksdb/options.h"
+#include "rocksdb/perf_context.h"
+#include "rocksdb/perf_level.h"
+#include "rocksdb/table.h"
+#include "util/random.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBPropertiesTest : public DBTestBase {
+ public:
+  DBPropertiesTest() : DBTestBase("/db_properties_test") {}
+};
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBPropertiesTest, Empty) {
+  do {
+    Options options;
+    options.env = env_;
+    options.write_buffer_size = 100000;  // Small write buffer
+    options.allow_concurrent_memtable_write = false;
+    options = CurrentOptions(options);
+    CreateAndReopenWithCF({"pikachu"}, options);
+
+    std::string num;
+    ASSERT_TRUE(dbfull()->GetProperty(
+        handles_[1], "rocksdb.num-entries-active-mem-table", &num));
+    ASSERT_EQ("0", num);
+
+    ASSERT_OK(Put(1, "foo", "v1"));
+    ASSERT_EQ("v1", Get(1, "foo"));
+    ASSERT_TRUE(dbfull()->GetProperty(
+        handles_[1], "rocksdb.num-entries-active-mem-table", &num));
+    ASSERT_EQ("1", num);
+
+    // Block sync calls
+    env_->delay_sstable_sync_.store(true, std::memory_order_release);
+    Put(1, "k1", std::string(100000, 'x'));  // Fill memtable
+    ASSERT_TRUE(dbfull()->GetProperty(
+        handles_[1], "rocksdb.num-entries-active-mem-table", &num));
+    ASSERT_EQ("2", num);
+
+    Put(1, "k2", std::string(100000, 'y'));  // Trigger compaction
+    ASSERT_TRUE(dbfull()->GetProperty(
+        handles_[1], "rocksdb.num-entries-active-mem-table", &num));
+    ASSERT_EQ("1", num);
+
+    ASSERT_EQ("v1", Get(1, "foo"));
+    // Release sync calls
+    env_->delay_sstable_sync_.store(false, std::memory_order_release);
+
+    ASSERT_OK(db_->DisableFileDeletions());
+    ASSERT_TRUE(
+        dbfull()->GetProperty("rocksdb.is-file-deletions-enabled", &num));
+    ASSERT_EQ("0", num);
+
+    ASSERT_OK(db_->DisableFileDeletions());
+    ASSERT_TRUE(
+        dbfull()->GetProperty("rocksdb.is-file-deletions-enabled", &num));
+    ASSERT_EQ("0", num);
+
+    ASSERT_OK(db_->DisableFileDeletions());
+    ASSERT_TRUE(
+        dbfull()->GetProperty("rocksdb.is-file-deletions-enabled", &num));
+    ASSERT_EQ("0", num);
+
+    ASSERT_OK(db_->EnableFileDeletions(false));
+    ASSERT_TRUE(
+        dbfull()->GetProperty("rocksdb.is-file-deletions-enabled", &num));
+    ASSERT_EQ("0", num);
+
+    ASSERT_OK(db_->EnableFileDeletions());
+    ASSERT_TRUE(
+        dbfull()->GetProperty("rocksdb.is-file-deletions-enabled", &num));
+    ASSERT_EQ("1", num);
+  } while (ChangeOptions());
+}
+
+TEST_F(DBPropertiesTest, CurrentVersionNumber) {
+  uint64_t v1, v2, v3;
+  ASSERT_TRUE(
+      dbfull()->GetIntProperty("rocksdb.current-super-version-number", &v1));
+  Put("12345678", "");
+  ASSERT_TRUE(
+      dbfull()->GetIntProperty("rocksdb.current-super-version-number", &v2));
+  Flush();
+  ASSERT_TRUE(
+      dbfull()->GetIntProperty("rocksdb.current-super-version-number", &v3));
+
+  ASSERT_EQ(v1, v2);
+  ASSERT_GT(v3, v2);
+}
+
+TEST_F(DBPropertiesTest, GetAggregatedIntPropertyTest) {
+  const int kKeySize = 100;
+  const int kValueSize = 500;
+  const int kKeyNum = 100;
+
+  Options options;
+  options.env = env_;
+  options.create_if_missing = true;
+  options.write_buffer_size = (kKeySize + kValueSize) * kKeyNum / 10;
+  // Make them never flush
+  options.min_write_buffer_number_to_merge = 1000;
+  options.max_write_buffer_number = 1000;
+  options = CurrentOptions(options);
+  CreateAndReopenWithCF({"one", "two", "three", "four"}, options);
+
+  Random rnd(301);
+  for (auto* handle : handles_) {
+    for (int i = 0; i < kKeyNum; ++i) {
+      db_->Put(WriteOptions(), handle, RandomString(&rnd, kKeySize),
+               RandomString(&rnd, kValueSize));
+    }
+  }
+
+  uint64_t manual_sum = 0;
+  uint64_t api_sum = 0;
+  uint64_t value = 0;
+  for (auto* handle : handles_) {
+    ASSERT_TRUE(
+        db_->GetIntProperty(handle, DB::Properties::kSizeAllMemTables, &value));
+    manual_sum += value;
+  }
+  ASSERT_TRUE(db_->GetAggregatedIntProperty(DB::Properties::kSizeAllMemTables,
+                                            &api_sum));
+  ASSERT_GT(manual_sum, 0);
+  ASSERT_EQ(manual_sum, api_sum);
+
+  ASSERT_FALSE(db_->GetAggregatedIntProperty(DB::Properties::kDBStats, &value));
+
+  uint64_t before_flush_trm;
+  uint64_t after_flush_trm;
+  for (auto* handle : handles_) {
+    ASSERT_TRUE(db_->GetAggregatedIntProperty(
+        DB::Properties::kEstimateTableReadersMem, &before_flush_trm));
+
+    // Issue flush and expect larger memory usage of table readers.
+    db_->Flush(FlushOptions(), handle);
+
+    ASSERT_TRUE(db_->GetAggregatedIntProperty(
+        DB::Properties::kEstimateTableReadersMem, &after_flush_trm));
+    ASSERT_GT(after_flush_trm, before_flush_trm);
+  }
+}
+
+namespace {
+void ResetTableProperties(TableProperties* tp) {
+  tp->data_size = 0;
+  tp->index_size = 0;
+  tp->filter_size = 0;
+  tp->raw_key_size = 0;
+  tp->raw_value_size = 0;
+  tp->num_data_blocks = 0;
+  tp->num_entries = 0;
+  tp->num_deletions = 0;
+  tp->num_merge_operands = 0;
+  tp->num_range_deletions = 0;
+}
+
+void ParseTablePropertiesString(std::string tp_string, TableProperties* tp) {
+  double dummy_double;
+  std::replace(tp_string.begin(), tp_string.end(), ';', ' ');
+  std::replace(tp_string.begin(), tp_string.end(), '=', ' ');
+  ResetTableProperties(tp);
+  sscanf(tp_string.c_str(),
+         "# data blocks %" SCNu64 " # entries %" SCNu64 " # deletions %" SCNu64
+         " # merge operands %" SCNu64 " # range deletions %" SCNu64
+         " raw key size %" SCNu64
+         " raw average key size %lf "
+         " raw value size %" SCNu64
+         " raw average value size %lf "
+         " data block size %" SCNu64 " index block size (user-key? %" SCNu64
+         ", delta-value? %" SCNu64 ") %" SCNu64 " filter block size %" SCNu64,
+         &tp->num_data_blocks, &tp->num_entries, &tp->num_deletions,
+         &tp->num_merge_operands, &tp->num_range_deletions, &tp->raw_key_size,
+         &dummy_double, &tp->raw_value_size, &dummy_double, &tp->data_size,
+         &tp->index_key_is_user_key, &tp->index_value_is_delta_encoded,
+         &tp->index_size, &tp->filter_size);
+}
+
+void VerifySimilar(uint64_t a, uint64_t b, double bias) {
+  ASSERT_EQ(a == 0U, b == 0U);
+  if (a == 0) {
+    return;
+  }
+  double dbl_a = static_cast<double>(a);
+  double dbl_b = static_cast<double>(b);
+  if (dbl_a > dbl_b) {
+    ASSERT_LT(static_cast<double>(dbl_a - dbl_b) / (dbl_a + dbl_b), bias);
+  } else {
+    ASSERT_LT(static_cast<double>(dbl_b - dbl_a) / (dbl_a + dbl_b), bias);
+  }
+}
+
+void VerifyTableProperties(
+    const TableProperties& base_tp, const TableProperties& new_tp,
+    double filter_size_bias = CACHE_LINE_SIZE >= 256 ? 0.15 : 0.1,
+    double index_size_bias = 0.1, double data_size_bias = 0.1,
+    double num_data_blocks_bias = 0.05) {
+  VerifySimilar(base_tp.data_size, new_tp.data_size, data_size_bias);
+  VerifySimilar(base_tp.index_size, new_tp.index_size, index_size_bias);
+  VerifySimilar(base_tp.filter_size, new_tp.filter_size, filter_size_bias);
+  VerifySimilar(base_tp.num_data_blocks, new_tp.num_data_blocks,
+                num_data_blocks_bias);
+
+  ASSERT_EQ(base_tp.raw_key_size, new_tp.raw_key_size);
+  ASSERT_EQ(base_tp.raw_value_size, new_tp.raw_value_size);
+  ASSERT_EQ(base_tp.num_entries, new_tp.num_entries);
+  ASSERT_EQ(base_tp.num_deletions, new_tp.num_deletions);
+  ASSERT_EQ(base_tp.num_range_deletions, new_tp.num_range_deletions);
+
+  // Merge operands may become Puts, so we only have an upper bound the exact
+  // number of merge operands.
+  ASSERT_GE(base_tp.num_merge_operands, new_tp.num_merge_operands);
+}
+
+void GetExpectedTableProperties(
+    TableProperties* expected_tp, const int kKeySize, const int kValueSize,
+    const int kPutsPerTable, const int kDeletionsPerTable,
+    const int kMergeOperandsPerTable, const int kRangeDeletionsPerTable,
+    const int kTableCount, const int kBloomBitsPerKey, const size_t kBlockSize,
+    const bool index_key_is_user_key, const bool value_delta_encoding) {
+  const int kKeysPerTable =
+      kPutsPerTable + kDeletionsPerTable + kMergeOperandsPerTable;
+  const int kPutCount = kTableCount * kPutsPerTable;
+  const int kDeletionCount = kTableCount * kDeletionsPerTable;
+  const int kMergeCount = kTableCount * kMergeOperandsPerTable;
+  const int kRangeDeletionCount = kTableCount * kRangeDeletionsPerTable;
+  const int kKeyCount = kPutCount + kDeletionCount + kMergeCount + kRangeDeletionCount;
+  const int kAvgSuccessorSize = kKeySize / 5;
+  const int kEncodingSavePerKey = kKeySize / 4;
+  expected_tp->raw_key_size = kKeyCount * (kKeySize + 8);
+  expected_tp->raw_value_size =
+      (kPutCount + kMergeCount + kRangeDeletionCount) * kValueSize;
+  expected_tp->num_entries = kKeyCount;
+  expected_tp->num_deletions = kDeletionCount + kRangeDeletionCount;
+  expected_tp->num_merge_operands = kMergeCount;
+  expected_tp->num_range_deletions = kRangeDeletionCount;
+  expected_tp->num_data_blocks =
+      kTableCount * (kKeysPerTable * (kKeySize - kEncodingSavePerKey + kValueSize)) /
+      kBlockSize;
+  expected_tp->data_size =
+      kTableCount * (kKeysPerTable * (kKeySize + 8 + kValueSize));
+  expected_tp->index_size =
+      expected_tp->num_data_blocks *
+      (kAvgSuccessorSize + (index_key_is_user_key ? 0 : 8) -
+       // discount 1 byte as value size is not encoded in value delta encoding
+       (value_delta_encoding ? 1 : 0));
+  expected_tp->filter_size =
+      kTableCount * ((kKeysPerTable * kBloomBitsPerKey + 7) / 8 +
+                     /*average-ish overhead*/ CACHE_LINE_SIZE / 2);
+}
+}  // anonymous namespace
+
+TEST_F(DBPropertiesTest, ValidatePropertyInfo) {
+  for (const auto& ppt_name_and_info : InternalStats::ppt_name_to_info) {
+    // If C++ gets a std::string_literal, this would be better to check at
+    // compile-time using static_assert.
+    ASSERT_TRUE(ppt_name_and_info.first.empty() ||
+                !isdigit(ppt_name_and_info.first.back()));
+
+    int count = 0;
+    count += (ppt_name_and_info.second.handle_string == nullptr) ? 0 : 1;
+    count += (ppt_name_and_info.second.handle_int == nullptr) ? 0 : 1;
+    count += (ppt_name_and_info.second.handle_string_dbimpl == nullptr) ? 0 : 1;
+    ASSERT_TRUE(count == 1);
+  }
+}
+
+TEST_F(DBPropertiesTest, ValidateSampleNumber) {
+  // When "max_open_files" is -1, we read all the files for
+  // "rocksdb.estimate-num-keys" computation, which is the ground truth.
+  // Otherwise, we sample 20 newest files to make an estimation.
+  // Formula: lastest_20_files_active_key_ratio * total_files
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.level0_stop_writes_trigger = 1000;
+  DestroyAndReopen(options);
+  int key = 0;
+  for (int files = 20; files >= 10; files -= 10) {
+    for (int i = 0; i < files; i++) {
+      int rows = files / 10;
+      for (int j = 0; j < rows; j++) {
+        db_->Put(WriteOptions(), std::to_string(++key), "foo");
+      }
+      db_->Flush(FlushOptions());
+    }
+  }
+  std::string num;
+  Reopen(options);
+  ASSERT_TRUE(dbfull()->GetProperty("rocksdb.estimate-num-keys", &num));
+  ASSERT_EQ("45", num);
+  options.max_open_files = -1;
+  Reopen(options);
+  ASSERT_TRUE(dbfull()->GetProperty("rocksdb.estimate-num-keys", &num));
+  ASSERT_EQ("50", num);
+}
+
+TEST_F(DBPropertiesTest, AggregatedTableProperties) {
+  for (int kTableCount = 40; kTableCount <= 100; kTableCount += 30) {
+    const int kDeletionsPerTable = 5;
+    const int kMergeOperandsPerTable = 15;
+    const int kRangeDeletionsPerTable = 5;
+    const int kPutsPerTable = 100;
+    const int kKeySize = 80;
+    const int kValueSize = 200;
+    const int kBloomBitsPerKey = 20;
+
+    Options options = CurrentOptions();
+    options.level0_file_num_compaction_trigger = 8;
+    options.compression = kNoCompression;
+    options.create_if_missing = true;
+    options.preserve_deletes = true;
+    options.merge_operator.reset(new TestPutOperator());
+
+    BlockBasedTableOptions table_options;
+    table_options.filter_policy.reset(
+        NewBloomFilterPolicy(kBloomBitsPerKey, false));
+    table_options.block_size = 1024;
+    options.table_factory.reset(new BlockBasedTableFactory(table_options));
+
+    DestroyAndReopen(options);
+
+    // Hold open a snapshot to prevent range tombstones from being compacted
+    // away.
+    ManagedSnapshot snapshot(db_);
+
+    Random rnd(5632);
+    for (int table = 1; table <= kTableCount; ++table) {
+      for (int i = 0; i < kPutsPerTable; ++i) {
+        db_->Put(WriteOptions(), RandomString(&rnd, kKeySize),
+                 RandomString(&rnd, kValueSize));
+      }
+      for (int i = 0; i < kDeletionsPerTable; i++) {
+        db_->Delete(WriteOptions(), RandomString(&rnd, kKeySize));
+      }
+      for (int i = 0; i < kMergeOperandsPerTable; i++) {
+        db_->Merge(WriteOptions(), RandomString(&rnd, kKeySize),
+                   RandomString(&rnd, kValueSize));
+      }
+      for (int i = 0; i < kRangeDeletionsPerTable; i++) {
+        std::string start = RandomString(&rnd, kKeySize);
+        std::string end = start;
+        end.resize(kValueSize);
+        db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), start, end);
+      }
+      db_->Flush(FlushOptions());
+    }
+    std::string property;
+    db_->GetProperty(DB::Properties::kAggregatedTableProperties, &property);
+    TableProperties output_tp;
+    ParseTablePropertiesString(property, &output_tp);
+    bool index_key_is_user_key = output_tp.index_key_is_user_key > 0;
+    bool value_is_delta_encoded = output_tp.index_value_is_delta_encoded > 0;
+
+    TableProperties expected_tp;
+    GetExpectedTableProperties(
+        &expected_tp, kKeySize, kValueSize, kPutsPerTable, kDeletionsPerTable,
+        kMergeOperandsPerTable, kRangeDeletionsPerTable, kTableCount,
+        kBloomBitsPerKey, table_options.block_size, index_key_is_user_key,
+        value_is_delta_encoded);
+
+    VerifyTableProperties(expected_tp, output_tp);
+  }
+}
+
+TEST_F(DBPropertiesTest, ReadLatencyHistogramByLevel) {
+  Options options = CurrentOptions();
+  options.write_buffer_size = 110 << 10;
+  options.level0_file_num_compaction_trigger = 6;
+  options.num_levels = 4;
+  options.compression = kNoCompression;
+  options.max_bytes_for_level_base = 4500 << 10;
+  options.target_file_size_base = 98 << 10;
+  options.max_write_buffer_number = 2;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  options.max_open_files = 11;  // Make sure no proloading of table readers
+
+  // RocksDB sanitize max open files to at least 20. Modify it back.
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "SanitizeOptions::AfterChangeMaxOpenFiles", [&](void* arg) {
+        int* max_open_files = static_cast<int*>(arg);
+        *max_open_files = 11;
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  BlockBasedTableOptions table_options;
+  table_options.no_block_cache = true;
+
+  CreateAndReopenWithCF({"pikachu"}, options);
+  int key_index = 0;
+  Random rnd(301);
+  for (int num = 0; num < 8; num++) {
+    Put("foo", "bar");
+    GenerateNewFile(&rnd, &key_index);
+    dbfull()->TEST_WaitForCompact();
+  }
+  dbfull()->TEST_WaitForCompact();
+
+  std::string prop;
+  ASSERT_TRUE(dbfull()->GetProperty("rocksdb.dbstats", &prop));
+
+  // Get() after flushes, See latency histogram tracked.
+  for (int key = 0; key < key_index; key++) {
+    Get(Key(key));
+  }
+  ASSERT_TRUE(dbfull()->GetProperty("rocksdb.cfstats", &prop));
+  ASSERT_NE(std::string::npos, prop.find("** Level 0 read latency histogram"));
+  ASSERT_NE(std::string::npos, prop.find("** Level 1 read latency histogram"));
+  ASSERT_EQ(std::string::npos, prop.find("** Level 2 read latency histogram"));
+
+  // Reopen and issue Get(). See thee latency tracked
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+  dbfull()->TEST_WaitForCompact();
+  for (int key = 0; key < key_index; key++) {
+    Get(Key(key));
+  }
+
+  // Test for getting immutable_db_options_.statistics
+  ASSERT_TRUE(dbfull()->GetProperty(dbfull()->DefaultColumnFamily(),
+                                    "rocksdb.options-statistics", &prop));
+  ASSERT_NE(std::string::npos, prop.find("rocksdb.block.cache.miss"));
+  ASSERT_EQ(std::string::npos, prop.find("rocksdb.db.f.micros"));
+
+  ASSERT_TRUE(dbfull()->GetProperty(dbfull()->DefaultColumnFamily(),
+                                    "rocksdb.cf-file-histogram", &prop));
+  ASSERT_NE(std::string::npos, prop.find("** Level 0 read latency histogram"));
+  ASSERT_NE(std::string::npos, prop.find("** Level 1 read latency histogram"));
+  ASSERT_EQ(std::string::npos, prop.find("** Level 2 read latency histogram"));
+
+  // Reopen and issue iterating. See thee latency tracked
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ASSERT_TRUE(dbfull()->GetProperty("rocksdb.cf-file-histogram", &prop));
+  ASSERT_EQ(std::string::npos, prop.find("** Level 0 read latency histogram"));
+  ASSERT_EQ(std::string::npos, prop.find("** Level 1 read latency histogram"));
+  ASSERT_EQ(std::string::npos, prop.find("** Level 2 read latency histogram"));
+  {
+    std::unique_ptr<Iterator> iter(db_->NewIterator(ReadOptions()));
+    for (iter->Seek(Key(0)); iter->Valid(); iter->Next()) {
+    }
+  }
+  ASSERT_TRUE(dbfull()->GetProperty("rocksdb.cf-file-histogram", &prop));
+  ASSERT_NE(std::string::npos, prop.find("** Level 0 read latency histogram"));
+  ASSERT_NE(std::string::npos, prop.find("** Level 1 read latency histogram"));
+  ASSERT_EQ(std::string::npos, prop.find("** Level 2 read latency histogram"));
+
+  // CF 1 should show no histogram.
+  ASSERT_TRUE(
+      dbfull()->GetProperty(handles_[1], "rocksdb.cf-file-histogram", &prop));
+  ASSERT_EQ(std::string::npos, prop.find("** Level 0 read latency histogram"));
+  ASSERT_EQ(std::string::npos, prop.find("** Level 1 read latency histogram"));
+  ASSERT_EQ(std::string::npos, prop.find("** Level 2 read latency histogram"));
+  // put something and read it back , CF 1 should show histogram.
+  Put(1, "foo", "bar");
+  Flush(1);
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ("bar", Get(1, "foo"));
+
+  ASSERT_TRUE(
+      dbfull()->GetProperty(handles_[1], "rocksdb.cf-file-histogram", &prop));
+  ASSERT_NE(std::string::npos, prop.find("** Level 0 read latency histogram"));
+  ASSERT_EQ(std::string::npos, prop.find("** Level 1 read latency histogram"));
+  ASSERT_EQ(std::string::npos, prop.find("** Level 2 read latency histogram"));
+
+  // options.max_open_files preloads table readers.
+  options.max_open_files = -1;
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+  ASSERT_TRUE(dbfull()->GetProperty(dbfull()->DefaultColumnFamily(),
+                                    "rocksdb.cf-file-histogram", &prop));
+  ASSERT_NE(std::string::npos, prop.find("** Level 0 read latency histogram"));
+  ASSERT_NE(std::string::npos, prop.find("** Level 1 read latency histogram"));
+  ASSERT_EQ(std::string::npos, prop.find("** Level 2 read latency histogram"));
+  for (int key = 0; key < key_index; key++) {
+    Get(Key(key));
+  }
+  ASSERT_TRUE(dbfull()->GetProperty("rocksdb.cfstats", &prop));
+  ASSERT_NE(std::string::npos, prop.find("** Level 0 read latency histogram"));
+  ASSERT_NE(std::string::npos, prop.find("** Level 1 read latency histogram"));
+  ASSERT_EQ(std::string::npos, prop.find("** Level 2 read latency histogram"));
+
+  // Clear internal stats
+  dbfull()->ResetStats();
+  ASSERT_TRUE(dbfull()->GetProperty("rocksdb.cfstats", &prop));
+  ASSERT_EQ(std::string::npos, prop.find("** Level 0 read latency histogram"));
+  ASSERT_EQ(std::string::npos, prop.find("** Level 1 read latency histogram"));
+  ASSERT_EQ(std::string::npos, prop.find("** Level 2 read latency histogram"));
+}
+
+TEST_F(DBPropertiesTest, AggregatedTablePropertiesAtLevel) {
+  const int kTableCount = 100;
+  const int kDeletionsPerTable = 2;
+  const int kMergeOperandsPerTable = 2;
+  const int kRangeDeletionsPerTable = 2;
+  const int kPutsPerTable = 10;
+  const int kKeySize = 50;
+  const int kValueSize = 400;
+  const int kMaxLevel = 7;
+  const int kBloomBitsPerKey = 20;
+  Random rnd(301);
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = 8;
+  options.compression = kNoCompression;
+  options.create_if_missing = true;
+  options.level0_file_num_compaction_trigger = 2;
+  options.target_file_size_base = 8192;
+  options.max_bytes_for_level_base = 10000;
+  options.max_bytes_for_level_multiplier = 2;
+  // This ensures there no compaction happening when we call GetProperty().
+  options.disable_auto_compactions = true;
+  options.preserve_deletes = true;
+  options.merge_operator.reset(new TestPutOperator());
+
+  BlockBasedTableOptions table_options;
+  table_options.filter_policy.reset(
+      NewBloomFilterPolicy(kBloomBitsPerKey, false));
+  table_options.block_size = 1024;
+  options.table_factory.reset(new BlockBasedTableFactory(table_options));
+
+  DestroyAndReopen(options);
+
+  // Hold open a snapshot to prevent range tombstones from being compacted away.
+  ManagedSnapshot snapshot(db_);
+
+  std::string level_tp_strings[kMaxLevel];
+  std::string tp_string;
+  TableProperties level_tps[kMaxLevel];
+  TableProperties tp, sum_tp, expected_tp;
+  for (int table = 1; table <= kTableCount; ++table) {
+    for (int i = 0; i < kPutsPerTable; ++i) {
+      db_->Put(WriteOptions(), RandomString(&rnd, kKeySize),
+               RandomString(&rnd, kValueSize));
+    }
+    for (int i = 0; i < kDeletionsPerTable; i++) {
+      db_->Delete(WriteOptions(), RandomString(&rnd, kKeySize));
+    }
+    for (int i = 0; i < kMergeOperandsPerTable; i++) {
+      db_->Merge(WriteOptions(), RandomString(&rnd, kKeySize),
+                 RandomString(&rnd, kValueSize));
+    }
+    for (int i = 0; i < kRangeDeletionsPerTable; i++) {
+      std::string start = RandomString(&rnd, kKeySize);
+      std::string end = start;
+      end.resize(kValueSize);
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), start, end);
+    }
+    db_->Flush(FlushOptions());
+    db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+    ResetTableProperties(&sum_tp);
+    for (int level = 0; level < kMaxLevel; ++level) {
+      db_->GetProperty(
+          DB::Properties::kAggregatedTablePropertiesAtLevel + ToString(level),
+          &level_tp_strings[level]);
+      ParseTablePropertiesString(level_tp_strings[level], &level_tps[level]);
+      sum_tp.data_size += level_tps[level].data_size;
+      sum_tp.index_size += level_tps[level].index_size;
+      sum_tp.filter_size += level_tps[level].filter_size;
+      sum_tp.raw_key_size += level_tps[level].raw_key_size;
+      sum_tp.raw_value_size += level_tps[level].raw_value_size;
+      sum_tp.num_data_blocks += level_tps[level].num_data_blocks;
+      sum_tp.num_entries += level_tps[level].num_entries;
+      sum_tp.num_deletions += level_tps[level].num_deletions;
+      sum_tp.num_merge_operands += level_tps[level].num_merge_operands;
+      sum_tp.num_range_deletions += level_tps[level].num_range_deletions;
+    }
+    db_->GetProperty(DB::Properties::kAggregatedTableProperties, &tp_string);
+    ParseTablePropertiesString(tp_string, &tp);
+    bool index_key_is_user_key = tp.index_key_is_user_key > 0;
+    bool value_is_delta_encoded = tp.index_value_is_delta_encoded > 0;
+    ASSERT_EQ(sum_tp.data_size, tp.data_size);
+    ASSERT_EQ(sum_tp.index_size, tp.index_size);
+    ASSERT_EQ(sum_tp.filter_size, tp.filter_size);
+    ASSERT_EQ(sum_tp.raw_key_size, tp.raw_key_size);
+    ASSERT_EQ(sum_tp.raw_value_size, tp.raw_value_size);
+    ASSERT_EQ(sum_tp.num_data_blocks, tp.num_data_blocks);
+    ASSERT_EQ(sum_tp.num_entries, tp.num_entries);
+    ASSERT_EQ(sum_tp.num_deletions, tp.num_deletions);
+    ASSERT_EQ(sum_tp.num_merge_operands, tp.num_merge_operands);
+    ASSERT_EQ(sum_tp.num_range_deletions, tp.num_range_deletions);
+    if (table > 3) {
+      GetExpectedTableProperties(
+          &expected_tp, kKeySize, kValueSize, kPutsPerTable, kDeletionsPerTable,
+          kMergeOperandsPerTable, kRangeDeletionsPerTable, table,
+          kBloomBitsPerKey, table_options.block_size, index_key_is_user_key,
+          value_is_delta_encoded);
+      // Gives larger bias here as index block size, filter block size,
+      // and data block size become much harder to estimate in this test.
+      VerifyTableProperties(expected_tp, tp, 0.5, 0.4, 0.4, 0.25);
+    }
+  }
+}
+
+TEST_F(DBPropertiesTest, NumImmutableMemTable) {
+  do {
+    Options options = CurrentOptions();
+    WriteOptions writeOpt = WriteOptions();
+    writeOpt.disableWAL = true;
+    options.max_write_buffer_number = 4;
+    options.min_write_buffer_number_to_merge = 3;
+    options.write_buffer_size = 1000000;
+    options.max_write_buffer_size_to_maintain =
+        5 * static_cast<int64_t>(options.write_buffer_size);
+    CreateAndReopenWithCF({"pikachu"}, options);
+
+    std::string big_value(1000000 * 2, 'x');
+    std::string num;
+    uint64_t value;
+    SetPerfLevel(kEnableTime);
+    ASSERT_TRUE(GetPerfLevel() == kEnableTime);
+
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "k1", big_value));
+    ASSERT_TRUE(dbfull()->GetProperty(handles_[1],
+                                      "rocksdb.num-immutable-mem-table", &num));
+    ASSERT_EQ(num, "0");
+    ASSERT_TRUE(dbfull()->GetProperty(
+        handles_[1], DB::Properties::kNumImmutableMemTableFlushed, &num));
+    ASSERT_EQ(num, "0");
+    ASSERT_TRUE(dbfull()->GetProperty(
+        handles_[1], "rocksdb.num-entries-active-mem-table", &num));
+    ASSERT_EQ(num, "1");
+    get_perf_context()->Reset();
+    Get(1, "k1");
+    ASSERT_EQ(1, static_cast<int>(get_perf_context()->get_from_memtable_count));
+
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "k2", big_value));
+    ASSERT_TRUE(dbfull()->GetProperty(handles_[1],
+                                      "rocksdb.num-immutable-mem-table", &num));
+    ASSERT_EQ(num, "1");
+    ASSERT_TRUE(dbfull()->GetProperty(
+        handles_[1], "rocksdb.num-entries-active-mem-table", &num));
+    ASSERT_EQ(num, "1");
+    ASSERT_TRUE(dbfull()->GetProperty(
+        handles_[1], "rocksdb.num-entries-imm-mem-tables", &num));
+    ASSERT_EQ(num, "1");
+
+    get_perf_context()->Reset();
+    Get(1, "k1");
+    ASSERT_EQ(2, static_cast<int>(get_perf_context()->get_from_memtable_count));
+    get_perf_context()->Reset();
+    Get(1, "k2");
+    ASSERT_EQ(1, static_cast<int>(get_perf_context()->get_from_memtable_count));
+
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "k3", big_value));
+    ASSERT_TRUE(dbfull()->GetProperty(
+        handles_[1], "rocksdb.cur-size-active-mem-table", &num));
+    ASSERT_TRUE(dbfull()->GetProperty(handles_[1],
+                                      "rocksdb.num-immutable-mem-table", &num));
+    ASSERT_EQ(num, "2");
+    ASSERT_TRUE(dbfull()->GetProperty(
+        handles_[1], "rocksdb.num-entries-active-mem-table", &num));
+    ASSERT_EQ(num, "1");
+    ASSERT_TRUE(dbfull()->GetProperty(
+        handles_[1], "rocksdb.num-entries-imm-mem-tables", &num));
+    ASSERT_EQ(num, "2");
+    get_perf_context()->Reset();
+    Get(1, "k2");
+    ASSERT_EQ(2, static_cast<int>(get_perf_context()->get_from_memtable_count));
+    get_perf_context()->Reset();
+    Get(1, "k3");
+    ASSERT_EQ(1, static_cast<int>(get_perf_context()->get_from_memtable_count));
+    get_perf_context()->Reset();
+    Get(1, "k1");
+    ASSERT_EQ(3, static_cast<int>(get_perf_context()->get_from_memtable_count));
+
+    ASSERT_OK(Flush(1));
+    ASSERT_TRUE(dbfull()->GetProperty(handles_[1],
+                                      "rocksdb.num-immutable-mem-table", &num));
+    ASSERT_EQ(num, "0");
+    ASSERT_TRUE(dbfull()->GetProperty(
+        handles_[1], DB::Properties::kNumImmutableMemTableFlushed, &num));
+    ASSERT_EQ(num, "3");
+    ASSERT_TRUE(dbfull()->GetIntProperty(
+        handles_[1], "rocksdb.cur-size-active-mem-table", &value));
+    // "192" is the size of the metadata of two empty skiplists, this would
+    // break if we change the default skiplist implementation
+    ASSERT_GE(value, 192);
+
+    uint64_t int_num;
+    uint64_t base_total_size;
+    ASSERT_TRUE(dbfull()->GetIntProperty(
+        handles_[1], "rocksdb.estimate-num-keys", &base_total_size));
+
+    ASSERT_OK(dbfull()->Delete(writeOpt, handles_[1], "k2"));
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "k3", ""));
+    ASSERT_OK(dbfull()->Delete(writeOpt, handles_[1], "k3"));
+    ASSERT_TRUE(dbfull()->GetIntProperty(
+        handles_[1], "rocksdb.num-deletes-active-mem-table", &int_num));
+    ASSERT_EQ(int_num, 2U);
+    ASSERT_TRUE(dbfull()->GetIntProperty(
+        handles_[1], "rocksdb.num-entries-active-mem-table", &int_num));
+    ASSERT_EQ(int_num, 3U);
+
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "k2", big_value));
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "k2", big_value));
+    ASSERT_TRUE(dbfull()->GetIntProperty(
+        handles_[1], "rocksdb.num-entries-imm-mem-tables", &int_num));
+    ASSERT_EQ(int_num, 4U);
+    ASSERT_TRUE(dbfull()->GetIntProperty(
+        handles_[1], "rocksdb.num-deletes-imm-mem-tables", &int_num));
+    ASSERT_EQ(int_num, 2U);
+
+    ASSERT_TRUE(dbfull()->GetIntProperty(
+        handles_[1], "rocksdb.estimate-num-keys", &int_num));
+    ASSERT_EQ(int_num, base_total_size + 1);
+
+    SetPerfLevel(kDisable);
+    ASSERT_TRUE(GetPerfLevel() == kDisable);
+  } while (ChangeCompactOptions());
+}
+
+// TODO(techdept) : Disabled flaky test #12863555
+TEST_F(DBPropertiesTest, DISABLED_GetProperty) {
+  // Set sizes to both background thread pool to be 1 and block them.
+  env_->SetBackgroundThreads(1, Env::HIGH);
+  env_->SetBackgroundThreads(1, Env::LOW);
+  test::SleepingBackgroundTask sleeping_task_low;
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+                 Env::Priority::LOW);
+  test::SleepingBackgroundTask sleeping_task_high;
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask,
+                 &sleeping_task_high, Env::Priority::HIGH);
+
+  Options options = CurrentOptions();
+  WriteOptions writeOpt = WriteOptions();
+  writeOpt.disableWAL = true;
+  options.compaction_style = kCompactionStyleUniversal;
+  options.level0_file_num_compaction_trigger = 1;
+  options.compaction_options_universal.size_ratio = 50;
+  options.max_background_compactions = 1;
+  options.max_background_flushes = 1;
+  options.max_write_buffer_number = 10;
+  options.min_write_buffer_number_to_merge = 1;
+  options.max_write_buffer_size_to_maintain = 0;
+  options.write_buffer_size = 1000000;
+  Reopen(options);
+
+  std::string big_value(1000000 * 2, 'x');
+  std::string num;
+  uint64_t int_num;
+  SetPerfLevel(kEnableTime);
+
+  ASSERT_TRUE(
+      dbfull()->GetIntProperty("rocksdb.estimate-table-readers-mem", &int_num));
+  ASSERT_EQ(int_num, 0U);
+  ASSERT_TRUE(
+      dbfull()->GetIntProperty("rocksdb.estimate-live-data-size", &int_num));
+  ASSERT_EQ(int_num, 0U);
+
+  ASSERT_OK(dbfull()->Put(writeOpt, "k1", big_value));
+  ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num));
+  ASSERT_EQ(num, "0");
+  ASSERT_TRUE(dbfull()->GetProperty("rocksdb.mem-table-flush-pending", &num));
+  ASSERT_EQ(num, "0");
+  ASSERT_TRUE(dbfull()->GetProperty("rocksdb.compaction-pending", &num));
+  ASSERT_EQ(num, "0");
+  ASSERT_TRUE(dbfull()->GetProperty("rocksdb.estimate-num-keys", &num));
+  ASSERT_EQ(num, "1");
+  get_perf_context()->Reset();
+
+  ASSERT_OK(dbfull()->Put(writeOpt, "k2", big_value));
+  ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num));
+  ASSERT_EQ(num, "1");
+  ASSERT_OK(dbfull()->Delete(writeOpt, "k-non-existing"));
+  ASSERT_OK(dbfull()->Put(writeOpt, "k3", big_value));
+  ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num));
+  ASSERT_EQ(num, "2");
+  ASSERT_TRUE(dbfull()->GetProperty("rocksdb.mem-table-flush-pending", &num));
+  ASSERT_EQ(num, "1");
+  ASSERT_TRUE(dbfull()->GetProperty("rocksdb.compaction-pending", &num));
+  ASSERT_EQ(num, "0");
+  ASSERT_TRUE(dbfull()->GetProperty("rocksdb.estimate-num-keys", &num));
+  ASSERT_EQ(num, "2");
+  // Verify the same set of properties through GetIntProperty
+  ASSERT_TRUE(
+      dbfull()->GetIntProperty("rocksdb.num-immutable-mem-table", &int_num));
+  ASSERT_EQ(int_num, 2U);
+  ASSERT_TRUE(
+      dbfull()->GetIntProperty("rocksdb.mem-table-flush-pending", &int_num));
+  ASSERT_EQ(int_num, 1U);
+  ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.compaction-pending", &int_num));
+  ASSERT_EQ(int_num, 0U);
+  ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.estimate-num-keys", &int_num));
+  ASSERT_EQ(int_num, 2U);
+
+  ASSERT_TRUE(
+      dbfull()->GetIntProperty("rocksdb.estimate-table-readers-mem", &int_num));
+  ASSERT_EQ(int_num, 0U);
+
+  sleeping_task_high.WakeUp();
+  sleeping_task_high.WaitUntilDone();
+  dbfull()->TEST_WaitForFlushMemTable();
+
+  ASSERT_OK(dbfull()->Put(writeOpt, "k4", big_value));
+  ASSERT_OK(dbfull()->Put(writeOpt, "k5", big_value));
+  dbfull()->TEST_WaitForFlushMemTable();
+  ASSERT_TRUE(dbfull()->GetProperty("rocksdb.mem-table-flush-pending", &num));
+  ASSERT_EQ(num, "0");
+  ASSERT_TRUE(dbfull()->GetProperty("rocksdb.compaction-pending", &num));
+  ASSERT_EQ(num, "1");
+  ASSERT_TRUE(dbfull()->GetProperty("rocksdb.estimate-num-keys", &num));
+  ASSERT_EQ(num, "4");
+
+  ASSERT_TRUE(
+      dbfull()->GetIntProperty("rocksdb.estimate-table-readers-mem", &int_num));
+  ASSERT_GT(int_num, 0U);
+
+  sleeping_task_low.WakeUp();
+  sleeping_task_low.WaitUntilDone();
+
+  // Wait for compaction to be done. This is important because otherwise RocksDB
+  // might schedule a compaction when reopening the database, failing assertion
+  // (A) as a result.
+  dbfull()->TEST_WaitForCompact();
+  options.max_open_files = 10;
+  Reopen(options);
+  // After reopening, no table reader is loaded, so no memory for table readers
+  ASSERT_TRUE(
+      dbfull()->GetIntProperty("rocksdb.estimate-table-readers-mem", &int_num));
+  ASSERT_EQ(int_num, 0U);  // (A)
+  ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.estimate-num-keys", &int_num));
+  ASSERT_GT(int_num, 0U);
+
+  // After reading a key, at least one table reader is loaded.
+  Get("k5");
+  ASSERT_TRUE(
+      dbfull()->GetIntProperty("rocksdb.estimate-table-readers-mem", &int_num));
+  ASSERT_GT(int_num, 0U);
+
+  // Test rocksdb.num-live-versions
+  {
+    options.level0_file_num_compaction_trigger = 20;
+    Reopen(options);
+    ASSERT_TRUE(
+        dbfull()->GetIntProperty("rocksdb.num-live-versions", &int_num));
+    ASSERT_EQ(int_num, 1U);
+
+    // Use an iterator to hold current version
+    std::unique_ptr<Iterator> iter1(dbfull()->NewIterator(ReadOptions()));
+
+    ASSERT_OK(dbfull()->Put(writeOpt, "k6", big_value));
+    Flush();
+    ASSERT_TRUE(
+        dbfull()->GetIntProperty("rocksdb.num-live-versions", &int_num));
+    ASSERT_EQ(int_num, 2U);
+
+    // Use an iterator to hold current version
+    std::unique_ptr<Iterator> iter2(dbfull()->NewIterator(ReadOptions()));
+
+    ASSERT_OK(dbfull()->Put(writeOpt, "k7", big_value));
+    Flush();
+    ASSERT_TRUE(
+        dbfull()->GetIntProperty("rocksdb.num-live-versions", &int_num));
+    ASSERT_EQ(int_num, 3U);
+
+    iter2.reset();
+    ASSERT_TRUE(
+        dbfull()->GetIntProperty("rocksdb.num-live-versions", &int_num));
+    ASSERT_EQ(int_num, 2U);
+
+    iter1.reset();
+    ASSERT_TRUE(
+        dbfull()->GetIntProperty("rocksdb.num-live-versions", &int_num));
+    ASSERT_EQ(int_num, 1U);
+  }
+}
+
+TEST_F(DBPropertiesTest, ApproximateMemoryUsage) {
+  const int kNumRounds = 10;
+  // TODO(noetzli) kFlushesPerRound does not really correlate with how many
+  // flushes happen.
+  const int kFlushesPerRound = 10;
+  const int kWritesPerFlush = 10;
+  const int kKeySize = 100;
+  const int kValueSize = 1000;
+  Options options;
+  options.write_buffer_size = 1000;  // small write buffer
+  options.min_write_buffer_number_to_merge = 4;
+  options.compression = kNoCompression;
+  options.create_if_missing = true;
+  options = CurrentOptions(options);
+  DestroyAndReopen(options);
+
+  Random rnd(301);
+
+  std::vector<Iterator*> iters;
+
+  uint64_t active_mem;
+  uint64_t unflushed_mem;
+  uint64_t all_mem;
+  uint64_t prev_all_mem;
+
+  // Phase 0. The verify the initial value of all these properties are the same
+  // as we have no mem-tables.
+  dbfull()->GetIntProperty("rocksdb.cur-size-active-mem-table", &active_mem);
+  dbfull()->GetIntProperty("rocksdb.cur-size-all-mem-tables", &unflushed_mem);
+  dbfull()->GetIntProperty("rocksdb.size-all-mem-tables", &all_mem);
+  ASSERT_EQ(all_mem, active_mem);
+  ASSERT_EQ(all_mem, unflushed_mem);
+
+  // Phase 1. Simply issue Put() and expect "cur-size-all-mem-tables" equals to
+  // "size-all-mem-tables"
+  for (int r = 0; r < kNumRounds; ++r) {
+    for (int f = 0; f < kFlushesPerRound; ++f) {
+      for (int w = 0; w < kWritesPerFlush; ++w) {
+        Put(RandomString(&rnd, kKeySize), RandomString(&rnd, kValueSize));
+      }
+    }
+    // Make sure that there is no flush between getting the two properties.
+    dbfull()->TEST_WaitForFlushMemTable();
+    dbfull()->GetIntProperty("rocksdb.cur-size-all-mem-tables", &unflushed_mem);
+    dbfull()->GetIntProperty("rocksdb.size-all-mem-tables", &all_mem);
+    // in no iterator case, these two number should be the same.
+    ASSERT_EQ(unflushed_mem, all_mem);
+  }
+  prev_all_mem = all_mem;
+
+  // Phase 2. Keep issuing Put() but also create new iterators. This time we
+  // expect "size-all-mem-tables" > "cur-size-all-mem-tables".
+  for (int r = 0; r < kNumRounds; ++r) {
+    iters.push_back(db_->NewIterator(ReadOptions()));
+    for (int f = 0; f < kFlushesPerRound; ++f) {
+      for (int w = 0; w < kWritesPerFlush; ++w) {
+        Put(RandomString(&rnd, kKeySize), RandomString(&rnd, kValueSize));
+      }
+    }
+    // Force flush to prevent flush from happening between getting the
+    // properties or after getting the properties and before the new round.
+    Flush();
+
+    // In the second round, add iterators.
+    dbfull()->GetIntProperty("rocksdb.cur-size-active-mem-table", &active_mem);
+    dbfull()->GetIntProperty("rocksdb.cur-size-all-mem-tables", &unflushed_mem);
+    dbfull()->GetIntProperty("rocksdb.size-all-mem-tables", &all_mem);
+    ASSERT_GT(all_mem, active_mem);
+    ASSERT_GT(all_mem, unflushed_mem);
+    ASSERT_GT(all_mem, prev_all_mem);
+    prev_all_mem = all_mem;
+  }
+
+  // Phase 3. Delete iterators and expect "size-all-mem-tables" shrinks
+  // whenever we release an iterator.
+  for (auto* iter : iters) {
+    delete iter;
+    dbfull()->GetIntProperty("rocksdb.size-all-mem-tables", &all_mem);
+    // Expect the size shrinking
+    ASSERT_LT(all_mem, prev_all_mem);
+    prev_all_mem = all_mem;
+  }
+
+  // Expect all these three counters to be the same.
+  dbfull()->GetIntProperty("rocksdb.cur-size-active-mem-table", &active_mem);
+  dbfull()->GetIntProperty("rocksdb.cur-size-all-mem-tables", &unflushed_mem);
+  dbfull()->GetIntProperty("rocksdb.size-all-mem-tables", &all_mem);
+  ASSERT_EQ(active_mem, unflushed_mem);
+  ASSERT_EQ(unflushed_mem, all_mem);
+
+  // Phase 5. Reopen, and expect all these three counters to be the same again.
+  Reopen(options);
+  dbfull()->GetIntProperty("rocksdb.cur-size-active-mem-table", &active_mem);
+  dbfull()->GetIntProperty("rocksdb.cur-size-all-mem-tables", &unflushed_mem);
+  dbfull()->GetIntProperty("rocksdb.size-all-mem-tables", &all_mem);
+  ASSERT_EQ(active_mem, unflushed_mem);
+  ASSERT_EQ(unflushed_mem, all_mem);
+}
+
+TEST_F(DBPropertiesTest, EstimatePendingCompBytes) {
+  // Set sizes to both background thread pool to be 1 and block them.
+  env_->SetBackgroundThreads(1, Env::HIGH);
+  env_->SetBackgroundThreads(1, Env::LOW);
+  test::SleepingBackgroundTask sleeping_task_low;
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+                 Env::Priority::LOW);
+
+  Options options = CurrentOptions();
+  WriteOptions writeOpt = WriteOptions();
+  writeOpt.disableWAL = true;
+  options.compaction_style = kCompactionStyleLevel;
+  options.level0_file_num_compaction_trigger = 2;
+  options.max_background_compactions = 1;
+  options.max_background_flushes = 1;
+  options.max_write_buffer_number = 10;
+  options.min_write_buffer_number_to_merge = 1;
+  options.max_write_buffer_size_to_maintain = 0;
+  options.write_buffer_size = 1000000;
+  Reopen(options);
+
+  std::string big_value(1000000 * 2, 'x');
+  std::string num;
+  uint64_t int_num;
+
+  ASSERT_OK(dbfull()->Put(writeOpt, "k1", big_value));
+  Flush();
+  ASSERT_TRUE(dbfull()->GetIntProperty(
+      "rocksdb.estimate-pending-compaction-bytes", &int_num));
+  ASSERT_EQ(int_num, 0U);
+
+  ASSERT_OK(dbfull()->Put(writeOpt, "k2", big_value));
+  Flush();
+  ASSERT_TRUE(dbfull()->GetIntProperty(
+      "rocksdb.estimate-pending-compaction-bytes", &int_num));
+  ASSERT_GT(int_num, 0U);
+
+  ASSERT_OK(dbfull()->Put(writeOpt, "k3", big_value));
+  Flush();
+  ASSERT_TRUE(dbfull()->GetIntProperty(
+      "rocksdb.estimate-pending-compaction-bytes", &int_num));
+  ASSERT_GT(int_num, 0U);
+
+  sleeping_task_low.WakeUp();
+  sleeping_task_low.WaitUntilDone();
+
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_TRUE(dbfull()->GetIntProperty(
+      "rocksdb.estimate-pending-compaction-bytes", &int_num));
+  ASSERT_EQ(int_num, 0U);
+}
+
+TEST_F(DBPropertiesTest, EstimateCompressionRatio) {
+  if (!Snappy_Supported()) {
+    return;
+  }
+  const int kNumL0Files = 3;
+  const int kNumEntriesPerFile = 1000;
+
+  Options options = CurrentOptions();
+  options.compression_per_level = {kNoCompression, kSnappyCompression};
+  options.disable_auto_compactions = true;
+  options.num_levels = 2;
+  Reopen(options);
+
+  // compression ratio is -1.0 when no open files at level
+  ASSERT_EQ(CompressionRatioAtLevel(0), -1.0);
+
+  const std::string kVal(100, 'a');
+  for (int i = 0; i < kNumL0Files; ++i) {
+    for (int j = 0; j < kNumEntriesPerFile; ++j) {
+      // Put common data ("key") at end to prevent delta encoding from
+      // compressing the key effectively
+      std::string key = ToString(i) + ToString(j) + "key";
+      ASSERT_OK(dbfull()->Put(WriteOptions(), key, kVal));
+    }
+    Flush();
+  }
+
+  // no compression at L0, so ratio is less than one
+  ASSERT_LT(CompressionRatioAtLevel(0), 1.0);
+  ASSERT_GT(CompressionRatioAtLevel(0), 0.0);
+  ASSERT_EQ(CompressionRatioAtLevel(1), -1.0);
+
+  dbfull()->TEST_CompactRange(0, nullptr, nullptr);
+
+  ASSERT_EQ(CompressionRatioAtLevel(0), -1.0);
+  // Data at L1 should be highly compressed thanks to Snappy and redundant data
+  // in values (ratio is 12.846 as of 4/19/2016).
+  ASSERT_GT(CompressionRatioAtLevel(1), 10.0);
+}
+
+#endif  // ROCKSDB_LITE
+
+class CountingUserTblPropCollector : public TablePropertiesCollector {
+ public:
+  const char* Name() const override { return "CountingUserTblPropCollector"; }
+
+  Status Finish(UserCollectedProperties* properties) override {
+    std::string encoded;
+    PutVarint32(&encoded, count_);
+    *properties = UserCollectedProperties{
+        {"CountingUserTblPropCollector", message_}, {"Count", encoded},
+    };
+    return Status::OK();
+  }
+
+  Status AddUserKey(const Slice& /*user_key*/, const Slice& /*value*/,
+                    EntryType /*type*/, SequenceNumber /*seq*/,
+                    uint64_t /*file_size*/) override {
+    ++count_;
+    return Status::OK();
+  }
+
+  UserCollectedProperties GetReadableProperties() const override {
+    return UserCollectedProperties{};
+  }
+
+ private:
+  std::string message_ = "Rocksdb";
+  uint32_t count_ = 0;
+};
+
+class CountingUserTblPropCollectorFactory
+    : public TablePropertiesCollectorFactory {
+ public:
+  explicit CountingUserTblPropCollectorFactory(
+      uint32_t expected_column_family_id)
+      : expected_column_family_id_(expected_column_family_id),
+        num_created_(0) {}
+  TablePropertiesCollector* CreateTablePropertiesCollector(
+      TablePropertiesCollectorFactory::Context context) override {
+    EXPECT_EQ(expected_column_family_id_, context.column_family_id);
+    num_created_++;
+    return new CountingUserTblPropCollector();
+  }
+  const char* Name() const override {
+    return "CountingUserTblPropCollectorFactory";
+  }
+  void set_expected_column_family_id(uint32_t v) {
+    expected_column_family_id_ = v;
+  }
+  uint32_t expected_column_family_id_;
+  uint32_t num_created_;
+};
+
+class CountingDeleteTabPropCollector : public TablePropertiesCollector {
+ public:
+  const char* Name() const override { return "CountingDeleteTabPropCollector"; }
+
+  Status AddUserKey(const Slice& /*user_key*/, const Slice& /*value*/,
+                    EntryType type, SequenceNumber /*seq*/,
+                    uint64_t /*file_size*/) override {
+    if (type == kEntryDelete) {
+      num_deletes_++;
+    }
+    return Status::OK();
+  }
+
+  bool NeedCompact() const override { return num_deletes_ > 10; }
+
+  UserCollectedProperties GetReadableProperties() const override {
+    return UserCollectedProperties{};
+  }
+
+  Status Finish(UserCollectedProperties* properties) override {
+    *properties =
+        UserCollectedProperties{{"num_delete", ToString(num_deletes_)}};
+    return Status::OK();
+  }
+
+ private:
+  uint32_t num_deletes_ = 0;
+};
+
+class CountingDeleteTabPropCollectorFactory
+    : public TablePropertiesCollectorFactory {
+ public:
+  TablePropertiesCollector* CreateTablePropertiesCollector(
+      TablePropertiesCollectorFactory::Context /*context*/) override {
+    return new CountingDeleteTabPropCollector();
+  }
+  const char* Name() const override {
+    return "CountingDeleteTabPropCollectorFactory";
+  }
+};
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBPropertiesTest, GetUserDefinedTableProperties) {
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = (1 << 30);
+  options.table_properties_collector_factories.resize(1);
+  std::shared_ptr<CountingUserTblPropCollectorFactory> collector_factory =
+      std::make_shared<CountingUserTblPropCollectorFactory>(0);
+  options.table_properties_collector_factories[0] = collector_factory;
+  Reopen(options);
+  // Create 4 tables
+  for (int table = 0; table < 4; ++table) {
+    for (int i = 0; i < 10 + table; ++i) {
+      db_->Put(WriteOptions(), ToString(table * 100 + i), "val");
+    }
+    db_->Flush(FlushOptions());
+  }
+
+  TablePropertiesCollection props;
+  ASSERT_OK(db_->GetPropertiesOfAllTables(&props));
+  ASSERT_EQ(4U, props.size());
+  uint32_t sum = 0;
+  for (const auto& item : props) {
+    auto& user_collected = item.second->user_collected_properties;
+    ASSERT_TRUE(user_collected.find("CountingUserTblPropCollector") !=
+                user_collected.end());
+    ASSERT_EQ(user_collected.at("CountingUserTblPropCollector"), "Rocksdb");
+    ASSERT_TRUE(user_collected.find("Count") != user_collected.end());
+    Slice key(user_collected.at("Count"));
+    uint32_t count;
+    ASSERT_TRUE(GetVarint32(&key, &count));
+    sum += count;
+  }
+  ASSERT_EQ(10u + 11u + 12u + 13u, sum);
+
+  ASSERT_GT(collector_factory->num_created_, 0U);
+  collector_factory->num_created_ = 0;
+  dbfull()->TEST_CompactRange(0, nullptr, nullptr);
+  ASSERT_GT(collector_factory->num_created_, 0U);
+}
+#endif  // ROCKSDB_LITE
+
+TEST_F(DBPropertiesTest, UserDefinedTablePropertiesContext) {
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = 3;
+  options.table_properties_collector_factories.resize(1);
+  std::shared_ptr<CountingUserTblPropCollectorFactory> collector_factory =
+      std::make_shared<CountingUserTblPropCollectorFactory>(1);
+  options.table_properties_collector_factories[0] = collector_factory,
+  CreateAndReopenWithCF({"pikachu"}, options);
+  // Create 2 files
+  for (int table = 0; table < 2; ++table) {
+    for (int i = 0; i < 10 + table; ++i) {
+      Put(1, ToString(table * 100 + i), "val");
+    }
+    Flush(1);
+  }
+  ASSERT_GT(collector_factory->num_created_, 0U);
+
+  collector_factory->num_created_ = 0;
+  // Trigger automatic compactions.
+  for (int table = 0; table < 3; ++table) {
+    for (int i = 0; i < 10 + table; ++i) {
+      Put(1, ToString(table * 100 + i), "val");
+    }
+    Flush(1);
+    dbfull()->TEST_WaitForCompact();
+  }
+  ASSERT_GT(collector_factory->num_created_, 0U);
+
+  collector_factory->num_created_ = 0;
+  dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]);
+  ASSERT_GT(collector_factory->num_created_, 0U);
+
+  // Come back to write to default column family
+  collector_factory->num_created_ = 0;
+  collector_factory->set_expected_column_family_id(0);  // default CF
+  // Create 4 tables in default column family
+  for (int table = 0; table < 2; ++table) {
+    for (int i = 0; i < 10 + table; ++i) {
+      Put(ToString(table * 100 + i), "val");
+    }
+    Flush();
+  }
+  ASSERT_GT(collector_factory->num_created_, 0U);
+
+  collector_factory->num_created_ = 0;
+  // Trigger automatic compactions.
+  for (int table = 0; table < 3; ++table) {
+    for (int i = 0; i < 10 + table; ++i) {
+      Put(ToString(table * 100 + i), "val");
+    }
+    Flush();
+    dbfull()->TEST_WaitForCompact();
+  }
+  ASSERT_GT(collector_factory->num_created_, 0U);
+
+  collector_factory->num_created_ = 0;
+  dbfull()->TEST_CompactRange(0, nullptr, nullptr);
+  ASSERT_GT(collector_factory->num_created_, 0U);
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBPropertiesTest, TablePropertiesNeedCompactTest) {
+  Random rnd(301);
+
+  Options options;
+  options.create_if_missing = true;
+  options.write_buffer_size = 4096;
+  options.max_write_buffer_number = 8;
+  options.level0_file_num_compaction_trigger = 2;
+  options.level0_slowdown_writes_trigger = 2;
+  options.level0_stop_writes_trigger = 4;
+  options.target_file_size_base = 2048;
+  options.max_bytes_for_level_base = 10240;
+  options.max_bytes_for_level_multiplier = 4;
+  options.soft_pending_compaction_bytes_limit = 1024 * 1024;
+  options.num_levels = 8;
+  options.env = env_;
+
+  std::shared_ptr<TablePropertiesCollectorFactory> collector_factory =
+      std::make_shared<CountingDeleteTabPropCollectorFactory>();
+  options.table_properties_collector_factories.resize(1);
+  options.table_properties_collector_factories[0] = collector_factory;
+
+  DestroyAndReopen(options);
+
+  const int kMaxKey = 1000;
+  for (int i = 0; i < kMaxKey; i++) {
+    ASSERT_OK(Put(Key(i), RandomString(&rnd, 102)));
+    ASSERT_OK(Put(Key(kMaxKey + i), RandomString(&rnd, 102)));
+  }
+  Flush();
+  dbfull()->TEST_WaitForCompact();
+  if (NumTableFilesAtLevel(0) == 1) {
+    // Clear Level 0 so that when later flush a file with deletions,
+    // we don't trigger an organic compaction.
+    ASSERT_OK(Put(Key(0), ""));
+    ASSERT_OK(Put(Key(kMaxKey * 2), ""));
+    Flush();
+    dbfull()->TEST_WaitForCompact();
+  }
+  ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+
+  {
+    int c = 0;
+    std::unique_ptr<Iterator> iter(db_->NewIterator(ReadOptions()));
+    iter->Seek(Key(kMaxKey - 100));
+    while (iter->Valid() && iter->key().compare(Key(kMaxKey + 100)) < 0) {
+      iter->Next();
+      ++c;
+    }
+    ASSERT_EQ(c, 200);
+  }
+
+  Delete(Key(0));
+  for (int i = kMaxKey - 100; i < kMaxKey + 100; i++) {
+    Delete(Key(i));
+  }
+  Delete(Key(kMaxKey * 2));
+
+  Flush();
+  dbfull()->TEST_WaitForCompact();
+
+  {
+    SetPerfLevel(kEnableCount);
+    get_perf_context()->Reset();
+    int c = 0;
+    std::unique_ptr<Iterator> iter(db_->NewIterator(ReadOptions()));
+    iter->Seek(Key(kMaxKey - 100));
+    while (iter->Valid() && iter->key().compare(Key(kMaxKey + 100)) < 0) {
+      iter->Next();
+    }
+    ASSERT_EQ(c, 0);
+    ASSERT_LT(get_perf_context()->internal_delete_skipped_count, 30u);
+    ASSERT_LT(get_perf_context()->internal_key_skipped_count, 30u);
+    SetPerfLevel(kDisable);
+  }
+}
+
+TEST_F(DBPropertiesTest, NeedCompactHintPersistentTest) {
+  Random rnd(301);
+
+  Options options;
+  options.create_if_missing = true;
+  options.max_write_buffer_number = 8;
+  options.level0_file_num_compaction_trigger = 10;
+  options.level0_slowdown_writes_trigger = 10;
+  options.level0_stop_writes_trigger = 10;
+  options.disable_auto_compactions = true;
+  options.env = env_;
+
+  std::shared_ptr<TablePropertiesCollectorFactory> collector_factory =
+      std::make_shared<CountingDeleteTabPropCollectorFactory>();
+  options.table_properties_collector_factories.resize(1);
+  options.table_properties_collector_factories[0] = collector_factory;
+
+  DestroyAndReopen(options);
+
+  const int kMaxKey = 100;
+  for (int i = 0; i < kMaxKey; i++) {
+    ASSERT_OK(Put(Key(i), ""));
+  }
+  Flush();
+  dbfull()->TEST_WaitForFlushMemTable();
+
+  for (int i = 1; i < kMaxKey - 1; i++) {
+    Delete(Key(i));
+  }
+  Flush();
+  dbfull()->TEST_WaitForFlushMemTable();
+  ASSERT_EQ(NumTableFilesAtLevel(0), 2);
+
+  // Restart the DB. Although number of files didn't reach
+  // options.level0_file_num_compaction_trigger, compaction should
+  // still be triggered because of the need-compaction hint.
+  options.disable_auto_compactions = false;
+  Reopen(options);
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+  {
+    SetPerfLevel(kEnableCount);
+    get_perf_context()->Reset();
+    int c = 0;
+    std::unique_ptr<Iterator> iter(db_->NewIterator(ReadOptions()));
+    for (iter->Seek(Key(0)); iter->Valid(); iter->Next()) {
+      c++;
+    }
+    ASSERT_EQ(c, 2);
+    ASSERT_EQ(get_perf_context()->internal_delete_skipped_count, 0);
+    // We iterate every key twice. Is it a bug?
+    ASSERT_LE(get_perf_context()->internal_key_skipped_count, 2);
+    SetPerfLevel(kDisable);
+  }
+}
+
+TEST_F(DBPropertiesTest, EstimateNumKeysUnderflow) {
+  Options options;
+  Reopen(options);
+  Put("foo", "bar");
+  Delete("foo");
+  Delete("foo");
+  uint64_t num_keys = 0;
+  ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.estimate-num-keys", &num_keys));
+  ASSERT_EQ(0, num_keys);
+}
+
+TEST_F(DBPropertiesTest, EstimateOldestKeyTime) {
+  std::unique_ptr<MockTimeEnv> mock_env(new MockTimeEnv(Env::Default()));
+  uint64_t oldest_key_time = 0;
+  Options options;
+  options.env = mock_env.get();
+
+  // "rocksdb.estimate-oldest-key-time" only available to fifo compaction.
+  mock_env->set_current_time(100);
+  for (auto compaction : {kCompactionStyleLevel, kCompactionStyleUniversal,
+                          kCompactionStyleNone}) {
+    options.compaction_style = compaction;
+    options.create_if_missing = true;
+    DestroyAndReopen(options);
+    ASSERT_OK(Put("foo", "bar"));
+    ASSERT_FALSE(dbfull()->GetIntProperty(
+        DB::Properties::kEstimateOldestKeyTime, &oldest_key_time));
+  }
+
+  options.compaction_style = kCompactionStyleFIFO;
+  options.ttl = 300;
+  options.compaction_options_fifo.allow_compaction = false;
+  DestroyAndReopen(options);
+
+  mock_env->set_current_time(100);
+  ASSERT_OK(Put("k1", "v1"));
+  ASSERT_TRUE(dbfull()->GetIntProperty(DB::Properties::kEstimateOldestKeyTime,
+                                       &oldest_key_time));
+  ASSERT_EQ(100, oldest_key_time);
+  ASSERT_OK(Flush());
+  ASSERT_EQ("1", FilesPerLevel());
+  ASSERT_TRUE(dbfull()->GetIntProperty(DB::Properties::kEstimateOldestKeyTime,
+                                       &oldest_key_time));
+  ASSERT_EQ(100, oldest_key_time);
+
+  mock_env->set_current_time(200);
+  ASSERT_OK(Put("k2", "v2"));
+  ASSERT_OK(Flush());
+  ASSERT_EQ("2", FilesPerLevel());
+  ASSERT_TRUE(dbfull()->GetIntProperty(DB::Properties::kEstimateOldestKeyTime,
+                                       &oldest_key_time));
+  ASSERT_EQ(100, oldest_key_time);
+
+  mock_env->set_current_time(300);
+  ASSERT_OK(Put("k3", "v3"));
+  ASSERT_OK(Flush());
+  ASSERT_EQ("3", FilesPerLevel());
+  ASSERT_TRUE(dbfull()->GetIntProperty(DB::Properties::kEstimateOldestKeyTime,
+                                       &oldest_key_time));
+  ASSERT_EQ(100, oldest_key_time);
+
+  mock_env->set_current_time(450);
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ("2", FilesPerLevel());
+  ASSERT_TRUE(dbfull()->GetIntProperty(DB::Properties::kEstimateOldestKeyTime,
+                                       &oldest_key_time));
+  ASSERT_EQ(200, oldest_key_time);
+
+  mock_env->set_current_time(550);
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ("1", FilesPerLevel());
+  ASSERT_TRUE(dbfull()->GetIntProperty(DB::Properties::kEstimateOldestKeyTime,
+                                       &oldest_key_time));
+  ASSERT_EQ(300, oldest_key_time);
+
+  mock_env->set_current_time(650);
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ("", FilesPerLevel());
+  ASSERT_FALSE(dbfull()->GetIntProperty(DB::Properties::kEstimateOldestKeyTime,
+                                        &oldest_key_time));
+
+  // Close before mock_env destructs.
+  Close();
+}
+
+TEST_F(DBPropertiesTest, SstFilesSize) {
+  struct TestListener : public EventListener {
+    void OnCompactionCompleted(DB* db,
+                               const CompactionJobInfo& /*info*/) override {
+      assert(callback_triggered == false);
+      assert(size_before_compaction > 0);
+      callback_triggered = true;
+      uint64_t total_sst_size = 0;
+      uint64_t live_sst_size = 0;
+      bool ok = db->GetIntProperty(DB::Properties::kTotalSstFilesSize,
+                                   &total_sst_size);
+      ASSERT_TRUE(ok);
+      // total_sst_size include files before and after compaction.
+      ASSERT_GT(total_sst_size, size_before_compaction);
+      ok =
+          db->GetIntProperty(DB::Properties::kLiveSstFilesSize, &live_sst_size);
+      ASSERT_TRUE(ok);
+      // live_sst_size only include files after compaction.
+      ASSERT_GT(live_sst_size, 0);
+      ASSERT_LT(live_sst_size, size_before_compaction);
+    }
+
+    uint64_t size_before_compaction = 0;
+    bool callback_triggered = false;
+  };
+  std::shared_ptr<TestListener> listener = std::make_shared<TestListener>();
+
+  Options options;
+  options.disable_auto_compactions = true;
+  options.listeners.push_back(listener);
+  Reopen(options);
+
+  for (int i = 0; i < 10; i++) {
+    ASSERT_OK(Put("key" + ToString(i), std::string(1000, 'v')));
+  }
+  ASSERT_OK(Flush());
+  for (int i = 0; i < 5; i++) {
+    ASSERT_OK(Delete("key" + ToString(i)));
+  }
+  ASSERT_OK(Flush());
+  uint64_t sst_size;
+  bool ok = db_->GetIntProperty(DB::Properties::kTotalSstFilesSize, &sst_size);
+  ASSERT_TRUE(ok);
+  ASSERT_GT(sst_size, 0);
+  listener->size_before_compaction = sst_size;
+  // Compact to clean all keys and trigger listener.
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_TRUE(listener->callback_triggered);
+}
+
+TEST_F(DBPropertiesTest, MinObsoleteSstNumberToKeep) {
+  class TestListener : public EventListener {
+   public:
+    void OnTableFileCreated(const TableFileCreationInfo& info) override {
+      if (info.reason == TableFileCreationReason::kCompaction) {
+        // Verify the property indicates that SSTs created by a running
+        // compaction cannot be deleted.
+        uint64_t created_file_num;
+        FileType created_file_type;
+        std::string filename =
+            info.file_path.substr(info.file_path.rfind('/') + 1);
+        ASSERT_TRUE(
+            ParseFileName(filename, &created_file_num, &created_file_type));
+        ASSERT_EQ(kTableFile, created_file_type);
+
+        uint64_t keep_sst_lower_bound;
+        ASSERT_TRUE(
+            db_->GetIntProperty(DB::Properties::kMinObsoleteSstNumberToKeep,
+                                &keep_sst_lower_bound));
+
+        ASSERT_LE(keep_sst_lower_bound, created_file_num);
+        validated_ = true;
+      }
+    }
+
+    void SetDB(DB* db) { db_ = db; }
+
+    int GetNumCompactions() { return num_compactions_; }
+
+    // True if we've verified the property for at least one output file
+    bool Validated() { return validated_; }
+
+   private:
+    int num_compactions_ = 0;
+    bool validated_ = false;
+    DB* db_ = nullptr;
+  };
+
+  const int kNumL0Files = 4;
+
+  std::shared_ptr<TestListener> listener = std::make_shared<TestListener>();
+
+  Options options = CurrentOptions();
+  options.listeners.push_back(listener);
+  options.level0_file_num_compaction_trigger = kNumL0Files;
+  DestroyAndReopen(options);
+  listener->SetDB(db_);
+
+  for (int i = 0; i < kNumL0Files; ++i) {
+    // Make sure they overlap in keyspace to prevent trivial move
+    Put("key1", "val");
+    Put("key2", "val");
+    Flush();
+  }
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_TRUE(listener->Validated());
+}
+
+TEST_F(DBPropertiesTest, BlockCacheProperties) {
+  Options options;
+  uint64_t value;
+
+  // Block cache properties are not available for tables other than
+  // block-based table.
+  options.table_factory.reset(NewPlainTableFactory());
+  Reopen(options);
+  ASSERT_FALSE(
+      db_->GetIntProperty(DB::Properties::kBlockCacheCapacity, &value));
+  ASSERT_FALSE(db_->GetIntProperty(DB::Properties::kBlockCacheUsage, &value));
+  ASSERT_FALSE(
+      db_->GetIntProperty(DB::Properties::kBlockCachePinnedUsage, &value));
+
+  options.table_factory.reset(NewCuckooTableFactory());
+  Reopen(options);
+  ASSERT_FALSE(
+      db_->GetIntProperty(DB::Properties::kBlockCacheCapacity, &value));
+  ASSERT_FALSE(db_->GetIntProperty(DB::Properties::kBlockCacheUsage, &value));
+  ASSERT_FALSE(
+      db_->GetIntProperty(DB::Properties::kBlockCachePinnedUsage, &value));
+
+  // Block cache properties are not available if block cache is not used.
+  BlockBasedTableOptions table_options;
+  table_options.no_block_cache = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  Reopen(options);
+  ASSERT_FALSE(
+      db_->GetIntProperty(DB::Properties::kBlockCacheCapacity, &value));
+  ASSERT_FALSE(db_->GetIntProperty(DB::Properties::kBlockCacheUsage, &value));
+  ASSERT_FALSE(
+      db_->GetIntProperty(DB::Properties::kBlockCachePinnedUsage, &value));
+
+  // Test with empty block cache.
+  constexpr size_t kCapacity = 100;
+  LRUCacheOptions co;
+  co.capacity = kCapacity;
+  co.num_shard_bits = 0;
+  co.metadata_charge_policy = kDontChargeCacheMetadata;
+  auto block_cache = NewLRUCache(co);
+  table_options.block_cache = block_cache;
+  table_options.no_block_cache = false;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  Reopen(options);
+  ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlockCacheCapacity, &value));
+  ASSERT_EQ(kCapacity, value);
+  ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlockCacheUsage, &value));
+  ASSERT_EQ(0, value);
+  ASSERT_TRUE(
+      db_->GetIntProperty(DB::Properties::kBlockCachePinnedUsage, &value));
+  ASSERT_EQ(0, value);
+
+  // Insert unpinned item to the cache and check size.
+  constexpr size_t kSize1 = 50;
+  block_cache->Insert("item1", nullptr /*value*/, kSize1, nullptr /*deleter*/);
+  ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlockCacheCapacity, &value));
+  ASSERT_EQ(kCapacity, value);
+  ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlockCacheUsage, &value));
+  ASSERT_EQ(kSize1, value);
+  ASSERT_TRUE(
+      db_->GetIntProperty(DB::Properties::kBlockCachePinnedUsage, &value));
+  ASSERT_EQ(0, value);
+
+  // Insert pinned item to the cache and check size.
+  constexpr size_t kSize2 = 30;
+  Cache::Handle* item2 = nullptr;
+  block_cache->Insert("item2", nullptr /*value*/, kSize2, nullptr /*deleter*/,
+                      &item2);
+  ASSERT_NE(nullptr, item2);
+  ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlockCacheCapacity, &value));
+  ASSERT_EQ(kCapacity, value);
+  ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlockCacheUsage, &value));
+  ASSERT_EQ(kSize1 + kSize2, value);
+  ASSERT_TRUE(
+      db_->GetIntProperty(DB::Properties::kBlockCachePinnedUsage, &value));
+  ASSERT_EQ(kSize2, value);
+
+  // Insert another pinned item to make the cache over-sized.
+  constexpr size_t kSize3 = 80;
+  Cache::Handle* item3 = nullptr;
+  block_cache->Insert("item3", nullptr /*value*/, kSize3, nullptr /*deleter*/,
+                      &item3);
+  ASSERT_NE(nullptr, item2);
+  ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlockCacheCapacity, &value));
+  ASSERT_EQ(kCapacity, value);
+  ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlockCacheUsage, &value));
+  // Item 1 is evicted.
+  ASSERT_EQ(kSize2 + kSize3, value);
+  ASSERT_TRUE(
+      db_->GetIntProperty(DB::Properties::kBlockCachePinnedUsage, &value));
+  ASSERT_EQ(kSize2 + kSize3, value);
+
+  // Check size after release.
+  block_cache->Release(item2);
+  block_cache->Release(item3);
+  ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlockCacheCapacity, &value));
+  ASSERT_EQ(kCapacity, value);
+  ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlockCacheUsage, &value));
+  // item2 will be evicted, while item3 remain in cache after release.
+  ASSERT_EQ(kSize3, value);
+  ASSERT_TRUE(
+      db_->GetIntProperty(DB::Properties::kBlockCachePinnedUsage, &value));
+  ASSERT_EQ(0, value);
+}
+
+#endif  // ROCKSDB_LITE
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_range_del_test.cc b/src/rocksdb/db/db_range_del_test.cc
new file mode 100644
index 000000000..15225875d
--- /dev/null
+++ b/src/rocksdb/db/db_range_del_test.cc
@@ -0,0 +1,1660 @@
+//  Copyright (c) 2016-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/db_test_util.h"
+#include "port/stack_trace.h"
+#include "rocksdb/utilities/write_batch_with_index.h"
+#include "test_util/testutil.h"
+#include "utilities/merge_operators.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBRangeDelTest : public DBTestBase {
+ public:
+  DBRangeDelTest() : DBTestBase("/db_range_del_test") {}
+
+  std::string GetNumericStr(int key) {
+    uint64_t uint64_key = static_cast<uint64_t>(key);
+    std::string str;
+    str.resize(8);
+    memcpy(&str[0], static_cast<void*>(&uint64_key), 8);
+    return str;
+  }
+};
+
+// PlainTableFactory, WriteBatchWithIndex, and NumTableFilesAtLevel() are not
+// supported in ROCKSDB_LITE
+#ifndef ROCKSDB_LITE
+TEST_F(DBRangeDelTest, NonBlockBasedTableNotSupported) {
+  // TODO: figure out why MmapReads trips the iterator pinning assertion in
+  // RangeDelAggregator. Ideally it would be supported; otherwise it should at
+  // least be explicitly unsupported.
+  for (auto config : {kPlainTableAllBytesPrefix, /* kWalDirAndMmapReads */}) {
+    option_config_ = config;
+    DestroyAndReopen(CurrentOptions());
+    ASSERT_TRUE(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                                 "dr1", "dr1")
+                    .IsNotSupported());
+  }
+}
+
+TEST_F(DBRangeDelTest, WriteBatchWithIndexNotSupported) {
+  WriteBatchWithIndex indexedBatch{};
+  ASSERT_TRUE(indexedBatch.DeleteRange(db_->DefaultColumnFamily(), "dr1", "dr1")
+                  .IsNotSupported());
+  ASSERT_TRUE(indexedBatch.DeleteRange("dr1", "dr1").IsNotSupported());
+}
+
+TEST_F(DBRangeDelTest, FlushOutputHasOnlyRangeTombstones) {
+  do {
+    DestroyAndReopen(CurrentOptions());
+    ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                               "dr1", "dr2"));
+    ASSERT_OK(db_->Flush(FlushOptions()));
+    ASSERT_EQ(1, NumTableFilesAtLevel(0));
+  } while (ChangeOptions(kRangeDelSkipConfigs));
+}
+
+TEST_F(DBRangeDelTest, CompactionOutputHasOnlyRangeTombstone) {
+  do {
+    Options opts = CurrentOptions();
+    opts.disable_auto_compactions = true;
+    opts.statistics = CreateDBStatistics();
+    DestroyAndReopen(opts);
+
+    // snapshot protects range tombstone from dropping due to becoming obsolete.
+    const Snapshot* snapshot = db_->GetSnapshot();
+    db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z");
+    db_->Flush(FlushOptions());
+
+    ASSERT_EQ(1, NumTableFilesAtLevel(0));
+    ASSERT_EQ(0, NumTableFilesAtLevel(1));
+    dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr,
+                                true /* disallow_trivial_move */);
+    ASSERT_EQ(0, NumTableFilesAtLevel(0));
+    ASSERT_EQ(1, NumTableFilesAtLevel(1));
+    ASSERT_EQ(0, TestGetTickerCount(opts, COMPACTION_RANGE_DEL_DROP_OBSOLETE));
+    db_->ReleaseSnapshot(snapshot);
+    // Skip cuckoo memtables, which do not support snapshots. Skip non-leveled
+    // compactions as the above assertions about the number of files in a level
+    // do not hold true.
+  } while (ChangeOptions(kRangeDelSkipConfigs | kSkipUniversalCompaction |
+                         kSkipFIFOCompaction));
+}
+
+TEST_F(DBRangeDelTest, CompactionOutputFilesExactlyFilled) {
+  // regression test for exactly filled compaction output files. Previously
+  // another file would be generated containing all range deletions, which
+  // could invalidate the non-overlapping file boundary invariant.
+  const int kNumPerFile = 4, kNumFiles = 2, kFileBytes = 9 << 10;
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.level0_file_num_compaction_trigger = kNumFiles;
+  options.memtable_factory.reset(new SpecialSkipListFactory(kNumPerFile));
+  options.num_levels = 2;
+  options.target_file_size_base = kFileBytes;
+  BlockBasedTableOptions table_options;
+  table_options.block_size_deviation = 50;  // each block holds two keys
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  Reopen(options);
+
+  // snapshot protects range tombstone from dropping due to becoming obsolete.
+  const Snapshot* snapshot = db_->GetSnapshot();
+  db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(0), Key(1));
+
+  Random rnd(301);
+  for (int i = 0; i < kNumFiles; ++i) {
+    std::vector<std::string> values;
+    // Write 12K (4 values, each 3K)
+    for (int j = 0; j < kNumPerFile; j++) {
+      values.push_back(RandomString(&rnd, 3 << 10));
+      ASSERT_OK(Put(Key(i * kNumPerFile + j), values[j]));
+      if (j == 0 && i > 0) {
+        dbfull()->TEST_WaitForFlushMemTable();
+      }
+    }
+  }
+  // put extra key to trigger final flush
+  ASSERT_OK(Put("", ""));
+  dbfull()->TEST_WaitForFlushMemTable();
+  ASSERT_EQ(kNumFiles, NumTableFilesAtLevel(0));
+  ASSERT_EQ(0, NumTableFilesAtLevel(1));
+
+  dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr,
+                              true /* disallow_trivial_move */);
+  ASSERT_EQ(0, NumTableFilesAtLevel(0));
+  ASSERT_EQ(2, NumTableFilesAtLevel(1));
+  db_->ReleaseSnapshot(snapshot);
+}
+
+TEST_F(DBRangeDelTest, MaxCompactionBytesCutsOutputFiles) {
+  // Ensures range deletion spanning multiple compaction output files that are
+  // cut by max_compaction_bytes will have non-overlapping key-ranges.
+  // https://github.com/facebook/rocksdb/issues/1778
+  const int kNumFiles = 2, kNumPerFile = 1 << 8, kBytesPerVal = 1 << 12;
+  Options opts = CurrentOptions();
+  opts.comparator = test::Uint64Comparator();
+  opts.disable_auto_compactions = true;
+  opts.level0_file_num_compaction_trigger = kNumFiles;
+  opts.max_compaction_bytes = kNumPerFile * kBytesPerVal;
+  opts.memtable_factory.reset(new SpecialSkipListFactory(kNumPerFile));
+  // Want max_compaction_bytes to trigger the end of compaction output file, not
+  // target_file_size_base, so make the latter much bigger
+  opts.target_file_size_base = 100 * opts.max_compaction_bytes;
+  Reopen(opts);
+
+  // snapshot protects range tombstone from dropping due to becoming obsolete.
+  const Snapshot* snapshot = db_->GetSnapshot();
+
+  // It spans the whole key-range, thus will be included in all output files
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                             GetNumericStr(0),
+                             GetNumericStr(kNumFiles * kNumPerFile - 1)));
+  Random rnd(301);
+  for (int i = 0; i < kNumFiles; ++i) {
+    std::vector<std::string> values;
+    // Write 1MB (256 values, each 4K)
+    for (int j = 0; j < kNumPerFile; j++) {
+      values.push_back(RandomString(&rnd, kBytesPerVal));
+      ASSERT_OK(Put(GetNumericStr(kNumPerFile * i + j), values[j]));
+    }
+    // extra entry to trigger SpecialSkipListFactory's flush
+    ASSERT_OK(Put(GetNumericStr(kNumPerFile), ""));
+    dbfull()->TEST_WaitForFlushMemTable();
+    ASSERT_EQ(i + 1, NumTableFilesAtLevel(0));
+  }
+
+  dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr,
+                              true /* disallow_trivial_move */);
+  ASSERT_EQ(0, NumTableFilesAtLevel(0));
+  ASSERT_GE(NumTableFilesAtLevel(1), 2);
+
+  std::vector<std::vector<FileMetaData>> files;
+  dbfull()->TEST_GetFilesMetaData(db_->DefaultColumnFamily(), &files);
+
+  for (size_t i = 0; i < files[1].size() - 1; ++i) {
+    ASSERT_TRUE(InternalKeyComparator(opts.comparator)
+                    .Compare(files[1][i].largest, files[1][i + 1].smallest) <
+                0);
+  }
+  db_->ReleaseSnapshot(snapshot);
+}
+
+TEST_F(DBRangeDelTest, SentinelsOmittedFromOutputFile) {
+  // Regression test for bug where sentinel range deletions (i.e., ones with
+  // sequence number of zero) were included in output files.
+  // snapshot protects range tombstone from dropping due to becoming obsolete.
+  const Snapshot* snapshot = db_->GetSnapshot();
+
+  // gaps between ranges creates sentinels in our internal representation
+  std::vector<std::pair<std::string, std::string>> range_dels = {{"a", "b"}, {"c", "d"}, {"e", "f"}};
+  for (const auto& range_del : range_dels) {
+    ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                               range_del.first, range_del.second));
+  }
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  ASSERT_EQ(1, NumTableFilesAtLevel(0));
+
+  std::vector<std::vector<FileMetaData>> files;
+  dbfull()->TEST_GetFilesMetaData(db_->DefaultColumnFamily(), &files);
+  ASSERT_GT(files[0][0].fd.smallest_seqno, 0);
+
+  db_->ReleaseSnapshot(snapshot);
+}
+
+TEST_F(DBRangeDelTest, FlushRangeDelsSameStartKey) {
+  db_->Put(WriteOptions(), "b1", "val");
+  ASSERT_OK(
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "c"));
+  db_->Put(WriteOptions(), "b2", "val");
+  ASSERT_OK(
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "b"));
+  // first iteration verifies query correctness in memtable, second verifies
+  // query correctness for a single SST file
+  for (int i = 0; i < 2; ++i) {
+    if (i > 0) {
+      ASSERT_OK(db_->Flush(FlushOptions()));
+      ASSERT_EQ(1, NumTableFilesAtLevel(0));
+    }
+    std::string value;
+    ASSERT_TRUE(db_->Get(ReadOptions(), "b1", &value).IsNotFound());
+    ASSERT_OK(db_->Get(ReadOptions(), "b2", &value));
+  }
+}
+
+TEST_F(DBRangeDelTest, CompactRangeDelsSameStartKey) {
+  db_->Put(WriteOptions(), "unused", "val");  // prevents empty after compaction
+  db_->Put(WriteOptions(), "b1", "val");
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  ASSERT_OK(
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "c"));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  ASSERT_OK(
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "b"));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  ASSERT_EQ(3, NumTableFilesAtLevel(0));
+
+  for (int i = 0; i < 2; ++i) {
+    if (i > 0) {
+      dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr,
+                                  true /* disallow_trivial_move */);
+      ASSERT_EQ(0, NumTableFilesAtLevel(0));
+      ASSERT_EQ(1, NumTableFilesAtLevel(1));
+    }
+    std::string value;
+    ASSERT_TRUE(db_->Get(ReadOptions(), "b1", &value).IsNotFound());
+  }
+}
+#endif  // ROCKSDB_LITE
+
+TEST_F(DBRangeDelTest, FlushRemovesCoveredKeys) {
+  const int kNum = 300, kRangeBegin = 50, kRangeEnd = 250;
+  Options opts = CurrentOptions();
+  opts.comparator = test::Uint64Comparator();
+  Reopen(opts);
+
+  // Write a third before snapshot, a third between snapshot and tombstone, and
+  // a third after the tombstone. Keys older than snapshot or newer than the
+  // tombstone should be preserved.
+  const Snapshot* snapshot = nullptr;
+  for (int i = 0; i < kNum; ++i) {
+    if (i == kNum / 3) {
+      snapshot = db_->GetSnapshot();
+    } else if (i == 2 * kNum / 3) {
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                       GetNumericStr(kRangeBegin), GetNumericStr(kRangeEnd));
+    }
+    db_->Put(WriteOptions(), GetNumericStr(i), "val");
+  }
+  db_->Flush(FlushOptions());
+
+  for (int i = 0; i < kNum; ++i) {
+    ReadOptions read_opts;
+    read_opts.ignore_range_deletions = true;
+    std::string value;
+    if (i < kRangeBegin || i > kRangeEnd || i < kNum / 3 || i >= 2 * kNum / 3) {
+      ASSERT_OK(db_->Get(read_opts, GetNumericStr(i), &value));
+    } else {
+      ASSERT_TRUE(db_->Get(read_opts, GetNumericStr(i), &value).IsNotFound());
+    }
+  }
+  db_->ReleaseSnapshot(snapshot);
+}
+
+// NumTableFilesAtLevel() is not supported in ROCKSDB_LITE
+#ifndef ROCKSDB_LITE
+TEST_F(DBRangeDelTest, CompactionRemovesCoveredKeys) {
+  const int kNumPerFile = 100, kNumFiles = 4;
+  Options opts = CurrentOptions();
+  opts.comparator = test::Uint64Comparator();
+  opts.disable_auto_compactions = true;
+  opts.memtable_factory.reset(new SpecialSkipListFactory(kNumPerFile));
+  opts.num_levels = 2;
+  opts.statistics = CreateDBStatistics();
+  Reopen(opts);
+
+  for (int i = 0; i < kNumFiles; ++i) {
+    if (i > 0) {
+      // range tombstone covers first half of the previous file
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                       GetNumericStr((i - 1) * kNumPerFile),
+                       GetNumericStr((i - 1) * kNumPerFile + kNumPerFile / 2));
+    }
+    // Make sure a given key appears in each file so compaction won't be able to
+    // use trivial move, which would happen if the ranges were non-overlapping.
+    // Also, we need an extra element since flush is only triggered when the
+    // number of keys is one greater than SpecialSkipListFactory's limit.
+    // We choose a key outside the key-range used by the test to avoid conflict.
+    db_->Put(WriteOptions(), GetNumericStr(kNumPerFile * kNumFiles), "val");
+
+    for (int j = 0; j < kNumPerFile; ++j) {
+      db_->Put(WriteOptions(), GetNumericStr(i * kNumPerFile + j), "val");
+    }
+    dbfull()->TEST_WaitForFlushMemTable();
+    ASSERT_EQ(i + 1, NumTableFilesAtLevel(0));
+  }
+  db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_EQ(0, NumTableFilesAtLevel(0));
+  ASSERT_GT(NumTableFilesAtLevel(1), 0);
+  ASSERT_EQ((kNumFiles - 1) * kNumPerFile / 2,
+            TestGetTickerCount(opts, COMPACTION_KEY_DROP_RANGE_DEL));
+
+  for (int i = 0; i < kNumFiles; ++i) {
+    for (int j = 0; j < kNumPerFile; ++j) {
+      ReadOptions read_opts;
+      read_opts.ignore_range_deletions = true;
+      std::string value;
+      if (i == kNumFiles - 1 || j >= kNumPerFile / 2) {
+        ASSERT_OK(
+            db_->Get(read_opts, GetNumericStr(i * kNumPerFile + j), &value));
+      } else {
+        ASSERT_TRUE(
+            db_->Get(read_opts, GetNumericStr(i * kNumPerFile + j), &value)
+                .IsNotFound());
+      }
+    }
+  }
+}
+
+TEST_F(DBRangeDelTest, ValidLevelSubcompactionBoundaries) {
+  const int kNumPerFile = 100, kNumFiles = 4, kFileBytes = 100 << 10;
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.level0_file_num_compaction_trigger = kNumFiles;
+  options.max_bytes_for_level_base = 2 * kFileBytes;
+  options.max_subcompactions = 4;
+  options.memtable_factory.reset(new SpecialSkipListFactory(kNumPerFile));
+  options.num_levels = 3;
+  options.target_file_size_base = kFileBytes;
+  options.target_file_size_multiplier = 1;
+  Reopen(options);
+
+  Random rnd(301);
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < kNumFiles; ++j) {
+      if (i > 0) {
+        // delete [95,105) in two files, [295,305) in next two
+        int mid = (j + (1 - j % 2)) * kNumPerFile;
+        db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                         Key(mid - 5), Key(mid + 5));
+      }
+      std::vector<std::string> values;
+      // Write 100KB (100 values, each 1K)
+      for (int k = 0; k < kNumPerFile; k++) {
+        values.push_back(RandomString(&rnd, 990));
+        ASSERT_OK(Put(Key(j * kNumPerFile + k), values[k]));
+      }
+      // put extra key to trigger flush
+      ASSERT_OK(Put("", ""));
+      dbfull()->TEST_WaitForFlushMemTable();
+      if (j < kNumFiles - 1) {
+        // background compaction may happen early for kNumFiles'th file
+        ASSERT_EQ(NumTableFilesAtLevel(0), j + 1);
+      }
+      if (j == options.level0_file_num_compaction_trigger - 1) {
+        // When i == 1, compaction will output some files to L1, at which point
+        // L1 is not bottommost so range deletions cannot be compacted away. The
+        // new L1 files must be generated with non-overlapping key ranges even
+        // though multiple subcompactions see the same ranges deleted, else an
+        // assertion will fail.
+        //
+        // Only enable auto-compactions when we're ready; otherwise, the
+        // oversized L0 (relative to base_level) causes the compaction to run
+        // earlier.
+        ASSERT_OK(db_->EnableAutoCompaction({db_->DefaultColumnFamily()}));
+        dbfull()->TEST_WaitForCompact();
+        ASSERT_OK(db_->SetOptions(db_->DefaultColumnFamily(),
+                                  {{"disable_auto_compactions", "true"}}));
+        ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+        ASSERT_GT(NumTableFilesAtLevel(1), 0);
+        ASSERT_GT(NumTableFilesAtLevel(2), 0);
+      }
+    }
+  }
+}
+
+TEST_F(DBRangeDelTest, ValidUniversalSubcompactionBoundaries) {
+  const int kNumPerFile = 100, kFilesPerLevel = 4, kNumLevels = 4;
+  Options options = CurrentOptions();
+  options.compaction_options_universal.min_merge_width = kFilesPerLevel;
+  options.compaction_options_universal.max_merge_width = kFilesPerLevel;
+  options.compaction_options_universal.size_ratio = 10;
+  options.compaction_style = kCompactionStyleUniversal;
+  options.level0_file_num_compaction_trigger = kFilesPerLevel;
+  options.max_subcompactions = 4;
+  options.memtable_factory.reset(new SpecialSkipListFactory(kNumPerFile));
+  options.num_levels = kNumLevels;
+  options.target_file_size_base = kNumPerFile << 10;
+  options.target_file_size_multiplier = 1;
+  Reopen(options);
+
+  Random rnd(301);
+  for (int i = 0; i < kNumLevels - 1; ++i) {
+    for (int j = 0; j < kFilesPerLevel; ++j) {
+      if (i == kNumLevels - 2) {
+        // insert range deletions [95,105) in two files, [295,305) in next two
+        // to prepare L1 for later manual compaction.
+        int mid = (j + (1 - j % 2)) * kNumPerFile;
+        db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                         Key(mid - 5), Key(mid + 5));
+      }
+      std::vector<std::string> values;
+      // Write 100KB (100 values, each 1K)
+      for (int k = 0; k < kNumPerFile; k++) {
+        values.push_back(RandomString(&rnd, 990));
+        ASSERT_OK(Put(Key(j * kNumPerFile + k), values[k]));
+      }
+      // put extra key to trigger flush
+      ASSERT_OK(Put("", ""));
+      dbfull()->TEST_WaitForFlushMemTable();
+      if (j < kFilesPerLevel - 1) {
+        // background compaction may happen early for kFilesPerLevel'th file
+        ASSERT_EQ(NumTableFilesAtLevel(0), j + 1);
+      }
+    }
+    dbfull()->TEST_WaitForCompact();
+    ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+    ASSERT_GT(NumTableFilesAtLevel(kNumLevels - 1 - i), kFilesPerLevel - 1);
+  }
+  // Now L1-L3 are full, when we compact L1->L2 we should see (1) subcompactions
+  // happen since input level > 0; (2) range deletions are not dropped since
+  // output level is not bottommost. If no file boundary assertion fails, that
+  // probably means universal compaction + subcompaction + range deletion are
+  // compatible.
+  ASSERT_OK(dbfull()->RunManualCompaction(
+      reinterpret_cast<ColumnFamilyHandleImpl*>(db_->DefaultColumnFamily())
+          ->cfd(),
+      1 /* input_level */, 2 /* output_level */, CompactRangeOptions(),
+      nullptr /* begin */, nullptr /* end */, true /* exclusive */,
+      true /* disallow_trivial_move */,
+      port::kMaxUint64 /* max_file_num_to_ignore */));
+}
+#endif  // ROCKSDB_LITE
+
+TEST_F(DBRangeDelTest, CompactionRemovesCoveredMergeOperands) {
+  const int kNumPerFile = 3, kNumFiles = 3;
+  Options opts = CurrentOptions();
+  opts.disable_auto_compactions = true;
+  opts.memtable_factory.reset(new SpecialSkipListFactory(2 * kNumPerFile));
+  opts.merge_operator = MergeOperators::CreateUInt64AddOperator();
+  opts.num_levels = 2;
+  Reopen(opts);
+
+  // Iterates kNumFiles * kNumPerFile + 1 times since flushing the last file
+  // requires an extra entry.
+  for (int i = 0; i <= kNumFiles * kNumPerFile; ++i) {
+    if (i % kNumPerFile == 0 && i / kNumPerFile == kNumFiles - 1) {
+      // Delete merge operands from all but the last file
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "key",
+                       "key_");
+    }
+    std::string val;
+    PutFixed64(&val, i);
+    db_->Merge(WriteOptions(), "key", val);
+    // we need to prevent trivial move using Puts so compaction will actually
+    // process the merge operands.
+    db_->Put(WriteOptions(), "prevent_trivial_move", "");
+    if (i > 0 && i % kNumPerFile == 0) {
+      dbfull()->TEST_WaitForFlushMemTable();
+    }
+  }
+
+  ReadOptions read_opts;
+  read_opts.ignore_range_deletions = true;
+  std::string expected, actual;
+  ASSERT_OK(db_->Get(read_opts, "key", &actual));
+  PutFixed64(&expected, 45);  // 1+2+...+9
+  ASSERT_EQ(expected, actual);
+
+  db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+
+  expected.clear();
+  ASSERT_OK(db_->Get(read_opts, "key", &actual));
+  uint64_t tmp;
+  Slice tmp2(actual);
+  GetFixed64(&tmp2, &tmp);
+  PutFixed64(&expected, 30);  // 6+7+8+9 (earlier operands covered by tombstone)
+  ASSERT_EQ(expected, actual);
+}
+
+TEST_F(DBRangeDelTest, PutDeleteRangeMergeFlush) {
+  // Test the sequence of operations: (1) Put, (2) DeleteRange, (3) Merge, (4)
+  // Flush. The `CompactionIterator` previously had a bug where we forgot to
+  // check for covering range tombstones when processing the (1) Put, causing
+  // it to reappear after the flush.
+  Options opts = CurrentOptions();
+  opts.merge_operator = MergeOperators::CreateUInt64AddOperator();
+  Reopen(opts);
+
+  std::string val;
+  PutFixed64(&val, 1);
+  ASSERT_OK(db_->Put(WriteOptions(), "key", val));
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                             "key", "key_"));
+  ASSERT_OK(db_->Merge(WriteOptions(), "key", val));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+
+  ReadOptions read_opts;
+  std::string expected, actual;
+  ASSERT_OK(db_->Get(read_opts, "key", &actual));
+  PutFixed64(&expected, 1);
+  ASSERT_EQ(expected, actual);
+}
+
+// NumTableFilesAtLevel() is not supported in ROCKSDB_LITE
+#ifndef ROCKSDB_LITE
+TEST_F(DBRangeDelTest, ObsoleteTombstoneCleanup) {
+  // During compaction to bottommost level, verify range tombstones older than
+  // the oldest snapshot are removed, while others are preserved.
+  Options opts = CurrentOptions();
+  opts.disable_auto_compactions = true;
+  opts.num_levels = 2;
+  opts.statistics = CreateDBStatistics();
+  Reopen(opts);
+
+  db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "dr1",
+                   "dr10");  // obsolete after compaction
+  db_->Put(WriteOptions(), "key", "val");
+  db_->Flush(FlushOptions());
+  const Snapshot* snapshot = db_->GetSnapshot();
+  db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "dr2",
+                   "dr20");  // protected by snapshot
+  db_->Put(WriteOptions(), "key", "val");
+  db_->Flush(FlushOptions());
+
+  ASSERT_EQ(2, NumTableFilesAtLevel(0));
+  ASSERT_EQ(0, NumTableFilesAtLevel(1));
+  db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_EQ(0, NumTableFilesAtLevel(0));
+  ASSERT_EQ(1, NumTableFilesAtLevel(1));
+  ASSERT_EQ(1, TestGetTickerCount(opts, COMPACTION_RANGE_DEL_DROP_OBSOLETE));
+
+  db_->ReleaseSnapshot(snapshot);
+}
+
+TEST_F(DBRangeDelTest, TableEvictedDuringScan) {
+  // The RangeDelAggregator holds pointers into range deletion blocks created by
+  // table readers. This test ensures the aggregator can still access those
+  // blocks even if it outlives the table readers that created them.
+  //
+  // DBIter always keeps readers open for L0 files. So, in order to test
+  // aggregator outliving reader, we need to have deletions in L1 files, which
+  // are opened/closed on-demand during the scan. This is accomplished by
+  // setting kNumRanges > level0_stop_writes_trigger, which prevents deletions
+  // from all lingering in L0 (there is at most one range deletion per L0 file).
+  //
+  // The first L1 file will contain a range deletion since its begin key is 0.
+  // SeekToFirst() references that table's reader and adds its range tombstone
+  // to the aggregator. Upon advancing beyond that table's key-range via Next(),
+  // the table reader will be unreferenced by the iterator. Since we manually
+  // call Evict() on all readers before the full scan, this unreference causes
+  // the reader's refcount to drop to zero and thus be destroyed.
+  //
+  // When it is destroyed, we do not remove its range deletions from the
+  // aggregator. So, subsequent calls to Next() must be able to use these
+  // deletions to decide whether a key is covered. This will work as long as
+  // the aggregator properly references the range deletion block.
+  const int kNum = 25, kRangeBegin = 0, kRangeEnd = 7, kNumRanges = 5;
+  Options opts = CurrentOptions();
+  opts.comparator = test::Uint64Comparator();
+  opts.level0_file_num_compaction_trigger = 4;
+  opts.level0_stop_writes_trigger = 4;
+  opts.memtable_factory.reset(new SpecialSkipListFactory(1));
+  opts.num_levels = 2;
+  BlockBasedTableOptions bbto;
+  bbto.cache_index_and_filter_blocks = true;
+  bbto.block_cache = NewLRUCache(8 << 20);
+  opts.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  Reopen(opts);
+
+  // Hold a snapshot so range deletions can't become obsolete during compaction
+  // to bottommost level (i.e., L1).
+  const Snapshot* snapshot = db_->GetSnapshot();
+  for (int i = 0; i < kNum; ++i) {
+    db_->Put(WriteOptions(), GetNumericStr(i), "val");
+    if (i > 0) {
+      dbfull()->TEST_WaitForFlushMemTable();
+    }
+    if (i >= kNum / 2 && i < kNum / 2 + kNumRanges) {
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                       GetNumericStr(kRangeBegin), GetNumericStr(kRangeEnd));
+    }
+  }
+  // Must be > 1 so the first L1 file can be closed before scan finishes
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_GT(NumTableFilesAtLevel(1), 1);
+  std::vector<uint64_t> file_numbers = ListTableFiles(env_, dbname_);
+
+  ReadOptions read_opts;
+  auto* iter = db_->NewIterator(read_opts);
+  int expected = kRangeEnd;
+  iter->SeekToFirst();
+  for (auto file_number : file_numbers) {
+    // This puts table caches in the state of being externally referenced only
+    // so they are destroyed immediately upon iterator unreferencing.
+    TableCache::Evict(dbfull()->TEST_table_cache(), file_number);
+  }
+  for (; iter->Valid(); iter->Next()) {
+    ASSERT_EQ(GetNumericStr(expected), iter->key());
+    ++expected;
+    // Keep clearing block cache's LRU so range deletion block can be freed as
+    // soon as its refcount drops to zero.
+    bbto.block_cache->EraseUnRefEntries();
+  }
+  ASSERT_EQ(kNum, expected);
+  delete iter;
+  db_->ReleaseSnapshot(snapshot);
+}
+
+TEST_F(DBRangeDelTest, GetCoveredKeyFromMutableMemtable) {
+  do {
+    DestroyAndReopen(CurrentOptions());
+    db_->Put(WriteOptions(), "key", "val");
+    ASSERT_OK(
+        db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
+
+    ReadOptions read_opts;
+    std::string value;
+    ASSERT_TRUE(db_->Get(read_opts, "key", &value).IsNotFound());
+  } while (ChangeOptions(kRangeDelSkipConfigs));
+}
+
+TEST_F(DBRangeDelTest, GetCoveredKeyFromImmutableMemtable) {
+  do {
+    Options opts = CurrentOptions();
+    opts.max_write_buffer_number = 3;
+    opts.min_write_buffer_number_to_merge = 2;
+    // SpecialSkipListFactory lets us specify maximum number of elements the
+    // memtable can hold. It switches the active memtable to immutable (flush is
+    // prevented by the above options) upon inserting an element that would
+    // overflow the memtable.
+    opts.memtable_factory.reset(new SpecialSkipListFactory(1));
+    DestroyAndReopen(opts);
+
+    db_->Put(WriteOptions(), "key", "val");
+    ASSERT_OK(
+        db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
+    db_->Put(WriteOptions(), "blah", "val");
+
+    ReadOptions read_opts;
+    std::string value;
+    ASSERT_TRUE(db_->Get(read_opts, "key", &value).IsNotFound());
+  } while (ChangeOptions(kRangeDelSkipConfigs));
+}
+
+TEST_F(DBRangeDelTest, GetCoveredKeyFromSst) {
+  do {
+    DestroyAndReopen(CurrentOptions());
+    db_->Put(WriteOptions(), "key", "val");
+    // snapshot prevents key from being deleted during flush
+    const Snapshot* snapshot = db_->GetSnapshot();
+    ASSERT_OK(
+        db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
+    ASSERT_OK(db_->Flush(FlushOptions()));
+
+    ReadOptions read_opts;
+    std::string value;
+    ASSERT_TRUE(db_->Get(read_opts, "key", &value).IsNotFound());
+    db_->ReleaseSnapshot(snapshot);
+  } while (ChangeOptions(kRangeDelSkipConfigs));
+}
+
+TEST_F(DBRangeDelTest, GetCoveredMergeOperandFromMemtable) {
+  const int kNumMergeOps = 10;
+  Options opts = CurrentOptions();
+  opts.merge_operator = MergeOperators::CreateUInt64AddOperator();
+  Reopen(opts);
+
+  for (int i = 0; i < kNumMergeOps; ++i) {
+    std::string val;
+    PutFixed64(&val, i);
+    db_->Merge(WriteOptions(), "key", val);
+    if (i == kNumMergeOps / 2) {
+      // deletes [0, 5]
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "key",
+                       "key_");
+    }
+  }
+
+  ReadOptions read_opts;
+  std::string expected, actual;
+  ASSERT_OK(db_->Get(read_opts, "key", &actual));
+  PutFixed64(&expected, 30);  // 6+7+8+9
+  ASSERT_EQ(expected, actual);
+
+  expected.clear();
+  read_opts.ignore_range_deletions = true;
+  ASSERT_OK(db_->Get(read_opts, "key", &actual));
+  PutFixed64(&expected, 45);  // 0+1+2+...+9
+  ASSERT_EQ(expected, actual);
+}
+
+TEST_F(DBRangeDelTest, GetIgnoresRangeDeletions) {
+  Options opts = CurrentOptions();
+  opts.max_write_buffer_number = 4;
+  opts.min_write_buffer_number_to_merge = 3;
+  opts.memtable_factory.reset(new SpecialSkipListFactory(1));
+  Reopen(opts);
+
+  db_->Put(WriteOptions(), "sst_key", "val");
+  // snapshot prevents key from being deleted during flush
+  const Snapshot* snapshot = db_->GetSnapshot();
+  ASSERT_OK(
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  db_->Put(WriteOptions(), "imm_key", "val");
+  ASSERT_OK(
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
+  db_->Put(WriteOptions(), "mem_key", "val");
+  ASSERT_OK(
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
+
+  ReadOptions read_opts;
+  read_opts.ignore_range_deletions = true;
+  for (std::string key : {"sst_key", "imm_key", "mem_key"}) {
+    std::string value;
+    ASSERT_OK(db_->Get(read_opts, key, &value));
+  }
+  db_->ReleaseSnapshot(snapshot);
+}
+
+TEST_F(DBRangeDelTest, IteratorRemovesCoveredKeys) {
+  const int kNum = 200, kRangeBegin = 50, kRangeEnd = 150, kNumPerFile = 25;
+  Options opts = CurrentOptions();
+  opts.comparator = test::Uint64Comparator();
+  opts.memtable_factory.reset(new SpecialSkipListFactory(kNumPerFile));
+  Reopen(opts);
+
+  // Write half of the keys before the tombstone and half after the tombstone.
+  // Only covered keys (i.e., within the range and older than the tombstone)
+  // should be deleted.
+  for (int i = 0; i < kNum; ++i) {
+    if (i == kNum / 2) {
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                       GetNumericStr(kRangeBegin), GetNumericStr(kRangeEnd));
+    }
+    db_->Put(WriteOptions(), GetNumericStr(i), "val");
+  }
+  ReadOptions read_opts;
+  auto* iter = db_->NewIterator(read_opts);
+
+  int expected = 0;
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    ASSERT_EQ(GetNumericStr(expected), iter->key());
+    if (expected == kRangeBegin - 1) {
+      expected = kNum / 2;
+    } else {
+      ++expected;
+    }
+  }
+  ASSERT_EQ(kNum, expected);
+  delete iter;
+}
+
+TEST_F(DBRangeDelTest, IteratorOverUserSnapshot) {
+  const int kNum = 200, kRangeBegin = 50, kRangeEnd = 150, kNumPerFile = 25;
+  Options opts = CurrentOptions();
+  opts.comparator = test::Uint64Comparator();
+  opts.memtable_factory.reset(new SpecialSkipListFactory(kNumPerFile));
+  Reopen(opts);
+
+  const Snapshot* snapshot = nullptr;
+  // Put a snapshot before the range tombstone, verify an iterator using that
+  // snapshot sees all inserted keys.
+  for (int i = 0; i < kNum; ++i) {
+    if (i == kNum / 2) {
+      snapshot = db_->GetSnapshot();
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                       GetNumericStr(kRangeBegin), GetNumericStr(kRangeEnd));
+    }
+    db_->Put(WriteOptions(), GetNumericStr(i), "val");
+  }
+  ReadOptions read_opts;
+  read_opts.snapshot = snapshot;
+  auto* iter = db_->NewIterator(read_opts);
+
+  int expected = 0;
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    ASSERT_EQ(GetNumericStr(expected), iter->key());
+    ++expected;
+  }
+  ASSERT_EQ(kNum / 2, expected);
+  delete iter;
+  db_->ReleaseSnapshot(snapshot);
+}
+
+TEST_F(DBRangeDelTest, IteratorIgnoresRangeDeletions) {
+  Options opts = CurrentOptions();
+  opts.max_write_buffer_number = 4;
+  opts.min_write_buffer_number_to_merge = 3;
+  opts.memtable_factory.reset(new SpecialSkipListFactory(1));
+  Reopen(opts);
+
+  db_->Put(WriteOptions(), "sst_key", "val");
+  // snapshot prevents key from being deleted during flush
+  const Snapshot* snapshot = db_->GetSnapshot();
+  ASSERT_OK(
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  db_->Put(WriteOptions(), "imm_key", "val");
+  ASSERT_OK(
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
+  db_->Put(WriteOptions(), "mem_key", "val");
+  ASSERT_OK(
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
+
+  ReadOptions read_opts;
+  read_opts.ignore_range_deletions = true;
+  auto* iter = db_->NewIterator(read_opts);
+  int i = 0;
+  std::string expected[] = {"imm_key", "mem_key", "sst_key"};
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next(), ++i) {
+    std::string key;
+    ASSERT_EQ(expected[i], iter->key());
+  }
+  ASSERT_EQ(3, i);
+  delete iter;
+  db_->ReleaseSnapshot(snapshot);
+}
+
+#ifndef ROCKSDB_UBSAN_RUN
+TEST_F(DBRangeDelTest, TailingIteratorRangeTombstoneUnsupported) {
+  db_->Put(WriteOptions(), "key", "val");
+  // snapshot prevents key from being deleted during flush
+  const Snapshot* snapshot = db_->GetSnapshot();
+  ASSERT_OK(
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
+
+  // iterations check unsupported in memtable, l0, and then l1
+  for (int i = 0; i < 3; ++i) {
+    ReadOptions read_opts;
+    read_opts.tailing = true;
+    auto* iter = db_->NewIterator(read_opts);
+    if (i == 2) {
+      // For L1+, iterators over files are created on-demand, so need seek
+      iter->SeekToFirst();
+    }
+    ASSERT_TRUE(iter->status().IsNotSupported());
+    delete iter;
+    if (i == 0) {
+      ASSERT_OK(db_->Flush(FlushOptions()));
+    } else if (i == 1) {
+      MoveFilesToLevel(1);
+    }
+  }
+  db_->ReleaseSnapshot(snapshot);
+}
+
+#endif  // !ROCKSDB_UBSAN_RUN
+
+TEST_F(DBRangeDelTest, SubcompactionHasEmptyDedicatedRangeDelFile) {
+  const int kNumFiles = 2, kNumKeysPerFile = 4;
+  Options options = CurrentOptions();
+  options.compression = kNoCompression;
+  options.disable_auto_compactions = true;
+  options.level0_file_num_compaction_trigger = kNumFiles;
+  options.max_subcompactions = 2;
+  options.num_levels = 2;
+  options.target_file_size_base = 4096;
+  Reopen(options);
+
+  // need a L1 file for subcompaction to be triggered
+  ASSERT_OK(
+      db_->Put(WriteOptions(), db_->DefaultColumnFamily(), Key(0), "val"));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  MoveFilesToLevel(1);
+
+  // put enough keys to fill up the first subcompaction, and later range-delete
+  // them so that the first subcompaction outputs no key-values. In that case
+  // it'll consider making an SST file dedicated to range deletions.
+  for (int i = 0; i < kNumKeysPerFile; ++i) {
+    ASSERT_OK(db_->Put(WriteOptions(), db_->DefaultColumnFamily(), Key(i),
+                       std::string(1024, 'a')));
+  }
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(0),
+                             Key(kNumKeysPerFile)));
+
+  // the above range tombstone can be dropped, so that one alone won't cause a
+  // dedicated file to be opened. We can make one protected by snapshot that
+  // must be considered. Make its range outside the first subcompaction's range
+  // to exercise the tricky part of the code.
+  const Snapshot* snapshot = db_->GetSnapshot();
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                             Key(kNumKeysPerFile + 1),
+                             Key(kNumKeysPerFile + 2)));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+
+  ASSERT_EQ(kNumFiles, NumTableFilesAtLevel(0));
+  ASSERT_EQ(1, NumTableFilesAtLevel(1));
+
+  db_->EnableAutoCompaction({db_->DefaultColumnFamily()});
+  dbfull()->TEST_WaitForCompact();
+  db_->ReleaseSnapshot(snapshot);
+}
+
+TEST_F(DBRangeDelTest, MemtableBloomFilter) {
+  // regression test for #2743. the range delete tombstones in memtable should
+  // be added even when Get() skips searching due to its prefix bloom filter
+  const int kMemtableSize = 1 << 20;              // 1MB
+  const int kMemtablePrefixFilterSize = 1 << 13;  // 8KB
+  const int kNumKeys = 1000;
+  const int kPrefixLen = 8;
+  Options options = CurrentOptions();
+  options.memtable_prefix_bloom_size_ratio =
+      static_cast<double>(kMemtablePrefixFilterSize) / kMemtableSize;
+  options.prefix_extractor.reset(
+      ROCKSDB_NAMESPACE::NewFixedPrefixTransform(kPrefixLen));
+  options.write_buffer_size = kMemtableSize;
+  Reopen(options);
+
+  for (int i = 0; i < kNumKeys; ++i) {
+    ASSERT_OK(Put(Key(i), "val"));
+  }
+  Flush();
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(0),
+                             Key(kNumKeys)));
+  for (int i = 0; i < kNumKeys; ++i) {
+    std::string value;
+    ASSERT_TRUE(db_->Get(ReadOptions(), Key(i), &value).IsNotFound());
+  }
+}
+
+TEST_F(DBRangeDelTest, CompactionTreatsSplitInputLevelDeletionAtomically) {
+  // This test originally verified that compaction treated files containing a
+  // split range deletion in the input level as an atomic unit. I.e.,
+  // compacting any input-level file(s) containing a portion of the range
+  // deletion causes all other input-level files containing portions of that
+  // same range deletion to be included in the compaction. Range deletion
+  // tombstones are now truncated to sstable boundaries which removed the need
+  // for that behavior (which could lead to excessively large
+  // compactions).
+  const int kNumFilesPerLevel = 4, kValueBytes = 4 << 10;
+  Options options = CurrentOptions();
+  options.compression = kNoCompression;
+  options.level0_file_num_compaction_trigger = kNumFilesPerLevel;
+  options.memtable_factory.reset(
+      new SpecialSkipListFactory(2 /* num_entries_flush */));
+  options.target_file_size_base = kValueBytes;
+  // i == 0: CompactFiles
+  // i == 1: CompactRange
+  // i == 2: automatic compaction
+  for (int i = 0; i < 3; ++i) {
+    DestroyAndReopen(options);
+
+    ASSERT_OK(Put(Key(0), ""));
+    ASSERT_OK(db_->Flush(FlushOptions()));
+    MoveFilesToLevel(2);
+    ASSERT_EQ(1, NumTableFilesAtLevel(2));
+
+    // snapshot protects range tombstone from dropping due to becoming obsolete.
+    const Snapshot* snapshot = db_->GetSnapshot();
+    db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(0),
+                     Key(2 * kNumFilesPerLevel));
+
+    Random rnd(301);
+    std::string value = RandomString(&rnd, kValueBytes);
+    for (int j = 0; j < kNumFilesPerLevel; ++j) {
+      // give files overlapping key-ranges to prevent trivial move
+      ASSERT_OK(Put(Key(j), value));
+      ASSERT_OK(Put(Key(2 * kNumFilesPerLevel - 1 - j), value));
+      if (j > 0) {
+        dbfull()->TEST_WaitForFlushMemTable();
+        ASSERT_EQ(j, NumTableFilesAtLevel(0));
+      }
+    }
+    // put extra key to trigger final flush
+    ASSERT_OK(Put("", ""));
+    dbfull()->TEST_WaitForFlushMemTable();
+    dbfull()->TEST_WaitForCompact();
+    ASSERT_EQ(0, NumTableFilesAtLevel(0));
+    ASSERT_EQ(kNumFilesPerLevel, NumTableFilesAtLevel(1));
+
+    ColumnFamilyMetaData meta;
+    db_->GetColumnFamilyMetaData(&meta);
+    if (i == 0) {
+      ASSERT_OK(db_->CompactFiles(
+          CompactionOptions(), {meta.levels[1].files[0].name}, 2 /* level */));
+      ASSERT_EQ(0, NumTableFilesAtLevel(1));
+    } else if (i == 1) {
+      auto begin_str = Key(0), end_str = Key(1);
+      Slice begin = begin_str, end = end_str;
+      ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &begin, &end));
+      ASSERT_EQ(3, NumTableFilesAtLevel(1));
+    } else if (i == 2) {
+      ASSERT_OK(db_->SetOptions(db_->DefaultColumnFamily(),
+                                {{"max_bytes_for_level_base", "10000"}}));
+      dbfull()->TEST_WaitForCompact();
+      ASSERT_EQ(1, NumTableFilesAtLevel(1));
+    }
+    ASSERT_GT(NumTableFilesAtLevel(2), 0);
+
+    db_->ReleaseSnapshot(snapshot);
+  }
+}
+
+TEST_F(DBRangeDelTest, RangeTombstoneEndKeyAsSstableUpperBound) {
+  // Test the handling of the range-tombstone end-key as the
+  // upper-bound for an sstable.
+
+  const int kNumFilesPerLevel = 2, kValueBytes = 4 << 10;
+  Options options = CurrentOptions();
+  options.compression = kNoCompression;
+  options.level0_file_num_compaction_trigger = kNumFilesPerLevel;
+  options.memtable_factory.reset(
+      new SpecialSkipListFactory(2 /* num_entries_flush */));
+  options.target_file_size_base = kValueBytes;
+  options.disable_auto_compactions = true;
+
+  DestroyAndReopen(options);
+
+  // Create an initial sstable at L2:
+  //   [key000000#1,1, key000000#1,1]
+  ASSERT_OK(Put(Key(0), ""));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  MoveFilesToLevel(2);
+  ASSERT_EQ(1, NumTableFilesAtLevel(2));
+
+  // A snapshot protects the range tombstone from dropping due to
+  // becoming obsolete.
+  const Snapshot* snapshot = db_->GetSnapshot();
+  db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                   Key(0), Key(2 * kNumFilesPerLevel));
+
+  // Create 2 additional sstables in L0. Note that the first sstable
+  // contains the range tombstone.
+  //   [key000000#3,1, key000004#72057594037927935,15]
+  //   [key000001#5,1, key000002#6,1]
+  Random rnd(301);
+  std::string value = RandomString(&rnd, kValueBytes);
+  for (int j = 0; j < kNumFilesPerLevel; ++j) {
+    // Give files overlapping key-ranges to prevent a trivial move when we
+    // compact from L0 to L1.
+    ASSERT_OK(Put(Key(j), value));
+    ASSERT_OK(Put(Key(2 * kNumFilesPerLevel - 1 - j), value));
+    ASSERT_OK(db_->Flush(FlushOptions()));
+    ASSERT_EQ(j + 1, NumTableFilesAtLevel(0));
+  }
+  // Compact the 2 L0 sstables to L1, resulting in the following LSM. There
+  // are 2 sstables generated in L1 due to the target_file_size_base setting.
+  //   L1:
+  //     [key000000#3,1, key000002#72057594037927935,15]
+  //     [key000002#6,1, key000004#72057594037927935,15]
+  //   L2:
+  //     [key000000#1,1, key000000#1,1]
+  MoveFilesToLevel(1);
+  ASSERT_EQ(2, NumTableFilesAtLevel(1));
+
+  {
+    // Compact the second sstable in L1:
+    //   L1:
+    //     [key000000#3,1, key000002#72057594037927935,15]
+    //   L2:
+    //     [key000000#1,1, key000000#1,1]
+    //     [key000002#6,1, key000004#72057594037927935,15]
+    //
+    // At the same time, verify the compaction does not cause the key at the
+    // endpoint (key000002#6,1) to disappear.
+    ASSERT_EQ(value, Get(Key(2)));
+    auto begin_str = Key(3);
+    const ROCKSDB_NAMESPACE::Slice begin = begin_str;
+    dbfull()->TEST_CompactRange(1, &begin, nullptr);
+    ASSERT_EQ(1, NumTableFilesAtLevel(1));
+    ASSERT_EQ(2, NumTableFilesAtLevel(2));
+    ASSERT_EQ(value, Get(Key(2)));
+  }
+
+  {
+    // Compact the first sstable in L1. This should be copacetic, but
+    // was previously resulting in overlapping sstables in L2 due to
+    // mishandling of the range tombstone end-key when used as the
+    // largest key for an sstable. The resulting LSM structure should
+    // be:
+    //
+    //   L2:
+    //     [key000000#1,1, key000001#72057594037927935,15]
+    //     [key000001#5,1, key000002#72057594037927935,15]
+    //     [key000002#6,1, key000004#72057594037927935,15]
+    auto begin_str = Key(0);
+    const ROCKSDB_NAMESPACE::Slice begin = begin_str;
+    dbfull()->TEST_CompactRange(1, &begin, &begin);
+    ASSERT_EQ(0, NumTableFilesAtLevel(1));
+    ASSERT_EQ(3, NumTableFilesAtLevel(2));
+  }
+
+  db_->ReleaseSnapshot(snapshot);
+}
+
+TEST_F(DBRangeDelTest, UnorderedTombstones) {
+  // Regression test for #2752. Range delete tombstones between
+  // different snapshot stripes are not stored in order, so the first
+  // tombstone of each snapshot stripe should be checked as a smallest
+  // candidate.
+  Options options = CurrentOptions();
+  DestroyAndReopen(options);
+
+  auto cf = db_->DefaultColumnFamily();
+
+  ASSERT_OK(db_->Put(WriteOptions(), cf, "a", "a"));
+  ASSERT_OK(db_->Flush(FlushOptions(), cf));
+  ASSERT_EQ(1, NumTableFilesAtLevel(0));
+  ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr));
+  ASSERT_EQ(1, NumTableFilesAtLevel(1));
+
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), cf, "b", "c"));
+  // Hold a snapshot to separate these two delete ranges.
+  auto snapshot = db_->GetSnapshot();
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), cf, "a", "b"));
+  ASSERT_OK(db_->Flush(FlushOptions(), cf));
+  db_->ReleaseSnapshot(snapshot);
+
+  std::vector<std::vector<FileMetaData>> files;
+  dbfull()->TEST_GetFilesMetaData(cf, &files);
+  ASSERT_EQ(1, files[0].size());
+  ASSERT_EQ("a", files[0][0].smallest.user_key());
+  ASSERT_EQ("c", files[0][0].largest.user_key());
+
+  std::string v;
+  auto s = db_->Get(ReadOptions(), "a", &v);
+  ASSERT_TRUE(s.IsNotFound());
+}
+
+class MockMergeOperator : public MergeOperator {
+  // Mock non-associative operator. Non-associativity is expressed by lack of
+  // implementation for any `PartialMerge*` functions.
+ public:
+  bool FullMergeV2(const MergeOperationInput& merge_in,
+                   MergeOperationOutput* merge_out) const override {
+    assert(merge_out != nullptr);
+    merge_out->new_value = merge_in.operand_list.back().ToString();
+    return true;
+  }
+
+  const char* Name() const override { return "MockMergeOperator"; }
+};
+
+TEST_F(DBRangeDelTest, KeyAtOverlappingEndpointReappears) {
+  // This test uses a non-associative merge operator since that is a convenient
+  // way to get compaction to write out files with overlapping user-keys at the
+  // endpoints. Note, however, overlapping endpoints can also occur with other
+  // value types (Put, etc.), assuming the right snapshots are present.
+  const int kFileBytes = 1 << 20;
+  const int kValueBytes = 1 << 10;
+  const int kNumFiles = 4;
+
+  Options options = CurrentOptions();
+  options.compression = kNoCompression;
+  options.disable_auto_compactions = true;
+  options.merge_operator.reset(new MockMergeOperator());
+  options.target_file_size_base = kFileBytes;
+  Reopen(options);
+
+  // Push dummy data to L3 so that our actual test files on L0-L2
+  // will not be considered "bottommost" level, otherwise compaction
+  // may prevent us from creating overlapping user keys
+  // as on the bottommost layer MergeHelper
+  ASSERT_OK(db_->Merge(WriteOptions(), "key", "dummy"));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  MoveFilesToLevel(3);
+
+  Random rnd(301);
+  const Snapshot* snapshot = nullptr;
+  for (int i = 0; i < kNumFiles; ++i) {
+    for (int j = 0; j < kFileBytes / kValueBytes; ++j) {
+      auto value = RandomString(&rnd, kValueBytes);
+      ASSERT_OK(db_->Merge(WriteOptions(), "key", value));
+    }
+    if (i == kNumFiles - 1) {
+      // Take snapshot to prevent covered merge operands from being dropped by
+      // compaction.
+      snapshot = db_->GetSnapshot();
+      // The DeleteRange is the last write so all merge operands are covered.
+      ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                                 "key", "key_"));
+    }
+    ASSERT_OK(db_->Flush(FlushOptions()));
+  }
+  ASSERT_EQ(kNumFiles, NumTableFilesAtLevel(0));
+  std::string value;
+  ASSERT_TRUE(db_->Get(ReadOptions(), "key", &value).IsNotFound());
+
+  dbfull()->TEST_CompactRange(0 /* level */, nullptr /* begin */,
+                              nullptr /* end */, nullptr /* column_family */,
+                              true /* disallow_trivial_move */);
+  ASSERT_EQ(0, NumTableFilesAtLevel(0));
+  // Now we have multiple files at L1 all containing a single user key, thus
+  // guaranteeing overlap in the file endpoints.
+  ASSERT_GT(NumTableFilesAtLevel(1), 1);
+
+  // Verify no merge operands reappeared after the compaction.
+  ASSERT_TRUE(db_->Get(ReadOptions(), "key", &value).IsNotFound());
+
+  // Compact and verify again. It's worthwhile because now the files have
+  // tighter endpoints, so we can verify that doesn't mess anything up.
+  dbfull()->TEST_CompactRange(1 /* level */, nullptr /* begin */,
+                              nullptr /* end */, nullptr /* column_family */,
+                              true /* disallow_trivial_move */);
+  ASSERT_GT(NumTableFilesAtLevel(2), 1);
+  ASSERT_TRUE(db_->Get(ReadOptions(), "key", &value).IsNotFound());
+
+  db_->ReleaseSnapshot(snapshot);
+}
+
+TEST_F(DBRangeDelTest, UntruncatedTombstoneDoesNotDeleteNewerKey) {
+  // Verify a key newer than a range tombstone cannot be deleted by being
+  // compacted to the bottom level (and thus having its seqnum zeroed) before
+  // the range tombstone. This used to happen when range tombstones were
+  // untruncated on reads such that they extended past their file boundaries.
+  //
+  // Test summary:
+  //
+  // - L1 is bottommost.
+  // - A couple snapshots are strategically taken to prevent seqnums from being
+  //   zeroed, range tombstone from being dropped, merge operands from being
+  //   dropped, and merge operands from being combined.
+  // - Left half of files in L1 all have same user key, ensuring their file
+  //   boundaries overlap. In the past this would cause range tombstones to be
+  //   untruncated.
+  // - Right half of L1 files all have different keys, ensuring no overlap.
+  // - A range tombstone spans all L1 keys, so it is stored in every L1 file.
+  // - Keys in the right side of the key-range are overwritten. These are
+  //   compacted down to L1 after releasing snapshots such that their seqnums
+  //   will be zeroed.
+  // - A full range scan is performed. If the tombstone in the left L1 files
+  //   were untruncated, it would now cover keys newer than it (but with zeroed
+  //   seqnums) in the right L1 files.
+  const int kFileBytes = 1 << 20;
+  const int kValueBytes = 1 << 10;
+  const int kNumFiles = 4;
+  const int kMaxKey = kNumFiles* kFileBytes / kValueBytes;
+  const int kKeysOverwritten = 10;
+
+  Options options = CurrentOptions();
+  options.compression = kNoCompression;
+  options.disable_auto_compactions = true;
+  options.merge_operator.reset(new MockMergeOperator());
+  options.num_levels = 2;
+  options.target_file_size_base = kFileBytes;
+  Reopen(options);
+
+  Random rnd(301);
+  // - snapshots[0] prevents merge operands from being combined during
+  //   compaction.
+  // - snapshots[1] prevents merge operands from being dropped due to the
+  //   covering range tombstone.
+  const Snapshot* snapshots[] = {nullptr, nullptr};
+  for (int i = 0; i < kNumFiles; ++i) {
+    for (int j = 0; j < kFileBytes / kValueBytes; ++j) {
+      auto value = RandomString(&rnd, kValueBytes);
+      std::string key;
+      if (i < kNumFiles / 2) {
+        key = Key(0);
+      } else {
+        key = Key(1 + i * kFileBytes / kValueBytes + j);
+      }
+      ASSERT_OK(db_->Merge(WriteOptions(), key, value));
+    }
+    if (i == 0) {
+      snapshots[0] = db_->GetSnapshot();
+    }
+    if (i == kNumFiles - 1) {
+      snapshots[1] = db_->GetSnapshot();
+      // The DeleteRange is the last write so all merge operands are covered.
+      ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                                 Key(0), Key(kMaxKey + 1)));
+    }
+    ASSERT_OK(db_->Flush(FlushOptions()));
+  }
+  ASSERT_EQ(kNumFiles, NumTableFilesAtLevel(0));
+
+  auto get_key_count = [this]() -> int {
+    auto* iter = db_->NewIterator(ReadOptions());
+    iter->SeekToFirst();
+    int keys_found = 0;
+    for (; iter->Valid(); iter->Next()) {
+      ++keys_found;
+    }
+    delete iter;
+    return keys_found;
+  };
+
+  // All keys should be covered
+  ASSERT_EQ(0, get_key_count());
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr /* begin_key */,
+                              nullptr /* end_key */));
+  ASSERT_EQ(0, NumTableFilesAtLevel(0));
+  // Roughly the left half of L1 files should have overlapping boundary keys,
+  // while the right half should not.
+  ASSERT_GE(NumTableFilesAtLevel(1), kNumFiles);
+
+  // Now overwrite a few keys that are in L1 files that definitely don't have
+  // overlapping boundary keys.
+  for (int i = kMaxKey; i > kMaxKey - kKeysOverwritten; --i) {
+    auto value = RandomString(&rnd, kValueBytes);
+    ASSERT_OK(db_->Merge(WriteOptions(), Key(i), value));
+  }
+  ASSERT_OK(db_->Flush(FlushOptions()));
+
+  // The overwritten keys are in L0 now, so clearly aren't covered by the range
+  // tombstone in L1.
+  ASSERT_EQ(kKeysOverwritten, get_key_count());
+
+  // Release snapshots so seqnums can be zeroed when L0->L1 happens.
+  db_->ReleaseSnapshot(snapshots[0]);
+  db_->ReleaseSnapshot(snapshots[1]);
+
+  auto begin_key_storage = Key(kMaxKey - kKeysOverwritten + 1);
+  auto end_key_storage = Key(kMaxKey);
+  Slice begin_key(begin_key_storage);
+  Slice end_key(end_key_storage);
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &begin_key, &end_key));
+  ASSERT_EQ(0, NumTableFilesAtLevel(0));
+  ASSERT_GE(NumTableFilesAtLevel(1), kNumFiles);
+
+  ASSERT_EQ(kKeysOverwritten, get_key_count());
+}
+
+TEST_F(DBRangeDelTest, DeletedMergeOperandReappearsIterPrev) {
+  // Exposes a bug where we were using
+  // `RangeDelPositioningMode::kBackwardTraversal` while scanning merge operands
+  // in the forward direction. Confusingly, this case happened during
+  // `DBIter::Prev`. It could cause assertion failure, or reappearing keys.
+  const int kFileBytes = 1 << 20;
+  const int kValueBytes = 1 << 10;
+  // Need multiple keys so we can get results when calling `Prev()` after
+  // `SeekToLast()`.
+  const int kNumKeys = 3;
+  const int kNumFiles = 4;
+
+  Options options = CurrentOptions();
+  options.compression = kNoCompression;
+  options.disable_auto_compactions = true;
+  options.merge_operator.reset(new MockMergeOperator());
+  options.target_file_size_base = kFileBytes;
+  Reopen(options);
+
+  Random rnd(301);
+  const Snapshot* snapshot = nullptr;
+  for (int i = 0; i < kNumFiles; ++i) {
+    for (int j = 0; j < kFileBytes / kValueBytes; ++j) {
+      auto value = RandomString(&rnd, kValueBytes);
+      ASSERT_OK(db_->Merge(WriteOptions(), Key(j % kNumKeys), value));
+      if (i == 0 && j == kNumKeys) {
+        // Take snapshot to prevent covered merge operands from being dropped or
+        // merged by compaction.
+        snapshot = db_->GetSnapshot();
+        // Do a DeleteRange near the beginning so only the oldest merge operand
+        // for each key is covered. This ensures the sequence of events:
+        //
+        // - `DBIter::Prev()` is called
+        // - After several same versions of the same user key are encountered,
+        //   it decides to seek using `DBIter::FindValueForCurrentKeyUsingSeek`.
+        // - Binary searches to the newest version of the key, which is in the
+        //   leftmost file containing the user key.
+        // - Scans forwards to collect all merge operands. Eventually reaches
+        //   the rightmost file containing the oldest merge operand, which
+        //   should be covered by the `DeleteRange`. If `RangeDelAggregator`
+        //   were not properly using `kForwardTraversal` here, that operand
+        //   would reappear.
+        ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                                   Key(0), Key(kNumKeys + 1)));
+      }
+    }
+    ASSERT_OK(db_->Flush(FlushOptions()));
+  }
+  ASSERT_EQ(kNumFiles, NumTableFilesAtLevel(0));
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr /* begin_key */,
+                              nullptr /* end_key */));
+  ASSERT_EQ(0, NumTableFilesAtLevel(0));
+  ASSERT_GT(NumTableFilesAtLevel(1), 1);
+
+  auto* iter = db_->NewIterator(ReadOptions());
+  iter->SeekToLast();
+  int keys_found = 0;
+  for (; iter->Valid(); iter->Prev()) {
+    ++keys_found;
+  }
+  delete iter;
+  ASSERT_EQ(kNumKeys, keys_found);
+
+  db_->ReleaseSnapshot(snapshot);
+}
+
+TEST_F(DBRangeDelTest, SnapshotPreventsDroppedKeys) {
+  const int kFileBytes = 1 << 20;
+
+  Options options = CurrentOptions();
+  options.compression = kNoCompression;
+  options.disable_auto_compactions = true;
+  options.target_file_size_base = kFileBytes;
+  Reopen(options);
+
+  ASSERT_OK(Put(Key(0), "a"));
+  const Snapshot* snapshot = db_->GetSnapshot();
+
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(0),
+                             Key(10)));
+
+  db_->Flush(FlushOptions());
+
+  ReadOptions read_opts;
+  read_opts.snapshot = snapshot;
+  auto* iter = db_->NewIterator(read_opts);
+
+  iter->SeekToFirst();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(Key(0), iter->key());
+
+  iter->Next();
+  ASSERT_FALSE(iter->Valid());
+
+  delete iter;
+  db_->ReleaseSnapshot(snapshot);
+}
+
+TEST_F(DBRangeDelTest, SnapshotPreventsDroppedKeysInImmMemTables) {
+  const int kFileBytes = 1 << 20;
+
+  Options options = CurrentOptions();
+  options.compression = kNoCompression;
+  options.disable_auto_compactions = true;
+  options.target_file_size_base = kFileBytes;
+  Reopen(options);
+
+  // block flush thread -> pin immtables in memory
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->LoadDependency({
+      {"SnapshotPreventsDroppedKeysInImmMemTables:AfterNewIterator",
+       "DBImpl::BGWorkFlush"},
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(Put(Key(0), "a"));
+  std::unique_ptr<const Snapshot, std::function<void(const Snapshot*)>>
+      snapshot(db_->GetSnapshot(),
+               [this](const Snapshot* s) { db_->ReleaseSnapshot(s); });
+
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(0),
+                             Key(10)));
+
+  ASSERT_OK(dbfull()->TEST_SwitchMemtable());
+
+  ReadOptions read_opts;
+  read_opts.snapshot = snapshot.get();
+  std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
+
+  TEST_SYNC_POINT("SnapshotPreventsDroppedKeysInImmMemTables:AfterNewIterator");
+
+  iter->SeekToFirst();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(Key(0), iter->key());
+
+  iter->Next();
+  ASSERT_FALSE(iter->Valid());
+}
+
+TEST_F(DBRangeDelTest, RangeTombstoneWrittenToMinimalSsts) {
+  // Adapted from
+  // https://github.com/cockroachdb/cockroach/blob/de8b3ea603dd1592d9dc26443c2cc92c356fbc2f/pkg/storage/engine/rocksdb_test.go#L1267-L1398.
+  // Regression test for issue where range tombstone was written to more files
+  // than necessary when it began exactly at the begin key in the next
+  // compaction output file.
+  const int kFileBytes = 1 << 20;
+  const int kValueBytes = 4 << 10;
+  Options options = CurrentOptions();
+  options.compression = kNoCompression;
+  options.disable_auto_compactions = true;
+  // Have a bit of slack in the size limits but we enforce them more strictly
+  // when manually flushing/compacting.
+  options.max_compaction_bytes = 2 * kFileBytes;
+  options.target_file_size_base = 2 * kFileBytes;
+  options.write_buffer_size = 2 * kFileBytes;
+  Reopen(options);
+
+  Random rnd(301);
+  for (char first_char : {'a', 'b', 'c'}) {
+    for (int i = 0; i < kFileBytes / kValueBytes; ++i) {
+      std::string key(1, first_char);
+      key.append(Key(i));
+      std::string value = RandomString(&rnd, kValueBytes);
+      ASSERT_OK(Put(key, value));
+    }
+    db_->Flush(FlushOptions());
+    MoveFilesToLevel(2);
+  }
+  ASSERT_EQ(0, NumTableFilesAtLevel(0));
+  ASSERT_EQ(3, NumTableFilesAtLevel(2));
+
+  // Populate the memtable lightly while spanning the whole key-space. The
+  // setting of `max_compaction_bytes` will cause the L0->L1 to output multiple
+  // files to prevent a large L1->L2 compaction later.
+  ASSERT_OK(Put("a", "val"));
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                             "c" + Key(1), "d"));
+  // Our compaction output file cutting logic currently only considers point
+  // keys. So, in order for the range tombstone to have a chance at landing at
+  // the start of a new file, we need a point key at the range tombstone's
+  // start.
+  // TODO(ajkr): remove this `Put` after file cutting accounts for range
+  // tombstones (#3977).
+  ASSERT_OK(Put("c" + Key(1), "value"));
+  db_->Flush(FlushOptions());
+
+  // Ensure manual L0->L1 compaction cuts the outputs before the range tombstone
+  // and the range tombstone is only placed in the second SST.
+  std::string begin_key_storage("c" + Key(1));
+  Slice begin_key(begin_key_storage);
+  std::string end_key_storage("d");
+  Slice end_key(end_key_storage);
+  dbfull()->TEST_CompactRange(0 /* level */, &begin_key /* begin */,
+                              &end_key /* end */, nullptr /* column_family */,
+                              true /* disallow_trivial_move */);
+  ASSERT_EQ(2, NumTableFilesAtLevel(1));
+
+  std::vector<LiveFileMetaData> all_metadata;
+  std::vector<LiveFileMetaData> l1_metadata;
+  db_->GetLiveFilesMetaData(&all_metadata);
+  for (const auto& metadata : all_metadata) {
+    if (metadata.level == 1) {
+      l1_metadata.push_back(metadata);
+    }
+  }
+  std::sort(l1_metadata.begin(), l1_metadata.end(),
+            [&](const LiveFileMetaData& a, const LiveFileMetaData& b) {
+              return options.comparator->Compare(a.smallestkey, b.smallestkey) <
+                     0;
+            });
+  ASSERT_EQ("a", l1_metadata[0].smallestkey);
+  ASSERT_EQ("a", l1_metadata[0].largestkey);
+  ASSERT_EQ("c" + Key(1), l1_metadata[1].smallestkey);
+  ASSERT_EQ("d", l1_metadata[1].largestkey);
+
+  TablePropertiesCollection all_table_props;
+  ASSERT_OK(db_->GetPropertiesOfAllTables(&all_table_props));
+  int64_t num_range_deletions = 0;
+  for (const auto& name_and_table_props : all_table_props) {
+    const auto& name = name_and_table_props.first;
+    const auto& table_props = name_and_table_props.second;
+    // The range tombstone should only be output to the second L1 SST.
+    if (name.size() >= l1_metadata[1].name.size() &&
+        name.substr(name.size() - l1_metadata[1].name.size()).compare(l1_metadata[1].name) == 0) {
+      ASSERT_EQ(1, table_props->num_range_deletions);
+      ++num_range_deletions;
+    } else {
+      ASSERT_EQ(0, table_props->num_range_deletions);
+    }
+  }
+  ASSERT_EQ(1, num_range_deletions);
+}
+
+TEST_F(DBRangeDelTest, OverlappedTombstones) {
+  const int kNumPerFile = 4, kNumFiles = 2;
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.max_compaction_bytes = 9 * 1024;
+  DestroyAndReopen(options);
+  Random rnd(301);
+  for (int i = 0; i < kNumFiles; ++i) {
+    std::vector<std::string> values;
+    // Write 12K (4 values, each 3K)
+    for (int j = 0; j < kNumPerFile; j++) {
+      values.push_back(RandomString(&rnd, 3 << 10));
+      ASSERT_OK(Put(Key(i * kNumPerFile + j), values[j]));
+    }
+  }
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  ASSERT_EQ(1, NumTableFilesAtLevel(0));
+  MoveFilesToLevel(2);
+  ASSERT_EQ(2, NumTableFilesAtLevel(2));
+
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(1),
+                             Key((kNumFiles)*kNumPerFile + 1)));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+
+  ASSERT_EQ(1, NumTableFilesAtLevel(0));
+
+  dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr,
+                              true /* disallow_trivial_move */);
+
+  // The tombstone range is not broken up into multiple SSTs which may incur a
+  // large compaction with L2.
+  ASSERT_EQ(1, NumTableFilesAtLevel(1));
+  std::vector<std::vector<FileMetaData>> files;
+  dbfull()->TEST_CompactRange(1, nullptr, nullptr, nullptr,
+                              true /* disallow_trivial_move */);
+  ASSERT_EQ(1, NumTableFilesAtLevel(2));
+  ASSERT_EQ(0, NumTableFilesAtLevel(1));
+}
+
+TEST_F(DBRangeDelTest, OverlappedKeys) {
+  const int kNumPerFile = 4, kNumFiles = 2;
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.max_compaction_bytes = 9 * 1024;
+  DestroyAndReopen(options);
+  Random rnd(301);
+  for (int i = 0; i < kNumFiles; ++i) {
+    std::vector<std::string> values;
+    // Write 12K (4 values, each 3K)
+    for (int j = 0; j < kNumPerFile; j++) {
+      values.push_back(RandomString(&rnd, 3 << 10));
+      ASSERT_OK(Put(Key(i * kNumPerFile + j), values[j]));
+    }
+  }
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  ASSERT_EQ(1, NumTableFilesAtLevel(0));
+  MoveFilesToLevel(2);
+  ASSERT_EQ(2, NumTableFilesAtLevel(2));
+
+  for (int i = 1; i < kNumFiles * kNumPerFile + 1; i++) {
+    ASSERT_OK(Put(Key(i), "0x123"));
+  }
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  ASSERT_EQ(1, NumTableFilesAtLevel(0));
+
+  // The key range is broken up into three SSTs to avoid a future big compaction
+  // with the grandparent
+  dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr,
+                              true /* disallow_trivial_move */);
+  ASSERT_EQ(3, NumTableFilesAtLevel(1));
+
+  std::vector<std::vector<FileMetaData>> files;
+  dbfull()->TEST_CompactRange(1, nullptr, nullptr, nullptr,
+                              true /* disallow_trivial_move */);
+  ASSERT_EQ(1, NumTableFilesAtLevel(2));
+  ASSERT_EQ(0, NumTableFilesAtLevel(1));
+}
+
+#endif  // ROCKSDB_LITE
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_sst_test.cc b/src/rocksdb/db/db_sst_test.cc
new file mode 100644
index 000000000..e0ecfb641
--- /dev/null
+++ b/src/rocksdb/db/db_sst_test.cc
@@ -0,0 +1,1227 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/db_test_util.h"
+#include "file/sst_file_manager_impl.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "rocksdb/sst_file_manager.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBSSTTest : public DBTestBase {
+ public:
+  DBSSTTest() : DBTestBase("/db_sst_test") {}
+};
+
+#ifndef ROCKSDB_LITE
+// A class which remembers the name of each flushed file.
+class FlushedFileCollector : public EventListener {
+ public:
+  FlushedFileCollector() {}
+  ~FlushedFileCollector() override {}
+
+  void OnFlushCompleted(DB* /*db*/, const FlushJobInfo& info) override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    flushed_files_.push_back(info.file_path);
+  }
+
+  std::vector<std::string> GetFlushedFiles() {
+    std::lock_guard<std::mutex> lock(mutex_);
+    std::vector<std::string> result;
+    for (auto fname : flushed_files_) {
+      result.push_back(fname);
+    }
+    return result;
+  }
+  void ClearFlushedFiles() {
+    std::lock_guard<std::mutex> lock(mutex_);
+    flushed_files_.clear();
+  }
+
+ private:
+  std::vector<std::string> flushed_files_;
+  std::mutex mutex_;
+};
+#endif  // ROCKSDB_LITE
+
+TEST_F(DBSSTTest, DontDeletePendingOutputs) {
+  Options options;
+  options.env = env_;
+  options.create_if_missing = true;
+  DestroyAndReopen(options);
+
+  // Every time we write to a table file, call FOF/POF with full DB scan. This
+  // will make sure our pending_outputs_ protection work correctly
+  std::function<void()> purge_obsolete_files_function = [&]() {
+    JobContext job_context(0);
+    dbfull()->TEST_LockMutex();
+    dbfull()->FindObsoleteFiles(&job_context, true /*force*/);
+    dbfull()->TEST_UnlockMutex();
+    dbfull()->PurgeObsoleteFiles(job_context);
+    job_context.Clean();
+  };
+
+  env_->table_write_callback_ = &purge_obsolete_files_function;
+
+  for (int i = 0; i < 2; ++i) {
+    ASSERT_OK(Put("a", "begin"));
+    ASSERT_OK(Put("z", "end"));
+    ASSERT_OK(Flush());
+  }
+
+  // If pending output guard does not work correctly, PurgeObsoleteFiles() will
+  // delete the file that Compaction is trying to create, causing this: error
+  // db/db_test.cc:975: IO error:
+  // /tmp/rocksdbtest-1552237650/db_test/000009.sst: No such file or directory
+  Compact("a", "b");
+}
+
+// 1 Create some SST files by inserting K-V pairs into DB
+// 2 Close DB and change suffix from ".sst" to ".ldb" for every other SST file
+// 3 Open DB and check if all key can be read
+TEST_F(DBSSTTest, SSTsWithLdbSuffixHandling) {
+  Options options = CurrentOptions();
+  options.write_buffer_size = 110 << 10;  // 110KB
+  options.num_levels = 4;
+  DestroyAndReopen(options);
+
+  Random rnd(301);
+  int key_id = 0;
+  for (int i = 0; i < 10; ++i) {
+    GenerateNewFile(&rnd, &key_id, false);
+  }
+  Flush();
+  Close();
+  int const num_files = GetSstFileCount(dbname_);
+  ASSERT_GT(num_files, 0);
+
+  Reopen(options);
+  std::vector<std::string> values;
+  values.reserve(key_id);
+  for (int k = 0; k < key_id; ++k) {
+    values.push_back(Get(Key(k)));
+  }
+  Close();
+
+  std::vector<std::string> filenames;
+  GetSstFiles(env_, dbname_, &filenames);
+  int num_ldb_files = 0;
+  for (size_t i = 0; i < filenames.size(); ++i) {
+    if (i & 1) {
+      continue;
+    }
+    std::string const rdb_name = dbname_ + "/" + filenames[i];
+    std::string const ldb_name = Rocks2LevelTableFileName(rdb_name);
+    ASSERT_TRUE(env_->RenameFile(rdb_name, ldb_name).ok());
+    ++num_ldb_files;
+  }
+  ASSERT_GT(num_ldb_files, 0);
+  ASSERT_EQ(num_files, GetSstFileCount(dbname_));
+
+  Reopen(options);
+  for (int k = 0; k < key_id; ++k) {
+    ASSERT_EQ(values[k], Get(Key(k)));
+  }
+  Destroy(options);
+}
+
+// Check that we don't crash when opening DB with
+// DBOptions::skip_checking_sst_file_sizes_on_db_open = true.
+TEST_F(DBSSTTest, SkipCheckingSSTFileSizesOnDBOpen) {
+  ASSERT_OK(Put("pika", "choo"));
+  ASSERT_OK(Flush());
+
+  // Just open the DB with the option set to true and check that we don't crash.
+  Options options;
+  options.skip_checking_sst_file_sizes_on_db_open = true;
+  Reopen(options);
+
+  ASSERT_EQ("choo", Get("pika"));
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBSSTTest, DontDeleteMovedFile) {
+  // This test triggers move compaction and verifies that the file is not
+  // deleted when it's part of move compaction
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  options.max_bytes_for_level_base = 1024 * 1024;  // 1 MB
+  options.level0_file_num_compaction_trigger =
+      2;  // trigger compaction when we have 2 files
+  DestroyAndReopen(options);
+
+  Random rnd(301);
+  // Create two 1MB sst files
+  for (int i = 0; i < 2; ++i) {
+    // Create 1MB sst file
+    for (int j = 0; j < 100; ++j) {
+      ASSERT_OK(Put(Key(i * 50 + j), RandomString(&rnd, 10 * 1024)));
+    }
+    ASSERT_OK(Flush());
+  }
+  // this should execute both L0->L1 and L1->(move)->L2 compactions
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ("0,0,1", FilesPerLevel(0));
+
+  // If the moved file is actually deleted (the move-safeguard in
+  // ~Version::Version() is not there), we get this failure:
+  // Corruption: Can't access /000009.sst
+  Reopen(options);
+}
+
+// This reproduces a bug where we don't delete a file because when it was
+// supposed to be deleted, it was blocked by pending_outputs
+// Consider:
+// 1. current file_number is 13
+// 2. compaction (1) starts, blocks deletion of all files starting with 13
+// (pending outputs)
+// 3. file 13 is created by compaction (2)
+// 4. file 13 is consumed by compaction (3) and file 15 was created. Since file
+// 13 has no references, it is put into VersionSet::obsolete_files_
+// 5. FindObsoleteFiles() gets file 13 from VersionSet::obsolete_files_. File 13
+// is deleted from obsolete_files_ set.
+// 6. PurgeObsoleteFiles() tries to delete file 13, but this file is blocked by
+// pending outputs since compaction (1) is still running. It is not deleted and
+// it is not present in obsolete_files_ anymore. Therefore, we never delete it.
+TEST_F(DBSSTTest, DeleteObsoleteFilesPendingOutputs) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.write_buffer_size = 2 * 1024 * 1024;     // 2 MB
+  options.max_bytes_for_level_base = 1024 * 1024;  // 1 MB
+  options.level0_file_num_compaction_trigger =
+      2;  // trigger compaction when we have 2 files
+  options.max_background_flushes = 2;
+  options.max_background_compactions = 2;
+
+  OnFileDeletionListener* listener = new OnFileDeletionListener();
+  options.listeners.emplace_back(listener);
+
+  Reopen(options);
+
+  Random rnd(301);
+  // Create two 1MB sst files
+  for (int i = 0; i < 2; ++i) {
+    // Create 1MB sst file
+    for (int j = 0; j < 100; ++j) {
+      ASSERT_OK(Put(Key(i * 50 + j), RandomString(&rnd, 10 * 1024)));
+    }
+    ASSERT_OK(Flush());
+  }
+  // this should execute both L0->L1 and L1->(move)->L2 compactions
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ("0,0,1", FilesPerLevel(0));
+
+  test::SleepingBackgroundTask blocking_thread;
+  port::Mutex mutex_;
+  bool already_blocked(false);
+
+  // block the flush
+  std::function<void()> block_first_time = [&]() {
+    bool blocking = false;
+    {
+      MutexLock l(&mutex_);
+      if (!already_blocked) {
+        blocking = true;
+        already_blocked = true;
+      }
+    }
+    if (blocking) {
+      blocking_thread.DoSleep();
+    }
+  };
+  env_->table_write_callback_ = &block_first_time;
+  // Insert 2.5MB data, which should trigger a flush because we exceed
+  // write_buffer_size. The flush will be blocked with block_first_time
+  // pending_file is protecting all the files created after
+  for (int j = 0; j < 256; ++j) {
+    ASSERT_OK(Put(Key(j), RandomString(&rnd, 10 * 1024)));
+  }
+  blocking_thread.WaitUntilSleeping();
+
+  ASSERT_OK(dbfull()->TEST_CompactRange(2, nullptr, nullptr));
+
+  ASSERT_EQ("0,0,0,1", FilesPerLevel(0));
+  std::vector<LiveFileMetaData> metadata;
+  db_->GetLiveFilesMetaData(&metadata);
+  ASSERT_EQ(metadata.size(), 1U);
+  auto file_on_L2 = metadata[0].name;
+  listener->SetExpectedFileName(dbname_ + file_on_L2);
+
+  ASSERT_OK(dbfull()->TEST_CompactRange(3, nullptr, nullptr, nullptr,
+                                        true /* disallow trivial move */));
+  ASSERT_EQ("0,0,0,0,1", FilesPerLevel(0));
+
+  // finish the flush!
+  blocking_thread.WakeUp();
+  blocking_thread.WaitUntilDone();
+  dbfull()->TEST_WaitForFlushMemTable();
+  // File just flushed is too big for L0 and L1 so gets moved to L2.
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ("0,0,1,0,1", FilesPerLevel(0));
+
+  metadata.clear();
+  db_->GetLiveFilesMetaData(&metadata);
+  ASSERT_EQ(metadata.size(), 2U);
+
+  // This file should have been deleted during last compaction
+  ASSERT_EQ(Status::NotFound(), env_->FileExists(dbname_ + file_on_L2));
+  listener->VerifyMatchedCount(1);
+}
+
+TEST_F(DBSSTTest, DBWithSstFileManager) {
+  std::shared_ptr<SstFileManager> sst_file_manager(NewSstFileManager(env_));
+  auto sfm = static_cast<SstFileManagerImpl*>(sst_file_manager.get());
+
+  int files_added = 0;
+  int files_deleted = 0;
+  int files_moved = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "SstFileManagerImpl::OnAddFile", [&](void* /*arg*/) { files_added++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "SstFileManagerImpl::OnDeleteFile",
+      [&](void* /*arg*/) { files_deleted++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "SstFileManagerImpl::OnMoveFile", [&](void* /*arg*/) { files_moved++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  Options options = CurrentOptions();
+  options.sst_file_manager = sst_file_manager;
+  DestroyAndReopen(options);
+
+  Random rnd(301);
+  for (int i = 0; i < 25; i++) {
+    GenerateNewRandomFile(&rnd);
+    ASSERT_OK(Flush());
+    dbfull()->TEST_WaitForFlushMemTable();
+    dbfull()->TEST_WaitForCompact();
+    // Verify that we are tracking all sst files in dbname_
+    ASSERT_EQ(sfm->GetTrackedFiles(), GetAllSSTFiles());
+  }
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+  auto files_in_db = GetAllSSTFiles();
+  // Verify that we are tracking all sst files in dbname_
+  ASSERT_EQ(sfm->GetTrackedFiles(), files_in_db);
+  // Verify the total files size
+  uint64_t total_files_size = 0;
+  for (auto& file_to_size : files_in_db) {
+    total_files_size += file_to_size.second;
+  }
+  ASSERT_EQ(sfm->GetTotalSize(), total_files_size);
+  // We flushed at least 25 files
+  ASSERT_GE(files_added, 25);
+  // Compaction must have deleted some files
+  ASSERT_GT(files_deleted, 0);
+  // No files were moved
+  ASSERT_EQ(files_moved, 0);
+
+  Close();
+  Reopen(options);
+  ASSERT_EQ(sfm->GetTrackedFiles(), files_in_db);
+  ASSERT_EQ(sfm->GetTotalSize(), total_files_size);
+
+  // Verify that we track all the files again after the DB is closed and opened
+  Close();
+  sst_file_manager.reset(NewSstFileManager(env_));
+  options.sst_file_manager = sst_file_manager;
+  sfm = static_cast<SstFileManagerImpl*>(sst_file_manager.get());
+
+  Reopen(options);
+  ASSERT_EQ(sfm->GetTrackedFiles(), files_in_db);
+  ASSERT_EQ(sfm->GetTotalSize(), total_files_size);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBSSTTest, RateLimitedDelete) {
+  Destroy(last_options_);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+      {"DBSSTTest::RateLimitedDelete:1",
+       "DeleteScheduler::BackgroundEmptyTrash"},
+  });
+
+  std::vector<uint64_t> penalties;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DeleteScheduler::BackgroundEmptyTrash:Wait",
+      [&](void* arg) { penalties.push_back(*(static_cast<uint64_t*>(arg))); });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "InstrumentedCondVar::TimedWaitInternal", [&](void* arg) {
+        // Turn timed wait into a simulated sleep
+        uint64_t* abs_time_us = static_cast<uint64_t*>(arg);
+        int64_t cur_time = 0;
+        env_->GetCurrentTime(&cur_time);
+        if (*abs_time_us > static_cast<uint64_t>(cur_time)) {
+          env_->addon_time_.fetch_add(*abs_time_us -
+                                      static_cast<uint64_t>(cur_time));
+        }
+
+        // Randomly sleep shortly
+        env_->addon_time_.fetch_add(
+            static_cast<uint64_t>(Random::GetTLSInstance()->Uniform(10)));
+
+        // Set wait until time to before current to force not to sleep.
+        int64_t real_cur_time = 0;
+        Env::Default()->GetCurrentTime(&real_cur_time);
+        *abs_time_us = static_cast<uint64_t>(real_cur_time);
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  env_->no_slowdown_ = true;
+  env_->time_elapse_only_sleep_ = true;
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  // Need to disable stats dumping and persisting which also use
+  // RepeatableThread, one of whose member variables is of type
+  // InstrumentedCondVar. The callback for
+  // InstrumentedCondVar::TimedWaitInternal can be triggered by stats dumping
+  // and persisting threads and cause time_spent_deleting measurement to become
+  // incorrect.
+  options.stats_dump_period_sec = 0;
+  options.stats_persist_period_sec = 0;
+  options.env = env_;
+
+  int64_t rate_bytes_per_sec = 1024 * 10;  // 10 Kbs / Sec
+  Status s;
+  options.sst_file_manager.reset(
+      NewSstFileManager(env_, nullptr, "", 0, false, &s, 0));
+  ASSERT_OK(s);
+  options.sst_file_manager->SetDeleteRateBytesPerSecond(rate_bytes_per_sec);
+  auto sfm = static_cast<SstFileManagerImpl*>(options.sst_file_manager.get());
+  sfm->delete_scheduler()->SetMaxTrashDBRatio(1.1);
+
+  WriteOptions wo;
+  wo.disableWAL = true;
+  ASSERT_OK(TryReopen(options));
+  // Create 4 files in L0
+  for (char v = 'a'; v <= 'd'; v++) {
+    ASSERT_OK(Put("Key2", DummyString(1024, v), wo));
+    ASSERT_OK(Put("Key3", DummyString(1024, v), wo));
+    ASSERT_OK(Put("Key4", DummyString(1024, v), wo));
+    ASSERT_OK(Put("Key1", DummyString(1024, v), wo));
+    ASSERT_OK(Put("Key4", DummyString(1024, v), wo));
+    ASSERT_OK(Flush());
+  }
+  // We created 4 sst files in L0
+  ASSERT_EQ("4", FilesPerLevel(0));
+
+  std::vector<LiveFileMetaData> metadata;
+  db_->GetLiveFilesMetaData(&metadata);
+
+  // Compaction will move the 4 files in L0 to trash and create 1 L1 file
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
+  ASSERT_EQ("0,1", FilesPerLevel(0));
+
+  uint64_t delete_start_time = env_->NowMicros();
+  // Hold BackgroundEmptyTrash
+  TEST_SYNC_POINT("DBSSTTest::RateLimitedDelete:1");
+  sfm->WaitForEmptyTrash();
+  uint64_t time_spent_deleting = env_->NowMicros() - delete_start_time;
+
+  uint64_t total_files_size = 0;
+  uint64_t expected_penlty = 0;
+  ASSERT_EQ(penalties.size(), metadata.size());
+  for (size_t i = 0; i < metadata.size(); i++) {
+    total_files_size += metadata[i].size;
+    expected_penlty = ((total_files_size * 1000000) / rate_bytes_per_sec);
+    ASSERT_EQ(expected_penlty, penalties[i]);
+  }
+  ASSERT_GT(time_spent_deleting, expected_penlty * 0.9);
+  ASSERT_LT(time_spent_deleting, expected_penlty * 1.1);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBSSTTest, RateLimitedWALDelete) {
+  Destroy(last_options_);
+
+  std::vector<uint64_t> penalties;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DeleteScheduler::BackgroundEmptyTrash:Wait",
+      [&](void* arg) { penalties.push_back(*(static_cast<uint64_t*>(arg))); });
+
+  env_->no_slowdown_ = true;
+  env_->time_elapse_only_sleep_ = true;
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.compression = kNoCompression;
+  options.env = env_;
+
+  int64_t rate_bytes_per_sec = 1024 * 10;  // 10 Kbs / Sec
+  Status s;
+  options.sst_file_manager.reset(
+      NewSstFileManager(env_, nullptr, "", 0, false, &s, 0));
+  ASSERT_OK(s);
+  options.sst_file_manager->SetDeleteRateBytesPerSecond(rate_bytes_per_sec);
+  auto sfm = static_cast<SstFileManagerImpl*>(options.sst_file_manager.get());
+  sfm->delete_scheduler()->SetMaxTrashDBRatio(3.1);
+
+  ASSERT_OK(TryReopen(options));
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // Create 4 files in L0
+  for (char v = 'a'; v <= 'd'; v++) {
+    ASSERT_OK(Put("Key2", DummyString(1024, v)));
+    ASSERT_OK(Put("Key3", DummyString(1024, v)));
+    ASSERT_OK(Put("Key4", DummyString(1024, v)));
+    ASSERT_OK(Put("Key1", DummyString(1024, v)));
+    ASSERT_OK(Put("Key4", DummyString(1024, v)));
+    ASSERT_OK(Flush());
+  }
+  // We created 4 sst files in L0
+  ASSERT_EQ("4", FilesPerLevel(0));
+
+  // Compaction will move the 4 files in L0 to trash and create 1 L1 file
+  CompactRangeOptions cro;
+  cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
+  ASSERT_EQ("0,1", FilesPerLevel(0));
+
+  sfm->WaitForEmptyTrash();
+  ASSERT_EQ(penalties.size(), 8);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+class DBWALTestWithParam
+    : public DBSSTTest,
+      public testing::WithParamInterface<std::tuple<std::string, bool>> {
+ public:
+  DBWALTestWithParam() {
+    wal_dir_ = std::get<0>(GetParam());
+    wal_dir_same_as_dbname_ = std::get<1>(GetParam());
+  }
+
+  std::string wal_dir_;
+  bool wal_dir_same_as_dbname_;
+};
+
+TEST_P(DBWALTestWithParam, WALTrashCleanupOnOpen) {
+  class MyEnv : public EnvWrapper {
+   public:
+    MyEnv(Env* t) : EnvWrapper(t), fake_log_delete(false) {}
+
+    Status DeleteFile(const std::string& fname) {
+      if (fname.find(".log.trash") != std::string::npos && fake_log_delete) {
+        return Status::OK();
+      }
+
+      return target()->DeleteFile(fname);
+    }
+
+    void set_fake_log_delete(bool fake) { fake_log_delete = fake; }
+
+   private:
+    bool fake_log_delete;
+  };
+
+  std::unique_ptr<MyEnv> env(new MyEnv(Env::Default()));
+  Destroy(last_options_);
+
+  env->set_fake_log_delete(true);
+
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.compression = kNoCompression;
+  options.env = env.get();
+  options.wal_dir = dbname_ + wal_dir_;
+
+  int64_t rate_bytes_per_sec = 1024 * 10;  // 10 Kbs / Sec
+  Status s;
+  options.sst_file_manager.reset(
+      NewSstFileManager(env_, nullptr, "", 0, false, &s, 0));
+  ASSERT_OK(s);
+  options.sst_file_manager->SetDeleteRateBytesPerSecond(rate_bytes_per_sec);
+  auto sfm = static_cast<SstFileManagerImpl*>(options.sst_file_manager.get());
+  sfm->delete_scheduler()->SetMaxTrashDBRatio(3.1);
+
+  ASSERT_OK(TryReopen(options));
+
+  // Create 4 files in L0
+  for (char v = 'a'; v <= 'd'; v++) {
+    ASSERT_OK(Put("Key2", DummyString(1024, v)));
+    ASSERT_OK(Put("Key3", DummyString(1024, v)));
+    ASSERT_OK(Put("Key4", DummyString(1024, v)));
+    ASSERT_OK(Put("Key1", DummyString(1024, v)));
+    ASSERT_OK(Put("Key4", DummyString(1024, v)));
+    ASSERT_OK(Flush());
+  }
+  // We created 4 sst files in L0
+  ASSERT_EQ("4", FilesPerLevel(0));
+
+  Close();
+
+  options.sst_file_manager.reset();
+  std::vector<std::string> filenames;
+  int trash_log_count = 0;
+  if (!wal_dir_same_as_dbname_) {
+    // Forcibly create some trash log files
+    std::unique_ptr<WritableFile> result;
+    env->NewWritableFile(options.wal_dir + "/1000.log.trash", &result,
+                         EnvOptions());
+    result.reset();
+  }
+  env->GetChildren(options.wal_dir, &filenames);
+  for (const std::string& fname : filenames) {
+    if (fname.find(".log.trash") != std::string::npos) {
+      trash_log_count++;
+    }
+  }
+  ASSERT_GE(trash_log_count, 1);
+
+  env->set_fake_log_delete(false);
+  ASSERT_OK(TryReopen(options));
+
+  filenames.clear();
+  trash_log_count = 0;
+  env->GetChildren(options.wal_dir, &filenames);
+  for (const std::string& fname : filenames) {
+    if (fname.find(".log.trash") != std::string::npos) {
+      trash_log_count++;
+    }
+  }
+  ASSERT_EQ(trash_log_count, 0);
+  Close();
+}
+
+INSTANTIATE_TEST_CASE_P(DBWALTestWithParam, DBWALTestWithParam,
+                        ::testing::Values(std::make_tuple("", true),
+                                          std::make_tuple("_wal_dir", false)));
+
+TEST_F(DBSSTTest, OpenDBWithExistingTrash) {
+  Options options = CurrentOptions();
+
+  options.sst_file_manager.reset(
+      NewSstFileManager(env_, nullptr, "", 1024 * 1024 /* 1 MB/sec */));
+  auto sfm = static_cast<SstFileManagerImpl*>(options.sst_file_manager.get());
+
+  Destroy(last_options_);
+
+  // Add some trash files to the db directory so the DB can clean them up
+  env_->CreateDirIfMissing(dbname_);
+  ASSERT_OK(WriteStringToFile(env_, "abc", dbname_ + "/" + "001.sst.trash"));
+  ASSERT_OK(WriteStringToFile(env_, "abc", dbname_ + "/" + "002.sst.trash"));
+  ASSERT_OK(WriteStringToFile(env_, "abc", dbname_ + "/" + "003.sst.trash"));
+
+  // Reopen the DB and verify that it deletes existing trash files
+  ASSERT_OK(TryReopen(options));
+  sfm->WaitForEmptyTrash();
+  ASSERT_NOK(env_->FileExists(dbname_ + "/" + "001.sst.trash"));
+  ASSERT_NOK(env_->FileExists(dbname_ + "/" + "002.sst.trash"));
+  ASSERT_NOK(env_->FileExists(dbname_ + "/" + "003.sst.trash"));
+}
+
+
+// Create a DB with 2 db_paths, and generate multiple files in the 2
+// db_paths using CompactRangeOptions, make sure that files that were
+// deleted from first db_path were deleted using DeleteScheduler and
+// files in the second path were not.
+TEST_F(DBSSTTest, DeleteSchedulerMultipleDBPaths) {
+  std::atomic<int> bg_delete_file(0);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DeleteScheduler::DeleteTrashFile:DeleteFile",
+      [&](void* /*arg*/) { bg_delete_file++; });
+  // The deletion scheduler sometimes skips marking file as trash according to
+  // a heuristic. In that case the deletion will go through the below SyncPoint.
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DeleteScheduler::DeleteFile", [&](void* /*arg*/) { bg_delete_file++; });
+
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.db_paths.emplace_back(dbname_, 1024 * 100);
+  options.db_paths.emplace_back(dbname_ + "_2", 1024 * 100);
+  options.env = env_;
+
+  int64_t rate_bytes_per_sec = 1024 * 1024;  // 1 Mb / Sec
+  Status s;
+  options.sst_file_manager.reset(
+      NewSstFileManager(env_, nullptr, "", rate_bytes_per_sec, false, &s,
+                        /* max_trash_db_ratio= */ 1.1));
+
+  ASSERT_OK(s);
+  auto sfm = static_cast<SstFileManagerImpl*>(options.sst_file_manager.get());
+
+  DestroyAndReopen(options);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  WriteOptions wo;
+  wo.disableWAL = true;
+
+  // Create 4 files in L0
+  for (int i = 0; i < 4; i++) {
+    ASSERT_OK(Put("Key" + ToString(i), DummyString(1024, 'A'), wo));
+    ASSERT_OK(Flush());
+  }
+  // We created 4 sst files in L0
+  ASSERT_EQ("4", FilesPerLevel(0));
+  // Compaction will delete files from L0 in first db path and generate a new
+  // file in L1 in second db path
+  CompactRangeOptions compact_options;
+  compact_options.target_path_id = 1;
+  Slice begin("Key0");
+  Slice end("Key3");
+  ASSERT_OK(db_->CompactRange(compact_options, &begin, &end));
+  ASSERT_EQ("0,1", FilesPerLevel(0));
+
+  // Create 4 files in L0
+  for (int i = 4; i < 8; i++) {
+    ASSERT_OK(Put("Key" + ToString(i), DummyString(1024, 'B'), wo));
+    ASSERT_OK(Flush());
+  }
+  ASSERT_EQ("4,1", FilesPerLevel(0));
+
+  // Compaction will delete files from L0 in first db path and generate a new
+  // file in L1 in second db path
+  begin = "Key4";
+  end = "Key7";
+  ASSERT_OK(db_->CompactRange(compact_options, &begin, &end));
+  ASSERT_EQ("0,2", FilesPerLevel(0));
+
+  sfm->WaitForEmptyTrash();
+  ASSERT_EQ(bg_delete_file, 8);
+
+  // Compaction will delete both files and regenerate a file in L1 in second
+  // db path. The deleted files should still be cleaned up via delete scheduler.
+  compact_options.bottommost_level_compaction =
+      BottommostLevelCompaction::kForceOptimized;
+  ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr));
+  ASSERT_EQ("0,1", FilesPerLevel(0));
+
+  sfm->WaitForEmptyTrash();
+  ASSERT_EQ(bg_delete_file, 10);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBSSTTest, DestroyDBWithRateLimitedDelete) {
+  int bg_delete_file = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DeleteScheduler::DeleteTrashFile:DeleteFile",
+      [&](void* /*arg*/) { bg_delete_file++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  Status s;
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.env = env_;
+  options.sst_file_manager.reset(
+      NewSstFileManager(env_, nullptr, "", 0, false, &s, 0));
+  ASSERT_OK(s);
+  DestroyAndReopen(options);
+
+  // Create 4 files in L0
+  for (int i = 0; i < 4; i++) {
+    ASSERT_OK(Put("Key" + ToString(i), DummyString(1024, 'A')));
+    ASSERT_OK(Flush());
+  }
+  // We created 4 sst files in L0
+  ASSERT_EQ("4", FilesPerLevel(0));
+
+  // Close DB and destroy it using DeleteScheduler
+  Close();
+
+  int num_sst_files = 0;
+  int num_wal_files = 0;
+  std::vector<std::string> db_files;
+  env_->GetChildren(dbname_, &db_files);
+  for (std::string f : db_files) {
+    if (f.substr(f.find_last_of(".") + 1) == "sst") {
+      num_sst_files++;
+    } else if (f.substr(f.find_last_of(".") + 1) == "log") {
+      num_wal_files++;
+    }
+  }
+  ASSERT_GT(num_sst_files, 0);
+  ASSERT_GT(num_wal_files, 0);
+
+  auto sfm = static_cast<SstFileManagerImpl*>(options.sst_file_manager.get());
+
+  sfm->SetDeleteRateBytesPerSecond(1024 * 1024);
+  sfm->delete_scheduler()->SetMaxTrashDBRatio(1.1);
+  ASSERT_OK(DestroyDB(dbname_, options));
+  sfm->WaitForEmptyTrash();
+  ASSERT_EQ(bg_delete_file, num_sst_files + num_wal_files);
+}
+
+TEST_F(DBSSTTest, DBWithMaxSpaceAllowed) {
+  std::shared_ptr<SstFileManager> sst_file_manager(NewSstFileManager(env_));
+  auto sfm = static_cast<SstFileManagerImpl*>(sst_file_manager.get());
+
+  Options options = CurrentOptions();
+  options.sst_file_manager = sst_file_manager;
+  options.disable_auto_compactions = true;
+  DestroyAndReopen(options);
+
+  Random rnd(301);
+
+  // Generate a file containing 100 keys.
+  for (int i = 0; i < 100; i++) {
+    ASSERT_OK(Put(Key(i), RandomString(&rnd, 50)));
+  }
+  ASSERT_OK(Flush());
+
+  uint64_t first_file_size = 0;
+  auto files_in_db = GetAllSSTFiles(&first_file_size);
+  ASSERT_EQ(sfm->GetTotalSize(), first_file_size);
+
+  // Set the maximum allowed space usage to the current total size
+  sfm->SetMaxAllowedSpaceUsage(first_file_size + 1);
+
+  ASSERT_OK(Put("key1", "val1"));
+  // This flush will cause bg_error_ and will fail
+  ASSERT_NOK(Flush());
+}
+
+TEST_F(DBSSTTest, CancellingCompactionsWorks) {
+  std::shared_ptr<SstFileManager> sst_file_manager(NewSstFileManager(env_));
+  auto sfm = static_cast<SstFileManagerImpl*>(sst_file_manager.get());
+
+  Options options = CurrentOptions();
+  options.sst_file_manager = sst_file_manager;
+  options.level0_file_num_compaction_trigger = 2;
+  options.statistics = CreateDBStatistics();
+  DestroyAndReopen(options);
+
+  int completed_compactions = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction():CancelledCompaction", [&](void* /*arg*/) {
+        sfm->SetMaxAllowedSpaceUsage(0);
+        ASSERT_EQ(sfm->GetCompactionsReservedSize(), 0);
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:NonTrivial:AfterRun",
+      [&](void* /*arg*/) { completed_compactions++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  Random rnd(301);
+
+  // Generate a file containing 10 keys.
+  for (int i = 0; i < 10; i++) {
+    ASSERT_OK(Put(Key(i), RandomString(&rnd, 50)));
+  }
+  ASSERT_OK(Flush());
+  uint64_t total_file_size = 0;
+  auto files_in_db = GetAllSSTFiles(&total_file_size);
+  // Set the maximum allowed space usage to the current total size
+  sfm->SetMaxAllowedSpaceUsage(2 * total_file_size + 1);
+
+  // Generate another file to trigger compaction.
+  for (int i = 0; i < 10; i++) {
+    ASSERT_OK(Put(Key(i), RandomString(&rnd, 50)));
+  }
+  ASSERT_OK(Flush());
+  dbfull()->TEST_WaitForCompact(true);
+
+  // Because we set a callback in CancelledCompaction, we actually
+  // let the compaction run
+  ASSERT_GT(completed_compactions, 0);
+  ASSERT_EQ(sfm->GetCompactionsReservedSize(), 0);
+  // Make sure the stat is bumped
+  ASSERT_GT(dbfull()->immutable_db_options().statistics.get()->getTickerCount(COMPACTION_CANCELLED), 0);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBSSTTest, CancellingManualCompactionsWorks) {
+  std::shared_ptr<SstFileManager> sst_file_manager(NewSstFileManager(env_));
+  auto sfm = static_cast<SstFileManagerImpl*>(sst_file_manager.get());
+
+  Options options = CurrentOptions();
+  options.sst_file_manager = sst_file_manager;
+  options.statistics = CreateDBStatistics();
+
+  FlushedFileCollector* collector = new FlushedFileCollector();
+  options.listeners.emplace_back(collector);
+
+  DestroyAndReopen(options);
+
+  Random rnd(301);
+
+  // Generate a file containing 10 keys.
+  for (int i = 0; i < 10; i++) {
+    ASSERT_OK(Put(Key(i), RandomString(&rnd, 50)));
+  }
+  ASSERT_OK(Flush());
+  uint64_t total_file_size = 0;
+  auto files_in_db = GetAllSSTFiles(&total_file_size);
+  // Set the maximum allowed space usage to the current total size
+  sfm->SetMaxAllowedSpaceUsage(2 * total_file_size + 1);
+
+  // Generate another file to trigger compaction.
+  for (int i = 0; i < 10; i++) {
+    ASSERT_OK(Put(Key(i), RandomString(&rnd, 50)));
+  }
+  ASSERT_OK(Flush());
+
+  // OK, now trigger a manual compaction
+  dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+
+  // Wait for manual compaction to get scheduled and finish
+  dbfull()->TEST_WaitForCompact(true);
+
+  ASSERT_EQ(sfm->GetCompactionsReservedSize(), 0);
+  // Make sure the stat is bumped
+  ASSERT_EQ(dbfull()->immutable_db_options().statistics.get()->getTickerCount(
+                COMPACTION_CANCELLED),
+            1);
+
+  // Now make sure CompactFiles also gets cancelled
+  auto l0_files = collector->GetFlushedFiles();
+  dbfull()->CompactFiles(ROCKSDB_NAMESPACE::CompactionOptions(), l0_files, 0);
+
+  // Wait for manual compaction to get scheduled and finish
+  dbfull()->TEST_WaitForCompact(true);
+
+  ASSERT_EQ(dbfull()->immutable_db_options().statistics.get()->getTickerCount(
+                COMPACTION_CANCELLED),
+            2);
+  ASSERT_EQ(sfm->GetCompactionsReservedSize(), 0);
+
+  // Now let the flush through and make sure GetCompactionsReservedSize
+  // returns to normal
+  sfm->SetMaxAllowedSpaceUsage(0);
+  int completed_compactions = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "CompactFilesImpl:End", [&](void* /*arg*/) { completed_compactions++; });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  dbfull()->CompactFiles(ROCKSDB_NAMESPACE::CompactionOptions(), l0_files, 0);
+  dbfull()->TEST_WaitForCompact(true);
+
+  ASSERT_EQ(sfm->GetCompactionsReservedSize(), 0);
+  ASSERT_GT(completed_compactions, 0);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBSSTTest, DBWithMaxSpaceAllowedRandomized) {
+  // This test will set a maximum allowed space for the DB, then it will
+  // keep filling the DB until the limit is reached and bg_error_ is set.
+  // When bg_error_ is set we will verify that the DB size is greater
+  // than the limit.
+
+  std::vector<int> max_space_limits_mbs = {1, 10};
+  std::atomic<bool> bg_error_set(false);
+
+  std::atomic<int> reached_max_space_on_flush(0);
+  std::atomic<int> reached_max_space_on_compaction(0);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::FlushMemTableToOutputFile:MaxAllowedSpaceReached",
+      [&](void* arg) {
+        Status* bg_error = static_cast<Status*>(arg);
+        bg_error_set = true;
+        reached_max_space_on_flush++;
+        // clear error to ensure compaction callback is called
+        *bg_error = Status::OK();
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction():CancelledCompaction", [&](void* arg) {
+        bool* enough_room = static_cast<bool*>(arg);
+        *enough_room = true;
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "CompactionJob::FinishCompactionOutputFile:MaxAllowedSpaceReached",
+      [&](void* /*arg*/) {
+        bg_error_set = true;
+        reached_max_space_on_compaction++;
+      });
+
+  for (auto limit_mb : max_space_limits_mbs) {
+    bg_error_set = false;
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearTrace();
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+    std::shared_ptr<SstFileManager> sst_file_manager(NewSstFileManager(env_));
+    auto sfm = static_cast<SstFileManagerImpl*>(sst_file_manager.get());
+
+    Options options = CurrentOptions();
+    options.sst_file_manager = sst_file_manager;
+    options.write_buffer_size = 1024 * 512;  // 512 Kb
+    DestroyAndReopen(options);
+    Random rnd(301);
+
+    sfm->SetMaxAllowedSpaceUsage(limit_mb * 1024 * 1024);
+
+    // It is easy to detect if the test is stuck in a loop. No need for
+    // complex termination logic.
+    while (true) {
+      auto s = Put(RandomString(&rnd, 10), RandomString(&rnd, 50));
+      if (!s.ok()) {
+        break;
+      }
+    }
+    ASSERT_TRUE(bg_error_set);
+    uint64_t total_sst_files_size = 0;
+    GetAllSSTFiles(&total_sst_files_size);
+    ASSERT_GE(total_sst_files_size, limit_mb * 1024 * 1024);
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  }
+
+  ASSERT_GT(reached_max_space_on_flush, 0);
+  ASSERT_GT(reached_max_space_on_compaction, 0);
+}
+
+TEST_F(DBSSTTest, OpenDBWithInfiniteMaxOpenFiles) {
+  // Open DB with infinite max open files
+  //  - First iteration use 1 thread to open files
+  //  - Second iteration use 5 threads to open files
+  for (int iter = 0; iter < 2; iter++) {
+    Options options;
+    options.create_if_missing = true;
+    options.write_buffer_size = 100000;
+    options.disable_auto_compactions = true;
+    options.max_open_files = -1;
+    if (iter == 0) {
+      options.max_file_opening_threads = 1;
+    } else {
+      options.max_file_opening_threads = 5;
+    }
+    options = CurrentOptions(options);
+    DestroyAndReopen(options);
+
+    // Create 12 Files in L0 (then move then to L2)
+    for (int i = 0; i < 12; i++) {
+      std::string k = "L2_" + Key(i);
+      ASSERT_OK(Put(k, k + std::string(1000, 'a')));
+      ASSERT_OK(Flush());
+    }
+    CompactRangeOptions compact_options;
+    compact_options.change_level = true;
+    compact_options.target_level = 2;
+    db_->CompactRange(compact_options, nullptr, nullptr);
+
+    // Create 12 Files in L0
+    for (int i = 0; i < 12; i++) {
+      std::string k = "L0_" + Key(i);
+      ASSERT_OK(Put(k, k + std::string(1000, 'a')));
+      ASSERT_OK(Flush());
+    }
+    Close();
+
+    // Reopening the DB will load all existing files
+    Reopen(options);
+    ASSERT_EQ("12,0,12", FilesPerLevel(0));
+    std::vector<std::vector<FileMetaData>> files;
+    dbfull()->TEST_GetFilesMetaData(db_->DefaultColumnFamily(), &files);
+
+    for (const auto& level : files) {
+      for (const auto& file : level) {
+        ASSERT_TRUE(file.table_reader_handle != nullptr);
+      }
+    }
+
+    for (int i = 0; i < 12; i++) {
+      ASSERT_EQ(Get("L0_" + Key(i)), "L0_" + Key(i) + std::string(1000, 'a'));
+      ASSERT_EQ(Get("L2_" + Key(i)), "L2_" + Key(i) + std::string(1000, 'a'));
+    }
+  }
+}
+
+TEST_F(DBSSTTest, GetTotalSstFilesSize) {
+  // We don't propagate oldest-key-time table property on compaction and
+  // just write 0 as default value. This affect the exact table size, since
+  // we encode table properties as varint64. Force time to be 0 to work around
+  // it. Should remove the workaround after we propagate the property on
+  // compaction.
+  std::unique_ptr<MockTimeEnv> mock_env(new MockTimeEnv(Env::Default()));
+  mock_env->set_current_time(0);
+
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.compression = kNoCompression;
+  options.env = mock_env.get();
+  DestroyAndReopen(options);
+  // Generate 5 files in L0
+  for (int i = 0; i < 5; i++) {
+    for (int j = 0; j < 10; j++) {
+      std::string val = "val_file_" + ToString(i);
+      ASSERT_OK(Put(Key(j), val));
+    }
+    Flush();
+  }
+  ASSERT_EQ("5", FilesPerLevel(0));
+
+  std::vector<LiveFileMetaData> live_files_meta;
+  dbfull()->GetLiveFilesMetaData(&live_files_meta);
+  ASSERT_EQ(live_files_meta.size(), 5);
+  uint64_t single_file_size = live_files_meta[0].size;
+
+  uint64_t live_sst_files_size = 0;
+  uint64_t total_sst_files_size = 0;
+  for (const auto& file_meta : live_files_meta) {
+    live_sst_files_size += file_meta.size;
+  }
+
+  ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size",
+                                       &total_sst_files_size));
+  // Live SST files = 5
+  // Total SST files = 5
+  ASSERT_EQ(live_sst_files_size, 5 * single_file_size);
+  ASSERT_EQ(total_sst_files_size, 5 * single_file_size);
+
+  // hold current version
+  std::unique_ptr<Iterator> iter1(dbfull()->NewIterator(ReadOptions()));
+
+  // Compact 5 files into 1 file in L0
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ("0,1", FilesPerLevel(0));
+
+  live_files_meta.clear();
+  dbfull()->GetLiveFilesMetaData(&live_files_meta);
+  ASSERT_EQ(live_files_meta.size(), 1);
+
+  live_sst_files_size = 0;
+  total_sst_files_size = 0;
+  for (const auto& file_meta : live_files_meta) {
+    live_sst_files_size += file_meta.size;
+  }
+  ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size",
+                                       &total_sst_files_size));
+  // Live SST files = 1 (compacted file)
+  // Total SST files = 6 (5 original files + compacted file)
+  ASSERT_EQ(live_sst_files_size, 1 * single_file_size);
+  ASSERT_EQ(total_sst_files_size, 6 * single_file_size);
+
+  // hold current version
+  std::unique_ptr<Iterator> iter2(dbfull()->NewIterator(ReadOptions()));
+
+  // Delete all keys and compact, this will delete all live files
+  for (int i = 0; i < 10; i++) {
+    ASSERT_OK(Delete(Key(i)));
+  }
+  Flush();
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ("", FilesPerLevel(0));
+
+  live_files_meta.clear();
+  dbfull()->GetLiveFilesMetaData(&live_files_meta);
+  ASSERT_EQ(live_files_meta.size(), 0);
+
+  ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size",
+                                       &total_sst_files_size));
+  // Live SST files = 0
+  // Total SST files = 6 (5 original files + compacted file)
+  ASSERT_EQ(total_sst_files_size, 6 * single_file_size);
+
+  iter1.reset();
+  ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size",
+                                       &total_sst_files_size));
+  // Live SST files = 0
+  // Total SST files = 1 (compacted file)
+  ASSERT_EQ(total_sst_files_size, 1 * single_file_size);
+
+  iter2.reset();
+  ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size",
+                                       &total_sst_files_size));
+  // Live SST files = 0
+  // Total SST files = 0
+  ASSERT_EQ(total_sst_files_size, 0);
+
+  // Close db before mock_env destruct.
+  Close();
+}
+
+TEST_F(DBSSTTest, GetTotalSstFilesSizeVersionsFilesShared) {
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.compression = kNoCompression;
+  DestroyAndReopen(options);
+  // Generate 5 files in L0
+  for (int i = 0; i < 5; i++) {
+    ASSERT_OK(Put(Key(i), "val"));
+    Flush();
+  }
+  ASSERT_EQ("5", FilesPerLevel(0));
+
+  std::vector<LiveFileMetaData> live_files_meta;
+  dbfull()->GetLiveFilesMetaData(&live_files_meta);
+  ASSERT_EQ(live_files_meta.size(), 5);
+  uint64_t single_file_size = live_files_meta[0].size;
+
+  uint64_t live_sst_files_size = 0;
+  uint64_t total_sst_files_size = 0;
+  for (const auto& file_meta : live_files_meta) {
+    live_sst_files_size += file_meta.size;
+  }
+
+  ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size",
+                                       &total_sst_files_size));
+
+  // Live SST files = 5
+  // Total SST files = 5
+  ASSERT_EQ(live_sst_files_size, 5 * single_file_size);
+  ASSERT_EQ(total_sst_files_size, 5 * single_file_size);
+
+  // hold current version
+  std::unique_ptr<Iterator> iter1(dbfull()->NewIterator(ReadOptions()));
+
+  // Compaction will do trivial move from L0 to L1
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ("0,5", FilesPerLevel(0));
+
+  live_files_meta.clear();
+  dbfull()->GetLiveFilesMetaData(&live_files_meta);
+  ASSERT_EQ(live_files_meta.size(), 5);
+
+  live_sst_files_size = 0;
+  total_sst_files_size = 0;
+  for (const auto& file_meta : live_files_meta) {
+    live_sst_files_size += file_meta.size;
+  }
+  ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size",
+                                       &total_sst_files_size));
+  // Live SST files = 5
+  // Total SST files = 5 (used in 2 version)
+  ASSERT_EQ(live_sst_files_size, 5 * single_file_size);
+  ASSERT_EQ(total_sst_files_size, 5 * single_file_size);
+
+  // hold current version
+  std::unique_ptr<Iterator> iter2(dbfull()->NewIterator(ReadOptions()));
+
+  // Delete all keys and compact, this will delete all live files
+  for (int i = 0; i < 5; i++) {
+    ASSERT_OK(Delete(Key(i)));
+  }
+  Flush();
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ("", FilesPerLevel(0));
+
+  live_files_meta.clear();
+  dbfull()->GetLiveFilesMetaData(&live_files_meta);
+  ASSERT_EQ(live_files_meta.size(), 0);
+
+  ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size",
+                                       &total_sst_files_size));
+  // Live SST files = 0
+  // Total SST files = 5 (used in 2 version)
+  ASSERT_EQ(total_sst_files_size, 5 * single_file_size);
+
+  iter1.reset();
+  iter2.reset();
+
+  ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size",
+                                       &total_sst_files_size));
+  // Live SST files = 0
+  // Total SST files = 0
+  ASSERT_EQ(total_sst_files_size, 0);
+}
+
+#endif  // ROCKSDB_LITE
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_statistics_test.cc b/src/rocksdb/db/db_statistics_test.cc
new file mode 100644
index 000000000..8fbbb96d5
--- /dev/null
+++ b/src/rocksdb/db/db_statistics_test.cc
@@ -0,0 +1,149 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include <string>
+
+#include "db/db_test_util.h"
+#include "monitoring/thread_status_util.h"
+#include "port/stack_trace.h"
+#include "rocksdb/statistics.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBStatisticsTest : public DBTestBase {
+ public:
+  DBStatisticsTest() : DBTestBase("/db_statistics_test") {}
+};
+
+TEST_F(DBStatisticsTest, CompressionStatsTest) {
+  CompressionType type;
+
+  if (Snappy_Supported()) {
+    type = kSnappyCompression;
+    fprintf(stderr, "using snappy\n");
+  } else if (Zlib_Supported()) {
+    type = kZlibCompression;
+    fprintf(stderr, "using zlib\n");
+  } else if (BZip2_Supported()) {
+    type = kBZip2Compression;
+    fprintf(stderr, "using bzip2\n");
+  } else if (LZ4_Supported()) {
+    type = kLZ4Compression;
+    fprintf(stderr, "using lz4\n");
+  } else if (XPRESS_Supported()) {
+    type = kXpressCompression;
+    fprintf(stderr, "using xpress\n");
+  } else if (ZSTD_Supported()) {
+    type = kZSTD;
+    fprintf(stderr, "using ZSTD\n");
+  } else {
+    fprintf(stderr, "skipping test, compression disabled\n");
+    return;
+  }
+
+  Options options = CurrentOptions();
+  options.compression = type;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  options.statistics->set_stats_level(StatsLevel::kExceptTimeForMutex);
+  DestroyAndReopen(options);
+
+  int kNumKeysWritten = 100000;
+
+  // Check that compressions occur and are counted when compression is turned on
+  Random rnd(301);
+  for (int i = 0; i < kNumKeysWritten; ++i) {
+    // compressible string
+    ASSERT_OK(Put(Key(i), RandomString(&rnd, 128) + std::string(128, 'a')));
+  }
+  ASSERT_OK(Flush());
+  ASSERT_GT(options.statistics->getTickerCount(NUMBER_BLOCK_COMPRESSED), 0);
+
+  for (int i = 0; i < kNumKeysWritten; ++i) {
+    auto r = Get(Key(i));
+  }
+  ASSERT_GT(options.statistics->getTickerCount(NUMBER_BLOCK_DECOMPRESSED), 0);
+
+  options.compression = kNoCompression;
+  DestroyAndReopen(options);
+  uint64_t currentCompressions =
+            options.statistics->getTickerCount(NUMBER_BLOCK_COMPRESSED);
+  uint64_t currentDecompressions =
+            options.statistics->getTickerCount(NUMBER_BLOCK_DECOMPRESSED);
+
+  // Check that compressions do not occur when turned off
+  for (int i = 0; i < kNumKeysWritten; ++i) {
+    // compressible string
+    ASSERT_OK(Put(Key(i), RandomString(&rnd, 128) + std::string(128, 'a')));
+  }
+  ASSERT_OK(Flush());
+  ASSERT_EQ(options.statistics->getTickerCount(NUMBER_BLOCK_COMPRESSED)
+            - currentCompressions, 0);
+
+  for (int i = 0; i < kNumKeysWritten; ++i) {
+    auto r = Get(Key(i));
+  }
+  ASSERT_EQ(options.statistics->getTickerCount(NUMBER_BLOCK_DECOMPRESSED)
+            - currentDecompressions, 0);
+}
+
+TEST_F(DBStatisticsTest, MutexWaitStatsDisabledByDefault) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  CreateAndReopenWithCF({"pikachu"}, options);
+  const uint64_t kMutexWaitDelay = 100;
+  ThreadStatusUtil::TEST_SetStateDelay(ThreadStatus::STATE_MUTEX_WAIT,
+                                       kMutexWaitDelay);
+  ASSERT_OK(Put("hello", "rocksdb"));
+  ASSERT_EQ(TestGetTickerCount(options, DB_MUTEX_WAIT_MICROS), 0);
+  ThreadStatusUtil::TEST_SetStateDelay(ThreadStatus::STATE_MUTEX_WAIT, 0);
+}
+
+TEST_F(DBStatisticsTest, MutexWaitStats) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  options.statistics->set_stats_level(StatsLevel::kAll);
+  CreateAndReopenWithCF({"pikachu"}, options);
+  const uint64_t kMutexWaitDelay = 100;
+  ThreadStatusUtil::TEST_SetStateDelay(ThreadStatus::STATE_MUTEX_WAIT,
+                                       kMutexWaitDelay);
+  ASSERT_OK(Put("hello", "rocksdb"));
+  ASSERT_GE(TestGetTickerCount(options, DB_MUTEX_WAIT_MICROS), kMutexWaitDelay);
+  ThreadStatusUtil::TEST_SetStateDelay(ThreadStatus::STATE_MUTEX_WAIT, 0);
+}
+
+TEST_F(DBStatisticsTest, ResetStats) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  DestroyAndReopen(options);
+  for (int i = 0; i < 2; ++i) {
+    // pick arbitrary ticker and histogram. On first iteration they're zero
+    // because db is unused. On second iteration they're zero due to Reset().
+    ASSERT_EQ(0, TestGetTickerCount(options, NUMBER_KEYS_WRITTEN));
+    HistogramData histogram_data;
+    options.statistics->histogramData(DB_WRITE, &histogram_data);
+    ASSERT_EQ(0.0, histogram_data.max);
+
+    if (i == 0) {
+      // The Put() makes some of the ticker/histogram stats nonzero until we
+      // Reset().
+      ASSERT_OK(Put("hello", "rocksdb"));
+      ASSERT_EQ(1, TestGetTickerCount(options, NUMBER_KEYS_WRITTEN));
+      options.statistics->histogramData(DB_WRITE, &histogram_data);
+      ASSERT_GT(histogram_data.max, 0.0);
+      options.statistics->Reset();
+    }
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_table_properties_test.cc b/src/rocksdb/db/db_table_properties_test.cc
new file mode 100644
index 000000000..e3499df70
--- /dev/null
+++ b/src/rocksdb/db/db_table_properties_test.cc
@@ -0,0 +1,336 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <unordered_set>
+#include <vector>
+
+#include "db/db_test_util.h"
+#include "port/stack_trace.h"
+#include "rocksdb/db.h"
+#include "rocksdb/utilities/table_properties_collectors.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+
+#ifndef ROCKSDB_LITE
+
+namespace ROCKSDB_NAMESPACE {
+
+// A helper function that ensures the table properties returned in
+// `GetPropertiesOfAllTablesTest` is correct.
+// This test assumes entries size is different for each of the tables.
+namespace {
+
+void VerifyTableProperties(DB* db, uint64_t expected_entries_size) {
+  TablePropertiesCollection props;
+  ASSERT_OK(db->GetPropertiesOfAllTables(&props));
+
+  ASSERT_EQ(4U, props.size());
+  std::unordered_set<uint64_t> unique_entries;
+
+  // Indirect test
+  uint64_t sum = 0;
+  for (const auto& item : props) {
+    unique_entries.insert(item.second->num_entries);
+    sum += item.second->num_entries;
+  }
+
+  ASSERT_EQ(props.size(), unique_entries.size());
+  ASSERT_EQ(expected_entries_size, sum);
+}
+}  // namespace
+
+class DBTablePropertiesTest : public DBTestBase {
+ public:
+  DBTablePropertiesTest() : DBTestBase("/db_table_properties_test") {}
+  TablePropertiesCollection TestGetPropertiesOfTablesInRange(
+      std::vector<Range> ranges, std::size_t* num_properties = nullptr,
+      std::size_t* num_files = nullptr);
+};
+
+TEST_F(DBTablePropertiesTest, GetPropertiesOfAllTablesTest) {
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = 8;
+  Reopen(options);
+  // Create 4 tables
+  for (int table = 0; table < 4; ++table) {
+    for (int i = 0; i < 10 + table; ++i) {
+      db_->Put(WriteOptions(), ToString(table * 100 + i), "val");
+    }
+    db_->Flush(FlushOptions());
+  }
+
+  // 1. Read table properties directly from file
+  Reopen(options);
+  VerifyTableProperties(db_, 10 + 11 + 12 + 13);
+
+  // 2. Put two tables to table cache and
+  Reopen(options);
+  // fetch key from 1st and 2nd table, which will internally place that table to
+  // the table cache.
+  for (int i = 0; i < 2; ++i) {
+    Get(ToString(i * 100 + 0));
+  }
+
+  VerifyTableProperties(db_, 10 + 11 + 12 + 13);
+
+  // 3. Put all tables to table cache
+  Reopen(options);
+  // fetch key from 1st and 2nd table, which will internally place that table to
+  // the table cache.
+  for (int i = 0; i < 4; ++i) {
+    Get(ToString(i * 100 + 0));
+  }
+  VerifyTableProperties(db_, 10 + 11 + 12 + 13);
+}
+
+TablePropertiesCollection
+DBTablePropertiesTest::TestGetPropertiesOfTablesInRange(
+    std::vector<Range> ranges, std::size_t* num_properties,
+    std::size_t* num_files) {
+
+  // Since we deref zero element in the vector it can not be empty
+  // otherwise we pass an address to some random memory
+  EXPECT_GT(ranges.size(), 0U);
+  // run the query
+  TablePropertiesCollection props;
+  EXPECT_OK(db_->GetPropertiesOfTablesInRange(
+      db_->DefaultColumnFamily(), &ranges[0], ranges.size(), &props));
+
+  // Make sure that we've received properties for those and for those files
+  // only which fall within requested ranges
+  std::vector<LiveFileMetaData> vmd;
+  db_->GetLiveFilesMetaData(&vmd);
+  for (auto& md : vmd) {
+    std::string fn = md.db_path + md.name;
+    bool in_range = false;
+    for (auto& r : ranges) {
+      // smallestkey < limit && largestkey >= start
+      if (r.limit.compare(md.smallestkey) >= 0 &&
+          r.start.compare(md.largestkey) <= 0) {
+        in_range = true;
+        EXPECT_GT(props.count(fn), 0);
+      }
+    }
+    if (!in_range) {
+      EXPECT_EQ(props.count(fn), 0);
+    }
+  }
+
+  if (num_properties) {
+    *num_properties = props.size();
+  }
+
+  if (num_files) {
+    *num_files = vmd.size();
+  }
+  return props;
+}
+
+TEST_F(DBTablePropertiesTest, GetPropertiesOfTablesInRange) {
+  // Fixed random sead
+  Random rnd(301);
+
+  Options options;
+  options.create_if_missing = true;
+  options.write_buffer_size = 4096;
+  options.max_write_buffer_number = 2;
+  options.level0_file_num_compaction_trigger = 2;
+  options.level0_slowdown_writes_trigger = 2;
+  options.level0_stop_writes_trigger = 2;
+  options.target_file_size_base = 2048;
+  options.max_bytes_for_level_base = 40960;
+  options.max_bytes_for_level_multiplier = 4;
+  options.hard_pending_compaction_bytes_limit = 16 * 1024;
+  options.num_levels = 8;
+  options.env = env_;
+
+  DestroyAndReopen(options);
+
+  // build a decent LSM
+  for (int i = 0; i < 10000; i++) {
+    ASSERT_OK(Put(test::RandomKey(&rnd, 5), RandomString(&rnd, 102)));
+  }
+  Flush();
+  dbfull()->TEST_WaitForCompact();
+  if (NumTableFilesAtLevel(0) == 0) {
+    ASSERT_OK(Put(test::RandomKey(&rnd, 5), RandomString(&rnd, 102)));
+    Flush();
+  }
+
+  db_->PauseBackgroundWork();
+
+  // Ensure that we have at least L0, L1 and L2
+  ASSERT_GT(NumTableFilesAtLevel(0), 0);
+  ASSERT_GT(NumTableFilesAtLevel(1), 0);
+  ASSERT_GT(NumTableFilesAtLevel(2), 0);
+
+  // Query the largest range
+  std::size_t num_properties, num_files;
+  TestGetPropertiesOfTablesInRange(
+      {Range(test::RandomKey(&rnd, 5, test::RandomKeyType::SMALLEST),
+             test::RandomKey(&rnd, 5, test::RandomKeyType::LARGEST))},
+      &num_properties, &num_files);
+  ASSERT_EQ(num_properties, num_files);
+
+  // Query the empty range
+  TestGetPropertiesOfTablesInRange(
+      {Range(test::RandomKey(&rnd, 5, test::RandomKeyType::LARGEST),
+             test::RandomKey(&rnd, 5, test::RandomKeyType::SMALLEST))},
+      &num_properties, &num_files);
+  ASSERT_GT(num_files, 0);
+  ASSERT_EQ(num_properties, 0);
+
+  // Query the middle rangee
+  TestGetPropertiesOfTablesInRange(
+      {Range(test::RandomKey(&rnd, 5, test::RandomKeyType::MIDDLE),
+             test::RandomKey(&rnd, 5, test::RandomKeyType::LARGEST))},
+      &num_properties, &num_files);
+  ASSERT_GT(num_files, 0);
+  ASSERT_GT(num_files, num_properties);
+  ASSERT_GT(num_properties, 0);
+
+  // Query a bunch of random ranges
+  for (int j = 0; j < 100; j++) {
+    // create a bunch of ranges
+    std::vector<std::string> random_keys;
+    // Random returns numbers with zero included
+    // when we pass empty ranges TestGetPropertiesOfTablesInRange()
+    // derefs random memory in the empty ranges[0]
+    // so want to be greater than zero and even since
+    // the below loop requires that random_keys.size() to be even.
+    auto n = 2 * (rnd.Uniform(50) + 1);
+
+    for (uint32_t i = 0; i < n; ++i) {
+      random_keys.push_back(test::RandomKey(&rnd, 5));
+    }
+
+    ASSERT_GT(random_keys.size(), 0U);
+    ASSERT_EQ((random_keys.size() % 2), 0U);
+
+    std::vector<Range> ranges;
+    auto it = random_keys.begin();
+    while (it != random_keys.end()) {
+      ranges.push_back(Range(*it, *(it + 1)));
+      it += 2;
+    }
+
+    TestGetPropertiesOfTablesInRange(std::move(ranges));
+  }
+}
+
+TEST_F(DBTablePropertiesTest, GetColumnFamilyNameProperty) {
+  std::string kExtraCfName = "pikachu";
+  CreateAndReopenWithCF({kExtraCfName}, CurrentOptions());
+
+  // Create one table per CF, then verify it was created with the column family
+  // name property.
+  for (uint32_t cf = 0; cf < 2; ++cf) {
+    Put(cf, "key", "val");
+    Flush(cf);
+
+    TablePropertiesCollection fname_to_props;
+    ASSERT_OK(db_->GetPropertiesOfAllTables(handles_[cf], &fname_to_props));
+    ASSERT_EQ(1U, fname_to_props.size());
+
+    std::string expected_cf_name;
+    if (cf > 0) {
+      expected_cf_name = kExtraCfName;
+    } else {
+      expected_cf_name = kDefaultColumnFamilyName;
+    }
+    ASSERT_EQ(expected_cf_name,
+              fname_to_props.begin()->second->column_family_name);
+    ASSERT_EQ(cf, static_cast<uint32_t>(
+                      fname_to_props.begin()->second->column_family_id));
+  }
+}
+
+TEST_F(DBTablePropertiesTest, DeletionTriggeredCompactionMarking) {
+  int kNumKeys = 1000;
+  int kWindowSize = 100;
+  int kNumDelsTrigger = 90;
+  std::shared_ptr<TablePropertiesCollectorFactory> compact_on_del =
+    NewCompactOnDeletionCollectorFactory(kWindowSize, kNumDelsTrigger);
+
+  Options opts = CurrentOptions();
+  opts.table_properties_collector_factories.emplace_back(compact_on_del);
+  Reopen(opts);
+
+  // add an L1 file to prevent tombstones from dropping due to obsolescence
+  // during flush
+  Put(Key(0), "val");
+  Flush();
+  MoveFilesToLevel(1);
+
+  for (int i = 0; i < kNumKeys; ++i) {
+    if (i >= kNumKeys - kWindowSize &&
+        i < kNumKeys - kWindowSize + kNumDelsTrigger) {
+      Delete(Key(i));
+    } else {
+      Put(Key(i), "val");
+    }
+  }
+  Flush();
+
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(0, NumTableFilesAtLevel(0));
+  ASSERT_GT(NumTableFilesAtLevel(1), 0);
+
+  // Change the window size and deletion trigger and ensure new values take
+  // effect
+  kWindowSize = 50;
+  kNumDelsTrigger = 40;
+  static_cast<CompactOnDeletionCollectorFactory*>
+    (compact_on_del.get())->SetWindowSize(kWindowSize);
+  static_cast<CompactOnDeletionCollectorFactory*>
+    (compact_on_del.get())->SetDeletionTrigger(kNumDelsTrigger);
+  for (int i = 0; i < kNumKeys; ++i) {
+    if (i >= kNumKeys - kWindowSize &&
+        i < kNumKeys - kWindowSize + kNumDelsTrigger) {
+      Delete(Key(i));
+    } else {
+      Put(Key(i), "val");
+    }
+  }
+  Flush();
+
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(0, NumTableFilesAtLevel(0));
+  ASSERT_GT(NumTableFilesAtLevel(1), 0);
+
+  // Change the window size to disable delete triggered compaction
+  kWindowSize = 0;
+  static_cast<CompactOnDeletionCollectorFactory*>
+    (compact_on_del.get())->SetWindowSize(kWindowSize);
+  static_cast<CompactOnDeletionCollectorFactory*>
+    (compact_on_del.get())->SetDeletionTrigger(kNumDelsTrigger);
+  for (int i = 0; i < kNumKeys; ++i) {
+    if (i >= kNumKeys - kWindowSize &&
+        i < kNumKeys - kWindowSize + kNumDelsTrigger) {
+      Delete(Key(i));
+    } else {
+      Put(Key(i), "val");
+    }
+  }
+  Flush();
+
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(1, NumTableFilesAtLevel(0));
+
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_tailing_iter_test.cc b/src/rocksdb/db/db_tailing_iter_test.cc
new file mode 100644
index 000000000..39988638b
--- /dev/null
+++ b/src/rocksdb/db/db_tailing_iter_test.cc
@@ -0,0 +1,547 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+// Introduction of SyncPoint effectively disabled building and running this test
+// in Release build.
+// which is a pity, it is a good test
+#if !defined(ROCKSDB_LITE)
+
+#include "db/db_test_util.h"
+#include "db/forward_iterator.h"
+#include "port/stack_trace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBTestTailingIterator : public DBTestBase {
+ public:
+  DBTestTailingIterator() : DBTestBase("/db_tailing_iterator_test") {}
+};
+
+TEST_F(DBTestTailingIterator, TailingIteratorSingle) {
+  ReadOptions read_options;
+  read_options.tailing = true;
+
+  std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+  iter->SeekToFirst();
+  ASSERT_TRUE(!iter->Valid());
+
+  // add a record and check that iter can see it
+  ASSERT_OK(db_->Put(WriteOptions(), "mirko", "fodor"));
+  iter->SeekToFirst();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key().ToString(), "mirko");
+
+  iter->Next();
+  ASSERT_TRUE(!iter->Valid());
+}
+
+TEST_F(DBTestTailingIterator, TailingIteratorKeepAdding) {
+  CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+  ReadOptions read_options;
+  read_options.tailing = true;
+
+  std::unique_ptr<Iterator> iter(db_->NewIterator(read_options, handles_[1]));
+  std::string value(1024, 'a');
+
+  const int num_records = 10000;
+  for (int i = 0; i < num_records; ++i) {
+    char buf[32];
+    snprintf(buf, sizeof(buf), "%016d", i);
+
+    Slice key(buf, 16);
+    ASSERT_OK(Put(1, key, value));
+
+    iter->Seek(key);
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(key), 0);
+  }
+}
+
+TEST_F(DBTestTailingIterator, TailingIteratorSeekToNext) {
+  CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+  ReadOptions read_options;
+  read_options.tailing = true;
+
+  std::unique_ptr<Iterator> iter(db_->NewIterator(read_options, handles_[1]));
+  std::unique_ptr<Iterator> itern(db_->NewIterator(read_options, handles_[1]));
+  std::string value(1024, 'a');
+
+  const int num_records = 1000;
+  for (int i = 1; i < num_records; ++i) {
+    char buf1[32];
+    char buf2[32];
+    snprintf(buf1, sizeof(buf1), "00a0%016d", i * 5);
+
+    Slice key(buf1, 20);
+    ASSERT_OK(Put(1, key, value));
+
+    if (i % 100 == 99) {
+      ASSERT_OK(Flush(1));
+    }
+
+    snprintf(buf2, sizeof(buf2), "00a0%016d", i * 5 - 2);
+    Slice target(buf2, 20);
+    iter->Seek(target);
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(key), 0);
+    if (i == 1) {
+      itern->SeekToFirst();
+    } else {
+      itern->Next();
+    }
+    ASSERT_TRUE(itern->Valid());
+    ASSERT_EQ(itern->key().compare(key), 0);
+  }
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  for (int i = 2 * num_records; i > 0; --i) {
+    char buf1[32];
+    char buf2[32];
+    snprintf(buf1, sizeof(buf1), "00a0%016d", i * 5);
+
+    Slice key(buf1, 20);
+    ASSERT_OK(Put(1, key, value));
+
+    if (i % 100 == 99) {
+      ASSERT_OK(Flush(1));
+    }
+
+    snprintf(buf2, sizeof(buf2), "00a0%016d", i * 5 - 2);
+    Slice target(buf2, 20);
+    iter->Seek(target);
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(key), 0);
+  }
+}
+
+TEST_F(DBTestTailingIterator, TailingIteratorTrimSeekToNext) {
+  const uint64_t k150KB = 150 * 1024;
+  Options options;
+  options.write_buffer_size = k150KB;
+  options.max_write_buffer_number = 3;
+  options.min_write_buffer_number_to_merge = 2;
+  options.env = env_;
+  CreateAndReopenWithCF({"pikachu"}, options);
+  ReadOptions read_options;
+  read_options.tailing = true;
+  int num_iters, deleted_iters;
+
+  char bufe[32];
+  snprintf(bufe, sizeof(bufe), "00b0%016d", 0);
+  Slice keyu(bufe, 20);
+  read_options.iterate_upper_bound = &keyu;
+  std::unique_ptr<Iterator> iter(db_->NewIterator(read_options, handles_[1]));
+  std::unique_ptr<Iterator> itern(db_->NewIterator(read_options, handles_[1]));
+  std::unique_ptr<Iterator> iterh(db_->NewIterator(read_options, handles_[1]));
+  std::string value(1024, 'a');
+  bool file_iters_deleted = false;
+  bool file_iters_renewed_null = false;
+  bool file_iters_renewed_copy = false;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "ForwardIterator::SeekInternal:Return", [&](void* arg) {
+        ForwardIterator* fiter = reinterpret_cast<ForwardIterator*>(arg);
+        ASSERT_TRUE(!file_iters_deleted ||
+                    fiter->TEST_CheckDeletedIters(&deleted_iters, &num_iters));
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "ForwardIterator::Next:Return", [&](void* arg) {
+        ForwardIterator* fiter = reinterpret_cast<ForwardIterator*>(arg);
+        ASSERT_TRUE(!file_iters_deleted ||
+                    fiter->TEST_CheckDeletedIters(&deleted_iters, &num_iters));
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "ForwardIterator::RenewIterators:Null",
+      [&](void* /*arg*/) { file_iters_renewed_null = true; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "ForwardIterator::RenewIterators:Copy",
+      [&](void* /*arg*/) { file_iters_renewed_copy = true; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  const int num_records = 1000;
+  for (int i = 1; i < num_records; ++i) {
+    char buf1[32];
+    char buf2[32];
+    char buf3[32];
+    char buf4[32];
+    snprintf(buf1, sizeof(buf1), "00a0%016d", i * 5);
+    snprintf(buf3, sizeof(buf3), "00b0%016d", i * 5);
+
+    Slice key(buf1, 20);
+    ASSERT_OK(Put(1, key, value));
+    Slice keyn(buf3, 20);
+    ASSERT_OK(Put(1, keyn, value));
+
+    if (i % 100 == 99) {
+      ASSERT_OK(Flush(1));
+      dbfull()->TEST_WaitForCompact();
+      if (i == 299) {
+        file_iters_deleted = true;
+      }
+      snprintf(buf4, sizeof(buf4), "00a0%016d", i * 5 / 2);
+      Slice target(buf4, 20);
+      iterh->Seek(target);
+      ASSERT_TRUE(iter->Valid());
+      for (int j = (i + 1) * 5 / 2; j < i * 5; j += 5) {
+        iterh->Next();
+        ASSERT_TRUE(iterh->Valid());
+      }
+      if (i == 299) {
+        file_iters_deleted = false;
+      }
+    }
+
+    file_iters_deleted = true;
+    snprintf(buf2, sizeof(buf2), "00a0%016d", i * 5 - 2);
+    Slice target(buf2, 20);
+    iter->Seek(target);
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(key), 0);
+    ASSERT_LE(num_iters, 1);
+    if (i == 1) {
+      itern->SeekToFirst();
+    } else {
+      itern->Next();
+    }
+    ASSERT_TRUE(itern->Valid());
+    ASSERT_EQ(itern->key().compare(key), 0);
+    ASSERT_LE(num_iters, 1);
+    file_iters_deleted = false;
+  }
+  ASSERT_TRUE(file_iters_renewed_null);
+  ASSERT_TRUE(file_iters_renewed_copy);
+  iter = nullptr;
+  itern = nullptr;
+  iterh = nullptr;
+  BlockBasedTableOptions table_options;
+  table_options.no_block_cache = true;
+  table_options.block_cache_compressed = nullptr;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+  read_options.read_tier = kBlockCacheTier;
+  std::unique_ptr<Iterator> iteri(db_->NewIterator(read_options, handles_[1]));
+  char buf5[32];
+  snprintf(buf5, sizeof(buf5), "00a0%016d", (num_records / 2) * 5 - 2);
+  Slice target1(buf5, 20);
+  iteri->Seek(target1);
+  ASSERT_TRUE(iteri->status().IsIncomplete());
+  iteri = nullptr;
+
+  read_options.read_tier = kReadAllTier;
+  options.table_factory.reset(NewBlockBasedTableFactory());
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+  iter.reset(db_->NewIterator(read_options, handles_[1]));
+  for (int i = 2 * num_records; i > 0; --i) {
+    char buf1[32];
+    char buf2[32];
+    snprintf(buf1, sizeof(buf1), "00a0%016d", i * 5);
+
+    Slice key(buf1, 20);
+    ASSERT_OK(Put(1, key, value));
+
+    if (i % 100 == 99) {
+      ASSERT_OK(Flush(1));
+    }
+
+    snprintf(buf2, sizeof(buf2), "00a0%016d", i * 5 - 2);
+    Slice target(buf2, 20);
+    iter->Seek(target);
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(key), 0);
+  }
+}
+
+TEST_F(DBTestTailingIterator, TailingIteratorDeletes) {
+  CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+  ReadOptions read_options;
+  read_options.tailing = true;
+
+  std::unique_ptr<Iterator> iter(db_->NewIterator(read_options, handles_[1]));
+
+  // write a single record, read it using the iterator, then delete it
+  ASSERT_OK(Put(1, "0test", "test"));
+  iter->SeekToFirst();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key().ToString(), "0test");
+  ASSERT_OK(Delete(1, "0test"));
+
+  // write many more records
+  const int num_records = 10000;
+  std::string value(1024, 'A');
+
+  for (int i = 0; i < num_records; ++i) {
+    char buf[32];
+    snprintf(buf, sizeof(buf), "1%015d", i);
+
+    Slice key(buf, 16);
+    ASSERT_OK(Put(1, key, value));
+  }
+
+  // force a flush to make sure that no records are read from memtable
+  ASSERT_OK(Flush(1));
+
+  // skip "0test"
+  iter->Next();
+
+  // make sure we can read all new records using the existing iterator
+  int count = 0;
+  for (; iter->Valid(); iter->Next(), ++count) ;
+
+  ASSERT_EQ(count, num_records);
+}
+
+TEST_F(DBTestTailingIterator, TailingIteratorPrefixSeek) {
+  ReadOptions read_options;
+  read_options.tailing = true;
+
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.disable_auto_compactions = true;
+  options.prefix_extractor.reset(NewFixedPrefixTransform(2));
+  options.memtable_factory.reset(NewHashSkipListRepFactory(16));
+  options.allow_concurrent_memtable_write = false;
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  std::unique_ptr<Iterator> iter(db_->NewIterator(read_options, handles_[1]));
+  ASSERT_OK(Put(1, "0101", "test"));
+
+  ASSERT_OK(Flush(1));
+
+  ASSERT_OK(Put(1, "0202", "test"));
+
+  // Seek(0102) shouldn't find any records since 0202 has a different prefix
+  iter->Seek("0102");
+  ASSERT_TRUE(!iter->Valid());
+
+  iter->Seek("0202");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key().ToString(), "0202");
+
+  iter->Next();
+  ASSERT_TRUE(!iter->Valid());
+}
+
+TEST_F(DBTestTailingIterator, TailingIteratorIncomplete) {
+  CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+  ReadOptions read_options;
+  read_options.tailing = true;
+  read_options.read_tier = kBlockCacheTier;
+
+  std::string key("key");
+  std::string value("value");
+
+  ASSERT_OK(db_->Put(WriteOptions(), key, value));
+
+  std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+  iter->SeekToFirst();
+  // we either see the entry or it's not in cache
+  ASSERT_TRUE(iter->Valid() || iter->status().IsIncomplete());
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  iter->SeekToFirst();
+  // should still be true after compaction
+  ASSERT_TRUE(iter->Valid() || iter->status().IsIncomplete());
+}
+
+TEST_F(DBTestTailingIterator, TailingIteratorSeekToSame) {
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  options.write_buffer_size = 1000;
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  ReadOptions read_options;
+  read_options.tailing = true;
+
+  const int NROWS = 10000;
+  // Write rows with keys 00000, 00002, 00004 etc.
+  for (int i = 0; i < NROWS; ++i) {
+    char buf[100];
+    snprintf(buf, sizeof(buf), "%05d", 2*i);
+    std::string key(buf);
+    std::string value("value");
+    ASSERT_OK(db_->Put(WriteOptions(), key, value));
+  }
+
+  std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+  // Seek to 00001.  We expect to find 00002.
+  std::string start_key = "00001";
+  iter->Seek(start_key);
+  ASSERT_TRUE(iter->Valid());
+
+  std::string found = iter->key().ToString();
+  ASSERT_EQ("00002", found);
+
+  // Now seek to the same key.  The iterator should remain in the same
+  // position.
+  iter->Seek(found);
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(found, iter->key().ToString());
+}
+
+// Sets iterate_upper_bound and verifies that ForwardIterator doesn't call
+// Seek() on immutable iterators when target key is >= prev_key and all
+// iterators, including the memtable iterator, are over the upper bound.
+TEST_F(DBTestTailingIterator, TailingIteratorUpperBound) {
+  CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+
+  const Slice upper_bound("20", 3);
+  ReadOptions read_options;
+  read_options.tailing = true;
+  read_options.iterate_upper_bound = &upper_bound;
+
+  ASSERT_OK(Put(1, "11", "11"));
+  ASSERT_OK(Put(1, "12", "12"));
+  ASSERT_OK(Put(1, "22", "22"));
+  ASSERT_OK(Flush(1));  // flush all those keys to an immutable SST file
+
+  // Add another key to the memtable.
+  ASSERT_OK(Put(1, "21", "21"));
+
+  std::unique_ptr<Iterator> it(db_->NewIterator(read_options, handles_[1]));
+  it->Seek("12");
+  ASSERT_TRUE(it->Valid());
+  ASSERT_EQ("12", it->key().ToString());
+
+  it->Next();
+  // Not valid since "21" is over the upper bound.
+  ASSERT_FALSE(it->Valid());
+
+  // This keeps track of the number of times NeedToSeekImmutable() was true.
+  int immutable_seeks = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "ForwardIterator::SeekInternal:Immutable",
+      [&](void* /*arg*/) { ++immutable_seeks; });
+
+  // Seek to 13. This should not require any immutable seeks.
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  it->Seek("13");
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+  ASSERT_FALSE(it->Valid());
+  ASSERT_EQ(0, immutable_seeks);
+}
+
+TEST_F(DBTestTailingIterator, TailingIteratorGap) {
+  // level 1:            [20, 25]  [35, 40]
+  // level 2:  [10 - 15]                    [45 - 50]
+  // level 3:            [20,    30,    40]
+  // Previously there is a bug in tailing_iterator that if there is a gap in
+  // lower level, the key will be skipped if it is within the range between
+  // the largest key of index n file and the smallest key of index n+1 file
+  // if both file fit in that gap. In this example, 25 < key < 35
+  // https://github.com/facebook/rocksdb/issues/1372
+  CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+
+  ReadOptions read_options;
+  read_options.tailing = true;
+
+  ASSERT_OK(Put(1, "20", "20"));
+  ASSERT_OK(Put(1, "30", "30"));
+  ASSERT_OK(Put(1, "40", "40"));
+  ASSERT_OK(Flush(1));
+  MoveFilesToLevel(3, 1);
+
+  ASSERT_OK(Put(1, "10", "10"));
+  ASSERT_OK(Put(1, "15", "15"));
+  ASSERT_OK(Flush(1));
+  ASSERT_OK(Put(1, "45", "45"));
+  ASSERT_OK(Put(1, "50", "50"));
+  ASSERT_OK(Flush(1));
+  MoveFilesToLevel(2, 1);
+
+  ASSERT_OK(Put(1, "20", "20"));
+  ASSERT_OK(Put(1, "25", "25"));
+  ASSERT_OK(Flush(1));
+  ASSERT_OK(Put(1, "35", "35"));
+  ASSERT_OK(Put(1, "40", "40"));
+  ASSERT_OK(Flush(1));
+  MoveFilesToLevel(1, 1);
+
+  ColumnFamilyMetaData meta;
+  db_->GetColumnFamilyMetaData(handles_[1], &meta);
+
+  std::unique_ptr<Iterator> it(db_->NewIterator(read_options, handles_[1]));
+  it->Seek("30");
+  ASSERT_TRUE(it->Valid());
+  ASSERT_EQ("30", it->key().ToString());
+
+  it->Next();
+  ASSERT_TRUE(it->Valid());
+  ASSERT_EQ("35", it->key().ToString());
+
+  it->Next();
+  ASSERT_TRUE(it->Valid());
+  ASSERT_EQ("40", it->key().ToString());
+}
+
+TEST_F(DBTestTailingIterator, SeekWithUpperBoundBug) {
+  ReadOptions read_options;
+  read_options.tailing = true;
+  const Slice upper_bound("cc", 3);
+  read_options.iterate_upper_bound = &upper_bound;
+
+
+  // 1st L0 file
+  ASSERT_OK(db_->Put(WriteOptions(), "aa", "SEEN"));
+  ASSERT_OK(Flush());
+
+  // 2nd L0 file
+  ASSERT_OK(db_->Put(WriteOptions(), "zz", "NOT-SEEN"));
+  ASSERT_OK(Flush());
+
+  std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+
+  iter->Seek("aa");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key().ToString(), "aa");
+}
+
+TEST_F(DBTestTailingIterator, SeekToFirstWithUpperBoundBug) {
+  ReadOptions read_options;
+  read_options.tailing = true;
+  const Slice upper_bound("cc", 3);
+  read_options.iterate_upper_bound = &upper_bound;
+
+
+  // 1st L0 file
+  ASSERT_OK(db_->Put(WriteOptions(), "aa", "SEEN"));
+  ASSERT_OK(Flush());
+
+  // 2nd L0 file
+  ASSERT_OK(db_->Put(WriteOptions(), "zz", "NOT-SEEN"));
+  ASSERT_OK(Flush());
+
+  std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+
+  iter->SeekToFirst();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key().ToString(), "aa");
+
+  iter->Next();
+  ASSERT_FALSE(iter->Valid());
+
+  iter->SeekToFirst();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key().ToString(), "aa");
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // !defined(ROCKSDB_LITE)
+
+int main(int argc, char** argv) {
+#if !defined(ROCKSDB_LITE)
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+#else
+  (void) argc;
+  (void) argv;
+  return 0;
+#endif
+}
diff --git a/src/rocksdb/db/db_test.cc b/src/rocksdb/db/db_test.cc
new file mode 100644
index 000000000..60b4d60f4
--- /dev/null
+++ b/src/rocksdb/db/db_test.cc
@@ -0,0 +1,6605 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+// Introduction of SyncPoint effectively disabled building and running this test
+// in Release build.
+// which is a pity, it is a good test
+#include <fcntl.h>
+#include <algorithm>
+#include <set>
+#include <thread>
+#include <unordered_set>
+#include <utility>
+#ifndef OS_WIN
+#include <unistd.h>
+#endif
+#ifdef OS_SOLARIS
+#include <alloca.h>
+#endif
+
+#include "cache/lru_cache.h"
+#include "db/blob_index.h"
+#include "db/db_impl/db_impl.h"
+#include "db/db_test_util.h"
+#include "db/dbformat.h"
+#include "db/job_context.h"
+#include "db/version_set.h"
+#include "db/write_batch_internal.h"
+#include "env/mock_env.h"
+#include "file/filename.h"
+#include "memtable/hash_linklist_rep.h"
+#include "monitoring/thread_status_util.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/experimental.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/options.h"
+#include "rocksdb/perf_context.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/snapshot.h"
+#include "rocksdb/table.h"
+#include "rocksdb/table_properties.h"
+#include "rocksdb/thread_status.h"
+#include "rocksdb/utilities/checkpoint.h"
+#include "rocksdb/utilities/optimistic_transaction_db.h"
+#include "rocksdb/utilities/write_batch_with_index.h"
+#include "table/block_based/block_based_table_factory.h"
+#include "table/mock_table.h"
+#include "table/plain/plain_table_factory.h"
+#include "table/scoped_arena_iterator.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/compression.h"
+#include "util/mutexlock.h"
+#include "util/rate_limiter.h"
+#include "util/string_util.h"
+#include "utilities/merge_operators.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBTest : public DBTestBase {
+ public:
+  DBTest() : DBTestBase("/db_test") {}
+};
+
+class DBTestWithParam
+    : public DBTest,
+      public testing::WithParamInterface<std::tuple<uint32_t, bool>> {
+ public:
+  DBTestWithParam() {
+    max_subcompactions_ = std::get<0>(GetParam());
+    exclusive_manual_compaction_ = std::get<1>(GetParam());
+  }
+
+  // Required if inheriting from testing::WithParamInterface<>
+  static void SetUpTestCase() {}
+  static void TearDownTestCase() {}
+
+  uint32_t max_subcompactions_;
+  bool exclusive_manual_compaction_;
+};
+
+TEST_F(DBTest, MockEnvTest) {
+  std::unique_ptr<MockEnv> env{new MockEnv(Env::Default())};
+  Options options;
+  options.create_if_missing = true;
+  options.env = env.get();
+  DB* db;
+
+  const Slice keys[] = {Slice("aaa"), Slice("bbb"), Slice("ccc")};
+  const Slice vals[] = {Slice("foo"), Slice("bar"), Slice("baz")};
+
+  ASSERT_OK(DB::Open(options, "/dir/db", &db));
+  for (size_t i = 0; i < 3; ++i) {
+    ASSERT_OK(db->Put(WriteOptions(), keys[i], vals[i]));
+  }
+
+  for (size_t i = 0; i < 3; ++i) {
+    std::string res;
+    ASSERT_OK(db->Get(ReadOptions(), keys[i], &res));
+    ASSERT_TRUE(res == vals[i]);
+  }
+
+  Iterator* iterator = db->NewIterator(ReadOptions());
+  iterator->SeekToFirst();
+  for (size_t i = 0; i < 3; ++i) {
+    ASSERT_TRUE(iterator->Valid());
+    ASSERT_TRUE(keys[i] == iterator->key());
+    ASSERT_TRUE(vals[i] == iterator->value());
+    iterator->Next();
+  }
+  ASSERT_TRUE(!iterator->Valid());
+  delete iterator;
+
+// TEST_FlushMemTable() is not supported in ROCKSDB_LITE
+#ifndef ROCKSDB_LITE
+  DBImpl* dbi = reinterpret_cast<DBImpl*>(db);
+  ASSERT_OK(dbi->TEST_FlushMemTable());
+
+  for (size_t i = 0; i < 3; ++i) {
+    std::string res;
+    ASSERT_OK(db->Get(ReadOptions(), keys[i], &res));
+    ASSERT_TRUE(res == vals[i]);
+  }
+#endif  // ROCKSDB_LITE
+
+  delete db;
+}
+
+// NewMemEnv returns nullptr in ROCKSDB_LITE since class InMemoryEnv isn't
+// defined.
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest, MemEnvTest) {
+  std::unique_ptr<Env> env{NewMemEnv(Env::Default())};
+  Options options;
+  options.create_if_missing = true;
+  options.env = env.get();
+  DB* db;
+
+  const Slice keys[] = {Slice("aaa"), Slice("bbb"), Slice("ccc")};
+  const Slice vals[] = {Slice("foo"), Slice("bar"), Slice("baz")};
+
+  ASSERT_OK(DB::Open(options, "/dir/db", &db));
+  for (size_t i = 0; i < 3; ++i) {
+    ASSERT_OK(db->Put(WriteOptions(), keys[i], vals[i]));
+  }
+
+  for (size_t i = 0; i < 3; ++i) {
+    std::string res;
+    ASSERT_OK(db->Get(ReadOptions(), keys[i], &res));
+    ASSERT_TRUE(res == vals[i]);
+  }
+
+  Iterator* iterator = db->NewIterator(ReadOptions());
+  iterator->SeekToFirst();
+  for (size_t i = 0; i < 3; ++i) {
+    ASSERT_TRUE(iterator->Valid());
+    ASSERT_TRUE(keys[i] == iterator->key());
+    ASSERT_TRUE(vals[i] == iterator->value());
+    iterator->Next();
+  }
+  ASSERT_TRUE(!iterator->Valid());
+  delete iterator;
+
+  DBImpl* dbi = reinterpret_cast<DBImpl*>(db);
+  ASSERT_OK(dbi->TEST_FlushMemTable());
+
+  for (size_t i = 0; i < 3; ++i) {
+    std::string res;
+    ASSERT_OK(db->Get(ReadOptions(), keys[i], &res));
+    ASSERT_TRUE(res == vals[i]);
+  }
+
+  delete db;
+
+  options.create_if_missing = false;
+  ASSERT_OK(DB::Open(options, "/dir/db", &db));
+  for (size_t i = 0; i < 3; ++i) {
+    std::string res;
+    ASSERT_OK(db->Get(ReadOptions(), keys[i], &res));
+    ASSERT_TRUE(res == vals[i]);
+  }
+  delete db;
+}
+#endif  // ROCKSDB_LITE
+
+TEST_F(DBTest, WriteEmptyBatch) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.write_buffer_size = 100000;
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  ASSERT_OK(Put(1, "foo", "bar"));
+  WriteOptions wo;
+  wo.sync = true;
+  wo.disableWAL = false;
+  WriteBatch empty_batch;
+  ASSERT_OK(dbfull()->Write(wo, &empty_batch));
+
+  // make sure we can re-open it.
+  ASSERT_OK(TryReopenWithColumnFamilies({"default", "pikachu"}, options));
+  ASSERT_EQ("bar", Get(1, "foo"));
+}
+
+TEST_F(DBTest, SkipDelay) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.write_buffer_size = 100000;
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  for (bool sync : {true, false}) {
+    for (bool disableWAL : {true, false}) {
+      if (sync && disableWAL) {
+        // sync and disableWAL is incompatible.
+        continue;
+      }
+      // Use a small number to ensure a large delay that is still effective
+      // when we do Put
+      // TODO(myabandeh): this is time dependent and could potentially make
+      // the test flaky
+      auto token = dbfull()->TEST_write_controler().GetDelayToken(1);
+      std::atomic<int> sleep_count(0);
+      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+          "DBImpl::DelayWrite:Sleep",
+          [&](void* /*arg*/) { sleep_count.fetch_add(1); });
+      std::atomic<int> wait_count(0);
+      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+          "DBImpl::DelayWrite:Wait",
+          [&](void* /*arg*/) { wait_count.fetch_add(1); });
+      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+      WriteOptions wo;
+      wo.sync = sync;
+      wo.disableWAL = disableWAL;
+      wo.no_slowdown = true;
+      dbfull()->Put(wo, "foo", "bar");
+      // We need the 2nd write to trigger delay. This is because delay is
+      // estimated based on the last write size which is 0 for the first write.
+      ASSERT_NOK(dbfull()->Put(wo, "foo2", "bar2"));
+      ASSERT_GE(sleep_count.load(), 0);
+      ASSERT_GE(wait_count.load(), 0);
+      token.reset();
+
+      token = dbfull()->TEST_write_controler().GetDelayToken(1000000000);
+      wo.no_slowdown = false;
+      ASSERT_OK(dbfull()->Put(wo, "foo3", "bar3"));
+      ASSERT_GE(sleep_count.load(), 1);
+      token.reset();
+    }
+  }
+}
+
+TEST_F(DBTest, MixedSlowdownOptions) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.write_buffer_size = 100000;
+  CreateAndReopenWithCF({"pikachu"}, options);
+  std::vector<port::Thread> threads;
+  std::atomic<int> thread_num(0);
+
+  std::function<void()> write_slowdown_func = [&]() {
+    int a = thread_num.fetch_add(1);
+    std::string key = "foo" + std::to_string(a);
+    WriteOptions wo;
+    wo.no_slowdown = false;
+    ASSERT_OK(dbfull()->Put(wo, key, "bar"));
+  };
+  std::function<void()> write_no_slowdown_func = [&]() {
+    int a = thread_num.fetch_add(1);
+    std::string key = "foo" + std::to_string(a);
+    WriteOptions wo;
+    wo.no_slowdown = true;
+    ASSERT_NOK(dbfull()->Put(wo, key, "bar"));
+  };
+  // Use a small number to ensure a large delay that is still effective
+  // when we do Put
+  // TODO(myabandeh): this is time dependent and could potentially make
+  // the test flaky
+  auto token = dbfull()->TEST_write_controler().GetDelayToken(1);
+  std::atomic<int> sleep_count(0);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::DelayWrite:BeginWriteStallDone", [&](void* /*arg*/) {
+        sleep_count.fetch_add(1);
+        if (threads.empty()) {
+          for (int i = 0; i < 2; ++i) {
+            threads.emplace_back(write_slowdown_func);
+          }
+          for (int i = 0; i < 2; ++i) {
+            threads.emplace_back(write_no_slowdown_func);
+          }
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  WriteOptions wo;
+  wo.sync = false;
+  wo.disableWAL = false;
+  wo.no_slowdown = false;
+  dbfull()->Put(wo, "foo", "bar");
+  // We need the 2nd write to trigger delay. This is because delay is
+  // estimated based on the last write size which is 0 for the first write.
+  ASSERT_OK(dbfull()->Put(wo, "foo2", "bar2"));
+          token.reset();
+
+  for (auto& t : threads) {
+    t.join();
+  }
+  ASSERT_GE(sleep_count.load(), 1);
+
+  wo.no_slowdown = true;
+  ASSERT_OK(dbfull()->Put(wo, "foo3", "bar"));
+}
+
+TEST_F(DBTest, MixedSlowdownOptionsInQueue) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.write_buffer_size = 100000;
+  CreateAndReopenWithCF({"pikachu"}, options);
+  std::vector<port::Thread> threads;
+  std::atomic<int> thread_num(0);
+
+  std::function<void()> write_no_slowdown_func = [&]() {
+    int a = thread_num.fetch_add(1);
+    std::string key = "foo" + std::to_string(a);
+    WriteOptions wo;
+    wo.no_slowdown = true;
+    ASSERT_NOK(dbfull()->Put(wo, key, "bar"));
+  };
+  // Use a small number to ensure a large delay that is still effective
+  // when we do Put
+  // TODO(myabandeh): this is time dependent and could potentially make
+  // the test flaky
+  auto token = dbfull()->TEST_write_controler().GetDelayToken(1);
+  std::atomic<int> sleep_count(0);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::DelayWrite:Sleep", [&](void* /*arg*/) {
+        sleep_count.fetch_add(1);
+        if (threads.empty()) {
+          for (int i = 0; i < 2; ++i) {
+            threads.emplace_back(write_no_slowdown_func);
+          }
+          // Sleep for 2s to allow the threads to insert themselves into the
+          // write queue
+          env_->SleepForMicroseconds(3000000ULL);
+        }
+      });
+  std::atomic<int> wait_count(0);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::DelayWrite:Wait",
+      [&](void* /*arg*/) { wait_count.fetch_add(1); });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  WriteOptions wo;
+  wo.sync = false;
+  wo.disableWAL = false;
+  wo.no_slowdown = false;
+  dbfull()->Put(wo, "foo", "bar");
+  // We need the 2nd write to trigger delay. This is because delay is
+  // estimated based on the last write size which is 0 for the first write.
+  ASSERT_OK(dbfull()->Put(wo, "foo2", "bar2"));
+          token.reset();
+
+  for (auto& t : threads) {
+    t.join();
+  }
+  ASSERT_EQ(sleep_count.load(), 1);
+  ASSERT_GE(wait_count.load(), 0);
+}
+
+TEST_F(DBTest, MixedSlowdownOptionsStop) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.write_buffer_size = 100000;
+  CreateAndReopenWithCF({"pikachu"}, options);
+  std::vector<port::Thread> threads;
+  std::atomic<int> thread_num(0);
+
+  std::function<void()> write_slowdown_func = [&]() {
+    int a = thread_num.fetch_add(1);
+    std::string key = "foo" + std::to_string(a);
+    WriteOptions wo;
+    wo.no_slowdown = false;
+    ASSERT_OK(dbfull()->Put(wo, key, "bar"));
+  };
+  std::function<void()> write_no_slowdown_func = [&]() {
+    int a = thread_num.fetch_add(1);
+    std::string key = "foo" + std::to_string(a);
+    WriteOptions wo;
+    wo.no_slowdown = true;
+    ASSERT_NOK(dbfull()->Put(wo, key, "bar"));
+  };
+  std::function<void()> wakeup_writer = [&]() {
+    dbfull()->mutex_.Lock();
+    dbfull()->bg_cv_.SignalAll();
+    dbfull()->mutex_.Unlock();
+  };
+  // Use a small number to ensure a large delay that is still effective
+  // when we do Put
+  // TODO(myabandeh): this is time dependent and could potentially make
+  // the test flaky
+  auto token = dbfull()->TEST_write_controler().GetStopToken();
+  std::atomic<int> wait_count(0);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::DelayWrite:Wait", [&](void* /*arg*/) {
+        wait_count.fetch_add(1);
+        if (threads.empty()) {
+          for (int i = 0; i < 2; ++i) {
+            threads.emplace_back(write_slowdown_func);
+          }
+          for (int i = 0; i < 2; ++i) {
+            threads.emplace_back(write_no_slowdown_func);
+          }
+          // Sleep for 2s to allow the threads to insert themselves into the
+          // write queue
+          env_->SleepForMicroseconds(3000000ULL);
+        }
+        token.reset();
+        threads.emplace_back(wakeup_writer);
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  WriteOptions wo;
+  wo.sync = false;
+  wo.disableWAL = false;
+  wo.no_slowdown = false;
+  dbfull()->Put(wo, "foo", "bar");
+  // We need the 2nd write to trigger delay. This is because delay is
+  // estimated based on the last write size which is 0 for the first write.
+  ASSERT_OK(dbfull()->Put(wo, "foo2", "bar2"));
+          token.reset();
+
+  for (auto& t : threads) {
+    t.join();
+  }
+  ASSERT_GE(wait_count.load(), 1);
+
+  wo.no_slowdown = true;
+  ASSERT_OK(dbfull()->Put(wo, "foo3", "bar"));
+}
+#ifndef ROCKSDB_LITE
+
+TEST_F(DBTest, LevelLimitReopen) {
+  Options options = CurrentOptions();
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  const std::string value(1024 * 1024, ' ');
+  int i = 0;
+  while (NumTableFilesAtLevel(2, 1) == 0) {
+    ASSERT_OK(Put(1, Key(i++), value));
+  }
+
+  options.num_levels = 1;
+  options.max_bytes_for_level_multiplier_additional.resize(1, 1);
+  Status s = TryReopenWithColumnFamilies({"default", "pikachu"}, options);
+  ASSERT_EQ(s.IsInvalidArgument(), true);
+  ASSERT_EQ(s.ToString(),
+            "Invalid argument: db has more levels than options.num_levels");
+
+  options.num_levels = 10;
+  options.max_bytes_for_level_multiplier_additional.resize(10, 1);
+  ASSERT_OK(TryReopenWithColumnFamilies({"default", "pikachu"}, options));
+}
+#endif  // ROCKSDB_LITE
+
+
+TEST_F(DBTest, PutSingleDeleteGet) {
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    ASSERT_OK(Put(1, "foo", "v1"));
+    ASSERT_EQ("v1", Get(1, "foo"));
+    ASSERT_OK(Put(1, "foo2", "v2"));
+    ASSERT_EQ("v2", Get(1, "foo2"));
+    ASSERT_OK(SingleDelete(1, "foo"));
+    ASSERT_EQ("NOT_FOUND", Get(1, "foo"));
+    // Skip FIFO and universal compaction beccause they do not apply to the test
+    // case. Skip MergePut because single delete does not get removed when it
+    // encounters a merge.
+  } while (ChangeOptions(kSkipFIFOCompaction | kSkipUniversalCompaction |
+                         kSkipMergePut));
+}
+
+TEST_F(DBTest, ReadFromPersistedTier) {
+  do {
+    Random rnd(301);
+    Options options = CurrentOptions();
+    for (int disableWAL = 0; disableWAL <= 1; ++disableWAL) {
+      CreateAndReopenWithCF({"pikachu"}, options);
+      WriteOptions wopt;
+      wopt.disableWAL = (disableWAL == 1);
+      // 1st round: put but not flush
+      ASSERT_OK(db_->Put(wopt, handles_[1], "foo", "first"));
+      ASSERT_OK(db_->Put(wopt, handles_[1], "bar", "one"));
+      ASSERT_EQ("first", Get(1, "foo"));
+      ASSERT_EQ("one", Get(1, "bar"));
+
+      // Read directly from persited data.
+      ReadOptions ropt;
+      ropt.read_tier = kPersistedTier;
+      std::string value;
+      if (wopt.disableWAL) {
+        // as data has not yet being flushed, we expect not found.
+        ASSERT_TRUE(db_->Get(ropt, handles_[1], "foo", &value).IsNotFound());
+        ASSERT_TRUE(db_->Get(ropt, handles_[1], "bar", &value).IsNotFound());
+      } else {
+        ASSERT_OK(db_->Get(ropt, handles_[1], "foo", &value));
+        ASSERT_OK(db_->Get(ropt, handles_[1], "bar", &value));
+      }
+
+      // Multiget
+      std::vector<ColumnFamilyHandle*> multiget_cfs;
+      multiget_cfs.push_back(handles_[1]);
+      multiget_cfs.push_back(handles_[1]);
+      std::vector<Slice> multiget_keys;
+      multiget_keys.push_back("foo");
+      multiget_keys.push_back("bar");
+      std::vector<std::string> multiget_values;
+      auto statuses =
+          db_->MultiGet(ropt, multiget_cfs, multiget_keys, &multiget_values);
+      if (wopt.disableWAL) {
+        ASSERT_TRUE(statuses[0].IsNotFound());
+        ASSERT_TRUE(statuses[1].IsNotFound());
+      } else {
+        ASSERT_OK(statuses[0]);
+        ASSERT_OK(statuses[1]);
+      }
+
+      // 2nd round: flush and put a new value in memtable.
+      ASSERT_OK(Flush(1));
+      ASSERT_OK(db_->Put(wopt, handles_[1], "rocksdb", "hello"));
+
+      // once the data has been flushed, we are able to get the
+      // data when kPersistedTier is used.
+      ASSERT_TRUE(db_->Get(ropt, handles_[1], "foo", &value).ok());
+      ASSERT_EQ(value, "first");
+      ASSERT_TRUE(db_->Get(ropt, handles_[1], "bar", &value).ok());
+      ASSERT_EQ(value, "one");
+      if (wopt.disableWAL) {
+        ASSERT_TRUE(
+            db_->Get(ropt, handles_[1], "rocksdb", &value).IsNotFound());
+      } else {
+        ASSERT_OK(db_->Get(ropt, handles_[1], "rocksdb", &value));
+        ASSERT_EQ(value, "hello");
+      }
+
+      // Expect same result in multiget
+      multiget_cfs.push_back(handles_[1]);
+      multiget_keys.push_back("rocksdb");
+      statuses =
+          db_->MultiGet(ropt, multiget_cfs, multiget_keys, &multiget_values);
+      ASSERT_TRUE(statuses[0].ok());
+      ASSERT_EQ("first", multiget_values[0]);
+      ASSERT_TRUE(statuses[1].ok());
+      ASSERT_EQ("one", multiget_values[1]);
+      if (wopt.disableWAL) {
+        ASSERT_TRUE(statuses[2].IsNotFound());
+      } else {
+        ASSERT_OK(statuses[2]);
+      }
+
+      // 3rd round: delete and flush
+      ASSERT_OK(db_->Delete(wopt, handles_[1], "foo"));
+      Flush(1);
+      ASSERT_OK(db_->Delete(wopt, handles_[1], "bar"));
+
+      ASSERT_TRUE(db_->Get(ropt, handles_[1], "foo", &value).IsNotFound());
+      if (wopt.disableWAL) {
+        // Still expect finding the value as its delete has not yet being
+        // flushed.
+        ASSERT_TRUE(db_->Get(ropt, handles_[1], "bar", &value).ok());
+        ASSERT_EQ(value, "one");
+      } else {
+        ASSERT_TRUE(db_->Get(ropt, handles_[1], "bar", &value).IsNotFound());
+      }
+      ASSERT_TRUE(db_->Get(ropt, handles_[1], "rocksdb", &value).ok());
+      ASSERT_EQ(value, "hello");
+
+      statuses =
+          db_->MultiGet(ropt, multiget_cfs, multiget_keys, &multiget_values);
+      ASSERT_TRUE(statuses[0].IsNotFound());
+      if (wopt.disableWAL) {
+        ASSERT_TRUE(statuses[1].ok());
+        ASSERT_EQ("one", multiget_values[1]);
+      } else {
+        ASSERT_TRUE(statuses[1].IsNotFound());
+      }
+      ASSERT_TRUE(statuses[2].ok());
+      ASSERT_EQ("hello", multiget_values[2]);
+      if (wopt.disableWAL == 0) {
+        DestroyAndReopen(options);
+      }
+    }
+  } while (ChangeOptions());
+}
+
+TEST_F(DBTest, SingleDeleteFlush) {
+  // Test to check whether flushing preserves a single delete hidden
+  // behind a put.
+  do {
+    Random rnd(301);
+
+    Options options = CurrentOptions();
+    options.disable_auto_compactions = true;
+    CreateAndReopenWithCF({"pikachu"}, options);
+
+    // Put values on second level (so that they will not be in the same
+    // compaction as the other operations.
+    Put(1, "foo", "first");
+    Put(1, "bar", "one");
+    ASSERT_OK(Flush(1));
+    MoveFilesToLevel(2, 1);
+
+    // (Single) delete hidden by a put
+    SingleDelete(1, "foo");
+    Put(1, "foo", "second");
+    Delete(1, "bar");
+    Put(1, "bar", "two");
+    ASSERT_OK(Flush(1));
+
+    SingleDelete(1, "foo");
+    Delete(1, "bar");
+    ASSERT_OK(Flush(1));
+
+    dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr,
+                           nullptr);
+
+    ASSERT_EQ("NOT_FOUND", Get(1, "bar"));
+    ASSERT_EQ("NOT_FOUND", Get(1, "foo"));
+    // Skip FIFO and universal compaction beccause they do not apply to the test
+    // case. Skip MergePut because single delete does not get removed when it
+    // encounters a merge.
+  } while (ChangeOptions(kSkipFIFOCompaction | kSkipUniversalCompaction |
+                         kSkipMergePut));
+}
+
+TEST_F(DBTest, SingleDeletePutFlush) {
+  // Single deletes that encounter the matching put in a flush should get
+  // removed.
+  do {
+    Random rnd(301);
+
+    Options options = CurrentOptions();
+    options.disable_auto_compactions = true;
+    CreateAndReopenWithCF({"pikachu"}, options);
+
+    Put(1, "foo", Slice());
+    Put(1, "a", Slice());
+    SingleDelete(1, "a");
+    ASSERT_OK(Flush(1));
+
+    ASSERT_EQ("[ ]", AllEntriesFor("a", 1));
+    // Skip FIFO and universal compaction beccause they do not apply to the test
+    // case. Skip MergePut because single delete does not get removed when it
+    // encounters a merge.
+  } while (ChangeOptions(kSkipFIFOCompaction | kSkipUniversalCompaction |
+                         kSkipMergePut));
+}
+
+// Disable because not all platform can run it.
+// It requires more than 9GB memory to run it, With single allocation
+// of more than 3GB.
+TEST_F(DBTest, DISABLED_SanitizeVeryVeryLargeValue) {
+  const size_t kValueSize = 4 * size_t{1024 * 1024 * 1024};  // 4GB value
+  std::string raw(kValueSize, 'v');
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.merge_operator = MergeOperators::CreatePutOperator();
+  options.write_buffer_size = 100000;  // Small write buffer
+  options.paranoid_checks = true;
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put("boo", "v1"));
+  ASSERT_TRUE(Put("foo", raw).IsInvalidArgument());
+  ASSERT_TRUE(Merge("foo", raw).IsInvalidArgument());
+
+  WriteBatch wb;
+  ASSERT_TRUE(wb.Put("foo", raw).IsInvalidArgument());
+  ASSERT_TRUE(wb.Merge("foo", raw).IsInvalidArgument());
+
+  Slice value_slice = raw;
+  Slice key_slice = "foo";
+  SliceParts sp_key(&key_slice, 1);
+  SliceParts sp_value(&value_slice, 1);
+
+  ASSERT_TRUE(wb.Put(sp_key, sp_value).IsInvalidArgument());
+  ASSERT_TRUE(wb.Merge(sp_key, sp_value).IsInvalidArgument());
+}
+
+// Disable because not all platform can run it.
+// It requires more than 9GB memory to run it, With single allocation
+// of more than 3GB.
+TEST_F(DBTest, DISABLED_VeryLargeValue) {
+  const size_t kValueSize = 3221225472u;  // 3GB value
+  const size_t kKeySize = 8388608u;       // 8MB key
+  std::string raw(kValueSize, 'v');
+  std::string key1(kKeySize, 'c');
+  std::string key2(kKeySize, 'd');
+
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.write_buffer_size = 100000;  // Small write buffer
+  options.paranoid_checks = true;
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put("boo", "v1"));
+  ASSERT_OK(Put("foo", "v1"));
+  ASSERT_OK(Put(key1, raw));
+  raw[0] = 'w';
+  ASSERT_OK(Put(key2, raw));
+  dbfull()->TEST_WaitForFlushMemTable();
+
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ(1, NumTableFilesAtLevel(0));
+#endif  // !ROCKSDB_LITE
+
+  std::string value;
+  Status s = db_->Get(ReadOptions(), key1, &value);
+  ASSERT_OK(s);
+  ASSERT_EQ(kValueSize, value.size());
+  ASSERT_EQ('v', value[0]);
+
+  s = db_->Get(ReadOptions(), key2, &value);
+  ASSERT_OK(s);
+  ASSERT_EQ(kValueSize, value.size());
+  ASSERT_EQ('w', value[0]);
+
+  // Compact all files.
+  Flush();
+  db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+
+  // Check DB is not in read-only state.
+  ASSERT_OK(Put("boo", "v1"));
+
+  s = db_->Get(ReadOptions(), key1, &value);
+  ASSERT_OK(s);
+  ASSERT_EQ(kValueSize, value.size());
+  ASSERT_EQ('v', value[0]);
+
+  s = db_->Get(ReadOptions(), key2, &value);
+  ASSERT_OK(s);
+  ASSERT_EQ(kValueSize, value.size());
+  ASSERT_EQ('w', value[0]);
+}
+
+TEST_F(DBTest, GetFromImmutableLayer) {
+  do {
+    Options options = CurrentOptions();
+    options.env = env_;
+    CreateAndReopenWithCF({"pikachu"}, options);
+
+    ASSERT_OK(Put(1, "foo", "v1"));
+    ASSERT_EQ("v1", Get(1, "foo"));
+
+    // Block sync calls
+    env_->delay_sstable_sync_.store(true, std::memory_order_release);
+    Put(1, "k1", std::string(100000, 'x'));  // Fill memtable
+    Put(1, "k2", std::string(100000, 'y'));  // Trigger flush
+    ASSERT_EQ("v1", Get(1, "foo"));
+    ASSERT_EQ("NOT_FOUND", Get(0, "foo"));
+    // Release sync calls
+    env_->delay_sstable_sync_.store(false, std::memory_order_release);
+  } while (ChangeOptions());
+}
+
+
+TEST_F(DBTest, GetLevel0Ordering) {
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    // Check that we process level-0 files in correct order.  The code
+    // below generates two level-0 files where the earlier one comes
+    // before the later one in the level-0 file list since the earlier
+    // one has a smaller "smallest" key.
+    ASSERT_OK(Put(1, "bar", "b"));
+    ASSERT_OK(Put(1, "foo", "v1"));
+    ASSERT_OK(Flush(1));
+    ASSERT_OK(Put(1, "foo", "v2"));
+    ASSERT_OK(Flush(1));
+    ASSERT_EQ("v2", Get(1, "foo"));
+  } while (ChangeOptions());
+}
+
+TEST_F(DBTest, WrongLevel0Config) {
+  Options options = CurrentOptions();
+  Close();
+  ASSERT_OK(DestroyDB(dbname_, options));
+  options.level0_stop_writes_trigger = 1;
+  options.level0_slowdown_writes_trigger = 2;
+  options.level0_file_num_compaction_trigger = 3;
+  ASSERT_OK(DB::Open(options, dbname_, &db_));
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest, GetOrderedByLevels) {
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    ASSERT_OK(Put(1, "foo", "v1"));
+    Compact(1, "a", "z");
+    ASSERT_EQ("v1", Get(1, "foo"));
+    ASSERT_OK(Put(1, "foo", "v2"));
+    ASSERT_EQ("v2", Get(1, "foo"));
+    ASSERT_OK(Flush(1));
+    ASSERT_EQ("v2", Get(1, "foo"));
+  } while (ChangeOptions());
+}
+
+TEST_F(DBTest, GetPicksCorrectFile) {
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    // Arrange to have multiple files in a non-level-0 level.
+    ASSERT_OK(Put(1, "a", "va"));
+    Compact(1, "a", "b");
+    ASSERT_OK(Put(1, "x", "vx"));
+    Compact(1, "x", "y");
+    ASSERT_OK(Put(1, "f", "vf"));
+    Compact(1, "f", "g");
+    ASSERT_EQ("va", Get(1, "a"));
+    ASSERT_EQ("vf", Get(1, "f"));
+    ASSERT_EQ("vx", Get(1, "x"));
+  } while (ChangeOptions());
+}
+
+TEST_F(DBTest, GetEncountersEmptyLevel) {
+  do {
+    Options options = CurrentOptions();
+    CreateAndReopenWithCF({"pikachu"}, options);
+    // Arrange for the following to happen:
+    //   * sstable A in level 0
+    //   * nothing in level 1
+    //   * sstable B in level 2
+    // Then do enough Get() calls to arrange for an automatic compaction
+    // of sstable A.  A bug would cause the compaction to be marked as
+    // occurring at level 1 (instead of the correct level 0).
+
+    // Step 1: First place sstables in levels 0 and 2
+    Put(1, "a", "begin");
+    Put(1, "z", "end");
+    ASSERT_OK(Flush(1));
+    dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]);
+    dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]);
+    Put(1, "a", "begin");
+    Put(1, "z", "end");
+    ASSERT_OK(Flush(1));
+    ASSERT_GT(NumTableFilesAtLevel(0, 1), 0);
+    ASSERT_GT(NumTableFilesAtLevel(2, 1), 0);
+
+    // Step 2: clear level 1 if necessary.
+    dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]);
+    ASSERT_EQ(NumTableFilesAtLevel(0, 1), 1);
+    ASSERT_EQ(NumTableFilesAtLevel(1, 1), 0);
+    ASSERT_EQ(NumTableFilesAtLevel(2, 1), 1);
+
+    // Step 3: read a bunch of times
+    for (int i = 0; i < 1000; i++) {
+      ASSERT_EQ("NOT_FOUND", Get(1, "missing"));
+    }
+
+    // Step 4: Wait for compaction to finish
+    dbfull()->TEST_WaitForCompact();
+
+    ASSERT_EQ(NumTableFilesAtLevel(0, 1), 1);  // XXX
+  } while (ChangeOptions(kSkipUniversalCompaction | kSkipFIFOCompaction));
+}
+#endif  // ROCKSDB_LITE
+
+TEST_F(DBTest, FlushMultipleMemtable) {
+  do {
+    Options options = CurrentOptions();
+    WriteOptions writeOpt = WriteOptions();
+    writeOpt.disableWAL = true;
+    options.max_write_buffer_number = 4;
+    options.min_write_buffer_number_to_merge = 3;
+    options.max_write_buffer_size_to_maintain = -1;
+    CreateAndReopenWithCF({"pikachu"}, options);
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v1"));
+    ASSERT_OK(Flush(1));
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v1"));
+
+    ASSERT_EQ("v1", Get(1, "foo"));
+    ASSERT_EQ("v1", Get(1, "bar"));
+    ASSERT_OK(Flush(1));
+  } while (ChangeCompactOptions());
+}
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest, FlushSchedule) {
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.level0_stop_writes_trigger = 1 << 10;
+  options.level0_slowdown_writes_trigger = 1 << 10;
+  options.min_write_buffer_number_to_merge = 1;
+  options.max_write_buffer_size_to_maintain =
+      static_cast<int64_t>(options.write_buffer_size);
+  options.max_write_buffer_number = 2;
+  options.write_buffer_size = 120 * 1024;
+  CreateAndReopenWithCF({"pikachu"}, options);
+  std::vector<port::Thread> threads;
+
+  std::atomic<int> thread_num(0);
+  // each column family will have 5 thread, each thread generating 2 memtables.
+  // each column family should end up with 10 table files
+  std::function<void()> fill_memtable_func = [&]() {
+    int a = thread_num.fetch_add(1);
+    Random rnd(a);
+    WriteOptions wo;
+    // this should fill up 2 memtables
+    for (int k = 0; k < 5000; ++k) {
+      ASSERT_OK(db_->Put(wo, handles_[a & 1], RandomString(&rnd, 13), ""));
+    }
+  };
+
+  for (int i = 0; i < 10; ++i) {
+    threads.emplace_back(fill_memtable_func);
+  }
+
+  for (auto& t : threads) {
+    t.join();
+  }
+
+  auto default_tables = GetNumberOfSstFilesForColumnFamily(db_, "default");
+  auto pikachu_tables = GetNumberOfSstFilesForColumnFamily(db_, "pikachu");
+  ASSERT_LE(default_tables, static_cast<uint64_t>(10));
+  ASSERT_GT(default_tables, static_cast<uint64_t>(0));
+  ASSERT_LE(pikachu_tables, static_cast<uint64_t>(10));
+  ASSERT_GT(pikachu_tables, static_cast<uint64_t>(0));
+}
+#endif  // ROCKSDB_LITE
+
+namespace {
+class KeepFilter : public CompactionFilter {
+ public:
+  bool Filter(int /*level*/, const Slice& /*key*/, const Slice& /*value*/,
+              std::string* /*new_value*/,
+              bool* /*value_changed*/) const override {
+    return false;
+  }
+
+  const char* Name() const override { return "KeepFilter"; }
+};
+
+class KeepFilterFactory : public CompactionFilterFactory {
+ public:
+  explicit KeepFilterFactory(bool check_context = false)
+      : check_context_(check_context) {}
+
+  std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+      const CompactionFilter::Context& context) override {
+    if (check_context_) {
+      EXPECT_EQ(expect_full_compaction_.load(), context.is_full_compaction);
+      EXPECT_EQ(expect_manual_compaction_.load(), context.is_manual_compaction);
+    }
+    return std::unique_ptr<CompactionFilter>(new KeepFilter());
+  }
+
+  const char* Name() const override { return "KeepFilterFactory"; }
+  bool check_context_;
+  std::atomic_bool expect_full_compaction_;
+  std::atomic_bool expect_manual_compaction_;
+};
+
+class DelayFilter : public CompactionFilter {
+ public:
+  explicit DelayFilter(DBTestBase* d) : db_test(d) {}
+  bool Filter(int /*level*/, const Slice& /*key*/, const Slice& /*value*/,
+              std::string* /*new_value*/,
+              bool* /*value_changed*/) const override {
+    db_test->env_->addon_time_.fetch_add(1000);
+    return true;
+  }
+
+  const char* Name() const override { return "DelayFilter"; }
+
+ private:
+  DBTestBase* db_test;
+};
+
+class DelayFilterFactory : public CompactionFilterFactory {
+ public:
+  explicit DelayFilterFactory(DBTestBase* d) : db_test(d) {}
+  std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+      const CompactionFilter::Context& /*context*/) override {
+    return std::unique_ptr<CompactionFilter>(new DelayFilter(db_test));
+  }
+
+  const char* Name() const override { return "DelayFilterFactory"; }
+
+ private:
+  DBTestBase* db_test;
+};
+}  // namespace
+
+#ifndef ROCKSDB_LITE
+
+static std::string CompressibleString(Random* rnd, int len) {
+  std::string r;
+  test::CompressibleString(rnd, 0.8, len, &r);
+  return r;
+}
+#endif  // ROCKSDB_LITE
+
+TEST_F(DBTest, FailMoreDbPaths) {
+  Options options = CurrentOptions();
+  options.db_paths.emplace_back(dbname_, 10000000);
+  options.db_paths.emplace_back(dbname_ + "_2", 1000000);
+  options.db_paths.emplace_back(dbname_ + "_3", 1000000);
+  options.db_paths.emplace_back(dbname_ + "_4", 1000000);
+  options.db_paths.emplace_back(dbname_ + "_5", 1000000);
+  ASSERT_TRUE(TryReopen(options).IsNotSupported());
+}
+
+void CheckColumnFamilyMeta(
+    const ColumnFamilyMetaData& cf_meta,
+    const std::vector<std::vector<FileMetaData>>& files_by_level,
+    uint64_t start_time, uint64_t end_time) {
+  ASSERT_EQ(cf_meta.name, kDefaultColumnFamilyName);
+  ASSERT_EQ(cf_meta.levels.size(), files_by_level.size());
+
+  uint64_t cf_size = 0;
+  size_t file_count = 0;
+
+  for (size_t i = 0; i < cf_meta.levels.size(); ++i) {
+    const auto& level_meta_from_cf = cf_meta.levels[i];
+    const auto& level_meta_from_files = files_by_level[i];
+
+    ASSERT_EQ(level_meta_from_cf.level, i);
+    ASSERT_EQ(level_meta_from_cf.files.size(), level_meta_from_files.size());
+
+    file_count += level_meta_from_cf.files.size();
+
+    uint64_t level_size = 0;
+    for (size_t j = 0; j < level_meta_from_cf.files.size(); ++j) {
+      const auto& file_meta_from_cf = level_meta_from_cf.files[j];
+      const auto& file_meta_from_files = level_meta_from_files[j];
+
+      level_size += file_meta_from_cf.size;
+
+      ASSERT_EQ(file_meta_from_cf.file_number,
+                file_meta_from_files.fd.GetNumber());
+      ASSERT_EQ(file_meta_from_cf.file_number,
+                TableFileNameToNumber(file_meta_from_cf.name));
+      ASSERT_EQ(file_meta_from_cf.size, file_meta_from_files.fd.file_size);
+      ASSERT_EQ(file_meta_from_cf.smallest_seqno,
+                file_meta_from_files.fd.smallest_seqno);
+      ASSERT_EQ(file_meta_from_cf.largest_seqno,
+                file_meta_from_files.fd.largest_seqno);
+      ASSERT_EQ(file_meta_from_cf.smallestkey,
+                file_meta_from_files.smallest.user_key().ToString());
+      ASSERT_EQ(file_meta_from_cf.largestkey,
+                file_meta_from_files.largest.user_key().ToString());
+      ASSERT_EQ(file_meta_from_cf.oldest_blob_file_number,
+                file_meta_from_files.oldest_blob_file_number);
+      ASSERT_EQ(file_meta_from_cf.oldest_ancester_time,
+                file_meta_from_files.oldest_ancester_time);
+      ASSERT_EQ(file_meta_from_cf.file_creation_time,
+                file_meta_from_files.file_creation_time);
+      ASSERT_GE(file_meta_from_cf.file_creation_time, start_time);
+      ASSERT_LE(file_meta_from_cf.file_creation_time, end_time);
+      ASSERT_GE(file_meta_from_cf.oldest_ancester_time, start_time);
+      ASSERT_LE(file_meta_from_cf.oldest_ancester_time, end_time);
+    }
+
+    ASSERT_EQ(level_meta_from_cf.size, level_size);
+    cf_size += level_size;
+  }
+
+  ASSERT_EQ(cf_meta.file_count, file_count);
+  ASSERT_EQ(cf_meta.size, cf_size);
+}
+
+void CheckLiveFilesMeta(
+    const std::vector<LiveFileMetaData>& live_file_meta,
+    const std::vector<std::vector<FileMetaData>>& files_by_level) {
+  size_t total_file_count = 0;
+  for (const auto& f : files_by_level) {
+    total_file_count += f.size();
+  }
+
+  ASSERT_EQ(live_file_meta.size(), total_file_count);
+
+  int level = 0;
+  int i = 0;
+
+  for (const auto& meta : live_file_meta) {
+    if (level != meta.level) {
+      level = meta.level;
+      i = 0;
+    }
+
+    ASSERT_LT(i, files_by_level[level].size());
+
+    const auto& expected_meta = files_by_level[level][i];
+
+    ASSERT_EQ(meta.column_family_name, kDefaultColumnFamilyName);
+    ASSERT_EQ(meta.file_number, expected_meta.fd.GetNumber());
+    ASSERT_EQ(meta.file_number, TableFileNameToNumber(meta.name));
+    ASSERT_EQ(meta.size, expected_meta.fd.file_size);
+    ASSERT_EQ(meta.smallest_seqno, expected_meta.fd.smallest_seqno);
+    ASSERT_EQ(meta.largest_seqno, expected_meta.fd.largest_seqno);
+    ASSERT_EQ(meta.smallestkey, expected_meta.smallest.user_key().ToString());
+    ASSERT_EQ(meta.largestkey, expected_meta.largest.user_key().ToString());
+    ASSERT_EQ(meta.oldest_blob_file_number,
+              expected_meta.oldest_blob_file_number);
+
+    ++i;
+  }
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest, MetaDataTest) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.disable_auto_compactions = true;
+
+  int64_t temp_time = 0;
+  options.env->GetCurrentTime(&temp_time);
+  uint64_t start_time = static_cast<uint64_t>(temp_time);
+
+  DestroyAndReopen(options);
+
+  Random rnd(301);
+  int key_index = 0;
+  for (int i = 0; i < 100; ++i) {
+    // Add a single blob reference to each file
+    std::string blob_index;
+    BlobIndex::EncodeBlob(&blob_index, /* blob_file_number */ i + 1000,
+                          /* offset */ 1234, /* size */ 5678, kNoCompression);
+
+    WriteBatch batch;
+    ASSERT_OK(WriteBatchInternal::PutBlobIndex(&batch, 0, Key(key_index),
+                                               blob_index));
+    ASSERT_OK(dbfull()->Write(WriteOptions(), &batch));
+
+    ++key_index;
+
+    // Fill up the rest of the file with random values.
+    GenerateNewFile(&rnd, &key_index, /* nowait */ true);
+
+    Flush();
+  }
+
+  std::vector<std::vector<FileMetaData>> files_by_level;
+  dbfull()->TEST_GetFilesMetaData(db_->DefaultColumnFamily(), &files_by_level);
+
+  options.env->GetCurrentTime(&temp_time);
+  uint64_t end_time = static_cast<uint64_t>(temp_time);
+
+  ColumnFamilyMetaData cf_meta;
+  db_->GetColumnFamilyMetaData(&cf_meta);
+  CheckColumnFamilyMeta(cf_meta, files_by_level, start_time, end_time);
+
+  std::vector<LiveFileMetaData> live_file_meta;
+  db_->GetLiveFilesMetaData(&live_file_meta);
+  CheckLiveFilesMeta(live_file_meta, files_by_level);
+}
+
+namespace {
+void MinLevelHelper(DBTest* self, Options& options) {
+  Random rnd(301);
+
+  for (int num = 0; num < options.level0_file_num_compaction_trigger - 1;
+       num++) {
+    std::vector<std::string> values;
+    // Write 120KB (12 values, each 10K)
+    for (int i = 0; i < 12; i++) {
+      values.push_back(DBTestBase::RandomString(&rnd, 10000));
+      ASSERT_OK(self->Put(DBTestBase::Key(i), values[i]));
+    }
+    self->dbfull()->TEST_WaitForFlushMemTable();
+    ASSERT_EQ(self->NumTableFilesAtLevel(0), num + 1);
+  }
+
+  // generate one more file in level-0, and should trigger level-0 compaction
+  std::vector<std::string> values;
+  for (int i = 0; i < 12; i++) {
+    values.push_back(DBTestBase::RandomString(&rnd, 10000));
+    ASSERT_OK(self->Put(DBTestBase::Key(i), values[i]));
+  }
+  self->dbfull()->TEST_WaitForCompact();
+
+  ASSERT_EQ(self->NumTableFilesAtLevel(0), 0);
+  ASSERT_EQ(self->NumTableFilesAtLevel(1), 1);
+}
+
+// returns false if the calling-Test should be skipped
+bool MinLevelToCompress(CompressionType& type, Options& options, int wbits,
+                        int lev, int strategy) {
+  fprintf(stderr,
+          "Test with compression options : window_bits = %d, level =  %d, "
+          "strategy = %d}\n",
+          wbits, lev, strategy);
+  options.write_buffer_size = 100 << 10;  // 100KB
+  options.arena_block_size = 4096;
+  options.num_levels = 3;
+  options.level0_file_num_compaction_trigger = 3;
+  options.create_if_missing = true;
+
+  if (Snappy_Supported()) {
+    type = kSnappyCompression;
+    fprintf(stderr, "using snappy\n");
+  } else if (Zlib_Supported()) {
+    type = kZlibCompression;
+    fprintf(stderr, "using zlib\n");
+  } else if (BZip2_Supported()) {
+    type = kBZip2Compression;
+    fprintf(stderr, "using bzip2\n");
+  } else if (LZ4_Supported()) {
+    type = kLZ4Compression;
+    fprintf(stderr, "using lz4\n");
+  } else if (XPRESS_Supported()) {
+    type = kXpressCompression;
+    fprintf(stderr, "using xpress\n");
+  } else if (ZSTD_Supported()) {
+    type = kZSTD;
+    fprintf(stderr, "using ZSTD\n");
+  } else {
+    fprintf(stderr, "skipping test, compression disabled\n");
+    return false;
+  }
+  options.compression_per_level.resize(options.num_levels);
+
+  // do not compress L0
+  for (int i = 0; i < 1; i++) {
+    options.compression_per_level[i] = kNoCompression;
+  }
+  for (int i = 1; i < options.num_levels; i++) {
+    options.compression_per_level[i] = type;
+  }
+  return true;
+}
+}  // namespace
+
+TEST_F(DBTest, MinLevelToCompress1) {
+  Options options = CurrentOptions();
+  CompressionType type = kSnappyCompression;
+  if (!MinLevelToCompress(type, options, -14, -1, 0)) {
+    return;
+  }
+  Reopen(options);
+  MinLevelHelper(this, options);
+
+  // do not compress L0 and L1
+  for (int i = 0; i < 2; i++) {
+    options.compression_per_level[i] = kNoCompression;
+  }
+  for (int i = 2; i < options.num_levels; i++) {
+    options.compression_per_level[i] = type;
+  }
+  DestroyAndReopen(options);
+  MinLevelHelper(this, options);
+}
+
+TEST_F(DBTest, MinLevelToCompress2) {
+  Options options = CurrentOptions();
+  CompressionType type = kSnappyCompression;
+  if (!MinLevelToCompress(type, options, 15, -1, 0)) {
+    return;
+  }
+  Reopen(options);
+  MinLevelHelper(this, options);
+
+  // do not compress L0 and L1
+  for (int i = 0; i < 2; i++) {
+    options.compression_per_level[i] = kNoCompression;
+  }
+  for (int i = 2; i < options.num_levels; i++) {
+    options.compression_per_level[i] = type;
+  }
+  DestroyAndReopen(options);
+  MinLevelHelper(this, options);
+}
+
+// This test may fail because of a legit case that multiple L0 files
+// are trivial moved to L1.
+TEST_F(DBTest, DISABLED_RepeatedWritesToSameKey) {
+  do {
+    Options options = CurrentOptions();
+    options.env = env_;
+    options.write_buffer_size = 100000;  // Small write buffer
+    CreateAndReopenWithCF({"pikachu"}, options);
+
+    // We must have at most one file per level except for level-0,
+    // which may have up to kL0_StopWritesTrigger files.
+    const int kMaxFiles =
+        options.num_levels + options.level0_stop_writes_trigger;
+
+    Random rnd(301);
+    std::string value =
+        RandomString(&rnd, static_cast<int>(2 * options.write_buffer_size));
+    for (int i = 0; i < 5 * kMaxFiles; i++) {
+      ASSERT_OK(Put(1, "key", value));
+      ASSERT_LE(TotalTableFiles(1), kMaxFiles);
+    }
+  } while (ChangeCompactOptions());
+}
+#endif  // ROCKSDB_LITE
+
+TEST_F(DBTest, SparseMerge) {
+  do {
+    Options options = CurrentOptions();
+    options.compression = kNoCompression;
+    CreateAndReopenWithCF({"pikachu"}, options);
+
+    FillLevels("A", "Z", 1);
+
+    // Suppose there is:
+    //    small amount of data with prefix A
+    //    large amount of data with prefix B
+    //    small amount of data with prefix C
+    // and that recent updates have made small changes to all three prefixes.
+    // Check that we do not do a compaction that merges all of B in one shot.
+    const std::string value(1000, 'x');
+    Put(1, "A", "va");
+    // Write approximately 100MB of "B" values
+    for (int i = 0; i < 100000; i++) {
+      char key[100];
+      snprintf(key, sizeof(key), "B%010d", i);
+      Put(1, key, value);
+    }
+    Put(1, "C", "vc");
+    ASSERT_OK(Flush(1));
+    dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]);
+
+    // Make sparse update
+    Put(1, "A", "va2");
+    Put(1, "B100", "bvalue2");
+    Put(1, "C", "vc2");
+    ASSERT_OK(Flush(1));
+
+    // Compactions should not cause us to create a situation where
+    // a file overlaps too much data at the next level.
+    ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(handles_[1]),
+              20 * 1048576);
+    dbfull()->TEST_CompactRange(0, nullptr, nullptr);
+    ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(handles_[1]),
+              20 * 1048576);
+    dbfull()->TEST_CompactRange(1, nullptr, nullptr);
+    ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(handles_[1]),
+              20 * 1048576);
+  } while (ChangeCompactOptions());
+}
+
+#ifndef ROCKSDB_LITE
+static bool Between(uint64_t val, uint64_t low, uint64_t high) {
+  bool result = (val >= low) && (val <= high);
+  if (!result) {
+    fprintf(stderr, "Value %llu is not in range [%llu, %llu]\n",
+            (unsigned long long)(val), (unsigned long long)(low),
+            (unsigned long long)(high));
+  }
+  return result;
+}
+
+TEST_F(DBTest, ApproximateSizesMemTable) {
+  Options options = CurrentOptions();
+  options.write_buffer_size = 100000000;  // Large write buffer
+  options.compression = kNoCompression;
+  options.create_if_missing = true;
+  DestroyAndReopen(options);
+  auto default_cf = db_->DefaultColumnFamily();
+
+  const int N = 128;
+  Random rnd(301);
+  for (int i = 0; i < N; i++) {
+    ASSERT_OK(Put(Key(i), RandomString(&rnd, 1024)));
+  }
+
+  uint64_t size;
+  std::string start = Key(50);
+  std::string end = Key(60);
+  Range r(start, end);
+  SizeApproximationOptions size_approx_options;
+  size_approx_options.include_memtabtles = true;
+  size_approx_options.include_files = true;
+  db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size);
+  ASSERT_GT(size, 6000);
+  ASSERT_LT(size, 204800);
+  // Zero if not including mem table
+  db_->GetApproximateSizes(&r, 1, &size);
+  ASSERT_EQ(size, 0);
+
+  start = Key(500);
+  end = Key(600);
+  r = Range(start, end);
+  db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size);
+  ASSERT_EQ(size, 0);
+
+  for (int i = 0; i < N; i++) {
+    ASSERT_OK(Put(Key(1000 + i), RandomString(&rnd, 1024)));
+  }
+
+  start = Key(500);
+  end = Key(600);
+  r = Range(start, end);
+  db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size);
+  ASSERT_EQ(size, 0);
+
+  start = Key(100);
+  end = Key(1020);
+  r = Range(start, end);
+  db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size);
+  ASSERT_GT(size, 6000);
+
+  options.max_write_buffer_number = 8;
+  options.min_write_buffer_number_to_merge = 5;
+  options.write_buffer_size = 1024 * N;  // Not very large
+  DestroyAndReopen(options);
+  default_cf = db_->DefaultColumnFamily();
+
+  int keys[N * 3];
+  for (int i = 0; i < N; i++) {
+    keys[i * 3] = i * 5;
+    keys[i * 3 + 1] = i * 5 + 1;
+    keys[i * 3 + 2] = i * 5 + 2;
+  }
+  std::random_shuffle(std::begin(keys), std::end(keys));
+
+  for (int i = 0; i < N * 3; i++) {
+    ASSERT_OK(Put(Key(keys[i] + 1000), RandomString(&rnd, 1024)));
+  }
+
+  start = Key(100);
+  end = Key(300);
+  r = Range(start, end);
+  db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size);
+  ASSERT_EQ(size, 0);
+
+  start = Key(1050);
+  end = Key(1080);
+  r = Range(start, end);
+  db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size);
+  ASSERT_GT(size, 6000);
+
+  start = Key(2100);
+  end = Key(2300);
+  r = Range(start, end);
+  db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size);
+  ASSERT_EQ(size, 0);
+
+  start = Key(1050);
+  end = Key(1080);
+  r = Range(start, end);
+  uint64_t size_with_mt, size_without_mt;
+  db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1,
+                           &size_with_mt);
+  ASSERT_GT(size_with_mt, 6000);
+  db_->GetApproximateSizes(&r, 1, &size_without_mt);
+  ASSERT_EQ(size_without_mt, 0);
+
+  Flush();
+
+  for (int i = 0; i < N; i++) {
+    ASSERT_OK(Put(Key(i + 1000), RandomString(&rnd, 1024)));
+  }
+
+  start = Key(1050);
+  end = Key(1080);
+  r = Range(start, end);
+  db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1,
+                           &size_with_mt);
+  db_->GetApproximateSizes(&r, 1, &size_without_mt);
+  ASSERT_GT(size_with_mt, size_without_mt);
+  ASSERT_GT(size_without_mt, 6000);
+
+  // Check that include_memtabtles flag works as expected
+  size_approx_options.include_memtabtles = false;
+  db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size);
+  ASSERT_EQ(size, size_without_mt);
+
+  // Check that files_size_error_margin works as expected, when the heuristic
+  // conditions are not met
+  start = Key(1);
+  end = Key(1000 + N - 2);
+  r = Range(start, end);
+  size_approx_options.files_size_error_margin = -1.0;  // disabled
+  db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size);
+  uint64_t size2;
+  size_approx_options.files_size_error_margin = 0.5;  // enabled, but not used
+  db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size2);
+  ASSERT_EQ(size, size2);
+}
+
+TEST_F(DBTest, ApproximateSizesFilesWithErrorMargin) {
+  Options options = CurrentOptions();
+  options.write_buffer_size = 1024 * 1024;
+  options.compression = kNoCompression;
+  options.create_if_missing = true;
+  options.target_file_size_base = 1024 * 1024;
+  DestroyAndReopen(options);
+  const auto default_cf = db_->DefaultColumnFamily();
+
+  const int N = 64000;
+  Random rnd(301);
+  for (int i = 0; i < N; i++) {
+    ASSERT_OK(Put(Key(i), RandomString(&rnd, 1024)));
+  }
+  // Flush everything to files
+  Flush();
+  // Compact the entire key space into the next level
+  db_->CompactRange(CompactRangeOptions(), default_cf, nullptr, nullptr);
+
+  // Write more keys
+  for (int i = N; i < (N + N / 4); i++) {
+    ASSERT_OK(Put(Key(i), RandomString(&rnd, 1024)));
+  }
+  // Flush everything to files again
+  Flush();
+
+  // Wait for compaction to finish
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  const std::string start = Key(0);
+  const std::string end = Key(2 * N);
+  const Range r(start, end);
+
+  SizeApproximationOptions size_approx_options;
+  size_approx_options.include_memtabtles = false;
+  size_approx_options.include_files = true;
+  size_approx_options.files_size_error_margin = -1.0;  // disabled
+
+  // Get the precise size without any approximation heuristic
+  uint64_t size;
+  db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size);
+  ASSERT_NE(size, 0);
+
+  // Get the size with an approximation heuristic
+  uint64_t size2;
+  const double error_margin = 0.2;
+  size_approx_options.files_size_error_margin = error_margin;
+  db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size2);
+  ASSERT_LT(size2, size * (1 + error_margin));
+  ASSERT_GT(size2, size * (1 - error_margin));
+}
+
+TEST_F(DBTest, GetApproximateMemTableStats) {
+  Options options = CurrentOptions();
+  options.write_buffer_size = 100000000;
+  options.compression = kNoCompression;
+  options.create_if_missing = true;
+  DestroyAndReopen(options);
+
+  const int N = 128;
+  Random rnd(301);
+  for (int i = 0; i < N; i++) {
+    ASSERT_OK(Put(Key(i), RandomString(&rnd, 1024)));
+  }
+
+  uint64_t count;
+  uint64_t size;
+
+  std::string start = Key(50);
+  std::string end = Key(60);
+  Range r(start, end);
+  db_->GetApproximateMemTableStats(r, &count, &size);
+  ASSERT_GT(count, 0);
+  ASSERT_LE(count, N);
+  ASSERT_GT(size, 6000);
+  ASSERT_LT(size, 204800);
+
+  start = Key(500);
+  end = Key(600);
+  r = Range(start, end);
+  db_->GetApproximateMemTableStats(r, &count, &size);
+  ASSERT_EQ(count, 0);
+  ASSERT_EQ(size, 0);
+
+  Flush();
+
+  start = Key(50);
+  end = Key(60);
+  r = Range(start, end);
+  db_->GetApproximateMemTableStats(r, &count, &size);
+  ASSERT_EQ(count, 0);
+  ASSERT_EQ(size, 0);
+
+  for (int i = 0; i < N; i++) {
+    ASSERT_OK(Put(Key(1000 + i), RandomString(&rnd, 1024)));
+  }
+
+  start = Key(100);
+  end = Key(1020);
+  r = Range(start, end);
+  db_->GetApproximateMemTableStats(r, &count, &size);
+  ASSERT_GT(count, 20);
+  ASSERT_GT(size, 6000);
+}
+
+TEST_F(DBTest, ApproximateSizes) {
+  do {
+    Options options = CurrentOptions();
+    options.write_buffer_size = 100000000;  // Large write buffer
+    options.compression = kNoCompression;
+    options.create_if_missing = true;
+    DestroyAndReopen(options);
+    CreateAndReopenWithCF({"pikachu"}, options);
+
+    ASSERT_TRUE(Between(Size("", "xyz", 1), 0, 0));
+    ReopenWithColumnFamilies({"default", "pikachu"}, options);
+    ASSERT_TRUE(Between(Size("", "xyz", 1), 0, 0));
+
+    // Write 8MB (80 values, each 100K)
+    ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
+    const int N = 80;
+    static const int S1 = 100000;
+    static const int S2 = 105000;  // Allow some expansion from metadata
+    Random rnd(301);
+    for (int i = 0; i < N; i++) {
+      ASSERT_OK(Put(1, Key(i), RandomString(&rnd, S1)));
+    }
+
+    // 0 because GetApproximateSizes() does not account for memtable space
+    ASSERT_TRUE(Between(Size("", Key(50), 1), 0, 0));
+
+    // Check sizes across recovery by reopening a few times
+    for (int run = 0; run < 3; run++) {
+      ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+      for (int compact_start = 0; compact_start < N; compact_start += 10) {
+        for (int i = 0; i < N; i += 10) {
+          ASSERT_TRUE(Between(Size("", Key(i), 1), S1 * i, S2 * i));
+          ASSERT_TRUE(Between(Size("", Key(i) + ".suffix", 1), S1 * (i + 1),
+                              S2 * (i + 1)));
+          ASSERT_TRUE(Between(Size(Key(i), Key(i + 10), 1), S1 * 10, S2 * 10));
+        }
+        ASSERT_TRUE(Between(Size("", Key(50), 1), S1 * 50, S2 * 50));
+        ASSERT_TRUE(
+            Between(Size("", Key(50) + ".suffix", 1), S1 * 50, S2 * 50));
+
+        std::string cstart_str = Key(compact_start);
+        std::string cend_str = Key(compact_start + 9);
+        Slice cstart = cstart_str;
+        Slice cend = cend_str;
+        dbfull()->TEST_CompactRange(0, &cstart, &cend, handles_[1]);
+      }
+
+      ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
+      ASSERT_GT(NumTableFilesAtLevel(1, 1), 0);
+    }
+    // ApproximateOffsetOf() is not yet implemented in plain table format.
+  } while (ChangeOptions(kSkipUniversalCompaction | kSkipFIFOCompaction |
+                         kSkipPlainTable | kSkipHashIndex));
+}
+
+TEST_F(DBTest, ApproximateSizes_MixOfSmallAndLarge) {
+  do {
+    Options options = CurrentOptions();
+    options.compression = kNoCompression;
+    CreateAndReopenWithCF({"pikachu"}, options);
+
+    Random rnd(301);
+    std::string big1 = RandomString(&rnd, 100000);
+    ASSERT_OK(Put(1, Key(0), RandomString(&rnd, 10000)));
+    ASSERT_OK(Put(1, Key(1), RandomString(&rnd, 10000)));
+    ASSERT_OK(Put(1, Key(2), big1));
+    ASSERT_OK(Put(1, Key(3), RandomString(&rnd, 10000)));
+    ASSERT_OK(Put(1, Key(4), big1));
+    ASSERT_OK(Put(1, Key(5), RandomString(&rnd, 10000)));
+    ASSERT_OK(Put(1, Key(6), RandomString(&rnd, 300000)));
+    ASSERT_OK(Put(1, Key(7), RandomString(&rnd, 10000)));
+
+    // Check sizes across recovery by reopening a few times
+    for (int run = 0; run < 3; run++) {
+      ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+      ASSERT_TRUE(Between(Size("", Key(0), 1), 0, 0));
+      ASSERT_TRUE(Between(Size("", Key(1), 1), 10000, 11000));
+      ASSERT_TRUE(Between(Size("", Key(2), 1), 20000, 21000));
+      ASSERT_TRUE(Between(Size("", Key(3), 1), 120000, 121000));
+      ASSERT_TRUE(Between(Size("", Key(4), 1), 130000, 131000));
+      ASSERT_TRUE(Between(Size("", Key(5), 1), 230000, 231000));
+      ASSERT_TRUE(Between(Size("", Key(6), 1), 240000, 241000));
+      ASSERT_TRUE(Between(Size("", Key(7), 1), 540000, 541000));
+      ASSERT_TRUE(Between(Size("", Key(8), 1), 550000, 560000));
+
+      ASSERT_TRUE(Between(Size(Key(3), Key(5), 1), 110000, 111000));
+
+      dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]);
+    }
+    // ApproximateOffsetOf() is not yet implemented in plain table format.
+  } while (ChangeOptions(kSkipPlainTable));
+}
+#endif  // ROCKSDB_LITE
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest, Snapshot) {
+  anon::OptionsOverride options_override;
+  options_override.skip_policy = kSkipNoSnapshot;
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions(options_override));
+    Put(0, "foo", "0v1");
+    Put(1, "foo", "1v1");
+
+    const Snapshot* s1 = db_->GetSnapshot();
+    ASSERT_EQ(1U, GetNumSnapshots());
+    uint64_t time_snap1 = GetTimeOldestSnapshots();
+    ASSERT_GT(time_snap1, 0U);
+    ASSERT_EQ(GetSequenceOldestSnapshots(), s1->GetSequenceNumber());
+    Put(0, "foo", "0v2");
+    Put(1, "foo", "1v2");
+
+    env_->addon_time_.fetch_add(1);
+
+    const Snapshot* s2 = db_->GetSnapshot();
+    ASSERT_EQ(2U, GetNumSnapshots());
+    ASSERT_EQ(time_snap1, GetTimeOldestSnapshots());
+    ASSERT_EQ(GetSequenceOldestSnapshots(), s1->GetSequenceNumber());
+    Put(0, "foo", "0v3");
+    Put(1, "foo", "1v3");
+
+    {
+      ManagedSnapshot s3(db_);
+      ASSERT_EQ(3U, GetNumSnapshots());
+      ASSERT_EQ(time_snap1, GetTimeOldestSnapshots());
+      ASSERT_EQ(GetSequenceOldestSnapshots(), s1->GetSequenceNumber());
+
+      Put(0, "foo", "0v4");
+      Put(1, "foo", "1v4");
+      ASSERT_EQ("0v1", Get(0, "foo", s1));
+      ASSERT_EQ("1v1", Get(1, "foo", s1));
+      ASSERT_EQ("0v2", Get(0, "foo", s2));
+      ASSERT_EQ("1v2", Get(1, "foo", s2));
+      ASSERT_EQ("0v3", Get(0, "foo", s3.snapshot()));
+      ASSERT_EQ("1v3", Get(1, "foo", s3.snapshot()));
+      ASSERT_EQ("0v4", Get(0, "foo"));
+      ASSERT_EQ("1v4", Get(1, "foo"));
+    }
+
+    ASSERT_EQ(2U, GetNumSnapshots());
+    ASSERT_EQ(time_snap1, GetTimeOldestSnapshots());
+    ASSERT_EQ(GetSequenceOldestSnapshots(), s1->GetSequenceNumber());
+    ASSERT_EQ("0v1", Get(0, "foo", s1));
+    ASSERT_EQ("1v1", Get(1, "foo", s1));
+    ASSERT_EQ("0v2", Get(0, "foo", s2));
+    ASSERT_EQ("1v2", Get(1, "foo", s2));
+    ASSERT_EQ("0v4", Get(0, "foo"));
+    ASSERT_EQ("1v4", Get(1, "foo"));
+
+    db_->ReleaseSnapshot(s1);
+    ASSERT_EQ("0v2", Get(0, "foo", s2));
+    ASSERT_EQ("1v2", Get(1, "foo", s2));
+    ASSERT_EQ("0v4", Get(0, "foo"));
+    ASSERT_EQ("1v4", Get(1, "foo"));
+    ASSERT_EQ(1U, GetNumSnapshots());
+    ASSERT_LT(time_snap1, GetTimeOldestSnapshots());
+    ASSERT_EQ(GetSequenceOldestSnapshots(), s2->GetSequenceNumber());
+
+    db_->ReleaseSnapshot(s2);
+    ASSERT_EQ(0U, GetNumSnapshots());
+    ASSERT_EQ(GetSequenceOldestSnapshots(), 0);
+    ASSERT_EQ("0v4", Get(0, "foo"));
+    ASSERT_EQ("1v4", Get(1, "foo"));
+  } while (ChangeOptions());
+}
+
+TEST_F(DBTest, HiddenValuesAreRemoved) {
+  anon::OptionsOverride options_override;
+  options_override.skip_policy = kSkipNoSnapshot;
+  do {
+    Options options = CurrentOptions(options_override);
+    CreateAndReopenWithCF({"pikachu"}, options);
+    Random rnd(301);
+    FillLevels("a", "z", 1);
+
+    std::string big = RandomString(&rnd, 50000);
+    Put(1, "foo", big);
+    Put(1, "pastfoo", "v");
+    const Snapshot* snapshot = db_->GetSnapshot();
+    Put(1, "foo", "tiny");
+    Put(1, "pastfoo2", "v2");  // Advance sequence number one more
+
+    ASSERT_OK(Flush(1));
+    ASSERT_GT(NumTableFilesAtLevel(0, 1), 0);
+
+    ASSERT_EQ(big, Get(1, "foo", snapshot));
+    ASSERT_TRUE(Between(Size("", "pastfoo", 1), 50000, 60000));
+    db_->ReleaseSnapshot(snapshot);
+    ASSERT_EQ(AllEntriesFor("foo", 1), "[ tiny, " + big + " ]");
+    Slice x("x");
+    dbfull()->TEST_CompactRange(0, nullptr, &x, handles_[1]);
+    ASSERT_EQ(AllEntriesFor("foo", 1), "[ tiny ]");
+    ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
+    ASSERT_GE(NumTableFilesAtLevel(1, 1), 1);
+    dbfull()->TEST_CompactRange(1, nullptr, &x, handles_[1]);
+    ASSERT_EQ(AllEntriesFor("foo", 1), "[ tiny ]");
+
+    ASSERT_TRUE(Between(Size("", "pastfoo", 1), 0, 1000));
+    // ApproximateOffsetOf() is not yet implemented in plain table format,
+    // which is used by Size().
+  } while (ChangeOptions(kSkipUniversalCompaction | kSkipFIFOCompaction |
+                         kSkipPlainTable));
+}
+#endif  // ROCKSDB_LITE
+
+TEST_F(DBTest, UnremovableSingleDelete) {
+  // If we compact:
+  //
+  // Put(A, v1) Snapshot SingleDelete(A) Put(A, v2)
+  //
+  // We do not want to end up with:
+  //
+  // Put(A, v1) Snapshot Put(A, v2)
+  //
+  // Because a subsequent SingleDelete(A) would delete the Put(A, v2)
+  // but not Put(A, v1), so Get(A) would return v1.
+  anon::OptionsOverride options_override;
+  options_override.skip_policy = kSkipNoSnapshot;
+  do {
+    Options options = CurrentOptions(options_override);
+    options.disable_auto_compactions = true;
+    CreateAndReopenWithCF({"pikachu"}, options);
+
+    Put(1, "foo", "first");
+    const Snapshot* snapshot = db_->GetSnapshot();
+    SingleDelete(1, "foo");
+    Put(1, "foo", "second");
+    ASSERT_OK(Flush(1));
+
+    ASSERT_EQ("first", Get(1, "foo", snapshot));
+    ASSERT_EQ("second", Get(1, "foo"));
+
+    dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr,
+                           nullptr);
+    ASSERT_EQ("[ second, SDEL, first ]", AllEntriesFor("foo", 1));
+
+    SingleDelete(1, "foo");
+
+    ASSERT_EQ("first", Get(1, "foo", snapshot));
+    ASSERT_EQ("NOT_FOUND", Get(1, "foo"));
+
+    dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr,
+                           nullptr);
+
+    ASSERT_EQ("first", Get(1, "foo", snapshot));
+    ASSERT_EQ("NOT_FOUND", Get(1, "foo"));
+    db_->ReleaseSnapshot(snapshot);
+    // Skip FIFO and universal compaction beccause they do not apply to the test
+    // case. Skip MergePut because single delete does not get removed when it
+    // encounters a merge.
+  } while (ChangeOptions(kSkipFIFOCompaction | kSkipUniversalCompaction |
+                         kSkipMergePut));
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest, DeletionMarkers1) {
+  Options options = CurrentOptions();
+  CreateAndReopenWithCF({"pikachu"}, options);
+  Put(1, "foo", "v1");
+  ASSERT_OK(Flush(1));
+  const int last = 2;
+  MoveFilesToLevel(last, 1);
+  // foo => v1 is now in last level
+  ASSERT_EQ(NumTableFilesAtLevel(last, 1), 1);
+
+  // Place a table at level last-1 to prevent merging with preceding mutation
+  Put(1, "a", "begin");
+  Put(1, "z", "end");
+  Flush(1);
+  MoveFilesToLevel(last - 1, 1);
+  ASSERT_EQ(NumTableFilesAtLevel(last, 1), 1);
+  ASSERT_EQ(NumTableFilesAtLevel(last - 1, 1), 1);
+
+  Delete(1, "foo");
+  Put(1, "foo", "v2");
+  ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2, DEL, v1 ]");
+  ASSERT_OK(Flush(1));  // Moves to level last-2
+  ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2, v1 ]");
+  Slice z("z");
+  dbfull()->TEST_CompactRange(last - 2, nullptr, &z, handles_[1]);
+  // DEL eliminated, but v1 remains because we aren't compacting that level
+  // (DEL can be eliminated because v2 hides v1).
+  ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2, v1 ]");
+  dbfull()->TEST_CompactRange(last - 1, nullptr, nullptr, handles_[1]);
+  // Merging last-1 w/ last, so we are the base level for "foo", so
+  // DEL is removed.  (as is v1).
+  ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2 ]");
+}
+
+TEST_F(DBTest, DeletionMarkers2) {
+  Options options = CurrentOptions();
+  CreateAndReopenWithCF({"pikachu"}, options);
+  Put(1, "foo", "v1");
+  ASSERT_OK(Flush(1));
+  const int last = 2;
+  MoveFilesToLevel(last, 1);
+  // foo => v1 is now in last level
+  ASSERT_EQ(NumTableFilesAtLevel(last, 1), 1);
+
+  // Place a table at level last-1 to prevent merging with preceding mutation
+  Put(1, "a", "begin");
+  Put(1, "z", "end");
+  Flush(1);
+  MoveFilesToLevel(last - 1, 1);
+  ASSERT_EQ(NumTableFilesAtLevel(last, 1), 1);
+  ASSERT_EQ(NumTableFilesAtLevel(last - 1, 1), 1);
+
+  Delete(1, "foo");
+  ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, v1 ]");
+  ASSERT_OK(Flush(1));  // Moves to level last-2
+  ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, v1 ]");
+  dbfull()->TEST_CompactRange(last - 2, nullptr, nullptr, handles_[1]);
+  // DEL kept: "last" file overlaps
+  ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, v1 ]");
+  dbfull()->TEST_CompactRange(last - 1, nullptr, nullptr, handles_[1]);
+  // Merging last-1 w/ last, so we are the base level for "foo", so
+  // DEL is removed.  (as is v1).
+  ASSERT_EQ(AllEntriesFor("foo", 1), "[ ]");
+}
+
+TEST_F(DBTest, OverlapInLevel0) {
+  do {
+    Options options = CurrentOptions();
+    CreateAndReopenWithCF({"pikachu"}, options);
+
+    // Fill levels 1 and 2 to disable the pushing of new memtables to levels >
+    // 0.
+    ASSERT_OK(Put(1, "100", "v100"));
+    ASSERT_OK(Put(1, "999", "v999"));
+    Flush(1);
+    MoveFilesToLevel(2, 1);
+    ASSERT_OK(Delete(1, "100"));
+    ASSERT_OK(Delete(1, "999"));
+    Flush(1);
+    MoveFilesToLevel(1, 1);
+    ASSERT_EQ("0,1,1", FilesPerLevel(1));
+
+    // Make files spanning the following ranges in level-0:
+    //  files[0]  200 .. 900
+    //  files[1]  300 .. 500
+    // Note that files are sorted by smallest key.
+    ASSERT_OK(Put(1, "300", "v300"));
+    ASSERT_OK(Put(1, "500", "v500"));
+    Flush(1);
+    ASSERT_OK(Put(1, "200", "v200"));
+    ASSERT_OK(Put(1, "600", "v600"));
+    ASSERT_OK(Put(1, "900", "v900"));
+    Flush(1);
+    ASSERT_EQ("2,1,1", FilesPerLevel(1));
+
+    // Compact away the placeholder files we created initially
+    dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]);
+    dbfull()->TEST_CompactRange(2, nullptr, nullptr, handles_[1]);
+    ASSERT_EQ("2", FilesPerLevel(1));
+
+    // Do a memtable compaction.  Before bug-fix, the compaction would
+    // not detect the overlap with level-0 files and would incorrectly place
+    // the deletion in a deeper level.
+    ASSERT_OK(Delete(1, "600"));
+    Flush(1);
+    ASSERT_EQ("3", FilesPerLevel(1));
+    ASSERT_EQ("NOT_FOUND", Get(1, "600"));
+  } while (ChangeOptions(kSkipUniversalCompaction | kSkipFIFOCompaction));
+}
+#endif  // ROCKSDB_LITE
+
+TEST_F(DBTest, ComparatorCheck) {
+  class NewComparator : public Comparator {
+   public:
+    const char* Name() const override { return "rocksdb.NewComparator"; }
+    int Compare(const Slice& a, const Slice& b) const override {
+      return BytewiseComparator()->Compare(a, b);
+    }
+    void FindShortestSeparator(std::string* s, const Slice& l) const override {
+      BytewiseComparator()->FindShortestSeparator(s, l);
+    }
+    void FindShortSuccessor(std::string* key) const override {
+      BytewiseComparator()->FindShortSuccessor(key);
+    }
+  };
+  Options new_options, options;
+  NewComparator cmp;
+  do {
+    options = CurrentOptions();
+    CreateAndReopenWithCF({"pikachu"}, options);
+    new_options = CurrentOptions();
+    new_options.comparator = &cmp;
+    // only the non-default column family has non-matching comparator
+    Status s = TryReopenWithColumnFamilies(
+        {"default", "pikachu"}, std::vector<Options>({options, new_options}));
+    ASSERT_TRUE(!s.ok());
+    ASSERT_TRUE(s.ToString().find("comparator") != std::string::npos)
+        << s.ToString();
+  } while (ChangeCompactOptions());
+}
+
+TEST_F(DBTest, CustomComparator) {
+  class NumberComparator : public Comparator {
+   public:
+    const char* Name() const override { return "test.NumberComparator"; }
+    int Compare(const Slice& a, const Slice& b) const override {
+      return ToNumber(a) - ToNumber(b);
+    }
+    void FindShortestSeparator(std::string* s, const Slice& l) const override {
+      ToNumber(*s);  // Check format
+      ToNumber(l);   // Check format
+    }
+    void FindShortSuccessor(std::string* key) const override {
+      ToNumber(*key);  // Check format
+    }
+
+   private:
+    static int ToNumber(const Slice& x) {
+      // Check that there are no extra characters.
+      EXPECT_TRUE(x.size() >= 2 && x[0] == '[' && x[x.size() - 1] == ']')
+          << EscapeString(x);
+      int val;
+      char ignored;
+      EXPECT_TRUE(sscanf(x.ToString().c_str(), "[%i]%c", &val, &ignored) == 1)
+          << EscapeString(x);
+      return val;
+    }
+  };
+  Options new_options;
+  NumberComparator cmp;
+  do {
+    new_options = CurrentOptions();
+    new_options.create_if_missing = true;
+    new_options.comparator = &cmp;
+    new_options.write_buffer_size = 4096;  // Compact more often
+    new_options.arena_block_size = 4096;
+    new_options = CurrentOptions(new_options);
+    DestroyAndReopen(new_options);
+    CreateAndReopenWithCF({"pikachu"}, new_options);
+    ASSERT_OK(Put(1, "[10]", "ten"));
+    ASSERT_OK(Put(1, "[0x14]", "twenty"));
+    for (int i = 0; i < 2; i++) {
+      ASSERT_EQ("ten", Get(1, "[10]"));
+      ASSERT_EQ("ten", Get(1, "[0xa]"));
+      ASSERT_EQ("twenty", Get(1, "[20]"));
+      ASSERT_EQ("twenty", Get(1, "[0x14]"));
+      ASSERT_EQ("NOT_FOUND", Get(1, "[15]"));
+      ASSERT_EQ("NOT_FOUND", Get(1, "[0xf]"));
+      Compact(1, "[0]", "[9999]");
+    }
+
+    for (int run = 0; run < 2; run++) {
+      for (int i = 0; i < 1000; i++) {
+        char buf[100];
+        snprintf(buf, sizeof(buf), "[%d]", i * 10);
+        ASSERT_OK(Put(1, buf, buf));
+      }
+      Compact(1, "[0]", "[1000000]");
+    }
+  } while (ChangeCompactOptions());
+}
+
+TEST_F(DBTest, DBOpen_Options) {
+  Options options = CurrentOptions();
+  std::string dbname = test::PerThreadDBPath("db_options_test");
+  ASSERT_OK(DestroyDB(dbname, options));
+
+  // Does not exist, and create_if_missing == false: error
+  DB* db = nullptr;
+  options.create_if_missing = false;
+  Status s = DB::Open(options, dbname, &db);
+  ASSERT_TRUE(strstr(s.ToString().c_str(), "does not exist") != nullptr);
+  ASSERT_TRUE(db == nullptr);
+
+  // Does not exist, and create_if_missing == true: OK
+  options.create_if_missing = true;
+  s = DB::Open(options, dbname, &db);
+  ASSERT_OK(s);
+  ASSERT_TRUE(db != nullptr);
+
+  delete db;
+  db = nullptr;
+
+  // Does exist, and error_if_exists == true: error
+  options.create_if_missing = false;
+  options.error_if_exists = true;
+  s = DB::Open(options, dbname, &db);
+  ASSERT_TRUE(strstr(s.ToString().c_str(), "exists") != nullptr);
+  ASSERT_TRUE(db == nullptr);
+
+  // Does exist, and error_if_exists == false: OK
+  options.create_if_missing = true;
+  options.error_if_exists = false;
+  s = DB::Open(options, dbname, &db);
+  ASSERT_OK(s);
+  ASSERT_TRUE(db != nullptr);
+
+  delete db;
+  db = nullptr;
+}
+
+TEST_F(DBTest, DBOpen_Change_NumLevels) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  DestroyAndReopen(options);
+  ASSERT_TRUE(db_ != nullptr);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  ASSERT_OK(Put(1, "a", "123"));
+  ASSERT_OK(Put(1, "b", "234"));
+  Flush(1);
+  MoveFilesToLevel(3, 1);
+  Close();
+
+  options.create_if_missing = false;
+  options.num_levels = 2;
+  Status s = TryReopenWithColumnFamilies({"default", "pikachu"}, options);
+  ASSERT_TRUE(strstr(s.ToString().c_str(), "Invalid argument") != nullptr);
+  ASSERT_TRUE(db_ == nullptr);
+}
+
+TEST_F(DBTest, DestroyDBMetaDatabase) {
+  std::string dbname = test::PerThreadDBPath("db_meta");
+  ASSERT_OK(env_->CreateDirIfMissing(dbname));
+  std::string metadbname = MetaDatabaseName(dbname, 0);
+  ASSERT_OK(env_->CreateDirIfMissing(metadbname));
+  std::string metametadbname = MetaDatabaseName(metadbname, 0);
+  ASSERT_OK(env_->CreateDirIfMissing(metametadbname));
+
+  // Destroy previous versions if they exist. Using the long way.
+  Options options = CurrentOptions();
+  ASSERT_OK(DestroyDB(metametadbname, options));
+  ASSERT_OK(DestroyDB(metadbname, options));
+  ASSERT_OK(DestroyDB(dbname, options));
+
+  // Setup databases
+  DB* db = nullptr;
+  ASSERT_OK(DB::Open(options, dbname, &db));
+  delete db;
+  db = nullptr;
+  ASSERT_OK(DB::Open(options, metadbname, &db));
+  delete db;
+  db = nullptr;
+  ASSERT_OK(DB::Open(options, metametadbname, &db));
+  delete db;
+  db = nullptr;
+
+  // Delete databases
+  ASSERT_OK(DestroyDB(dbname, options));
+
+  // Check if deletion worked.
+  options.create_if_missing = false;
+  ASSERT_TRUE(!(DB::Open(options, dbname, &db)).ok());
+  ASSERT_TRUE(!(DB::Open(options, metadbname, &db)).ok());
+  ASSERT_TRUE(!(DB::Open(options, metametadbname, &db)).ok());
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest, SnapshotFiles) {
+  do {
+    Options options = CurrentOptions();
+    options.write_buffer_size = 100000000;  // Large write buffer
+    CreateAndReopenWithCF({"pikachu"}, options);
+
+    Random rnd(301);
+
+    // Write 8MB (80 values, each 100K)
+    ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
+    std::vector<std::string> values;
+    for (int i = 0; i < 80; i++) {
+      values.push_back(RandomString(&rnd, 100000));
+      ASSERT_OK(Put((i < 40), Key(i), values[i]));
+    }
+
+    // assert that nothing makes it to disk yet.
+    ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
+
+    // get a file snapshot
+    uint64_t manifest_number = 0;
+    uint64_t manifest_size = 0;
+    std::vector<std::string> files;
+    dbfull()->DisableFileDeletions();
+    dbfull()->GetLiveFiles(files, &manifest_size);
+
+    // CURRENT, MANIFEST, OPTIONS, *.sst files (one for each CF)
+    ASSERT_EQ(files.size(), 5U);
+
+    uint64_t number = 0;
+    FileType type;
+
+    // copy these files to a new snapshot directory
+    std::string snapdir = dbname_ + ".snapdir/";
+    ASSERT_OK(env_->CreateDirIfMissing(snapdir));
+
+    for (size_t i = 0; i < files.size(); i++) {
+      // our clients require that GetLiveFiles returns
+      // files with "/" as first character!
+      ASSERT_EQ(files[i][0], '/');
+      std::string src = dbname_ + files[i];
+      std::string dest = snapdir + files[i];
+
+      uint64_t size;
+      ASSERT_OK(env_->GetFileSize(src, &size));
+
+      // record the number and the size of the
+      // latest manifest file
+      if (ParseFileName(files[i].substr(1), &number, &type)) {
+        if (type == kDescriptorFile) {
+          if (number > manifest_number) {
+            manifest_number = number;
+            ASSERT_GE(size, manifest_size);
+            size = manifest_size;  // copy only valid MANIFEST data
+          }
+        }
+      }
+      CopyFile(src, dest, size);
+    }
+
+    // release file snapshot
+    dbfull()->DisableFileDeletions();
+    // overwrite one key, this key should not appear in the snapshot
+    std::vector<std::string> extras;
+    for (unsigned int i = 0; i < 1; i++) {
+      extras.push_back(RandomString(&rnd, 100000));
+      ASSERT_OK(Put(0, Key(i), extras[i]));
+    }
+
+    // verify that data in the snapshot are correct
+    std::vector<ColumnFamilyDescriptor> column_families;
+    column_families.emplace_back("default", ColumnFamilyOptions());
+    column_families.emplace_back("pikachu", ColumnFamilyOptions());
+    std::vector<ColumnFamilyHandle*> cf_handles;
+    DB* snapdb;
+    DBOptions opts;
+    opts.env = env_;
+    opts.create_if_missing = false;
+    Status stat =
+        DB::Open(opts, snapdir, column_families, &cf_handles, &snapdb);
+    ASSERT_OK(stat);
+
+    ReadOptions roptions;
+    std::string val;
+    for (unsigned int i = 0; i < 80; i++) {
+      stat = snapdb->Get(roptions, cf_handles[i < 40], Key(i), &val);
+      ASSERT_EQ(values[i].compare(val), 0);
+    }
+    for (auto cfh : cf_handles) {
+      delete cfh;
+    }
+    delete snapdb;
+
+    // look at the new live files after we added an 'extra' key
+    // and after we took the first snapshot.
+    uint64_t new_manifest_number = 0;
+    uint64_t new_manifest_size = 0;
+    std::vector<std::string> newfiles;
+    dbfull()->DisableFileDeletions();
+    dbfull()->GetLiveFiles(newfiles, &new_manifest_size);
+
+    // find the new manifest file. assert that this manifest file is
+    // the same one as in the previous snapshot. But its size should be
+    // larger because we added an extra key after taking the
+    // previous shapshot.
+    for (size_t i = 0; i < newfiles.size(); i++) {
+      std::string src = dbname_ + "/" + newfiles[i];
+      // record the lognumber and the size of the
+      // latest manifest file
+      if (ParseFileName(newfiles[i].substr(1), &number, &type)) {
+        if (type == kDescriptorFile) {
+          if (number > new_manifest_number) {
+            uint64_t size;
+            new_manifest_number = number;
+            ASSERT_OK(env_->GetFileSize(src, &size));
+            ASSERT_GE(size, new_manifest_size);
+          }
+        }
+      }
+    }
+    ASSERT_EQ(manifest_number, new_manifest_number);
+    ASSERT_GT(new_manifest_size, manifest_size);
+
+    // release file snapshot
+    dbfull()->DisableFileDeletions();
+  } while (ChangeCompactOptions());
+}
+
+TEST_F(DBTest, ReadonlyDBGetLiveManifestSize) {
+  do {
+    Options options = CurrentOptions();
+    options.level0_file_num_compaction_trigger = 2;
+    DestroyAndReopen(options);
+
+    ASSERT_OK(Put("foo", "bar"));
+    ASSERT_OK(Flush());
+    ASSERT_OK(Put("foo", "bar"));
+    ASSERT_OK(Flush());
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+    Close();
+    ASSERT_OK(ReadOnlyReopen(options));
+
+    uint64_t manifest_size = 0;
+    std::vector<std::string> files;
+    dbfull()->GetLiveFiles(files, &manifest_size);
+
+    for (const std::string& f : files) {
+      uint64_t number = 0;
+      FileType type;
+      if (ParseFileName(f.substr(1), &number, &type)) {
+        if (type == kDescriptorFile) {
+          uint64_t size_on_disk;
+          env_->GetFileSize(dbname_ + "/" + f, &size_on_disk);
+          ASSERT_EQ(manifest_size, size_on_disk);
+          break;
+        }
+      }
+    }
+    Close();
+  } while (ChangeCompactOptions());
+}
+#endif
+
+TEST_F(DBTest, PurgeInfoLogs) {
+  Options options = CurrentOptions();
+  options.keep_log_file_num = 5;
+  options.create_if_missing = true;
+  for (int mode = 0; mode <= 1; mode++) {
+    if (mode == 1) {
+      options.db_log_dir = dbname_ + "_logs";
+      env_->CreateDirIfMissing(options.db_log_dir);
+    } else {
+      options.db_log_dir = "";
+    }
+    for (int i = 0; i < 8; i++) {
+      Reopen(options);
+    }
+
+    std::vector<std::string> files;
+    env_->GetChildren(options.db_log_dir.empty() ? dbname_ : options.db_log_dir,
+                      &files);
+    int info_log_count = 0;
+    for (std::string file : files) {
+      if (file.find("LOG") != std::string::npos) {
+        info_log_count++;
+      }
+    }
+    ASSERT_EQ(5, info_log_count);
+
+    Destroy(options);
+    // For mode (1), test DestroyDB() to delete all the logs under DB dir.
+    // For mode (2), no info log file should have been put under DB dir.
+    std::vector<std::string> db_files;
+    env_->GetChildren(dbname_, &db_files);
+    for (std::string file : db_files) {
+      ASSERT_TRUE(file.find("LOG") == std::string::npos);
+    }
+
+    if (mode == 1) {
+      // Cleaning up
+      env_->GetChildren(options.db_log_dir, &files);
+      for (std::string file : files) {
+        env_->DeleteFile(options.db_log_dir + "/" + file);
+      }
+      env_->DeleteDir(options.db_log_dir);
+    }
+  }
+}
+
+#ifndef ROCKSDB_LITE
+// Multi-threaded test:
+namespace {
+
+static const int kColumnFamilies = 10;
+static const int kNumThreads = 10;
+static const int kTestSeconds = 10;
+static const int kNumKeys = 1000;
+
+struct MTState {
+  DBTest* test;
+  std::atomic<bool> stop;
+  std::atomic<int> counter[kNumThreads];
+  std::atomic<bool> thread_done[kNumThreads];
+};
+
+struct MTThread {
+  MTState* state;
+  int id;
+  bool multiget_batched;
+};
+
+static void MTThreadBody(void* arg) {
+  MTThread* t = reinterpret_cast<MTThread*>(arg);
+  int id = t->id;
+  DB* db = t->state->test->db_;
+  int counter = 0;
+  fprintf(stderr, "... starting thread %d\n", id);
+  Random rnd(1000 + id);
+  char valbuf[1500];
+  while (t->state->stop.load(std::memory_order_acquire) == false) {
+    t->state->counter[id].store(counter, std::memory_order_release);
+
+    int key = rnd.Uniform(kNumKeys);
+    char keybuf[20];
+    snprintf(keybuf, sizeof(keybuf), "%016d", key);
+
+    if (rnd.OneIn(2)) {
+      // Write values of the form <key, my id, counter, cf, unique_id>.
+      // into each of the CFs
+      // We add some padding for force compactions.
+      int unique_id = rnd.Uniform(1000000);
+
+      // Half of the time directly use WriteBatch. Half of the time use
+      // WriteBatchWithIndex.
+      if (rnd.OneIn(2)) {
+        WriteBatch batch;
+        for (int cf = 0; cf < kColumnFamilies; ++cf) {
+          snprintf(valbuf, sizeof(valbuf), "%d.%d.%d.%d.%-1000d", key, id,
+                   static_cast<int>(counter), cf, unique_id);
+          batch.Put(t->state->test->handles_[cf], Slice(keybuf), Slice(valbuf));
+        }
+        ASSERT_OK(db->Write(WriteOptions(), &batch));
+      } else {
+        WriteBatchWithIndex batch(db->GetOptions().comparator);
+        for (int cf = 0; cf < kColumnFamilies; ++cf) {
+          snprintf(valbuf, sizeof(valbuf), "%d.%d.%d.%d.%-1000d", key, id,
+                   static_cast<int>(counter), cf, unique_id);
+          batch.Put(t->state->test->handles_[cf], Slice(keybuf), Slice(valbuf));
+        }
+        ASSERT_OK(db->Write(WriteOptions(), batch.GetWriteBatch()));
+      }
+    } else {
+      // Read a value and verify that it matches the pattern written above
+      // and that writes to all column families were atomic (unique_id is the
+      // same)
+      std::vector<Slice> keys(kColumnFamilies, Slice(keybuf));
+      std::vector<std::string> values;
+      std::vector<Status> statuses;
+      if (!t->multiget_batched) {
+        statuses = db->MultiGet(ReadOptions(), t->state->test->handles_, keys,
+                                &values);
+      } else {
+        std::vector<PinnableSlice> pin_values(keys.size());
+        statuses.resize(keys.size());
+        const Snapshot* snapshot = db->GetSnapshot();
+        ReadOptions ro;
+        ro.snapshot = snapshot;
+        for (int cf = 0; cf < kColumnFamilies; ++cf) {
+          db->MultiGet(ro, t->state->test->handles_[cf], 1, &keys[cf],
+                       &pin_values[cf], &statuses[cf]);
+        }
+        db->ReleaseSnapshot(snapshot);
+        values.resize(keys.size());
+        for (int cf = 0; cf < kColumnFamilies; ++cf) {
+          if (statuses[cf].ok()) {
+            values[cf].assign(pin_values[cf].data(), pin_values[cf].size());
+          }
+        }
+      }
+      Status s = statuses[0];
+      // all statuses have to be the same
+      for (size_t i = 1; i < statuses.size(); ++i) {
+        // they are either both ok or both not-found
+        ASSERT_TRUE((s.ok() && statuses[i].ok()) ||
+                    (s.IsNotFound() && statuses[i].IsNotFound()));
+      }
+      if (s.IsNotFound()) {
+        // Key has not yet been written
+      } else {
+        // Check that the writer thread counter is >= the counter in the value
+        ASSERT_OK(s);
+        int unique_id = -1;
+        for (int i = 0; i < kColumnFamilies; ++i) {
+          int k, w, c, cf, u;
+          ASSERT_EQ(5, sscanf(values[i].c_str(), "%d.%d.%d.%d.%d", &k, &w, &c,
+                              &cf, &u))
+              << values[i];
+          ASSERT_EQ(k, key);
+          ASSERT_GE(w, 0);
+          ASSERT_LT(w, kNumThreads);
+          ASSERT_LE(c, t->state->counter[w].load(std::memory_order_acquire));
+          ASSERT_EQ(cf, i);
+          if (i == 0) {
+            unique_id = u;
+          } else {
+            // this checks that updates across column families happened
+            // atomically -- all unique ids are the same
+            ASSERT_EQ(u, unique_id);
+          }
+        }
+      }
+    }
+    counter++;
+  }
+  t->state->thread_done[id].store(true, std::memory_order_release);
+  fprintf(stderr, "... stopping thread %d after %d ops\n", id, int(counter));
+}
+
+}  // namespace
+
+class MultiThreadedDBTest
+    : public DBTest,
+      public ::testing::WithParamInterface<std::tuple<int, bool>> {
+ public:
+  void SetUp() override {
+    std::tie(option_config_, multiget_batched_) = GetParam();
+  }
+
+  static std::vector<int> GenerateOptionConfigs() {
+    std::vector<int> optionConfigs;
+    for (int optionConfig = kDefault; optionConfig < kEnd; ++optionConfig) {
+      optionConfigs.push_back(optionConfig);
+    }
+    return optionConfigs;
+  }
+
+  bool multiget_batched_;
+};
+
+TEST_P(MultiThreadedDBTest, MultiThreaded) {
+  if (option_config_ == kPipelinedWrite) return;
+  anon::OptionsOverride options_override;
+  options_override.skip_policy = kSkipNoSnapshot;
+  Options options = CurrentOptions(options_override);
+  std::vector<std::string> cfs;
+  for (int i = 1; i < kColumnFamilies; ++i) {
+    cfs.push_back(ToString(i));
+  }
+  Reopen(options);
+  CreateAndReopenWithCF(cfs, options);
+  // Initialize state
+  MTState mt;
+  mt.test = this;
+  mt.stop.store(false, std::memory_order_release);
+  for (int id = 0; id < kNumThreads; id++) {
+    mt.counter[id].store(0, std::memory_order_release);
+    mt.thread_done[id].store(false, std::memory_order_release);
+  }
+
+  // Start threads
+  MTThread thread[kNumThreads];
+  for (int id = 0; id < kNumThreads; id++) {
+    thread[id].state = &mt;
+    thread[id].id = id;
+    thread[id].multiget_batched = multiget_batched_;
+    env_->StartThread(MTThreadBody, &thread[id]);
+  }
+
+  // Let them run for a while
+  env_->SleepForMicroseconds(kTestSeconds * 1000000);
+
+  // Stop the threads and wait for them to finish
+  mt.stop.store(true, std::memory_order_release);
+  for (int id = 0; id < kNumThreads; id++) {
+    while (mt.thread_done[id].load(std::memory_order_acquire) == false) {
+      env_->SleepForMicroseconds(100000);
+    }
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(
+    MultiThreaded, MultiThreadedDBTest,
+    ::testing::Combine(
+        ::testing::ValuesIn(MultiThreadedDBTest::GenerateOptionConfigs()),
+        ::testing::Bool()));
+#endif  // ROCKSDB_LITE
+
+// Group commit test:
+#if !defined(TRAVIS) && !defined(OS_WIN)
+// Disable this test temporarily on Travis and appveyor as it fails
+// intermittently. Github issue: #4151
+namespace {
+
+static const int kGCNumThreads = 4;
+static const int kGCNumKeys = 1000;
+
+struct GCThread {
+  DB* db;
+  int id;
+  std::atomic<bool> done;
+};
+
+static void GCThreadBody(void* arg) {
+  GCThread* t = reinterpret_cast<GCThread*>(arg);
+  int id = t->id;
+  DB* db = t->db;
+  WriteOptions wo;
+
+  for (int i = 0; i < kGCNumKeys; ++i) {
+    std::string kv(ToString(i + id * kGCNumKeys));
+    ASSERT_OK(db->Put(wo, kv, kv));
+  }
+  t->done = true;
+}
+
+}  // namespace
+
+TEST_F(DBTest, GroupCommitTest) {
+  do {
+    Options options = CurrentOptions();
+    options.env = env_;
+    options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+    Reopen(options);
+
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+        {{"WriteThread::JoinBatchGroup:BeganWaiting",
+          "DBImpl::WriteImpl:BeforeLeaderEnters"},
+         {"WriteThread::AwaitState:BlockingWaiting",
+          "WriteThread::EnterAsBatchGroupLeader:End"}});
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+    // Start threads
+    GCThread thread[kGCNumThreads];
+    for (int id = 0; id < kGCNumThreads; id++) {
+      thread[id].id = id;
+      thread[id].db = db_;
+      thread[id].done = false;
+      env_->StartThread(GCThreadBody, &thread[id]);
+    }
+    env_->WaitForJoin();
+
+    ASSERT_GT(TestGetTickerCount(options, WRITE_DONE_BY_OTHER), 0);
+
+    std::vector<std::string> expected_db;
+    for (int i = 0; i < kGCNumThreads * kGCNumKeys; ++i) {
+      expected_db.push_back(ToString(i));
+    }
+    std::sort(expected_db.begin(), expected_db.end());
+
+    Iterator* itr = db_->NewIterator(ReadOptions());
+    itr->SeekToFirst();
+    for (auto x : expected_db) {
+      ASSERT_TRUE(itr->Valid());
+      ASSERT_EQ(itr->key().ToString(), x);
+      ASSERT_EQ(itr->value().ToString(), x);
+      itr->Next();
+    }
+    ASSERT_TRUE(!itr->Valid());
+    delete itr;
+
+    HistogramData hist_data;
+    options.statistics->histogramData(DB_WRITE, &hist_data);
+    ASSERT_GT(hist_data.average, 0.0);
+  } while (ChangeOptions(kSkipNoSeekToLast));
+}
+#endif  // TRAVIS
+
+namespace {
+typedef std::map<std::string, std::string> KVMap;
+}
+
+class ModelDB : public DB {
+ public:
+  class ModelSnapshot : public Snapshot {
+   public:
+    KVMap map_;
+
+    SequenceNumber GetSequenceNumber() const override {
+      // no need to call this
+      assert(false);
+      return 0;
+    }
+  };
+
+  explicit ModelDB(const Options& options) : options_(options) {}
+  using DB::Put;
+  Status Put(const WriteOptions& o, ColumnFamilyHandle* cf, const Slice& k,
+             const Slice& v) override {
+    WriteBatch batch;
+    batch.Put(cf, k, v);
+    return Write(o, &batch);
+  }
+  using DB::Close;
+  Status Close() override { return Status::OK(); }
+  using DB::Delete;
+  Status Delete(const WriteOptions& o, ColumnFamilyHandle* cf,
+                const Slice& key) override {
+    WriteBatch batch;
+    batch.Delete(cf, key);
+    return Write(o, &batch);
+  }
+  using DB::SingleDelete;
+  Status SingleDelete(const WriteOptions& o, ColumnFamilyHandle* cf,
+                      const Slice& key) override {
+    WriteBatch batch;
+    batch.SingleDelete(cf, key);
+    return Write(o, &batch);
+  }
+  using DB::Merge;
+  Status Merge(const WriteOptions& o, ColumnFamilyHandle* cf, const Slice& k,
+               const Slice& v) override {
+    WriteBatch batch;
+    batch.Merge(cf, k, v);
+    return Write(o, &batch);
+  }
+  using DB::Get;
+  Status Get(const ReadOptions& /*options*/, ColumnFamilyHandle* /*cf*/,
+             const Slice& key, PinnableSlice* /*value*/) override {
+    return Status::NotSupported(key);
+  }
+
+  using DB::GetMergeOperands;
+  virtual Status GetMergeOperands(
+      const ReadOptions& /*options*/, ColumnFamilyHandle* /*column_family*/,
+      const Slice& key, PinnableSlice* /*slice*/,
+      GetMergeOperandsOptions* /*merge_operands_options*/,
+      int* /*number_of_operands*/) override {
+    return Status::NotSupported(key);
+  }
+
+  using DB::MultiGet;
+  std::vector<Status> MultiGet(
+      const ReadOptions& /*options*/,
+      const std::vector<ColumnFamilyHandle*>& /*column_family*/,
+      const std::vector<Slice>& keys,
+      std::vector<std::string>* /*values*/) override {
+    std::vector<Status> s(keys.size(),
+                          Status::NotSupported("Not implemented."));
+    return s;
+  }
+
+#ifndef ROCKSDB_LITE
+  using DB::IngestExternalFile;
+  Status IngestExternalFile(
+      ColumnFamilyHandle* /*column_family*/,
+      const std::vector<std::string>& /*external_files*/,
+      const IngestExternalFileOptions& /*options*/) override {
+    return Status::NotSupported("Not implemented.");
+  }
+
+  using DB::IngestExternalFiles;
+  Status IngestExternalFiles(
+      const std::vector<IngestExternalFileArg>& /*args*/) override {
+    return Status::NotSupported("Not implemented");
+  }
+
+  using DB::CreateColumnFamilyWithImport;
+  virtual Status CreateColumnFamilyWithImport(
+      const ColumnFamilyOptions& /*options*/,
+      const std::string& /*column_family_name*/,
+      const ImportColumnFamilyOptions& /*import_options*/,
+      const ExportImportFilesMetaData& /*metadata*/,
+      ColumnFamilyHandle** /*handle*/) override {
+    return Status::NotSupported("Not implemented.");
+  }
+
+  using DB::VerifyChecksum;
+  Status VerifyChecksum(const ReadOptions&) override {
+    return Status::NotSupported("Not implemented.");
+  }
+
+  using DB::GetPropertiesOfAllTables;
+  Status GetPropertiesOfAllTables(
+      ColumnFamilyHandle* /*column_family*/,
+      TablePropertiesCollection* /*props*/) override {
+    return Status();
+  }
+
+  Status GetPropertiesOfTablesInRange(
+      ColumnFamilyHandle* /*column_family*/, const Range* /*range*/,
+      std::size_t /*n*/, TablePropertiesCollection* /*props*/) override {
+    return Status();
+  }
+#endif  // ROCKSDB_LITE
+
+  using DB::KeyMayExist;
+  bool KeyMayExist(const ReadOptions& /*options*/,
+                   ColumnFamilyHandle* /*column_family*/, const Slice& /*key*/,
+                   std::string* /*value*/,
+                   bool* value_found = nullptr) override {
+    if (value_found != nullptr) {
+      *value_found = false;
+    }
+    return true;  // Not Supported directly
+  }
+  using DB::NewIterator;
+  Iterator* NewIterator(const ReadOptions& options,
+                        ColumnFamilyHandle* /*column_family*/) override {
+    if (options.snapshot == nullptr) {
+      KVMap* saved = new KVMap;
+      *saved = map_;
+      return new ModelIter(saved, true);
+    } else {
+      const KVMap* snapshot_state =
+          &(reinterpret_cast<const ModelSnapshot*>(options.snapshot)->map_);
+      return new ModelIter(snapshot_state, false);
+    }
+  }
+  Status NewIterators(const ReadOptions& /*options*/,
+                      const std::vector<ColumnFamilyHandle*>& /*column_family*/,
+                      std::vector<Iterator*>* /*iterators*/) override {
+    return Status::NotSupported("Not supported yet");
+  }
+  const Snapshot* GetSnapshot() override {
+    ModelSnapshot* snapshot = new ModelSnapshot;
+    snapshot->map_ = map_;
+    return snapshot;
+  }
+
+  void ReleaseSnapshot(const Snapshot* snapshot) override {
+    delete reinterpret_cast<const ModelSnapshot*>(snapshot);
+  }
+
+  Status Write(const WriteOptions& /*options*/, WriteBatch* batch) override {
+    class Handler : public WriteBatch::Handler {
+     public:
+      KVMap* map_;
+      void Put(const Slice& key, const Slice& value) override {
+        (*map_)[key.ToString()] = value.ToString();
+      }
+      void Merge(const Slice& /*key*/, const Slice& /*value*/) override {
+        // ignore merge for now
+        // (*map_)[key.ToString()] = value.ToString();
+      }
+      void Delete(const Slice& key) override { map_->erase(key.ToString()); }
+    };
+    Handler handler;
+    handler.map_ = &map_;
+    return batch->Iterate(&handler);
+  }
+
+  using DB::GetProperty;
+  bool GetProperty(ColumnFamilyHandle* /*column_family*/,
+                   const Slice& /*property*/, std::string* /*value*/) override {
+    return false;
+  }
+  using DB::GetIntProperty;
+  bool GetIntProperty(ColumnFamilyHandle* /*column_family*/,
+                      const Slice& /*property*/, uint64_t* /*value*/) override {
+    return false;
+  }
+  using DB::GetMapProperty;
+  bool GetMapProperty(ColumnFamilyHandle* /*column_family*/,
+                      const Slice& /*property*/,
+                      std::map<std::string, std::string>* /*value*/) override {
+    return false;
+  }
+  using DB::GetAggregatedIntProperty;
+  bool GetAggregatedIntProperty(const Slice& /*property*/,
+                                uint64_t* /*value*/) override {
+    return false;
+  }
+  using DB::GetApproximateSizes;
+  Status GetApproximateSizes(const SizeApproximationOptions& /*options*/,
+                             ColumnFamilyHandle* /*column_family*/,
+                             const Range* /*range*/, int n,
+                             uint64_t* sizes) override {
+    for (int i = 0; i < n; i++) {
+      sizes[i] = 0;
+    }
+    return Status::OK();
+  }
+  using DB::GetApproximateMemTableStats;
+  void GetApproximateMemTableStats(ColumnFamilyHandle* /*column_family*/,
+                                   const Range& /*range*/,
+                                   uint64_t* const count,
+                                   uint64_t* const size) override {
+    *count = 0;
+    *size = 0;
+  }
+  using DB::CompactRange;
+  Status CompactRange(const CompactRangeOptions& /*options*/,
+                      ColumnFamilyHandle* /*column_family*/,
+                      const Slice* /*start*/, const Slice* /*end*/) override {
+    return Status::NotSupported("Not supported operation.");
+  }
+
+  Status SetDBOptions(
+      const std::unordered_map<std::string, std::string>& /*new_options*/)
+      override {
+    return Status::NotSupported("Not supported operation.");
+  }
+
+  using DB::CompactFiles;
+  Status CompactFiles(
+      const CompactionOptions& /*compact_options*/,
+      ColumnFamilyHandle* /*column_family*/,
+      const std::vector<std::string>& /*input_file_names*/,
+      const int /*output_level*/, const int /*output_path_id*/ = -1,
+      std::vector<std::string>* const /*output_file_names*/ = nullptr,
+      CompactionJobInfo* /*compaction_job_info*/ = nullptr) override {
+    return Status::NotSupported("Not supported operation.");
+  }
+
+  Status PauseBackgroundWork() override {
+    return Status::NotSupported("Not supported operation.");
+  }
+
+  Status ContinueBackgroundWork() override {
+    return Status::NotSupported("Not supported operation.");
+  }
+
+  Status EnableAutoCompaction(
+      const std::vector<ColumnFamilyHandle*>& /*column_family_handles*/)
+      override {
+    return Status::NotSupported("Not supported operation.");
+  }
+
+  void EnableManualCompaction() override { return; }
+
+  void DisableManualCompaction() override { return; }
+
+  using DB::NumberLevels;
+  int NumberLevels(ColumnFamilyHandle* /*column_family*/) override { return 1; }
+
+  using DB::MaxMemCompactionLevel;
+  int MaxMemCompactionLevel(ColumnFamilyHandle* /*column_family*/) override {
+    return 1;
+  }
+
+  using DB::Level0StopWriteTrigger;
+  int Level0StopWriteTrigger(ColumnFamilyHandle* /*column_family*/) override {
+    return -1;
+  }
+
+  const std::string& GetName() const override { return name_; }
+
+  Env* GetEnv() const override { return nullptr; }
+
+  using DB::GetOptions;
+  Options GetOptions(ColumnFamilyHandle* /*column_family*/) const override {
+    return options_;
+  }
+
+  using DB::GetDBOptions;
+  DBOptions GetDBOptions() const override { return options_; }
+
+  using DB::Flush;
+  Status Flush(const ROCKSDB_NAMESPACE::FlushOptions& /*options*/,
+               ColumnFamilyHandle* /*column_family*/) override {
+    Status ret;
+    return ret;
+  }
+  Status Flush(
+      const ROCKSDB_NAMESPACE::FlushOptions& /*options*/,
+      const std::vector<ColumnFamilyHandle*>& /*column_families*/) override {
+    return Status::OK();
+  }
+
+  Status SyncWAL() override { return Status::OK(); }
+
+#ifndef ROCKSDB_LITE
+  Status DisableFileDeletions() override { return Status::OK(); }
+
+  Status EnableFileDeletions(bool /*force*/) override { return Status::OK(); }
+  Status GetLiveFiles(std::vector<std::string>&, uint64_t* /*size*/,
+                      bool /*flush_memtable*/ = true) override {
+    return Status::OK();
+  }
+
+  Status GetSortedWalFiles(VectorLogPtr& /*files*/) override {
+    return Status::OK();
+  }
+
+  Status GetCurrentWalFile(
+      std::unique_ptr<LogFile>* /*current_log_file*/) override {
+    return Status::OK();
+  }
+
+  virtual Status GetCreationTimeOfOldestFile(
+      uint64_t* /*creation_time*/) override {
+    return Status::NotSupported();
+  }
+
+  Status DeleteFile(std::string /*name*/) override { return Status::OK(); }
+
+  Status GetUpdatesSince(
+      ROCKSDB_NAMESPACE::SequenceNumber,
+      std::unique_ptr<ROCKSDB_NAMESPACE::TransactionLogIterator>*,
+      const TransactionLogIterator::ReadOptions& /*read_options*/ =
+          TransactionLogIterator::ReadOptions()) override {
+    return Status::NotSupported("Not supported in Model DB");
+  }
+
+  void GetColumnFamilyMetaData(ColumnFamilyHandle* /*column_family*/,
+                               ColumnFamilyMetaData* /*metadata*/) override {}
+#endif  // ROCKSDB_LITE
+
+  Status GetDbIdentity(std::string& /*identity*/) const override {
+    return Status::OK();
+  }
+
+  SequenceNumber GetLatestSequenceNumber() const override { return 0; }
+
+  bool SetPreserveDeletesSequenceNumber(SequenceNumber /*seqnum*/) override {
+    return true;
+  }
+
+  ColumnFamilyHandle* DefaultColumnFamily() const override { return nullptr; }
+
+ private:
+  class ModelIter : public Iterator {
+   public:
+    ModelIter(const KVMap* map, bool owned)
+        : map_(map), owned_(owned), iter_(map_->end()) {}
+    ~ModelIter() override {
+      if (owned_) delete map_;
+    }
+    bool Valid() const override { return iter_ != map_->end(); }
+    void SeekToFirst() override { iter_ = map_->begin(); }
+    void SeekToLast() override {
+      if (map_->empty()) {
+        iter_ = map_->end();
+      } else {
+        iter_ = map_->find(map_->rbegin()->first);
+      }
+    }
+    void Seek(const Slice& k) override {
+      iter_ = map_->lower_bound(k.ToString());
+    }
+    void SeekForPrev(const Slice& k) override {
+      iter_ = map_->upper_bound(k.ToString());
+      Prev();
+    }
+    void Next() override { ++iter_; }
+    void Prev() override {
+      if (iter_ == map_->begin()) {
+        iter_ = map_->end();
+        return;
+      }
+      --iter_;
+    }
+
+    Slice key() const override { return iter_->first; }
+    Slice value() const override { return iter_->second; }
+    Status status() const override { return Status::OK(); }
+
+   private:
+    const KVMap* const map_;
+    const bool owned_;  // Do we own map_
+    KVMap::const_iterator iter_;
+  };
+  const Options options_;
+  KVMap map_;
+  std::string name_ = "";
+};
+
+#ifndef ROCKSDB_VALGRIND_RUN
+static std::string RandomKey(Random* rnd, int minimum = 0) {
+  int len;
+  do {
+    len = (rnd->OneIn(3)
+               ? 1  // Short sometimes to encourage collisions
+               : (rnd->OneIn(100) ? rnd->Skewed(10) : rnd->Uniform(10)));
+  } while (len < minimum);
+  return test::RandomKey(rnd, len);
+}
+
+static bool CompareIterators(int step, DB* model, DB* db,
+                             const Snapshot* model_snap,
+                             const Snapshot* db_snap) {
+  ReadOptions options;
+  options.snapshot = model_snap;
+  Iterator* miter = model->NewIterator(options);
+  options.snapshot = db_snap;
+  Iterator* dbiter = db->NewIterator(options);
+  bool ok = true;
+  int count = 0;
+  for (miter->SeekToFirst(), dbiter->SeekToFirst();
+       ok && miter->Valid() && dbiter->Valid(); miter->Next(), dbiter->Next()) {
+    count++;
+    if (miter->key().compare(dbiter->key()) != 0) {
+      fprintf(stderr, "step %d: Key mismatch: '%s' vs. '%s'\n", step,
+              EscapeString(miter->key()).c_str(),
+              EscapeString(dbiter->key()).c_str());
+      ok = false;
+      break;
+    }
+
+    if (miter->value().compare(dbiter->value()) != 0) {
+      fprintf(stderr, "step %d: Value mismatch for key '%s': '%s' vs. '%s'\n",
+              step, EscapeString(miter->key()).c_str(),
+              EscapeString(miter->value()).c_str(),
+              EscapeString(miter->value()).c_str());
+      ok = false;
+    }
+  }
+
+  if (ok) {
+    if (miter->Valid() != dbiter->Valid()) {
+      fprintf(stderr, "step %d: Mismatch at end of iterators: %d vs. %d\n",
+              step, miter->Valid(), dbiter->Valid());
+      ok = false;
+    }
+  }
+  delete miter;
+  delete dbiter;
+  return ok;
+}
+
+class DBTestRandomized : public DBTest,
+                         public ::testing::WithParamInterface<int> {
+ public:
+  void SetUp() override { option_config_ = GetParam(); }
+
+  static std::vector<int> GenerateOptionConfigs() {
+    std::vector<int> option_configs;
+    // skip cuckoo hash as it does not support snapshot.
+    for (int option_config = kDefault; option_config < kEnd; ++option_config) {
+      if (!ShouldSkipOptions(option_config,
+                             kSkipDeletesFilterFirst | kSkipNoSeekToLast)) {
+        option_configs.push_back(option_config);
+      }
+    }
+    option_configs.push_back(kBlockBasedTableWithIndexRestartInterval);
+    return option_configs;
+  }
+};
+
+INSTANTIATE_TEST_CASE_P(
+    DBTestRandomized, DBTestRandomized,
+    ::testing::ValuesIn(DBTestRandomized::GenerateOptionConfigs()));
+
+TEST_P(DBTestRandomized, Randomized) {
+  anon::OptionsOverride options_override;
+  options_override.skip_policy = kSkipNoSnapshot;
+  Options options = CurrentOptions(options_override);
+  DestroyAndReopen(options);
+
+  Random rnd(test::RandomSeed() + GetParam());
+  ModelDB model(options);
+  const int N = 10000;
+  const Snapshot* model_snap = nullptr;
+  const Snapshot* db_snap = nullptr;
+  std::string k, v;
+  for (int step = 0; step < N; step++) {
+    // TODO(sanjay): Test Get() works
+    int p = rnd.Uniform(100);
+    int minimum = 0;
+    if (option_config_ == kHashSkipList || option_config_ == kHashLinkList ||
+        option_config_ == kPlainTableFirstBytePrefix ||
+        option_config_ == kBlockBasedTableWithWholeKeyHashIndex ||
+        option_config_ == kBlockBasedTableWithPrefixHashIndex) {
+      minimum = 1;
+    }
+    if (p < 45) {  // Put
+      k = RandomKey(&rnd, minimum);
+      v = RandomString(&rnd,
+                       rnd.OneIn(20) ? 100 + rnd.Uniform(100) : rnd.Uniform(8));
+      ASSERT_OK(model.Put(WriteOptions(), k, v));
+      ASSERT_OK(db_->Put(WriteOptions(), k, v));
+    } else if (p < 90) {  // Delete
+      k = RandomKey(&rnd, minimum);
+      ASSERT_OK(model.Delete(WriteOptions(), k));
+      ASSERT_OK(db_->Delete(WriteOptions(), k));
+    } else {  // Multi-element batch
+      WriteBatch b;
+      const int num = rnd.Uniform(8);
+      for (int i = 0; i < num; i++) {
+        if (i == 0 || !rnd.OneIn(10)) {
+          k = RandomKey(&rnd, minimum);
+        } else {
+          // Periodically re-use the same key from the previous iter, so
+          // we have multiple entries in the write batch for the same key
+        }
+        if (rnd.OneIn(2)) {
+          v = RandomString(&rnd, rnd.Uniform(10));
+          b.Put(k, v);
+        } else {
+          b.Delete(k);
+        }
+      }
+      ASSERT_OK(model.Write(WriteOptions(), &b));
+      ASSERT_OK(db_->Write(WriteOptions(), &b));
+    }
+
+    if ((step % 100) == 0) {
+      // For DB instances that use the hash index + block-based table, the
+      // iterator will be invalid right when seeking a non-existent key, right
+      // than return a key that is close to it.
+      if (option_config_ != kBlockBasedTableWithWholeKeyHashIndex &&
+          option_config_ != kBlockBasedTableWithPrefixHashIndex) {
+        ASSERT_TRUE(CompareIterators(step, &model, db_, nullptr, nullptr));
+        ASSERT_TRUE(CompareIterators(step, &model, db_, model_snap, db_snap));
+      }
+
+      // Save a snapshot from each DB this time that we'll use next
+      // time we compare things, to make sure the current state is
+      // preserved with the snapshot
+      if (model_snap != nullptr) model.ReleaseSnapshot(model_snap);
+      if (db_snap != nullptr) db_->ReleaseSnapshot(db_snap);
+
+      Reopen(options);
+      ASSERT_TRUE(CompareIterators(step, &model, db_, nullptr, nullptr));
+
+      model_snap = model.GetSnapshot();
+      db_snap = db_->GetSnapshot();
+    }
+  }
+  if (model_snap != nullptr) model.ReleaseSnapshot(model_snap);
+  if (db_snap != nullptr) db_->ReleaseSnapshot(db_snap);
+}
+#endif  // ROCKSDB_VALGRIND_RUN
+
+TEST_F(DBTest, BlockBasedTablePrefixIndexTest) {
+  // create a DB with block prefix index
+  BlockBasedTableOptions table_options;
+  Options options = CurrentOptions();
+  table_options.index_type = BlockBasedTableOptions::kHashSearch;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+
+  Reopen(options);
+  ASSERT_OK(Put("k1", "v1"));
+  Flush();
+  ASSERT_OK(Put("k2", "v2"));
+
+  // Reopen it without prefix extractor, make sure everything still works.
+  // RocksDB should just fall back to the binary index.
+  table_options.index_type = BlockBasedTableOptions::kBinarySearch;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options.prefix_extractor.reset();
+
+  Reopen(options);
+  ASSERT_EQ("v1", Get("k1"));
+  ASSERT_EQ("v2", Get("k2"));
+}
+
+TEST_F(DBTest, BlockBasedTablePrefixIndexTotalOrderSeek) {
+  // create a DB with block prefix index
+  BlockBasedTableOptions table_options;
+  Options options = CurrentOptions();
+  options.max_open_files = 10;
+  table_options.index_type = BlockBasedTableOptions::kHashSearch;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+
+  // RocksDB sanitize max open files to at least 20. Modify it back.
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "SanitizeOptions::AfterChangeMaxOpenFiles", [&](void* arg) {
+        int* max_open_files = static_cast<int*>(arg);
+        *max_open_files = 11;
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  Reopen(options);
+  ASSERT_OK(Put("k1", "v1"));
+  Flush();
+
+  CompactRangeOptions cro;
+  cro.change_level = true;
+  cro.target_level = 1;
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+  // Force evict tables
+  dbfull()->TEST_table_cache()->SetCapacity(0);
+  // Make table cache to keep one entry.
+  dbfull()->TEST_table_cache()->SetCapacity(1);
+
+  ReadOptions read_options;
+  read_options.total_order_seek = true;
+  {
+    std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+    iter->Seek("k1");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("k1", iter->key().ToString());
+  }
+
+  // After total order seek, prefix index should still be used.
+  read_options.total_order_seek = false;
+  {
+    std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+    iter->Seek("k1");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("k1", iter->key().ToString());
+  }
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBTest, ChecksumTest) {
+  BlockBasedTableOptions table_options;
+  Options options = CurrentOptions();
+
+  table_options.checksum = kCRC32c;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  Reopen(options);
+  ASSERT_OK(Put("a", "b"));
+  ASSERT_OK(Put("c", "d"));
+  ASSERT_OK(Flush());  // table with crc checksum
+
+  table_options.checksum = kxxHash;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  Reopen(options);
+  ASSERT_OK(Put("e", "f"));
+  ASSERT_OK(Put("g", "h"));
+  ASSERT_OK(Flush());  // table with xxhash checksum
+
+  table_options.checksum = kCRC32c;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  Reopen(options);
+  ASSERT_EQ("b", Get("a"));
+  ASSERT_EQ("d", Get("c"));
+  ASSERT_EQ("f", Get("e"));
+  ASSERT_EQ("h", Get("g"));
+
+  table_options.checksum = kCRC32c;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  Reopen(options);
+  ASSERT_EQ("b", Get("a"));
+  ASSERT_EQ("d", Get("c"));
+  ASSERT_EQ("f", Get("e"));
+  ASSERT_EQ("h", Get("g"));
+}
+
+#ifndef ROCKSDB_LITE
+TEST_P(DBTestWithParam, FIFOCompactionTest) {
+  for (int iter = 0; iter < 2; ++iter) {
+    // first iteration -- auto compaction
+    // second iteration -- manual compaction
+    Options options;
+    options.compaction_style = kCompactionStyleFIFO;
+    options.write_buffer_size = 100 << 10;  // 100KB
+    options.arena_block_size = 4096;
+    options.compaction_options_fifo.max_table_files_size = 500 << 10;  // 500KB
+    options.compression = kNoCompression;
+    options.create_if_missing = true;
+    options.max_subcompactions = max_subcompactions_;
+    if (iter == 1) {
+      options.disable_auto_compactions = true;
+    }
+    options = CurrentOptions(options);
+    DestroyAndReopen(options);
+
+    Random rnd(301);
+    for (int i = 0; i < 6; ++i) {
+      for (int j = 0; j < 110; ++j) {
+        ASSERT_OK(Put(ToString(i * 100 + j), RandomString(&rnd, 980)));
+      }
+      // flush should happen here
+      ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+    }
+    if (iter == 0) {
+      ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    } else {
+      CompactRangeOptions cro;
+      cro.exclusive_manual_compaction = exclusive_manual_compaction_;
+      ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+    }
+    // only 5 files should survive
+    ASSERT_EQ(NumTableFilesAtLevel(0), 5);
+    for (int i = 0; i < 50; ++i) {
+      // these keys should be deleted in previous compaction
+      ASSERT_EQ("NOT_FOUND", Get(ToString(i)));
+    }
+  }
+}
+
+TEST_F(DBTest, FIFOCompactionTestWithCompaction) {
+  Options options;
+  options.compaction_style = kCompactionStyleFIFO;
+  options.write_buffer_size = 20 << 10;  // 20K
+  options.arena_block_size = 4096;
+  options.compaction_options_fifo.max_table_files_size = 1500 << 10;  // 1MB
+  options.compaction_options_fifo.allow_compaction = true;
+  options.level0_file_num_compaction_trigger = 6;
+  options.compression = kNoCompression;
+  options.create_if_missing = true;
+  options = CurrentOptions(options);
+  DestroyAndReopen(options);
+
+  Random rnd(301);
+  for (int i = 0; i < 60; i++) {
+    // Generate and flush a file about 20KB.
+    for (int j = 0; j < 20; j++) {
+      ASSERT_OK(Put(ToString(i * 20 + j), RandomString(&rnd, 980)));
+    }
+    Flush();
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  }
+  // It should be compacted to 10 files.
+  ASSERT_EQ(NumTableFilesAtLevel(0), 10);
+
+  for (int i = 0; i < 60; i++) {
+    // Generate and flush a file about 20KB.
+    for (int j = 0; j < 20; j++) {
+      ASSERT_OK(Put(ToString(i * 20 + j + 2000), RandomString(&rnd, 980)));
+    }
+    Flush();
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  }
+
+  // It should be compacted to no more than 20 files.
+  ASSERT_GT(NumTableFilesAtLevel(0), 10);
+  ASSERT_LT(NumTableFilesAtLevel(0), 18);
+  // Size limit is still guaranteed.
+  ASSERT_LE(SizeAtLevel(0),
+            options.compaction_options_fifo.max_table_files_size);
+}
+
+TEST_F(DBTest, FIFOCompactionStyleWithCompactionAndDelete) {
+  Options options;
+  options.compaction_style = kCompactionStyleFIFO;
+  options.write_buffer_size = 20 << 10;  // 20K
+  options.arena_block_size = 4096;
+  options.compaction_options_fifo.max_table_files_size = 1500 << 10;  // 1MB
+  options.compaction_options_fifo.allow_compaction = true;
+  options.level0_file_num_compaction_trigger = 3;
+  options.compression = kNoCompression;
+  options.create_if_missing = true;
+  options = CurrentOptions(options);
+  DestroyAndReopen(options);
+
+  Random rnd(301);
+  for (int i = 0; i < 3; i++) {
+    // Each file contains a different key which will be dropped later.
+    ASSERT_OK(Put("a" + ToString(i), RandomString(&rnd, 500)));
+    ASSERT_OK(Put("key" + ToString(i), ""));
+    ASSERT_OK(Put("z" + ToString(i), RandomString(&rnd, 500)));
+    Flush();
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  }
+  ASSERT_EQ(NumTableFilesAtLevel(0), 1);
+  for (int i = 0; i < 3; i++) {
+    ASSERT_EQ("", Get("key" + ToString(i)));
+  }
+  for (int i = 0; i < 3; i++) {
+    // Each file contains a different key which will be dropped later.
+    ASSERT_OK(Put("a" + ToString(i), RandomString(&rnd, 500)));
+    ASSERT_OK(Delete("key" + ToString(i)));
+    ASSERT_OK(Put("z" + ToString(i), RandomString(&rnd, 500)));
+    Flush();
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  }
+  ASSERT_EQ(NumTableFilesAtLevel(0), 2);
+  for (int i = 0; i < 3; i++) {
+    ASSERT_EQ("NOT_FOUND", Get("key" + ToString(i)));
+  }
+}
+
+// Check that FIFO-with-TTL is not supported with max_open_files != -1.
+TEST_F(DBTest, FIFOCompactionWithTTLAndMaxOpenFilesTest) {
+  Options options;
+  options.compaction_style = kCompactionStyleFIFO;
+  options.create_if_missing = true;
+  options.ttl = 600;  // seconds
+
+  // TTL is now supported with max_open_files != -1.
+  options.max_open_files = 100;
+  options = CurrentOptions(options);
+  ASSERT_OK(TryReopen(options));
+
+  options.max_open_files = -1;
+  ASSERT_OK(TryReopen(options));
+}
+
+// Check that FIFO-with-TTL is supported only with BlockBasedTableFactory.
+TEST_F(DBTest, FIFOCompactionWithTTLAndVariousTableFormatsTest) {
+  Options options;
+  options.compaction_style = kCompactionStyleFIFO;
+  options.create_if_missing = true;
+  options.ttl = 600;  // seconds
+
+  options = CurrentOptions(options);
+  options.table_factory.reset(NewBlockBasedTableFactory());
+  ASSERT_OK(TryReopen(options));
+
+  Destroy(options);
+  options.table_factory.reset(NewPlainTableFactory());
+  ASSERT_TRUE(TryReopen(options).IsNotSupported());
+
+  Destroy(options);
+  options.table_factory.reset(NewAdaptiveTableFactory());
+  ASSERT_TRUE(TryReopen(options).IsNotSupported());
+}
+
+TEST_F(DBTest, FIFOCompactionWithTTLTest) {
+  Options options;
+  options.compaction_style = kCompactionStyleFIFO;
+  options.write_buffer_size = 10 << 10;  // 10KB
+  options.arena_block_size = 4096;
+  options.compression = kNoCompression;
+  options.create_if_missing = true;
+  env_->time_elapse_only_sleep_ = false;
+  options.env = env_;
+
+  // Test to make sure that all files with expired ttl are deleted on next
+  // manual compaction.
+  {
+    env_->addon_time_.store(0);
+    options.compaction_options_fifo.max_table_files_size = 150 << 10;  // 150KB
+    options.compaction_options_fifo.allow_compaction = false;
+    options.ttl = 1 * 60 * 60 ;  // 1 hour
+    options = CurrentOptions(options);
+    DestroyAndReopen(options);
+
+    Random rnd(301);
+    for (int i = 0; i < 10; i++) {
+      // Generate and flush a file about 10KB.
+      for (int j = 0; j < 10; j++) {
+        ASSERT_OK(Put(ToString(i * 20 + j), RandomString(&rnd, 980)));
+      }
+      Flush();
+      ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    }
+    ASSERT_EQ(NumTableFilesAtLevel(0), 10);
+
+    // Sleep for 2 hours -- which is much greater than TTL.
+    // Note: Couldn't use SleepForMicroseconds because it takes an int instead
+    // of uint64_t. Hence used addon_time_ directly.
+    // env_->SleepForMicroseconds(2 * 60 * 60 * 1000 * 1000);
+    env_->addon_time_.fetch_add(2 * 60 * 60);
+
+    // Since no flushes and compactions have run, the db should still be in
+    // the same state even after considerable time has passed.
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    ASSERT_EQ(NumTableFilesAtLevel(0), 10);
+
+    dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+    ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+  }
+
+  // Test to make sure that all files with expired ttl are deleted on next
+  // automatic compaction.
+  {
+    options.compaction_options_fifo.max_table_files_size = 150 << 10;  // 150KB
+    options.compaction_options_fifo.allow_compaction = false;
+    options.ttl = 1 * 60 * 60;  // 1 hour
+    options = CurrentOptions(options);
+    DestroyAndReopen(options);
+
+    Random rnd(301);
+    for (int i = 0; i < 10; i++) {
+      // Generate and flush a file about 10KB.
+      for (int j = 0; j < 10; j++) {
+        ASSERT_OK(Put(ToString(i * 20 + j), RandomString(&rnd, 980)));
+      }
+      Flush();
+      ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    }
+    ASSERT_EQ(NumTableFilesAtLevel(0), 10);
+
+    // Sleep for 2 hours -- which is much greater than TTL.
+    env_->addon_time_.fetch_add(2 * 60 * 60);
+    // Just to make sure that we are in the same state even after sleeping.
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    ASSERT_EQ(NumTableFilesAtLevel(0), 10);
+
+    // Create 1 more file to trigger TTL compaction. The old files are dropped.
+    for (int i = 0; i < 1; i++) {
+      for (int j = 0; j < 10; j++) {
+        ASSERT_OK(Put(ToString(i * 20 + j), RandomString(&rnd, 980)));
+      }
+      Flush();
+    }
+
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    // Only the new 10 files remain.
+    ASSERT_EQ(NumTableFilesAtLevel(0), 1);
+    ASSERT_LE(SizeAtLevel(0),
+              options.compaction_options_fifo.max_table_files_size);
+  }
+
+  // Test that shows the fall back to size-based FIFO compaction if TTL-based
+  // deletion doesn't move the total size to be less than max_table_files_size.
+  {
+    options.write_buffer_size = 10 << 10;                              // 10KB
+    options.compaction_options_fifo.max_table_files_size = 150 << 10;  // 150KB
+    options.compaction_options_fifo.allow_compaction = false;
+    options.ttl =  1 * 60 * 60;  // 1 hour
+    options = CurrentOptions(options);
+    DestroyAndReopen(options);
+
+    Random rnd(301);
+    for (int i = 0; i < 3; i++) {
+      // Generate and flush a file about 10KB.
+      for (int j = 0; j < 10; j++) {
+        ASSERT_OK(Put(ToString(i * 20 + j), RandomString(&rnd, 980)));
+      }
+      Flush();
+      ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    }
+    ASSERT_EQ(NumTableFilesAtLevel(0), 3);
+
+    // Sleep for 2 hours -- which is much greater than TTL.
+    env_->addon_time_.fetch_add(2 * 60 * 60);
+    // Just to make sure that we are in the same state even after sleeping.
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    ASSERT_EQ(NumTableFilesAtLevel(0), 3);
+
+    for (int i = 0; i < 5; i++) {
+      for (int j = 0; j < 140; j++) {
+        ASSERT_OK(Put(ToString(i * 20 + j), RandomString(&rnd, 980)));
+      }
+      Flush();
+      ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    }
+    // Size limit is still guaranteed.
+    ASSERT_LE(SizeAtLevel(0),
+              options.compaction_options_fifo.max_table_files_size);
+  }
+
+  // Test with TTL + Intra-L0 compactions.
+  {
+    options.compaction_options_fifo.max_table_files_size = 150 << 10;  // 150KB
+    options.compaction_options_fifo.allow_compaction = true;
+    options.ttl = 1 * 60 * 60;  // 1 hour
+    options.level0_file_num_compaction_trigger = 6;
+    options = CurrentOptions(options);
+    DestroyAndReopen(options);
+
+    Random rnd(301);
+    for (int i = 0; i < 10; i++) {
+      // Generate and flush a file about 10KB.
+      for (int j = 0; j < 10; j++) {
+        ASSERT_OK(Put(ToString(i * 20 + j), RandomString(&rnd, 980)));
+      }
+      Flush();
+      ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    }
+    // With Intra-L0 compaction, out of 10 files, 6 files will be compacted to 1
+    // (due to level0_file_num_compaction_trigger = 6).
+    // So total files = 1 + remaining 4 = 5.
+    ASSERT_EQ(NumTableFilesAtLevel(0), 5);
+
+    // Sleep for 2 hours -- which is much greater than TTL.
+    env_->addon_time_.fetch_add(2 * 60 * 60);
+    // Just to make sure that we are in the same state even after sleeping.
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    ASSERT_EQ(NumTableFilesAtLevel(0), 5);
+
+    // Create 10 more files. The old 5 files are dropped as their ttl expired.
+    for (int i = 0; i < 10; i++) {
+      for (int j = 0; j < 10; j++) {
+        ASSERT_OK(Put(ToString(i * 20 + j), RandomString(&rnd, 980)));
+      }
+      Flush();
+      ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    }
+    ASSERT_EQ(NumTableFilesAtLevel(0), 5);
+    ASSERT_LE(SizeAtLevel(0),
+              options.compaction_options_fifo.max_table_files_size);
+  }
+
+  // Test with large TTL + Intra-L0 compactions.
+  // Files dropped based on size, as ttl doesn't kick in.
+  {
+    options.write_buffer_size = 20 << 10;                               // 20K
+    options.compaction_options_fifo.max_table_files_size = 1500 << 10;  // 1.5MB
+    options.compaction_options_fifo.allow_compaction = true;
+    options.ttl = 1 * 60 * 60;  // 1 hour
+    options.level0_file_num_compaction_trigger = 6;
+    options = CurrentOptions(options);
+    DestroyAndReopen(options);
+
+    Random rnd(301);
+    for (int i = 0; i < 60; i++) {
+      // Generate and flush a file about 20KB.
+      for (int j = 0; j < 20; j++) {
+        ASSERT_OK(Put(ToString(i * 20 + j), RandomString(&rnd, 980)));
+      }
+      Flush();
+      ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    }
+    // It should be compacted to 10 files.
+    ASSERT_EQ(NumTableFilesAtLevel(0), 10);
+
+    for (int i = 0; i < 60; i++) {
+      // Generate and flush a file about 20KB.
+      for (int j = 0; j < 20; j++) {
+        ASSERT_OK(Put(ToString(i * 20 + j + 2000), RandomString(&rnd, 980)));
+      }
+      Flush();
+      ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    }
+
+    // It should be compacted to no more than 20 files.
+    ASSERT_GT(NumTableFilesAtLevel(0), 10);
+    ASSERT_LT(NumTableFilesAtLevel(0), 18);
+    // Size limit is still guaranteed.
+    ASSERT_LE(SizeAtLevel(0),
+              options.compaction_options_fifo.max_table_files_size);
+  }
+}
+#endif  // ROCKSDB_LITE
+
+#ifndef ROCKSDB_LITE
+/*
+ * This test is not reliable enough as it heavily depends on disk behavior.
+ * Disable as it is flaky.
+ */
+TEST_F(DBTest, DISABLED_RateLimitingTest) {
+  Options options = CurrentOptions();
+  options.write_buffer_size = 1 << 20;  // 1MB
+  options.level0_file_num_compaction_trigger = 2;
+  options.target_file_size_base = 1 << 20;     // 1MB
+  options.max_bytes_for_level_base = 4 << 20;  // 4MB
+  options.max_bytes_for_level_multiplier = 4;
+  options.compression = kNoCompression;
+  options.create_if_missing = true;
+  options.env = env_;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  options.IncreaseParallelism(4);
+  DestroyAndReopen(options);
+
+  WriteOptions wo;
+  wo.disableWAL = true;
+
+  // # no rate limiting
+  Random rnd(301);
+  uint64_t start = env_->NowMicros();
+  // Write ~96M data
+  for (int64_t i = 0; i < (96 << 10); ++i) {
+    ASSERT_OK(
+        Put(RandomString(&rnd, 32), RandomString(&rnd, (1 << 10) + 1), wo));
+  }
+  uint64_t elapsed = env_->NowMicros() - start;
+  double raw_rate = env_->bytes_written_ * 1000000.0 / elapsed;
+  uint64_t rate_limiter_drains =
+      TestGetTickerCount(options, NUMBER_RATE_LIMITER_DRAINS);
+  ASSERT_EQ(0, rate_limiter_drains);
+  Close();
+
+  // # rate limiting with 0.7 x threshold
+  options.rate_limiter.reset(
+      NewGenericRateLimiter(static_cast<int64_t>(0.7 * raw_rate)));
+  env_->bytes_written_ = 0;
+  DestroyAndReopen(options);
+
+  start = env_->NowMicros();
+  // Write ~96M data
+  for (int64_t i = 0; i < (96 << 10); ++i) {
+    ASSERT_OK(
+        Put(RandomString(&rnd, 32), RandomString(&rnd, (1 << 10) + 1), wo));
+  }
+  rate_limiter_drains =
+      TestGetTickerCount(options, NUMBER_RATE_LIMITER_DRAINS) -
+      rate_limiter_drains;
+  elapsed = env_->NowMicros() - start;
+  Close();
+  ASSERT_EQ(options.rate_limiter->GetTotalBytesThrough(), env_->bytes_written_);
+  // Most intervals should've been drained (interval time is 100ms, elapsed is
+  // micros)
+  ASSERT_GT(rate_limiter_drains, 0);
+  ASSERT_LE(rate_limiter_drains, elapsed / 100000 + 1);
+  double ratio = env_->bytes_written_ * 1000000 / elapsed / raw_rate;
+  fprintf(stderr, "write rate ratio = %.2lf, expected 0.7\n", ratio);
+  ASSERT_TRUE(ratio < 0.8);
+
+  // # rate limiting with half of the raw_rate
+  options.rate_limiter.reset(
+      NewGenericRateLimiter(static_cast<int64_t>(raw_rate / 2)));
+  env_->bytes_written_ = 0;
+  DestroyAndReopen(options);
+
+  start = env_->NowMicros();
+  // Write ~96M data
+  for (int64_t i = 0; i < (96 << 10); ++i) {
+    ASSERT_OK(
+        Put(RandomString(&rnd, 32), RandomString(&rnd, (1 << 10) + 1), wo));
+  }
+  elapsed = env_->NowMicros() - start;
+  rate_limiter_drains =
+      TestGetTickerCount(options, NUMBER_RATE_LIMITER_DRAINS) -
+      rate_limiter_drains;
+  Close();
+  ASSERT_EQ(options.rate_limiter->GetTotalBytesThrough(), env_->bytes_written_);
+  // Most intervals should've been drained (interval time is 100ms, elapsed is
+  // micros)
+  ASSERT_GT(rate_limiter_drains, elapsed / 100000 / 2);
+  ASSERT_LE(rate_limiter_drains, elapsed / 100000 + 1);
+  ratio = env_->bytes_written_ * 1000000 / elapsed / raw_rate;
+  fprintf(stderr, "write rate ratio = %.2lf, expected 0.5\n", ratio);
+  ASSERT_LT(ratio, 0.6);
+}
+
+TEST_F(DBTest, TableOptionsSanitizeTest) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  DestroyAndReopen(options);
+  ASSERT_EQ(db_->GetOptions().allow_mmap_reads, false);
+
+  options.table_factory.reset(new PlainTableFactory());
+  options.prefix_extractor.reset(NewNoopTransform());
+  Destroy(options);
+  ASSERT_TRUE(!TryReopen(options).IsNotSupported());
+
+  // Test for check of prefix_extractor when hash index is used for
+  // block-based table
+  BlockBasedTableOptions to;
+  to.index_type = BlockBasedTableOptions::kHashSearch;
+  options = CurrentOptions();
+  options.create_if_missing = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(to));
+  ASSERT_TRUE(TryReopen(options).IsInvalidArgument());
+  options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+  ASSERT_OK(TryReopen(options));
+}
+
+TEST_F(DBTest, ConcurrentMemtableNotSupported) {
+  Options options = CurrentOptions();
+  options.allow_concurrent_memtable_write = true;
+  options.soft_pending_compaction_bytes_limit = 0;
+  options.hard_pending_compaction_bytes_limit = 100;
+  options.create_if_missing = true;
+
+  DestroyDB(dbname_, options);
+  options.memtable_factory.reset(NewHashLinkListRepFactory(4, 0, 3, true, 4));
+  ASSERT_NOK(TryReopen(options));
+
+  options.memtable_factory.reset(new SkipListFactory);
+  ASSERT_OK(TryReopen(options));
+
+  ColumnFamilyOptions cf_options(options);
+  cf_options.memtable_factory.reset(
+      NewHashLinkListRepFactory(4, 0, 3, true, 4));
+  ColumnFamilyHandle* handle;
+  ASSERT_NOK(db_->CreateColumnFamily(cf_options, "name", &handle));
+}
+
+#endif  // ROCKSDB_LITE
+
+TEST_F(DBTest, SanitizeNumThreads) {
+  for (int attempt = 0; attempt < 2; attempt++) {
+    const size_t kTotalTasks = 8;
+    test::SleepingBackgroundTask sleeping_tasks[kTotalTasks];
+
+    Options options = CurrentOptions();
+    if (attempt == 0) {
+      options.max_background_compactions = 3;
+      options.max_background_flushes = 2;
+    }
+    options.create_if_missing = true;
+    DestroyAndReopen(options);
+
+    for (size_t i = 0; i < kTotalTasks; i++) {
+      // Insert 5 tasks to low priority queue and 5 tasks to high priority queue
+      env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask,
+                     &sleeping_tasks[i],
+                     (i < 4) ? Env::Priority::LOW : Env::Priority::HIGH);
+    }
+
+    // Wait until 10s for they are scheduled.
+    for (int i = 0; i < 10000; i++) {
+      if (options.env->GetThreadPoolQueueLen(Env::Priority::LOW) <= 1 &&
+          options.env->GetThreadPoolQueueLen(Env::Priority::HIGH) <= 2) {
+        break;
+      }
+      env_->SleepForMicroseconds(1000);
+    }
+
+    // pool size 3, total task 4. Queue size should be 1.
+    ASSERT_EQ(1U, options.env->GetThreadPoolQueueLen(Env::Priority::LOW));
+    // pool size 2, total task 4. Queue size should be 2.
+    ASSERT_EQ(2U, options.env->GetThreadPoolQueueLen(Env::Priority::HIGH));
+
+    for (size_t i = 0; i < kTotalTasks; i++) {
+      sleeping_tasks[i].WakeUp();
+      sleeping_tasks[i].WaitUntilDone();
+    }
+
+    ASSERT_OK(Put("abc", "def"));
+    ASSERT_EQ("def", Get("abc"));
+    Flush();
+    ASSERT_EQ("def", Get("abc"));
+  }
+}
+
+TEST_F(DBTest, WriteSingleThreadEntry) {
+  std::vector<port::Thread> threads;
+  dbfull()->TEST_LockMutex();
+  auto w = dbfull()->TEST_BeginWrite();
+  threads.emplace_back([&] { Put("a", "b"); });
+  env_->SleepForMicroseconds(10000);
+  threads.emplace_back([&] { Flush(); });
+  env_->SleepForMicroseconds(10000);
+  dbfull()->TEST_UnlockMutex();
+  dbfull()->TEST_LockMutex();
+  dbfull()->TEST_EndWrite(w);
+  dbfull()->TEST_UnlockMutex();
+
+  for (auto& t : threads) {
+    t.join();
+  }
+}
+
+TEST_F(DBTest, ConcurrentFlushWAL) {
+  const size_t cnt = 100;
+  Options options;
+  WriteOptions wopt;
+  ReadOptions ropt;
+  for (bool two_write_queues : {false, true}) {
+    for (bool manual_wal_flush : {false, true}) {
+      options.two_write_queues = two_write_queues;
+      options.manual_wal_flush = manual_wal_flush;
+      options.create_if_missing = true;
+      DestroyAndReopen(options);
+      std::vector<port::Thread> threads;
+      threads.emplace_back([&] {
+        for (size_t i = 0; i < cnt; i++) {
+          auto istr = ToString(i);
+          db_->Put(wopt, db_->DefaultColumnFamily(), "a" + istr, "b" + istr);
+        }
+      });
+      if (two_write_queues) {
+        threads.emplace_back([&] {
+          for (size_t i = cnt; i < 2 * cnt; i++) {
+            auto istr = ToString(i);
+            WriteBatch batch;
+            batch.Put("a" + istr, "b" + istr);
+            dbfull()->WriteImpl(wopt, &batch, nullptr, nullptr, 0, true);
+          }
+        });
+      }
+      threads.emplace_back([&] {
+        for (size_t i = 0; i < cnt * 100; i++) {  // FlushWAL is faster than Put
+          db_->FlushWAL(false);
+        }
+      });
+      for (auto& t : threads) {
+        t.join();
+      }
+      options.create_if_missing = false;
+      // Recover from the wal and make sure that it is not corrupted
+      Reopen(options);
+      for (size_t i = 0; i < cnt; i++) {
+        PinnableSlice pval;
+        auto istr = ToString(i);
+        ASSERT_OK(
+            db_->Get(ropt, db_->DefaultColumnFamily(), "a" + istr, &pval));
+        ASSERT_TRUE(pval == ("b" + istr));
+      }
+    }
+  }
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest, DynamicMemtableOptions) {
+  const uint64_t k64KB = 1 << 16;
+  const uint64_t k128KB = 1 << 17;
+  const uint64_t k5KB = 5 * 1024;
+  Options options;
+  options.env = env_;
+  options.create_if_missing = true;
+  options.compression = kNoCompression;
+  options.max_background_compactions = 1;
+  options.write_buffer_size = k64KB;
+  options.arena_block_size = 16 * 1024;
+  options.max_write_buffer_number = 2;
+  // Don't trigger compact/slowdown/stop
+  options.level0_file_num_compaction_trigger = 1024;
+  options.level0_slowdown_writes_trigger = 1024;
+  options.level0_stop_writes_trigger = 1024;
+  DestroyAndReopen(options);
+
+  auto gen_l0_kb = [this](int size) {
+    const int kNumPutsBeforeWaitForFlush = 64;
+    Random rnd(301);
+    for (int i = 0; i < size; i++) {
+      ASSERT_OK(Put(Key(i), RandomString(&rnd, 1024)));
+
+      // The following condition prevents a race condition between flush jobs
+      // acquiring work and this thread filling up multiple memtables. Without
+      // this, the flush might produce less files than expected because
+      // multiple memtables are flushed into a single L0 file. This race
+      // condition affects assertion (A).
+      if (i % kNumPutsBeforeWaitForFlush == kNumPutsBeforeWaitForFlush - 1) {
+        dbfull()->TEST_WaitForFlushMemTable();
+      }
+    }
+    dbfull()->TEST_WaitForFlushMemTable();
+  };
+
+  // Test write_buffer_size
+  gen_l0_kb(64);
+  ASSERT_EQ(NumTableFilesAtLevel(0), 1);
+  ASSERT_LT(SizeAtLevel(0), k64KB + k5KB);
+  ASSERT_GT(SizeAtLevel(0), k64KB - k5KB * 2);
+
+  // Clean up L0
+  dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+
+  // Increase buffer size
+  ASSERT_OK(dbfull()->SetOptions({
+      {"write_buffer_size", "131072"},
+  }));
+
+  // The existing memtable inflated 64KB->128KB when we invoked SetOptions().
+  // Write 192KB, we should have a 128KB L0 file and a memtable with 64KB data.
+  gen_l0_kb(192);
+  ASSERT_EQ(NumTableFilesAtLevel(0), 1);  // (A)
+  ASSERT_LT(SizeAtLevel(0), k128KB + 2 * k5KB);
+  ASSERT_GT(SizeAtLevel(0), k128KB - 4 * k5KB);
+
+  // Decrease buffer size below current usage
+  ASSERT_OK(dbfull()->SetOptions({
+      {"write_buffer_size", "65536"},
+  }));
+  // The existing memtable became eligible for flush when we reduced its
+  // capacity to 64KB. Two keys need to be added to trigger flush: first causes
+  // memtable to be marked full, second schedules the flush. Then we should have
+  // a 128KB L0 file, a 64KB L0 file, and a memtable with just one key.
+  gen_l0_kb(2);
+  ASSERT_EQ(NumTableFilesAtLevel(0), 2);
+  ASSERT_LT(SizeAtLevel(0), k128KB + k64KB + 2 * k5KB);
+  ASSERT_GT(SizeAtLevel(0), k128KB + k64KB - 4 * k5KB);
+
+  // Test max_write_buffer_number
+  // Block compaction thread, which will also block the flushes because
+  // max_background_flushes == 0, so flushes are getting executed by the
+  // compaction thread
+  env_->SetBackgroundThreads(1, Env::LOW);
+  test::SleepingBackgroundTask sleeping_task_low;
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+                 Env::Priority::LOW);
+  // Start from scratch and disable compaction/flush. Flush can only happen
+  // during compaction but trigger is pretty high
+  options.disable_auto_compactions = true;
+  DestroyAndReopen(options);
+  env_->SetBackgroundThreads(0, Env::HIGH);
+
+  // Put until writes are stopped, bounded by 256 puts. We should see stop at
+  // ~128KB
+  int count = 0;
+  Random rnd(301);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::DelayWrite:Wait",
+      [&](void* /*arg*/) { sleeping_task_low.WakeUp(); });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  while (!sleeping_task_low.WokenUp() && count < 256) {
+    ASSERT_OK(Put(Key(count), RandomString(&rnd, 1024), WriteOptions()));
+    count++;
+  }
+  ASSERT_GT(static_cast<double>(count), 128 * 0.8);
+  ASSERT_LT(static_cast<double>(count), 128 * 1.2);
+
+  sleeping_task_low.WaitUntilDone();
+
+  // Increase
+  ASSERT_OK(dbfull()->SetOptions({
+      {"max_write_buffer_number", "8"},
+  }));
+  // Clean up memtable and L0
+  dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+
+  sleeping_task_low.Reset();
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+                 Env::Priority::LOW);
+  count = 0;
+  while (!sleeping_task_low.WokenUp() && count < 1024) {
+    ASSERT_OK(Put(Key(count), RandomString(&rnd, 1024), WriteOptions()));
+    count++;
+  }
+// Windows fails this test. Will tune in the future and figure out
+// approp number
+#ifndef OS_WIN
+  ASSERT_GT(static_cast<double>(count), 512 * 0.8);
+  ASSERT_LT(static_cast<double>(count), 512 * 1.2);
+#endif
+  sleeping_task_low.WaitUntilDone();
+
+  // Decrease
+  ASSERT_OK(dbfull()->SetOptions({
+      {"max_write_buffer_number", "4"},
+  }));
+  // Clean up memtable and L0
+  dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+
+  sleeping_task_low.Reset();
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+                 Env::Priority::LOW);
+
+  count = 0;
+  while (!sleeping_task_low.WokenUp() && count < 1024) {
+    ASSERT_OK(Put(Key(count), RandomString(&rnd, 1024), WriteOptions()));
+    count++;
+  }
+// Windows fails this test. Will tune in the future and figure out
+// approp number
+#ifndef OS_WIN
+  ASSERT_GT(static_cast<double>(count), 256 * 0.8);
+  ASSERT_LT(static_cast<double>(count), 266 * 1.2);
+#endif
+  sleeping_task_low.WaitUntilDone();
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+#endif  // ROCKSDB_LITE
+
+#ifdef ROCKSDB_USING_THREAD_STATUS
+namespace {
+void VerifyOperationCount(Env* env, ThreadStatus::OperationType op_type,
+                          int expected_count) {
+  int op_count = 0;
+  std::vector<ThreadStatus> thread_list;
+  ASSERT_OK(env->GetThreadList(&thread_list));
+  for (auto thread : thread_list) {
+    if (thread.operation_type == op_type) {
+      op_count++;
+    }
+  }
+  ASSERT_EQ(op_count, expected_count);
+}
+}  // namespace
+
+TEST_F(DBTest, GetThreadStatus) {
+  Options options;
+  options.env = env_;
+  options.enable_thread_tracking = true;
+  TryReopen(options);
+
+  std::vector<ThreadStatus> thread_list;
+  Status s = env_->GetThreadList(&thread_list);
+
+  for (int i = 0; i < 2; ++i) {
+    // repeat the test with differet number of high / low priority threads
+    const int kTestCount = 3;
+    const unsigned int kHighPriCounts[kTestCount] = {3, 2, 5};
+    const unsigned int kLowPriCounts[kTestCount] = {10, 15, 3};
+    const unsigned int kBottomPriCounts[kTestCount] = {2, 1, 4};
+    for (int test = 0; test < kTestCount; ++test) {
+      // Change the number of threads in high / low priority pool.
+      env_->SetBackgroundThreads(kHighPriCounts[test], Env::HIGH);
+      env_->SetBackgroundThreads(kLowPriCounts[test], Env::LOW);
+      env_->SetBackgroundThreads(kBottomPriCounts[test], Env::BOTTOM);
+      // Wait to ensure the all threads has been registered
+      unsigned int thread_type_counts[ThreadStatus::NUM_THREAD_TYPES];
+      // TODO(ajkr): it'd be better if SetBackgroundThreads returned only after
+      // all threads have been registered.
+      // Try up to 60 seconds.
+      for (int num_try = 0; num_try < 60000; num_try++) {
+        env_->SleepForMicroseconds(1000);
+        thread_list.clear();
+        s = env_->GetThreadList(&thread_list);
+        ASSERT_OK(s);
+        memset(thread_type_counts, 0, sizeof(thread_type_counts));
+        for (auto thread : thread_list) {
+          ASSERT_LT(thread.thread_type, ThreadStatus::NUM_THREAD_TYPES);
+          thread_type_counts[thread.thread_type]++;
+        }
+        if (thread_type_counts[ThreadStatus::HIGH_PRIORITY] ==
+                kHighPriCounts[test] &&
+            thread_type_counts[ThreadStatus::LOW_PRIORITY] ==
+                kLowPriCounts[test] &&
+            thread_type_counts[ThreadStatus::BOTTOM_PRIORITY] ==
+                kBottomPriCounts[test]) {
+          break;
+        }
+      }
+      // Verify the number of high-priority threads
+      ASSERT_EQ(thread_type_counts[ThreadStatus::HIGH_PRIORITY],
+                kHighPriCounts[test]);
+      // Verify the number of low-priority threads
+      ASSERT_EQ(thread_type_counts[ThreadStatus::LOW_PRIORITY],
+                kLowPriCounts[test]);
+      // Verify the number of bottom-priority threads
+      ASSERT_EQ(thread_type_counts[ThreadStatus::BOTTOM_PRIORITY],
+                kBottomPriCounts[test]);
+    }
+    if (i == 0) {
+      // repeat the test with multiple column families
+      CreateAndReopenWithCF({"pikachu", "about-to-remove"}, options);
+      env_->GetThreadStatusUpdater()->TEST_VerifyColumnFamilyInfoMap(handles_,
+                                                                     true);
+    }
+  }
+  db_->DropColumnFamily(handles_[2]);
+  delete handles_[2];
+  handles_.erase(handles_.begin() + 2);
+  env_->GetThreadStatusUpdater()->TEST_VerifyColumnFamilyInfoMap(handles_,
+                                                                 true);
+  Close();
+  env_->GetThreadStatusUpdater()->TEST_VerifyColumnFamilyInfoMap(handles_,
+                                                                 true);
+}
+
+TEST_F(DBTest, DisableThreadStatus) {
+  Options options;
+  options.env = env_;
+  options.enable_thread_tracking = false;
+  TryReopen(options);
+  CreateAndReopenWithCF({"pikachu", "about-to-remove"}, options);
+  // Verify non of the column family info exists
+  env_->GetThreadStatusUpdater()->TEST_VerifyColumnFamilyInfoMap(handles_,
+                                                                 false);
+}
+
+TEST_F(DBTest, ThreadStatusFlush) {
+  Options options;
+  options.env = env_;
+  options.write_buffer_size = 100000;  // Small write buffer
+  options.enable_thread_tracking = true;
+  options = CurrentOptions(options);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+      {"FlushJob::FlushJob()", "DBTest::ThreadStatusFlush:1"},
+      {"DBTest::ThreadStatusFlush:2", "FlushJob::WriteLevel0Table"},
+  });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  CreateAndReopenWithCF({"pikachu"}, options);
+  VerifyOperationCount(env_, ThreadStatus::OP_FLUSH, 0);
+
+  ASSERT_OK(Put(1, "foo", "v1"));
+  ASSERT_EQ("v1", Get(1, "foo"));
+  VerifyOperationCount(env_, ThreadStatus::OP_FLUSH, 0);
+
+  uint64_t num_running_flushes = 0;
+  db_->GetIntProperty(DB::Properties::kNumRunningFlushes, &num_running_flushes);
+  ASSERT_EQ(num_running_flushes, 0);
+
+  Put(1, "k1", std::string(100000, 'x'));  // Fill memtable
+  Put(1, "k2", std::string(100000, 'y'));  // Trigger flush
+
+  // The first sync point is to make sure there's one flush job
+  // running when we perform VerifyOperationCount().
+  TEST_SYNC_POINT("DBTest::ThreadStatusFlush:1");
+  VerifyOperationCount(env_, ThreadStatus::OP_FLUSH, 1);
+  db_->GetIntProperty(DB::Properties::kNumRunningFlushes, &num_running_flushes);
+  ASSERT_EQ(num_running_flushes, 1);
+  // This second sync point is to ensure the flush job will not
+  // be completed until we already perform VerifyOperationCount().
+  TEST_SYNC_POINT("DBTest::ThreadStatusFlush:2");
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_P(DBTestWithParam, ThreadStatusSingleCompaction) {
+  const int kTestKeySize = 16;
+  const int kTestValueSize = 984;
+  const int kEntrySize = kTestKeySize + kTestValueSize;
+  const int kEntriesPerBuffer = 100;
+  Options options;
+  options.create_if_missing = true;
+  options.write_buffer_size = kEntrySize * kEntriesPerBuffer;
+  options.compaction_style = kCompactionStyleLevel;
+  options.target_file_size_base = options.write_buffer_size;
+  options.max_bytes_for_level_base = options.target_file_size_base * 2;
+  options.max_bytes_for_level_multiplier = 2;
+  options.compression = kNoCompression;
+  options = CurrentOptions(options);
+  options.env = env_;
+  options.enable_thread_tracking = true;
+  const int kNumL0Files = 4;
+  options.level0_file_num_compaction_trigger = kNumL0Files;
+  options.max_subcompactions = max_subcompactions_;
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+      {"DBTest::ThreadStatusSingleCompaction:0", "DBImpl::BGWorkCompaction"},
+      {"CompactionJob::Run():Start", "DBTest::ThreadStatusSingleCompaction:1"},
+      {"DBTest::ThreadStatusSingleCompaction:2", "CompactionJob::Run():End"},
+  });
+  for (int tests = 0; tests < 2; ++tests) {
+    DestroyAndReopen(options);
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearTrace();
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+    Random rnd(301);
+    // The Put Phase.
+    for (int file = 0; file < kNumL0Files; ++file) {
+      for (int key = 0; key < kEntriesPerBuffer; ++key) {
+        ASSERT_OK(Put(ToString(key + file * kEntriesPerBuffer),
+                      RandomString(&rnd, kTestValueSize)));
+      }
+      Flush();
+    }
+    // This makes sure a compaction won't be scheduled until
+    // we have done with the above Put Phase.
+    uint64_t num_running_compactions = 0;
+    db_->GetIntProperty(DB::Properties::kNumRunningCompactions,
+                        &num_running_compactions);
+    ASSERT_EQ(num_running_compactions, 0);
+    TEST_SYNC_POINT("DBTest::ThreadStatusSingleCompaction:0");
+    ASSERT_GE(NumTableFilesAtLevel(0),
+              options.level0_file_num_compaction_trigger);
+
+    // This makes sure at least one compaction is running.
+    TEST_SYNC_POINT("DBTest::ThreadStatusSingleCompaction:1");
+
+    if (options.enable_thread_tracking) {
+      // expecting one single L0 to L1 compaction
+      VerifyOperationCount(env_, ThreadStatus::OP_COMPACTION, 1);
+    } else {
+      // If thread tracking is not enabled, compaction count should be 0.
+      VerifyOperationCount(env_, ThreadStatus::OP_COMPACTION, 0);
+    }
+    db_->GetIntProperty(DB::Properties::kNumRunningCompactions,
+                        &num_running_compactions);
+    ASSERT_EQ(num_running_compactions, 1);
+    // TODO(yhchiang): adding assert to verify each compaction stage.
+    TEST_SYNC_POINT("DBTest::ThreadStatusSingleCompaction:2");
+
+    // repeat the test with disabling thread tracking.
+    options.enable_thread_tracking = false;
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  }
+}
+
+TEST_P(DBTestWithParam, PreShutdownManualCompaction) {
+  Options options = CurrentOptions();
+  options.max_subcompactions = max_subcompactions_;
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  // iter - 0 with 7 levels
+  // iter - 1 with 3 levels
+  for (int iter = 0; iter < 2; ++iter) {
+    MakeTables(3, "p", "q", 1);
+    ASSERT_EQ("1,1,1", FilesPerLevel(1));
+
+    // Compaction range falls before files
+    Compact(1, "", "c");
+    ASSERT_EQ("1,1,1", FilesPerLevel(1));
+
+    // Compaction range falls after files
+    Compact(1, "r", "z");
+    ASSERT_EQ("1,1,1", FilesPerLevel(1));
+
+    // Compaction range overlaps files
+    Compact(1, "p1", "p9");
+    ASSERT_EQ("0,0,1", FilesPerLevel(1));
+
+    // Populate a different range
+    MakeTables(3, "c", "e", 1);
+    ASSERT_EQ("1,1,2", FilesPerLevel(1));
+
+    // Compact just the new range
+    Compact(1, "b", "f");
+    ASSERT_EQ("0,0,2", FilesPerLevel(1));
+
+    // Compact all
+    MakeTables(1, "a", "z", 1);
+    ASSERT_EQ("1,0,2", FilesPerLevel(1));
+    CancelAllBackgroundWork(db_);
+    db_->CompactRange(CompactRangeOptions(), handles_[1], nullptr, nullptr);
+    ASSERT_EQ("1,0,2", FilesPerLevel(1));
+
+    if (iter == 0) {
+      options = CurrentOptions();
+      options.num_levels = 3;
+      options.create_if_missing = true;
+      DestroyAndReopen(options);
+      CreateAndReopenWithCF({"pikachu"}, options);
+    }
+  }
+}
+
+TEST_F(DBTest, PreShutdownFlush) {
+  Options options = CurrentOptions();
+  CreateAndReopenWithCF({"pikachu"}, options);
+  ASSERT_OK(Put(1, "key", "value"));
+  CancelAllBackgroundWork(db_);
+  Status s =
+      db_->CompactRange(CompactRangeOptions(), handles_[1], nullptr, nullptr);
+  ASSERT_TRUE(s.IsShutdownInProgress());
+}
+
+TEST_P(DBTestWithParam, PreShutdownMultipleCompaction) {
+  const int kTestKeySize = 16;
+  const int kTestValueSize = 984;
+  const int kEntrySize = kTestKeySize + kTestValueSize;
+  const int kEntriesPerBuffer = 40;
+  const int kNumL0Files = 4;
+
+  const int kHighPriCount = 3;
+  const int kLowPriCount = 5;
+  env_->SetBackgroundThreads(kHighPriCount, Env::HIGH);
+  env_->SetBackgroundThreads(kLowPriCount, Env::LOW);
+
+  Options options;
+  options.create_if_missing = true;
+  options.write_buffer_size = kEntrySize * kEntriesPerBuffer;
+  options.compaction_style = kCompactionStyleLevel;
+  options.target_file_size_base = options.write_buffer_size;
+  options.max_bytes_for_level_base =
+      options.target_file_size_base * kNumL0Files;
+  options.compression = kNoCompression;
+  options = CurrentOptions(options);
+  options.env = env_;
+  options.enable_thread_tracking = true;
+  options.level0_file_num_compaction_trigger = kNumL0Files;
+  options.max_bytes_for_level_multiplier = 2;
+  options.max_background_compactions = kLowPriCount;
+  options.level0_stop_writes_trigger = 1 << 10;
+  options.level0_slowdown_writes_trigger = 1 << 10;
+  options.max_subcompactions = max_subcompactions_;
+
+  TryReopen(options);
+  Random rnd(301);
+
+  std::vector<ThreadStatus> thread_list;
+  // Delay both flush and compaction
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"FlushJob::FlushJob()", "CompactionJob::Run():Start"},
+       {"CompactionJob::Run():Start",
+        "DBTest::PreShutdownMultipleCompaction:Preshutdown"},
+       {"CompactionJob::Run():Start",
+        "DBTest::PreShutdownMultipleCompaction:VerifyCompaction"},
+       {"DBTest::PreShutdownMultipleCompaction:Preshutdown",
+        "CompactionJob::Run():End"},
+       {"CompactionJob::Run():End",
+        "DBTest::PreShutdownMultipleCompaction:VerifyPreshutdown"}});
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // Make rocksdb busy
+  int key = 0;
+  // check how many threads are doing compaction using GetThreadList
+  int operation_count[ThreadStatus::NUM_OP_TYPES] = {0};
+  for (int file = 0; file < 16 * kNumL0Files; ++file) {
+    for (int k = 0; k < kEntriesPerBuffer; ++k) {
+      ASSERT_OK(Put(ToString(key++), RandomString(&rnd, kTestValueSize)));
+    }
+
+    Status s = env_->GetThreadList(&thread_list);
+    for (auto thread : thread_list) {
+      operation_count[thread.operation_type]++;
+    }
+
+    // Speed up the test
+    if (operation_count[ThreadStatus::OP_FLUSH] > 1 &&
+        operation_count[ThreadStatus::OP_COMPACTION] >
+            0.6 * options.max_background_compactions) {
+      break;
+    }
+    if (file == 15 * kNumL0Files) {
+      TEST_SYNC_POINT("DBTest::PreShutdownMultipleCompaction:Preshutdown");
+    }
+  }
+
+  TEST_SYNC_POINT("DBTest::PreShutdownMultipleCompaction:Preshutdown");
+  ASSERT_GE(operation_count[ThreadStatus::OP_COMPACTION], 1);
+  CancelAllBackgroundWork(db_);
+  TEST_SYNC_POINT("DBTest::PreShutdownMultipleCompaction:VerifyPreshutdown");
+  dbfull()->TEST_WaitForCompact();
+  // Record the number of compactions at a time.
+  for (int i = 0; i < ThreadStatus::NUM_OP_TYPES; ++i) {
+    operation_count[i] = 0;
+  }
+  Status s = env_->GetThreadList(&thread_list);
+  for (auto thread : thread_list) {
+    operation_count[thread.operation_type]++;
+  }
+  ASSERT_EQ(operation_count[ThreadStatus::OP_COMPACTION], 0);
+}
+
+TEST_P(DBTestWithParam, PreShutdownCompactionMiddle) {
+  const int kTestKeySize = 16;
+  const int kTestValueSize = 984;
+  const int kEntrySize = kTestKeySize + kTestValueSize;
+  const int kEntriesPerBuffer = 40;
+  const int kNumL0Files = 4;
+
+  const int kHighPriCount = 3;
+  const int kLowPriCount = 5;
+  env_->SetBackgroundThreads(kHighPriCount, Env::HIGH);
+  env_->SetBackgroundThreads(kLowPriCount, Env::LOW);
+
+  Options options;
+  options.create_if_missing = true;
+  options.write_buffer_size = kEntrySize * kEntriesPerBuffer;
+  options.compaction_style = kCompactionStyleLevel;
+  options.target_file_size_base = options.write_buffer_size;
+  options.max_bytes_for_level_base =
+      options.target_file_size_base * kNumL0Files;
+  options.compression = kNoCompression;
+  options = CurrentOptions(options);
+  options.env = env_;
+  options.enable_thread_tracking = true;
+  options.level0_file_num_compaction_trigger = kNumL0Files;
+  options.max_bytes_for_level_multiplier = 2;
+  options.max_background_compactions = kLowPriCount;
+  options.level0_stop_writes_trigger = 1 << 10;
+  options.level0_slowdown_writes_trigger = 1 << 10;
+  options.max_subcompactions = max_subcompactions_;
+
+  TryReopen(options);
+  Random rnd(301);
+
+  std::vector<ThreadStatus> thread_list;
+  // Delay both flush and compaction
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBTest::PreShutdownCompactionMiddle:Preshutdown",
+        "CompactionJob::Run():Inprogress"},
+       {"CompactionJob::Run():Start",
+        "DBTest::PreShutdownCompactionMiddle:VerifyCompaction"},
+       {"CompactionJob::Run():Inprogress", "CompactionJob::Run():End"},
+       {"CompactionJob::Run():End",
+        "DBTest::PreShutdownCompactionMiddle:VerifyPreshutdown"}});
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // Make rocksdb busy
+  int key = 0;
+  // check how many threads are doing compaction using GetThreadList
+  int operation_count[ThreadStatus::NUM_OP_TYPES] = {0};
+  for (int file = 0; file < 16 * kNumL0Files; ++file) {
+    for (int k = 0; k < kEntriesPerBuffer; ++k) {
+      ASSERT_OK(Put(ToString(key++), RandomString(&rnd, kTestValueSize)));
+    }
+
+    Status s = env_->GetThreadList(&thread_list);
+    for (auto thread : thread_list) {
+      operation_count[thread.operation_type]++;
+    }
+
+    // Speed up the test
+    if (operation_count[ThreadStatus::OP_FLUSH] > 1 &&
+        operation_count[ThreadStatus::OP_COMPACTION] >
+            0.6 * options.max_background_compactions) {
+      break;
+    }
+    if (file == 15 * kNumL0Files) {
+      TEST_SYNC_POINT("DBTest::PreShutdownCompactionMiddle:VerifyCompaction");
+    }
+  }
+
+  ASSERT_GE(operation_count[ThreadStatus::OP_COMPACTION], 1);
+  CancelAllBackgroundWork(db_);
+  TEST_SYNC_POINT("DBTest::PreShutdownCompactionMiddle:Preshutdown");
+  TEST_SYNC_POINT("DBTest::PreShutdownCompactionMiddle:VerifyPreshutdown");
+  dbfull()->TEST_WaitForCompact();
+  // Record the number of compactions at a time.
+  for (int i = 0; i < ThreadStatus::NUM_OP_TYPES; ++i) {
+    operation_count[i] = 0;
+  }
+  Status s = env_->GetThreadList(&thread_list);
+  for (auto thread : thread_list) {
+    operation_count[thread.operation_type]++;
+  }
+  ASSERT_EQ(operation_count[ThreadStatus::OP_COMPACTION], 0);
+}
+
+#endif  // ROCKSDB_USING_THREAD_STATUS
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest, FlushOnDestroy) {
+  WriteOptions wo;
+  wo.disableWAL = true;
+  ASSERT_OK(Put("foo", "v1", wo));
+  CancelAllBackgroundWork(db_);
+}
+
+TEST_F(DBTest, DynamicLevelCompressionPerLevel) {
+  if (!Snappy_Supported()) {
+    return;
+  }
+  const int kNKeys = 120;
+  int keys[kNKeys];
+  for (int i = 0; i < kNKeys; i++) {
+    keys[i] = i;
+  }
+  std::random_shuffle(std::begin(keys), std::end(keys));
+
+  Random rnd(301);
+  Options options;
+  options.create_if_missing = true;
+  options.db_write_buffer_size = 20480;
+  options.write_buffer_size = 20480;
+  options.max_write_buffer_number = 2;
+  options.level0_file_num_compaction_trigger = 2;
+  options.level0_slowdown_writes_trigger = 2;
+  options.level0_stop_writes_trigger = 2;
+  options.target_file_size_base = 20480;
+  options.level_compaction_dynamic_level_bytes = true;
+  options.max_bytes_for_level_base = 102400;
+  options.max_bytes_for_level_multiplier = 4;
+  options.max_background_compactions = 1;
+  options.num_levels = 5;
+
+  options.compression_per_level.resize(3);
+  options.compression_per_level[0] = kNoCompression;
+  options.compression_per_level[1] = kNoCompression;
+  options.compression_per_level[2] = kSnappyCompression;
+
+  OnFileDeletionListener* listener = new OnFileDeletionListener();
+  options.listeners.emplace_back(listener);
+
+  DestroyAndReopen(options);
+
+  // Insert more than 80K. L4 should be base level. Neither L0 nor L4 should
+  // be compressed, so total data size should be more than 80K.
+  for (int i = 0; i < 20; i++) {
+    ASSERT_OK(Put(Key(keys[i]), CompressibleString(&rnd, 4000)));
+  }
+  Flush();
+  dbfull()->TEST_WaitForCompact();
+
+  ASSERT_EQ(NumTableFilesAtLevel(1), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(2), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(3), 0);
+  // Assuming each files' metadata is at least 50 bytes/
+  ASSERT_GT(SizeAtLevel(0) + SizeAtLevel(4), 20U * 4000U + 50U * 4);
+
+  // Insert 400KB. Some data will be compressed
+  for (int i = 21; i < 120; i++) {
+    ASSERT_OK(Put(Key(keys[i]), CompressibleString(&rnd, 4000)));
+  }
+  Flush();
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(NumTableFilesAtLevel(1), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(2), 0);
+  ASSERT_LT(SizeAtLevel(0) + SizeAtLevel(3) + SizeAtLevel(4),
+            120U * 4000U + 50U * 24);
+  // Make sure data in files in L3 is not compacted by removing all files
+  // in L4 and calculate number of rows
+  ASSERT_OK(dbfull()->SetOptions({
+      {"disable_auto_compactions", "true"},
+  }));
+  ColumnFamilyMetaData cf_meta;
+  db_->GetColumnFamilyMetaData(&cf_meta);
+  for (auto file : cf_meta.levels[4].files) {
+    listener->SetExpectedFileName(dbname_ + file.name);
+    ASSERT_OK(dbfull()->DeleteFile(file.name));
+  }
+  listener->VerifyMatchedCount(cf_meta.levels[4].files.size());
+
+  int num_keys = 0;
+  std::unique_ptr<Iterator> iter(db_->NewIterator(ReadOptions()));
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    num_keys++;
+  }
+  ASSERT_OK(iter->status());
+  ASSERT_GT(SizeAtLevel(0) + SizeAtLevel(3), num_keys * 4000U + num_keys * 10U);
+}
+
+TEST_F(DBTest, DynamicLevelCompressionPerLevel2) {
+  if (!Snappy_Supported() || !LZ4_Supported() || !Zlib_Supported()) {
+    return;
+  }
+  const int kNKeys = 500;
+  int keys[kNKeys];
+  for (int i = 0; i < kNKeys; i++) {
+    keys[i] = i;
+  }
+  std::random_shuffle(std::begin(keys), std::end(keys));
+
+  Random rnd(301);
+  Options options;
+  options.create_if_missing = true;
+  options.db_write_buffer_size = 6000000;
+  options.write_buffer_size = 600000;
+  options.max_write_buffer_number = 2;
+  options.level0_file_num_compaction_trigger = 2;
+  options.level0_slowdown_writes_trigger = 2;
+  options.level0_stop_writes_trigger = 2;
+  options.soft_pending_compaction_bytes_limit = 1024 * 1024;
+  options.target_file_size_base = 20;
+
+  options.level_compaction_dynamic_level_bytes = true;
+  options.max_bytes_for_level_base = 200;
+  options.max_bytes_for_level_multiplier = 8;
+  options.max_background_compactions = 1;
+  options.num_levels = 5;
+  std::shared_ptr<mock::MockTableFactory> mtf(new mock::MockTableFactory);
+  options.table_factory = mtf;
+
+  options.compression_per_level.resize(3);
+  options.compression_per_level[0] = kNoCompression;
+  options.compression_per_level[1] = kLZ4Compression;
+  options.compression_per_level[2] = kZlibCompression;
+
+  DestroyAndReopen(options);
+  // When base level is L4, L4 is LZ4.
+  std::atomic<int> num_zlib(0);
+  std::atomic<int> num_lz4(0);
+  std::atomic<int> num_no(0);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
+        Compaction* compaction = reinterpret_cast<Compaction*>(arg);
+        if (compaction->output_level() == 4) {
+          ASSERT_TRUE(compaction->output_compression() == kLZ4Compression);
+          num_lz4.fetch_add(1);
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "FlushJob::WriteLevel0Table:output_compression", [&](void* arg) {
+        auto* compression = reinterpret_cast<CompressionType*>(arg);
+        ASSERT_TRUE(*compression == kNoCompression);
+        num_no.fetch_add(1);
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  for (int i = 0; i < 100; i++) {
+    std::string value = RandomString(&rnd, 200);
+    ASSERT_OK(Put(Key(keys[i]), value));
+    if (i % 25 == 24) {
+      Flush();
+      dbfull()->TEST_WaitForCompact();
+    }
+  }
+
+  Flush();
+  dbfull()->TEST_WaitForFlushMemTable();
+  dbfull()->TEST_WaitForCompact();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  ASSERT_EQ(NumTableFilesAtLevel(1), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(2), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(3), 0);
+  ASSERT_GT(NumTableFilesAtLevel(4), 0);
+  ASSERT_GT(num_no.load(), 2);
+  ASSERT_GT(num_lz4.load(), 0);
+  int prev_num_files_l4 = NumTableFilesAtLevel(4);
+
+  // After base level turn L4->L3, L3 becomes LZ4 and L4 becomes Zlib
+  num_lz4.store(0);
+  num_no.store(0);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
+        Compaction* compaction = reinterpret_cast<Compaction*>(arg);
+        if (compaction->output_level() == 4 && compaction->start_level() == 3) {
+          ASSERT_TRUE(compaction->output_compression() == kZlibCompression);
+          num_zlib.fetch_add(1);
+        } else {
+          ASSERT_TRUE(compaction->output_compression() == kLZ4Compression);
+          num_lz4.fetch_add(1);
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "FlushJob::WriteLevel0Table:output_compression", [&](void* arg) {
+        auto* compression = reinterpret_cast<CompressionType*>(arg);
+        ASSERT_TRUE(*compression == kNoCompression);
+        num_no.fetch_add(1);
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  for (int i = 101; i < 500; i++) {
+    std::string value = RandomString(&rnd, 200);
+    ASSERT_OK(Put(Key(keys[i]), value));
+    if (i % 100 == 99) {
+      Flush();
+      dbfull()->TEST_WaitForCompact();
+    }
+  }
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ASSERT_EQ(NumTableFilesAtLevel(1), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(2), 0);
+  ASSERT_GT(NumTableFilesAtLevel(3), 0);
+  ASSERT_GT(NumTableFilesAtLevel(4), prev_num_files_l4);
+  ASSERT_GT(num_no.load(), 2);
+  ASSERT_GT(num_lz4.load(), 0);
+  ASSERT_GT(num_zlib.load(), 0);
+}
+
+TEST_F(DBTest, DynamicCompactionOptions) {
+  // minimum write buffer size is enforced at 64KB
+  const uint64_t k32KB = 1 << 15;
+  const uint64_t k64KB = 1 << 16;
+  const uint64_t k128KB = 1 << 17;
+  const uint64_t k1MB = 1 << 20;
+  const uint64_t k4KB = 1 << 12;
+  Options options;
+  options.env = env_;
+  options.create_if_missing = true;
+  options.compression = kNoCompression;
+  options.soft_pending_compaction_bytes_limit = 1024 * 1024;
+  options.write_buffer_size = k64KB;
+  options.arena_block_size = 4 * k4KB;
+  options.max_write_buffer_number = 2;
+  // Compaction related options
+  options.level0_file_num_compaction_trigger = 3;
+  options.level0_slowdown_writes_trigger = 4;
+  options.level0_stop_writes_trigger = 8;
+  options.target_file_size_base = k64KB;
+  options.max_compaction_bytes = options.target_file_size_base * 10;
+  options.target_file_size_multiplier = 1;
+  options.max_bytes_for_level_base = k128KB;
+  options.max_bytes_for_level_multiplier = 4;
+
+  // Block flush thread and disable compaction thread
+  env_->SetBackgroundThreads(1, Env::LOW);
+  env_->SetBackgroundThreads(1, Env::HIGH);
+  DestroyAndReopen(options);
+
+  auto gen_l0_kb = [this](int start, int size, int stride) {
+    Random rnd(301);
+    for (int i = 0; i < size; i++) {
+      ASSERT_OK(Put(Key(start + stride * i), RandomString(&rnd, 1024)));
+    }
+    dbfull()->TEST_WaitForFlushMemTable();
+  };
+
+  // Write 3 files that have the same key range.
+  // Since level0_file_num_compaction_trigger is 3, compaction should be
+  // triggered. The compaction should result in one L1 file
+  gen_l0_kb(0, 64, 1);
+  ASSERT_EQ(NumTableFilesAtLevel(0), 1);
+  gen_l0_kb(0, 64, 1);
+  ASSERT_EQ(NumTableFilesAtLevel(0), 2);
+  gen_l0_kb(0, 64, 1);
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ("0,1", FilesPerLevel());
+  std::vector<LiveFileMetaData> metadata;
+  db_->GetLiveFilesMetaData(&metadata);
+  ASSERT_EQ(1U, metadata.size());
+  ASSERT_LE(metadata[0].size, k64KB + k4KB);
+  ASSERT_GE(metadata[0].size, k64KB - k4KB);
+
+  // Test compaction trigger and target_file_size_base
+  // Reduce compaction trigger to 2, and reduce L1 file size to 32KB.
+  // Writing to 64KB L0 files should trigger a compaction. Since these
+  // 2 L0 files have the same key range, compaction merge them and should
+  // result in 2 32KB L1 files.
+  ASSERT_OK(dbfull()->SetOptions({{"level0_file_num_compaction_trigger", "2"},
+                                  {"target_file_size_base", ToString(k32KB)}}));
+
+  gen_l0_kb(0, 64, 1);
+  ASSERT_EQ("1,1", FilesPerLevel());
+  gen_l0_kb(0, 64, 1);
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ("0,2", FilesPerLevel());
+  metadata.clear();
+  db_->GetLiveFilesMetaData(&metadata);
+  ASSERT_EQ(2U, metadata.size());
+  ASSERT_LE(metadata[0].size, k32KB + k4KB);
+  ASSERT_GE(metadata[0].size, k32KB - k4KB);
+  ASSERT_LE(metadata[1].size, k32KB + k4KB);
+  ASSERT_GE(metadata[1].size, k32KB - k4KB);
+
+  // Test max_bytes_for_level_base
+  // Increase level base size to 256KB and write enough data that will
+  // fill L1 and L2. L1 size should be around 256KB while L2 size should be
+  // around 256KB x 4.
+  ASSERT_OK(
+      dbfull()->SetOptions({{"max_bytes_for_level_base", ToString(k1MB)}}));
+
+  // writing 96 x 64KB => 6 * 1024KB
+  // (L1 + L2) = (1 + 4) * 1024KB
+  for (int i = 0; i < 96; ++i) {
+    gen_l0_kb(i, 64, 96);
+  }
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_GT(SizeAtLevel(1), k1MB / 2);
+  ASSERT_LT(SizeAtLevel(1), k1MB + k1MB / 2);
+
+  // Within (0.5, 1.5) of 4MB.
+  ASSERT_GT(SizeAtLevel(2), 2 * k1MB);
+  ASSERT_LT(SizeAtLevel(2), 6 * k1MB);
+
+  // Test max_bytes_for_level_multiplier and
+  // max_bytes_for_level_base. Now, reduce both mulitplier and level base,
+  // After filling enough data that can fit in L1 - L3, we should see L1 size
+  // reduces to 128KB from 256KB which was asserted previously. Same for L2.
+  ASSERT_OK(
+      dbfull()->SetOptions({{"max_bytes_for_level_multiplier", "2"},
+                            {"max_bytes_for_level_base", ToString(k128KB)}}));
+
+  // writing 20 x 64KB = 10 x 128KB
+  // (L1 + L2 + L3) = (1 + 2 + 4) * 128KB
+  for (int i = 0; i < 20; ++i) {
+    gen_l0_kb(i, 64, 32);
+  }
+  dbfull()->TEST_WaitForCompact();
+  uint64_t total_size = SizeAtLevel(1) + SizeAtLevel(2) + SizeAtLevel(3);
+  ASSERT_TRUE(total_size < k128KB * 7 * 1.5);
+
+  // Test level0_stop_writes_trigger.
+  // Clean up memtable and L0. Block compaction threads. If continue to write
+  // and flush memtables. We should see put stop after 8 memtable flushes
+  // since level0_stop_writes_trigger = 8
+  dbfull()->TEST_FlushMemTable(true, true);
+  dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  // Block compaction
+  test::SleepingBackgroundTask sleeping_task_low;
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+                 Env::Priority::LOW);
+  sleeping_task_low.WaitUntilSleeping();
+  ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+  int count = 0;
+  Random rnd(301);
+  WriteOptions wo;
+  while (count < 64) {
+    ASSERT_OK(Put(Key(count), RandomString(&rnd, 1024), wo));
+    dbfull()->TEST_FlushMemTable(true, true);
+    count++;
+    if (dbfull()->TEST_write_controler().IsStopped()) {
+      sleeping_task_low.WakeUp();
+      break;
+    }
+  }
+  // Stop trigger = 8
+  ASSERT_EQ(count, 8);
+  // Unblock
+  sleeping_task_low.WaitUntilDone();
+
+  // Now reduce level0_stop_writes_trigger to 6. Clear up memtables and L0.
+  // Block compaction thread again. Perform the put and memtable flushes
+  // until we see the stop after 6 memtable flushes.
+  ASSERT_OK(dbfull()->SetOptions({{"level0_stop_writes_trigger", "6"}}));
+  dbfull()->TEST_FlushMemTable(true);
+  dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+
+  // Block compaction again
+  sleeping_task_low.Reset();
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+                 Env::Priority::LOW);
+  sleeping_task_low.WaitUntilSleeping();
+  count = 0;
+  while (count < 64) {
+    ASSERT_OK(Put(Key(count), RandomString(&rnd, 1024), wo));
+    dbfull()->TEST_FlushMemTable(true, true);
+    count++;
+    if (dbfull()->TEST_write_controler().IsStopped()) {
+      sleeping_task_low.WakeUp();
+      break;
+    }
+  }
+  ASSERT_EQ(count, 6);
+  // Unblock
+  sleeping_task_low.WaitUntilDone();
+
+  // Test disable_auto_compactions
+  // Compaction thread is unblocked but auto compaction is disabled. Write
+  // 4 L0 files and compaction should be triggered. If auto compaction is
+  // disabled, then TEST_WaitForCompact will be waiting for nothing. Number of
+  // L0 files do not change after the call.
+  ASSERT_OK(dbfull()->SetOptions({{"disable_auto_compactions", "true"}}));
+  dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+
+  for (int i = 0; i < 4; ++i) {
+    ASSERT_OK(Put(Key(i), RandomString(&rnd, 1024)));
+    // Wait for compaction so that put won't stop
+    dbfull()->TEST_FlushMemTable(true);
+  }
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(NumTableFilesAtLevel(0), 4);
+
+  // Enable auto compaction and perform the same test, # of L0 files should be
+  // reduced after compaction.
+  ASSERT_OK(dbfull()->SetOptions({{"disable_auto_compactions", "false"}}));
+  dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+
+  for (int i = 0; i < 4; ++i) {
+    ASSERT_OK(Put(Key(i), RandomString(&rnd, 1024)));
+    // Wait for compaction so that put won't stop
+    dbfull()->TEST_FlushMemTable(true);
+  }
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_LT(NumTableFilesAtLevel(0), 4);
+}
+
+// Test dynamic FIFO compaction options.
+// This test covers just option parsing and makes sure that the options are
+// correctly assigned. Also look at DBOptionsTest.SetFIFOCompactionOptions
+// test which makes sure that the FIFO compaction funcionality is working
+// as expected on dynamically changing the options.
+// Even more FIFOCompactionTests are at DBTest.FIFOCompaction* .
+TEST_F(DBTest, DynamicFIFOCompactionOptions) {
+  Options options;
+  options.ttl = 0;
+  options.create_if_missing = true;
+  DestroyAndReopen(options);
+
+  // Initial defaults
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.max_table_files_size,
+            1024 * 1024 * 1024);
+  ASSERT_EQ(dbfull()->GetOptions().ttl, 0);
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.allow_compaction,
+            false);
+
+  ASSERT_OK(dbfull()->SetOptions(
+      {{"compaction_options_fifo", "{max_table_files_size=23;}"}}));
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.max_table_files_size,
+            23);
+  ASSERT_EQ(dbfull()->GetOptions().ttl, 0);
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.allow_compaction,
+            false);
+
+  ASSERT_OK(dbfull()->SetOptions({{"ttl", "97"}}));
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.max_table_files_size,
+            23);
+  ASSERT_EQ(dbfull()->GetOptions().ttl, 97);
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.allow_compaction,
+            false);
+
+  ASSERT_OK(dbfull()->SetOptions({{"ttl", "203"}}));
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.max_table_files_size,
+            23);
+  ASSERT_EQ(dbfull()->GetOptions().ttl, 203);
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.allow_compaction,
+            false);
+
+  ASSERT_OK(dbfull()->SetOptions(
+      {{"compaction_options_fifo", "{allow_compaction=true;}"}}));
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.max_table_files_size,
+            23);
+  ASSERT_EQ(dbfull()->GetOptions().ttl, 203);
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.allow_compaction,
+            true);
+
+  ASSERT_OK(dbfull()->SetOptions(
+      {{"compaction_options_fifo", "{max_table_files_size=31;}"}}));
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.max_table_files_size,
+            31);
+  ASSERT_EQ(dbfull()->GetOptions().ttl, 203);
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.allow_compaction,
+            true);
+
+  ASSERT_OK(dbfull()->SetOptions(
+      {{"compaction_options_fifo",
+        "{max_table_files_size=51;allow_compaction=true;}"}}));
+  ASSERT_OK(dbfull()->SetOptions({{"ttl", "49"}}));
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.max_table_files_size,
+            51);
+  ASSERT_EQ(dbfull()->GetOptions().ttl, 49);
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.allow_compaction,
+            true);
+}
+
+TEST_F(DBTest, DynamicUniversalCompactionOptions) {
+  Options options;
+  options.create_if_missing = true;
+  DestroyAndReopen(options);
+
+  // Initial defaults
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.size_ratio, 1U);
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.min_merge_width,
+            2u);
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.max_merge_width,
+            UINT_MAX);
+  ASSERT_EQ(dbfull()
+                ->GetOptions()
+                .compaction_options_universal.max_size_amplification_percent,
+            200u);
+  ASSERT_EQ(dbfull()
+                ->GetOptions()
+                .compaction_options_universal.compression_size_percent,
+            -1);
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.stop_style,
+            kCompactionStopStyleTotalSize);
+  ASSERT_EQ(
+      dbfull()->GetOptions().compaction_options_universal.allow_trivial_move,
+      false);
+
+  ASSERT_OK(dbfull()->SetOptions(
+      {{"compaction_options_universal", "{size_ratio=7;}"}}));
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.size_ratio, 7u);
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.min_merge_width,
+            2u);
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.max_merge_width,
+            UINT_MAX);
+  ASSERT_EQ(dbfull()
+                ->GetOptions()
+                .compaction_options_universal.max_size_amplification_percent,
+            200u);
+  ASSERT_EQ(dbfull()
+                ->GetOptions()
+                .compaction_options_universal.compression_size_percent,
+            -1);
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.stop_style,
+            kCompactionStopStyleTotalSize);
+  ASSERT_EQ(
+      dbfull()->GetOptions().compaction_options_universal.allow_trivial_move,
+      false);
+
+  ASSERT_OK(dbfull()->SetOptions(
+      {{"compaction_options_universal", "{min_merge_width=11;}"}}));
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.size_ratio, 7u);
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.min_merge_width,
+            11u);
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.max_merge_width,
+            UINT_MAX);
+  ASSERT_EQ(dbfull()
+                ->GetOptions()
+                .compaction_options_universal.max_size_amplification_percent,
+            200u);
+  ASSERT_EQ(dbfull()
+                ->GetOptions()
+                .compaction_options_universal.compression_size_percent,
+            -1);
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.stop_style,
+            kCompactionStopStyleTotalSize);
+  ASSERT_EQ(
+      dbfull()->GetOptions().compaction_options_universal.allow_trivial_move,
+      false);
+}
+#endif  // ROCKSDB_LITE
+
+TEST_F(DBTest, FileCreationRandomFailure) {
+  Options options;
+  options.env = env_;
+  options.create_if_missing = true;
+  options.write_buffer_size = 100000;  // Small write buffer
+  options.target_file_size_base = 200000;
+  options.max_bytes_for_level_base = 1000000;
+  options.max_bytes_for_level_multiplier = 2;
+
+  DestroyAndReopen(options);
+  Random rnd(301);
+
+  const int kCDTKeysPerBuffer = 4;
+  const int kTestSize = kCDTKeysPerBuffer * 4096;
+  const int kTotalIteration = 100;
+  // the second half of the test involves in random failure
+  // of file creation.
+  const int kRandomFailureTest = kTotalIteration / 2;
+  std::vector<std::string> values;
+  for (int i = 0; i < kTestSize; ++i) {
+    values.push_back("NOT_FOUND");
+  }
+  for (int j = 0; j < kTotalIteration; ++j) {
+    if (j == kRandomFailureTest) {
+      env_->non_writeable_rate_.store(90);
+    }
+    for (int k = 0; k < kTestSize; ++k) {
+      // here we expect some of the Put fails.
+      std::string value = RandomString(&rnd, 100);
+      Status s = Put(Key(k), Slice(value));
+      if (s.ok()) {
+        // update the latest successful put
+        values[k] = value;
+      }
+      // But everything before we simulate the failure-test should succeed.
+      if (j < kRandomFailureTest) {
+        ASSERT_OK(s);
+      }
+    }
+  }
+
+  // If rocksdb does not do the correct job, internal assert will fail here.
+  dbfull()->TEST_WaitForFlushMemTable();
+  dbfull()->TEST_WaitForCompact();
+
+  // verify we have the latest successful update
+  for (int k = 0; k < kTestSize; ++k) {
+    auto v = Get(Key(k));
+    ASSERT_EQ(v, values[k]);
+  }
+
+  // reopen and reverify we have the latest successful update
+  env_->non_writeable_rate_.store(0);
+  Reopen(options);
+  for (int k = 0; k < kTestSize; ++k) {
+    auto v = Get(Key(k));
+    ASSERT_EQ(v, values[k]);
+  }
+}
+
+#ifndef ROCKSDB_LITE
+
+TEST_F(DBTest, DynamicMiscOptions) {
+  // Test max_sequential_skip_in_iterations
+  Options options;
+  options.env = env_;
+  options.create_if_missing = true;
+  options.max_sequential_skip_in_iterations = 16;
+  options.compression = kNoCompression;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  DestroyAndReopen(options);
+
+  auto assert_reseek_count = [this, &options](int key_start, int num_reseek) {
+    int key0 = key_start;
+    int key1 = key_start + 1;
+    int key2 = key_start + 2;
+    Random rnd(301);
+    ASSERT_OK(Put(Key(key0), RandomString(&rnd, 8)));
+    for (int i = 0; i < 10; ++i) {
+      ASSERT_OK(Put(Key(key1), RandomString(&rnd, 8)));
+    }
+    ASSERT_OK(Put(Key(key2), RandomString(&rnd, 8)));
+    std::unique_ptr<Iterator> iter(db_->NewIterator(ReadOptions()));
+    iter->Seek(Key(key1));
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(Key(key1)), 0);
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(Key(key2)), 0);
+    ASSERT_EQ(num_reseek,
+              TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION));
+  };
+  // No reseek
+  assert_reseek_count(100, 0);
+
+  ASSERT_OK(dbfull()->SetOptions({{"max_sequential_skip_in_iterations", "4"}}));
+  // Clear memtable and make new option effective
+  dbfull()->TEST_FlushMemTable(true);
+  // Trigger reseek
+  assert_reseek_count(200, 1);
+
+  ASSERT_OK(
+      dbfull()->SetOptions({{"max_sequential_skip_in_iterations", "16"}}));
+  // Clear memtable and make new option effective
+  dbfull()->TEST_FlushMemTable(true);
+  // No reseek
+  assert_reseek_count(300, 1);
+
+  MutableCFOptions mutable_cf_options;
+  CreateAndReopenWithCF({"pikachu"}, options);
+  // Test soft_pending_compaction_bytes_limit,
+  // hard_pending_compaction_bytes_limit
+  ASSERT_OK(dbfull()->SetOptions(
+      handles_[1], {{"soft_pending_compaction_bytes_limit", "200"},
+                    {"hard_pending_compaction_bytes_limit", "300"}}));
+  ASSERT_OK(dbfull()->TEST_GetLatestMutableCFOptions(handles_[1],
+                                                     &mutable_cf_options));
+  ASSERT_EQ(200, mutable_cf_options.soft_pending_compaction_bytes_limit);
+  ASSERT_EQ(300, mutable_cf_options.hard_pending_compaction_bytes_limit);
+  // Test report_bg_io_stats
+  ASSERT_OK(
+      dbfull()->SetOptions(handles_[1], {{"report_bg_io_stats", "true"}}));
+  // sanity check
+  ASSERT_OK(dbfull()->TEST_GetLatestMutableCFOptions(handles_[1],
+                                                     &mutable_cf_options));
+  ASSERT_TRUE(mutable_cf_options.report_bg_io_stats);
+  // Test compression
+  // sanity check
+  ASSERT_OK(dbfull()->SetOptions({{"compression", "kNoCompression"}}));
+  ASSERT_OK(dbfull()->TEST_GetLatestMutableCFOptions(handles_[0],
+                                                     &mutable_cf_options));
+  ASSERT_EQ(CompressionType::kNoCompression, mutable_cf_options.compression);
+
+  if (Snappy_Supported()) {
+    ASSERT_OK(dbfull()->SetOptions({{"compression", "kSnappyCompression"}}));
+    ASSERT_OK(dbfull()->TEST_GetLatestMutableCFOptions(handles_[0],
+                                                       &mutable_cf_options));
+    ASSERT_EQ(CompressionType::kSnappyCompression,
+              mutable_cf_options.compression);
+  }
+
+  // Test paranoid_file_checks already done in db_block_cache_test
+  ASSERT_OK(
+      dbfull()->SetOptions(handles_[1], {{"paranoid_file_checks", "true"}}));
+  ASSERT_OK(dbfull()->TEST_GetLatestMutableCFOptions(handles_[1],
+                                                     &mutable_cf_options));
+  ASSERT_TRUE(mutable_cf_options.report_bg_io_stats);
+}
+#endif  // ROCKSDB_LITE
+
+TEST_F(DBTest, L0L1L2AndUpHitCounter) {
+  Options options = CurrentOptions();
+  options.write_buffer_size = 32 * 1024;
+  options.target_file_size_base = 32 * 1024;
+  options.level0_file_num_compaction_trigger = 2;
+  options.level0_slowdown_writes_trigger = 2;
+  options.level0_stop_writes_trigger = 4;
+  options.max_bytes_for_level_base = 64 * 1024;
+  options.max_write_buffer_number = 2;
+  options.max_background_compactions = 8;
+  options.max_background_flushes = 8;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  CreateAndReopenWithCF({"mypikachu"}, options);
+
+  int numkeys = 20000;
+  for (int i = 0; i < numkeys; i++) {
+    ASSERT_OK(Put(1, Key(i), "val"));
+  }
+  ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L0));
+  ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L1));
+  ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L2_AND_UP));
+
+  ASSERT_OK(Flush(1));
+  dbfull()->TEST_WaitForCompact();
+
+  for (int i = 0; i < numkeys; i++) {
+    ASSERT_EQ(Get(1, Key(i)), "val");
+  }
+
+  ASSERT_GT(TestGetTickerCount(options, GET_HIT_L0), 100);
+  ASSERT_GT(TestGetTickerCount(options, GET_HIT_L1), 100);
+  ASSERT_GT(TestGetTickerCount(options, GET_HIT_L2_AND_UP), 100);
+
+  ASSERT_EQ(numkeys, TestGetTickerCount(options, GET_HIT_L0) +
+                         TestGetTickerCount(options, GET_HIT_L1) +
+                         TestGetTickerCount(options, GET_HIT_L2_AND_UP));
+}
+
+TEST_F(DBTest, EncodeDecompressedBlockSizeTest) {
+  // iter 0 -- zlib
+  // iter 1 -- bzip2
+  // iter 2 -- lz4
+  // iter 3 -- lz4HC
+  // iter 4 -- xpress
+  CompressionType compressions[] = {kZlibCompression, kBZip2Compression,
+                                    kLZ4Compression, kLZ4HCCompression,
+                                    kXpressCompression};
+  for (auto comp : compressions) {
+    if (!CompressionTypeSupported(comp)) {
+      continue;
+    }
+    // first_table_version 1 -- generate with table_version == 1, read with
+    // table_version == 2
+    // first_table_version 2 -- generate with table_version == 2, read with
+    // table_version == 1
+    for (int first_table_version = 1; first_table_version <= 2;
+         ++first_table_version) {
+      BlockBasedTableOptions table_options;
+      table_options.format_version = first_table_version;
+      table_options.filter_policy.reset(NewBloomFilterPolicy(10));
+      Options options = CurrentOptions();
+      options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+      options.create_if_missing = true;
+      options.compression = comp;
+      DestroyAndReopen(options);
+
+      int kNumKeysWritten = 1000;
+
+      Random rnd(301);
+      for (int i = 0; i < kNumKeysWritten; ++i) {
+        // compressible string
+        ASSERT_OK(Put(Key(i), RandomString(&rnd, 128) + std::string(128, 'a')));
+      }
+
+      table_options.format_version = first_table_version == 1 ? 2 : 1;
+      options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+      Reopen(options);
+      for (int i = 0; i < kNumKeysWritten; ++i) {
+        auto r = Get(Key(i));
+        ASSERT_EQ(r.substr(128), std::string(128, 'a'));
+      }
+    }
+  }
+}
+
+TEST_F(DBTest, CloseSpeedup) {
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleLevel;
+  options.write_buffer_size = 110 << 10;  // 110KB
+  options.arena_block_size = 4 << 10;
+  options.level0_file_num_compaction_trigger = 2;
+  options.num_levels = 4;
+  options.max_bytes_for_level_base = 400 * 1024;
+  options.max_write_buffer_number = 16;
+
+  // Block background threads
+  env_->SetBackgroundThreads(1, Env::LOW);
+  env_->SetBackgroundThreads(1, Env::HIGH);
+  test::SleepingBackgroundTask sleeping_task_low;
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+                 Env::Priority::LOW);
+  test::SleepingBackgroundTask sleeping_task_high;
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask,
+                 &sleeping_task_high, Env::Priority::HIGH);
+
+  std::vector<std::string> filenames;
+  env_->GetChildren(dbname_, &filenames);
+  // Delete archival files.
+  for (size_t i = 0; i < filenames.size(); ++i) {
+    env_->DeleteFile(dbname_ + "/" + filenames[i]);
+  }
+  env_->DeleteDir(dbname_);
+  DestroyAndReopen(options);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  env_->SetBackgroundThreads(1, Env::LOW);
+  env_->SetBackgroundThreads(1, Env::HIGH);
+  Random rnd(301);
+  int key_idx = 0;
+
+  // First three 110KB files are not going to level 2
+  // After that, (100K, 200K)
+  for (int num = 0; num < 5; num++) {
+    GenerateNewFile(&rnd, &key_idx, true);
+  }
+
+  ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+  Close();
+  ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+  // Unblock background threads
+  sleeping_task_high.WakeUp();
+  sleeping_task_high.WaitUntilDone();
+  sleeping_task_low.WakeUp();
+  sleeping_task_low.WaitUntilDone();
+
+  Destroy(options);
+}
+
+class DelayedMergeOperator : public MergeOperator {
+ private:
+  DBTest* db_test_;
+
+ public:
+  explicit DelayedMergeOperator(DBTest* d) : db_test_(d) {}
+
+  bool FullMergeV2(const MergeOperationInput& /*merge_in*/,
+                   MergeOperationOutput* merge_out) const override {
+    db_test_->env_->addon_time_.fetch_add(1000);
+    merge_out->new_value = "";
+    return true;
+  }
+
+  const char* Name() const override { return "DelayedMergeOperator"; }
+};
+
+TEST_F(DBTest, MergeTestTime) {
+  std::string one, two, three;
+  PutFixed64(&one, 1);
+  PutFixed64(&two, 2);
+  PutFixed64(&three, 3);
+
+  // Enable time profiling
+  SetPerfLevel(kEnableTime);
+  this->env_->addon_time_.store(0);
+  this->env_->time_elapse_only_sleep_ = true;
+  this->env_->no_slowdown_ = true;
+  Options options = CurrentOptions();
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  options.merge_operator.reset(new DelayedMergeOperator(this));
+  DestroyAndReopen(options);
+
+  ASSERT_EQ(TestGetTickerCount(options, MERGE_OPERATION_TOTAL_TIME), 0);
+  db_->Put(WriteOptions(), "foo", one);
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->Merge(WriteOptions(), "foo", two));
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->Merge(WriteOptions(), "foo", three));
+  ASSERT_OK(Flush());
+
+  ReadOptions opt;
+  opt.verify_checksums = true;
+  opt.snapshot = nullptr;
+  std::string result;
+  db_->Get(opt, "foo", &result);
+
+  ASSERT_EQ(1000000, TestGetTickerCount(options, MERGE_OPERATION_TOTAL_TIME));
+
+  ReadOptions read_options;
+  std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+  int count = 0;
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    ASSERT_OK(iter->status());
+    ++count;
+  }
+
+  ASSERT_EQ(1, count);
+  ASSERT_EQ(2000000, TestGetTickerCount(options, MERGE_OPERATION_TOTAL_TIME));
+#ifdef ROCKSDB_USING_THREAD_STATUS
+  ASSERT_GT(TestGetTickerCount(options, FLUSH_WRITE_BYTES), 0);
+#endif  // ROCKSDB_USING_THREAD_STATUS
+  this->env_->time_elapse_only_sleep_ = false;
+}
+
+#ifndef ROCKSDB_LITE
+TEST_P(DBTestWithParam, MergeCompactionTimeTest) {
+  SetPerfLevel(kEnableTime);
+  Options options = CurrentOptions();
+  options.compaction_filter_factory = std::make_shared<KeepFilterFactory>();
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  options.merge_operator.reset(new DelayedMergeOperator(this));
+  options.compaction_style = kCompactionStyleUniversal;
+  options.max_subcompactions = max_subcompactions_;
+  DestroyAndReopen(options);
+
+  for (int i = 0; i < 1000; i++) {
+    ASSERT_OK(db_->Merge(WriteOptions(), "foo", "TEST"));
+    ASSERT_OK(Flush());
+  }
+  dbfull()->TEST_WaitForFlushMemTable();
+  dbfull()->TEST_WaitForCompact();
+
+  ASSERT_NE(TestGetTickerCount(options, MERGE_OPERATION_TOTAL_TIME), 0);
+}
+
+TEST_P(DBTestWithParam, FilterCompactionTimeTest) {
+  Options options = CurrentOptions();
+  options.compaction_filter_factory =
+      std::make_shared<DelayFilterFactory>(this);
+  options.disable_auto_compactions = true;
+  options.create_if_missing = true;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  options.statistics->set_stats_level(kExceptTimeForMutex);
+  options.max_subcompactions = max_subcompactions_;
+  DestroyAndReopen(options);
+
+  // put some data
+  for (int table = 0; table < 4; ++table) {
+    for (int i = 0; i < 10 + table; ++i) {
+      Put(ToString(table * 100 + i), "val");
+    }
+    Flush();
+  }
+
+  CompactRangeOptions cro;
+  cro.exclusive_manual_compaction = exclusive_manual_compaction_;
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  ASSERT_EQ(0U, CountLiveFiles());
+
+  Reopen(options);
+
+  Iterator* itr = db_->NewIterator(ReadOptions());
+  itr->SeekToFirst();
+  ASSERT_NE(TestGetTickerCount(options, FILTER_OPERATION_TOTAL_TIME), 0);
+  delete itr;
+}
+#endif  // ROCKSDB_LITE
+
+TEST_F(DBTest, TestLogCleanup) {
+  Options options = CurrentOptions();
+  options.write_buffer_size = 64 * 1024;  // very small
+  // only two memtables allowed ==> only two log files
+  options.max_write_buffer_number = 2;
+  Reopen(options);
+
+  for (int i = 0; i < 100000; ++i) {
+    Put(Key(i), "val");
+    // only 2 memtables will be alive, so logs_to_free needs to always be below
+    // 2
+    ASSERT_LT(dbfull()->TEST_LogsToFreeSize(), static_cast<size_t>(3));
+  }
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest, EmptyCompactedDB) {
+  Options options = CurrentOptions();
+  options.max_open_files = -1;
+  Close();
+  ASSERT_OK(ReadOnlyReopen(options));
+  Status s = Put("new", "value");
+  ASSERT_TRUE(s.IsNotSupported());
+  Close();
+}
+#endif  // ROCKSDB_LITE
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest, SuggestCompactRangeTest) {
+  class CompactionFilterFactoryGetContext : public CompactionFilterFactory {
+   public:
+    std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+        const CompactionFilter::Context& context) override {
+      saved_context = context;
+      std::unique_ptr<CompactionFilter> empty_filter;
+      return empty_filter;
+    }
+    const char* Name() const override {
+      return "CompactionFilterFactoryGetContext";
+    }
+    static bool IsManual(CompactionFilterFactory* compaction_filter_factory) {
+      return reinterpret_cast<CompactionFilterFactoryGetContext*>(
+                 compaction_filter_factory)
+          ->saved_context.is_manual_compaction;
+    }
+    CompactionFilter::Context saved_context;
+  };
+
+  Options options = CurrentOptions();
+  options.memtable_factory.reset(
+      new SpecialSkipListFactory(DBTestBase::kNumKeysByGenerateNewRandomFile));
+  options.compaction_style = kCompactionStyleLevel;
+  options.compaction_filter_factory.reset(
+      new CompactionFilterFactoryGetContext());
+  options.write_buffer_size = 200 << 10;
+  options.arena_block_size = 4 << 10;
+  options.level0_file_num_compaction_trigger = 4;
+  options.num_levels = 4;
+  options.compression = kNoCompression;
+  options.max_bytes_for_level_base = 450 << 10;
+  options.target_file_size_base = 98 << 10;
+  options.max_compaction_bytes = static_cast<uint64_t>(1) << 60;  // inf
+
+  Reopen(options);
+
+  Random rnd(301);
+
+  for (int num = 0; num < 3; num++) {
+    GenerateNewRandomFile(&rnd);
+  }
+
+  GenerateNewRandomFile(&rnd);
+  ASSERT_EQ("0,4", FilesPerLevel(0));
+  ASSERT_TRUE(!CompactionFilterFactoryGetContext::IsManual(
+      options.compaction_filter_factory.get()));
+
+  GenerateNewRandomFile(&rnd);
+  ASSERT_EQ("1,4", FilesPerLevel(0));
+
+  GenerateNewRandomFile(&rnd);
+  ASSERT_EQ("2,4", FilesPerLevel(0));
+
+  GenerateNewRandomFile(&rnd);
+  ASSERT_EQ("3,4", FilesPerLevel(0));
+
+  GenerateNewRandomFile(&rnd);
+  ASSERT_EQ("0,4,4", FilesPerLevel(0));
+
+  GenerateNewRandomFile(&rnd);
+  ASSERT_EQ("1,4,4", FilesPerLevel(0));
+
+  GenerateNewRandomFile(&rnd);
+  ASSERT_EQ("2,4,4", FilesPerLevel(0));
+
+  GenerateNewRandomFile(&rnd);
+  ASSERT_EQ("3,4,4", FilesPerLevel(0));
+
+  GenerateNewRandomFile(&rnd);
+  ASSERT_EQ("0,4,8", FilesPerLevel(0));
+
+  GenerateNewRandomFile(&rnd);
+  ASSERT_EQ("1,4,8", FilesPerLevel(0));
+
+  // compact it three times
+  for (int i = 0; i < 3; ++i) {
+    ASSERT_OK(experimental::SuggestCompactRange(db_, nullptr, nullptr));
+    dbfull()->TEST_WaitForCompact();
+  }
+
+  // All files are compacted
+  ASSERT_EQ(0, NumTableFilesAtLevel(0));
+  ASSERT_EQ(0, NumTableFilesAtLevel(1));
+
+  GenerateNewRandomFile(&rnd);
+  ASSERT_EQ(1, NumTableFilesAtLevel(0));
+
+  // nonoverlapping with the file on level 0
+  Slice start("a"), end("b");
+  ASSERT_OK(experimental::SuggestCompactRange(db_, &start, &end));
+  dbfull()->TEST_WaitForCompact();
+
+  // should not compact the level 0 file
+  ASSERT_EQ(1, NumTableFilesAtLevel(0));
+
+  start = Slice("j");
+  end = Slice("m");
+  ASSERT_OK(experimental::SuggestCompactRange(db_, &start, &end));
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_TRUE(CompactionFilterFactoryGetContext::IsManual(
+      options.compaction_filter_factory.get()));
+
+  // now it should compact the level 0 file
+  ASSERT_EQ(0, NumTableFilesAtLevel(0));
+  ASSERT_EQ(1, NumTableFilesAtLevel(1));
+}
+
+TEST_F(DBTest, PromoteL0) {
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.write_buffer_size = 10 * 1024 * 1024;
+  DestroyAndReopen(options);
+
+  // non overlapping ranges
+  std::vector<std::pair<int32_t, int32_t>> ranges = {
+      {81, 160}, {0, 80}, {161, 240}, {241, 320}};
+
+  int32_t value_size = 10 * 1024;  // 10 KB
+
+  Random rnd(301);
+  std::map<int32_t, std::string> values;
+  for (const auto& range : ranges) {
+    for (int32_t j = range.first; j < range.second; j++) {
+      values[j] = RandomString(&rnd, value_size);
+      ASSERT_OK(Put(Key(j), values[j]));
+    }
+    ASSERT_OK(Flush());
+  }
+
+  int32_t level0_files = NumTableFilesAtLevel(0, 0);
+  ASSERT_EQ(level0_files, ranges.size());
+  ASSERT_EQ(NumTableFilesAtLevel(1, 0), 0);  // No files in L1
+
+  // Promote L0 level to L2.
+  ASSERT_OK(experimental::PromoteL0(db_, db_->DefaultColumnFamily(), 2));
+  // We expect that all the files were trivially moved from L0 to L2
+  ASSERT_EQ(NumTableFilesAtLevel(0, 0), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(2, 0), level0_files);
+
+  for (const auto& kv : values) {
+    ASSERT_EQ(Get(Key(kv.first)), kv.second);
+  }
+}
+
+TEST_F(DBTest, PromoteL0Failure) {
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.write_buffer_size = 10 * 1024 * 1024;
+  DestroyAndReopen(options);
+
+  // Produce two L0 files with overlapping ranges.
+  ASSERT_OK(Put(Key(0), ""));
+  ASSERT_OK(Put(Key(3), ""));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put(Key(1), ""));
+  ASSERT_OK(Flush());
+
+  Status status;
+  // Fails because L0 has overlapping files.
+  status = experimental::PromoteL0(db_, db_->DefaultColumnFamily());
+  ASSERT_TRUE(status.IsInvalidArgument());
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  // Now there is a file in L1.
+  ASSERT_GE(NumTableFilesAtLevel(1, 0), 1);
+
+  ASSERT_OK(Put(Key(5), ""));
+  ASSERT_OK(Flush());
+  // Fails because L1 is non-empty.
+  status = experimental::PromoteL0(db_, db_->DefaultColumnFamily());
+  ASSERT_TRUE(status.IsInvalidArgument());
+}
+
+// Github issue #596
+TEST_F(DBTest, CompactRangeWithEmptyBottomLevel) {
+  const int kNumLevels = 2;
+  const int kNumL0Files = 2;
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.num_levels = kNumLevels;
+  DestroyAndReopen(options);
+
+  Random rnd(301);
+  for (int i = 0; i < kNumL0Files; ++i) {
+    ASSERT_OK(Put(Key(0), RandomString(&rnd, 1024)));
+    Flush();
+  }
+  ASSERT_EQ(NumTableFilesAtLevel(0), kNumL0Files);
+  ASSERT_EQ(NumTableFilesAtLevel(1), 0);
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(1), kNumL0Files);
+}
+#endif  // ROCKSDB_LITE
+
+TEST_F(DBTest, AutomaticConflictsWithManualCompaction) {
+  const int kNumL0Files = 50;
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = 4;
+  // never slowdown / stop
+  options.level0_slowdown_writes_trigger = 999999;
+  options.level0_stop_writes_trigger = 999999;
+  options.max_background_compactions = 10;
+  DestroyAndReopen(options);
+
+  // schedule automatic compactions after the manual one starts, but before it
+  // finishes to ensure conflict.
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::BackgroundCompaction:Start",
+        "DBTest::AutomaticConflictsWithManualCompaction:PrePuts"},
+       {"DBTest::AutomaticConflictsWithManualCompaction:PostPuts",
+        "DBImpl::BackgroundCompaction:NonTrivial:AfterRun"}});
+  std::atomic<int> callback_count(0);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::MaybeScheduleFlushOrCompaction:Conflict",
+      [&](void* /*arg*/) { callback_count.fetch_add(1); });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  Random rnd(301);
+  for (int i = 0; i < 2; ++i) {
+    // put two keys to ensure no trivial move
+    for (int j = 0; j < 2; ++j) {
+      ASSERT_OK(Put(Key(j), RandomString(&rnd, 1024)));
+    }
+    ASSERT_OK(Flush());
+  }
+  port::Thread manual_compaction_thread([this]() {
+    CompactRangeOptions croptions;
+    croptions.exclusive_manual_compaction = true;
+    ASSERT_OK(db_->CompactRange(croptions, nullptr, nullptr));
+  });
+
+  TEST_SYNC_POINT("DBTest::AutomaticConflictsWithManualCompaction:PrePuts");
+  for (int i = 0; i < kNumL0Files; ++i) {
+    // put two keys to ensure no trivial move
+    for (int j = 0; j < 2; ++j) {
+      ASSERT_OK(Put(Key(j), RandomString(&rnd, 1024)));
+    }
+    ASSERT_OK(Flush());
+  }
+  TEST_SYNC_POINT("DBTest::AutomaticConflictsWithManualCompaction:PostPuts");
+
+  ASSERT_GE(callback_count.load(), 1);
+  for (int i = 0; i < 2; ++i) {
+    ASSERT_NE("NOT_FOUND", Get(Key(i)));
+  }
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  manual_compaction_thread.join();
+  dbfull()->TEST_WaitForCompact();
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest, CompactFilesShouldTriggerAutoCompaction) {
+  Options options = CurrentOptions();
+  options.max_background_compactions = 1;
+  options.level0_file_num_compaction_trigger = 4;
+  options.level0_slowdown_writes_trigger = 36;
+  options.level0_stop_writes_trigger = 36;
+  DestroyAndReopen(options);
+
+  // generate files for manual compaction
+  Random rnd(301);
+  for (int i = 0; i < 2; ++i) {
+    // put two keys to ensure no trivial move
+    for (int j = 0; j < 2; ++j) {
+      ASSERT_OK(Put(Key(j), RandomString(&rnd, 1024)));
+    }
+    ASSERT_OK(Flush());
+  }
+
+  ROCKSDB_NAMESPACE::ColumnFamilyMetaData cf_meta_data;
+  db_->GetColumnFamilyMetaData(db_->DefaultColumnFamily(), &cf_meta_data);
+
+  std::vector<std::string> input_files;
+  input_files.push_back(cf_meta_data.levels[0].files[0].name);
+
+  SyncPoint::GetInstance()->LoadDependency({
+      {"CompactFilesImpl:0",
+       "DBTest::CompactFilesShouldTriggerAutoCompaction:Begin"},
+      {"DBTest::CompactFilesShouldTriggerAutoCompaction:End",
+       "CompactFilesImpl:1"},
+  });
+
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  port::Thread manual_compaction_thread([&]() {
+      auto s = db_->CompactFiles(CompactionOptions(),
+          db_->DefaultColumnFamily(), input_files, 0);
+  });
+
+  TEST_SYNC_POINT(
+          "DBTest::CompactFilesShouldTriggerAutoCompaction:Begin");
+  // generate enough files to trigger compaction
+  for (int i = 0; i < 20; ++i) {
+    for (int j = 0; j < 2; ++j) {
+      ASSERT_OK(Put(Key(j), RandomString(&rnd, 1024)));
+    }
+    ASSERT_OK(Flush());
+  }
+  db_->GetColumnFamilyMetaData(db_->DefaultColumnFamily(), &cf_meta_data);
+  ASSERT_GT(cf_meta_data.levels[0].files.size(),
+      options.level0_file_num_compaction_trigger);
+  TEST_SYNC_POINT(
+          "DBTest::CompactFilesShouldTriggerAutoCompaction:End");
+
+  manual_compaction_thread.join();
+  dbfull()->TEST_WaitForCompact();
+
+  db_->GetColumnFamilyMetaData(db_->DefaultColumnFamily(), &cf_meta_data);
+  ASSERT_LE(cf_meta_data.levels[0].files.size(),
+      options.level0_file_num_compaction_trigger);
+}
+#endif  // ROCKSDB_LITE
+
+// Github issue #595
+// Large write batch with column families
+TEST_F(DBTest, LargeBatchWithColumnFamilies) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.write_buffer_size = 100000;  // Small write buffer
+  CreateAndReopenWithCF({"pikachu"}, options);
+  int64_t j = 0;
+  for (int i = 0; i < 5; i++) {
+    for (int pass = 1; pass <= 3; pass++) {
+      WriteBatch batch;
+      size_t write_size = 1024 * 1024 * (5 + i);
+      fprintf(stderr, "prepare: %" ROCKSDB_PRIszt " MB, pass:%d\n",
+              (write_size / 1024 / 1024), pass);
+      for (;;) {
+        std::string data(3000, j++ % 127 + 20);
+        data += ToString(j);
+        batch.Put(handles_[0], Slice(data), Slice(data));
+        if (batch.GetDataSize() > write_size) {
+          break;
+        }
+      }
+      fprintf(stderr, "write: %" ROCKSDB_PRIszt " MB\n",
+              (batch.GetDataSize() / 1024 / 1024));
+      ASSERT_OK(dbfull()->Write(WriteOptions(), &batch));
+      fprintf(stderr, "done\n");
+    }
+  }
+  // make sure we can re-open it.
+  ASSERT_OK(TryReopenWithColumnFamilies({"default", "pikachu"}, options));
+}
+
+// Make sure that Flushes can proceed in parallel with CompactRange()
+TEST_F(DBTest, FlushesInParallelWithCompactRange) {
+  // iter == 0 -- leveled
+  // iter == 1 -- leveled, but throw in a flush between two levels compacting
+  // iter == 2 -- universal
+  for (int iter = 0; iter < 3; ++iter) {
+    Options options = CurrentOptions();
+    if (iter < 2) {
+      options.compaction_style = kCompactionStyleLevel;
+    } else {
+      options.compaction_style = kCompactionStyleUniversal;
+    }
+    options.write_buffer_size = 110 << 10;
+    options.level0_file_num_compaction_trigger = 4;
+    options.num_levels = 4;
+    options.compression = kNoCompression;
+    options.max_bytes_for_level_base = 450 << 10;
+    options.target_file_size_base = 98 << 10;
+    options.max_write_buffer_number = 2;
+
+    DestroyAndReopen(options);
+
+    Random rnd(301);
+    for (int num = 0; num < 14; num++) {
+      GenerateNewRandomFile(&rnd);
+    }
+
+    if (iter == 1) {
+      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+          {{"DBImpl::RunManualCompaction()::1",
+            "DBTest::FlushesInParallelWithCompactRange:1"},
+           {"DBTest::FlushesInParallelWithCompactRange:2",
+            "DBImpl::RunManualCompaction()::2"}});
+    } else {
+      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+          {{"CompactionJob::Run():Start",
+            "DBTest::FlushesInParallelWithCompactRange:1"},
+           {"DBTest::FlushesInParallelWithCompactRange:2",
+            "CompactionJob::Run():End"}});
+    }
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+    std::vector<port::Thread> threads;
+    threads.emplace_back([&]() { Compact("a", "z"); });
+
+    TEST_SYNC_POINT("DBTest::FlushesInParallelWithCompactRange:1");
+
+    // this has to start a flush. if flushes are blocked, this will try to
+    // create
+    // 3 memtables, and that will fail because max_write_buffer_number is 2
+    for (int num = 0; num < 3; num++) {
+      GenerateNewRandomFile(&rnd, /* nowait */ true);
+    }
+
+    TEST_SYNC_POINT("DBTest::FlushesInParallelWithCompactRange:2");
+
+    for (auto& t : threads) {
+      t.join();
+    }
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  }
+}
+
+TEST_F(DBTest, DelayedWriteRate) {
+  const int kEntriesPerMemTable = 100;
+  const int kTotalFlushes = 12;
+
+  Options options = CurrentOptions();
+  env_->SetBackgroundThreads(1, Env::LOW);
+  options.env = env_;
+  env_->no_slowdown_ = true;
+  options.write_buffer_size = 100000000;
+  options.max_write_buffer_number = 256;
+  options.max_background_compactions = 1;
+  options.level0_file_num_compaction_trigger = 3;
+  options.level0_slowdown_writes_trigger = 3;
+  options.level0_stop_writes_trigger = 999999;
+  options.delayed_write_rate = 20000000;  // Start with 200MB/s
+  options.memtable_factory.reset(
+      new SpecialSkipListFactory(kEntriesPerMemTable));
+
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  // Block compactions
+  test::SleepingBackgroundTask sleeping_task_low;
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+                 Env::Priority::LOW);
+
+  for (int i = 0; i < 3; i++) {
+    Put(Key(i), std::string(10000, 'x'));
+    Flush();
+  }
+
+  // These writes will be slowed down to 1KB/s
+  uint64_t estimated_sleep_time = 0;
+  Random rnd(301);
+  Put("", "");
+  uint64_t cur_rate = options.delayed_write_rate;
+  for (int i = 0; i < kTotalFlushes; i++) {
+    uint64_t size_memtable = 0;
+    for (int j = 0; j < kEntriesPerMemTable; j++) {
+      auto rand_num = rnd.Uniform(20);
+      // Spread the size range to more.
+      size_t entry_size = rand_num * rand_num * rand_num;
+      WriteOptions wo;
+      Put(Key(i), std::string(entry_size, 'x'), wo);
+      size_memtable += entry_size + 18;
+      // Occasionally sleep a while
+      if (rnd.Uniform(20) == 6) {
+        env_->SleepForMicroseconds(2666);
+      }
+    }
+    dbfull()->TEST_WaitForFlushMemTable();
+    estimated_sleep_time += size_memtable * 1000000u / cur_rate;
+    // Slow down twice. One for memtable switch and one for flush finishes.
+    cur_rate = static_cast<uint64_t>(static_cast<double>(cur_rate) *
+                                     kIncSlowdownRatio * kIncSlowdownRatio);
+  }
+  // Estimate the total sleep time fall into the rough range.
+  ASSERT_GT(env_->addon_time_.load(),
+            static_cast<int64_t>(estimated_sleep_time / 2));
+  ASSERT_LT(env_->addon_time_.load(),
+            static_cast<int64_t>(estimated_sleep_time * 2));
+
+  env_->no_slowdown_ = false;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  sleeping_task_low.WakeUp();
+  sleeping_task_low.WaitUntilDone();
+}
+
+TEST_F(DBTest, HardLimit) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  env_->SetBackgroundThreads(1, Env::LOW);
+  options.max_write_buffer_number = 256;
+  options.write_buffer_size = 110 << 10;  // 110KB
+  options.arena_block_size = 4 * 1024;
+  options.level0_file_num_compaction_trigger = 4;
+  options.level0_slowdown_writes_trigger = 999999;
+  options.level0_stop_writes_trigger = 999999;
+  options.hard_pending_compaction_bytes_limit = 800 << 10;
+  options.max_bytes_for_level_base = 10000000000u;
+  options.max_background_compactions = 1;
+  options.memtable_factory.reset(
+      new SpecialSkipListFactory(KNumKeysByGenerateNewFile - 1));
+
+  env_->SetBackgroundThreads(1, Env::LOW);
+  test::SleepingBackgroundTask sleeping_task_low;
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+                 Env::Priority::LOW);
+
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  std::atomic<int> callback_count(0);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::DelayWrite:Wait", [&](void* /*arg*/) {
+        callback_count.fetch_add(1);
+        sleeping_task_low.WakeUp();
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  Random rnd(301);
+  int key_idx = 0;
+  for (int num = 0; num < 5; num++) {
+    GenerateNewFile(&rnd, &key_idx, true);
+    dbfull()->TEST_WaitForFlushMemTable();
+  }
+
+  ASSERT_EQ(0, callback_count.load());
+
+  for (int num = 0; num < 5; num++) {
+    GenerateNewFile(&rnd, &key_idx, true);
+    dbfull()->TEST_WaitForFlushMemTable();
+  }
+  ASSERT_GE(callback_count.load(), 1);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  sleeping_task_low.WaitUntilDone();
+}
+
+#if !defined(ROCKSDB_LITE) && !defined(ROCKSDB_DISABLE_STALL_NOTIFICATION)
+class WriteStallListener : public EventListener {
+ public:
+  WriteStallListener() : condition_(WriteStallCondition::kNormal) {}
+  void OnStallConditionsChanged(const WriteStallInfo& info) override {
+    MutexLock l(&mutex_);
+    condition_ = info.condition.cur;
+  }
+  bool CheckCondition(WriteStallCondition expected) {
+    MutexLock l(&mutex_);
+    return expected == condition_;
+  }
+ private:
+  port::Mutex   mutex_;
+  WriteStallCondition condition_;
+};
+
+TEST_F(DBTest, SoftLimit) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.write_buffer_size = 100000;  // Small write buffer
+  options.max_write_buffer_number = 256;
+  options.level0_file_num_compaction_trigger = 1;
+  options.level0_slowdown_writes_trigger = 3;
+  options.level0_stop_writes_trigger = 999999;
+  options.delayed_write_rate = 20000;  // About 200KB/s limited rate
+  options.soft_pending_compaction_bytes_limit = 160000;
+  options.target_file_size_base = 99999999;  // All into one file
+  options.max_bytes_for_level_base = 50000;
+  options.max_bytes_for_level_multiplier = 10;
+  options.max_background_compactions = 1;
+  options.compression = kNoCompression;
+  WriteStallListener* listener = new WriteStallListener();
+  options.listeners.emplace_back(listener);
+
+  // FlushMemtable with opt.wait=true does not wait for
+  // `OnStallConditionsChanged` being called. The event listener is triggered
+  // on `JobContext::Clean`, which happens after flush result is installed.
+  // We use sync point to create a custom WaitForFlush that waits for
+  // context cleanup.
+  port::Mutex flush_mutex;
+  port::CondVar flush_cv(&flush_mutex);
+  bool flush_finished = false;
+  auto InstallFlushCallback = [&]() {
+    {
+      MutexLock l(&flush_mutex);
+      flush_finished = false;
+    }
+    SyncPoint::GetInstance()->SetCallBack(
+        "DBImpl::BackgroundCallFlush:ContextCleanedUp", [&](void*) {
+          {
+            MutexLock l(&flush_mutex);
+            flush_finished = true;
+          }
+          flush_cv.SignalAll();
+        });
+  };
+  auto WaitForFlush = [&]() {
+    {
+      MutexLock l(&flush_mutex);
+      while (!flush_finished) {
+        flush_cv.Wait();
+      }
+    }
+    SyncPoint::GetInstance()->ClearCallBack(
+        "DBImpl::BackgroundCallFlush:ContextCleanedUp");
+  };
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  Reopen(options);
+
+  // Generating 360KB in Level 3
+  for (int i = 0; i < 72; i++) {
+    Put(Key(i), std::string(5000, 'x'));
+    if (i % 10 == 0) {
+      dbfull()->TEST_FlushMemTable(true, true);
+    }
+  }
+  dbfull()->TEST_WaitForCompact();
+  MoveFilesToLevel(3);
+
+  // Generating 360KB in Level 2
+  for (int i = 0; i < 72; i++) {
+    Put(Key(i), std::string(5000, 'x'));
+    if (i % 10 == 0) {
+      dbfull()->TEST_FlushMemTable(true, true);
+    }
+  }
+  dbfull()->TEST_WaitForCompact();
+  MoveFilesToLevel(2);
+
+  Put(Key(0), "");
+
+  test::SleepingBackgroundTask sleeping_task_low;
+  // Block compactions
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+                 Env::Priority::LOW);
+  sleeping_task_low.WaitUntilSleeping();
+
+  // Create 3 L0 files, making score of L0 to be 3.
+  for (int i = 0; i < 3; i++) {
+    Put(Key(i), std::string(5000, 'x'));
+    Put(Key(100 - i), std::string(5000, 'x'));
+    // Flush the file. File size is around 30KB.
+    InstallFlushCallback();
+    dbfull()->TEST_FlushMemTable(true, true);
+    WaitForFlush();
+  }
+  ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+  ASSERT_TRUE(listener->CheckCondition(WriteStallCondition::kDelayed));
+
+  sleeping_task_low.WakeUp();
+  sleeping_task_low.WaitUntilDone();
+  sleeping_task_low.Reset();
+  dbfull()->TEST_WaitForCompact();
+
+  // Now there is one L1 file but doesn't trigger soft_rate_limit
+  // The L1 file size is around 30KB.
+  ASSERT_EQ(NumTableFilesAtLevel(1), 1);
+  ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay());
+  ASSERT_TRUE(listener->CheckCondition(WriteStallCondition::kNormal));
+
+  // Only allow one compactin going through.
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "BackgroundCallCompaction:0", [&](void* /*arg*/) {
+        // Schedule a sleeping task.
+        sleeping_task_low.Reset();
+        env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask,
+                       &sleeping_task_low, Env::Priority::LOW);
+      });
+
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+                 Env::Priority::LOW);
+  sleeping_task_low.WaitUntilSleeping();
+  // Create 3 L0 files, making score of L0 to be 3
+  for (int i = 0; i < 3; i++) {
+    Put(Key(10 + i), std::string(5000, 'x'));
+    Put(Key(90 - i), std::string(5000, 'x'));
+    // Flush the file. File size is around 30KB.
+    InstallFlushCallback();
+    dbfull()->TEST_FlushMemTable(true, true);
+    WaitForFlush();
+  }
+
+  // Wake up sleep task to enable compaction to run and waits
+  // for it to go to sleep state again to make sure one compaction
+  // goes through.
+  sleeping_task_low.WakeUp();
+  sleeping_task_low.WaitUntilSleeping();
+
+  // Now there is one L1 file (around 60KB) which exceeds 50KB base by 10KB
+  // Given level multiplier 10, estimated pending compaction is around 100KB
+  // doesn't trigger soft_pending_compaction_bytes_limit
+  ASSERT_EQ(NumTableFilesAtLevel(1), 1);
+  ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay());
+  ASSERT_TRUE(listener->CheckCondition(WriteStallCondition::kNormal));
+
+  // Create 3 L0 files, making score of L0 to be 3, higher than L0.
+  for (int i = 0; i < 3; i++) {
+    Put(Key(20 + i), std::string(5000, 'x'));
+    Put(Key(80 - i), std::string(5000, 'x'));
+    // Flush the file. File size is around 30KB.
+    InstallFlushCallback();
+    dbfull()->TEST_FlushMemTable(true, true);
+    WaitForFlush();
+  }
+  // Wake up sleep task to enable compaction to run and waits
+  // for it to go to sleep state again to make sure one compaction
+  // goes through.
+  sleeping_task_low.WakeUp();
+  sleeping_task_low.WaitUntilSleeping();
+
+  // Now there is one L1 file (around 90KB) which exceeds 50KB base by 40KB
+  // L2 size is 360KB, so the estimated level fanout 4, estimated pending
+  // compaction is around 200KB
+  // triggerring soft_pending_compaction_bytes_limit
+  ASSERT_EQ(NumTableFilesAtLevel(1), 1);
+  ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+  ASSERT_TRUE(listener->CheckCondition(WriteStallCondition::kDelayed));
+
+  sleeping_task_low.WakeUp();
+  sleeping_task_low.WaitUntilSleeping();
+
+  ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay());
+  ASSERT_TRUE(listener->CheckCondition(WriteStallCondition::kNormal));
+
+  // shrink level base so L2 will hit soft limit easier.
+  ASSERT_OK(dbfull()->SetOptions({
+      {"max_bytes_for_level_base", "5000"},
+  }));
+
+  Put("", "");
+  Flush();
+  ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+  ASSERT_TRUE(listener->CheckCondition(WriteStallCondition::kDelayed));
+
+  sleeping_task_low.WaitUntilSleeping();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  sleeping_task_low.WakeUp();
+  sleeping_task_low.WaitUntilDone();
+}
+
+TEST_F(DBTest, LastWriteBufferDelay) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.write_buffer_size = 100000;
+  options.max_write_buffer_number = 4;
+  options.delayed_write_rate = 20000;
+  options.compression = kNoCompression;
+  options.disable_auto_compactions = true;
+  int kNumKeysPerMemtable = 3;
+  options.memtable_factory.reset(
+      new SpecialSkipListFactory(kNumKeysPerMemtable));
+
+  Reopen(options);
+  test::SleepingBackgroundTask sleeping_task;
+  // Block flushes
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task,
+                 Env::Priority::HIGH);
+  sleeping_task.WaitUntilSleeping();
+
+  // Create 3 L0 files, making score of L0 to be 3.
+  for (int i = 0; i < 3; i++) {
+    // Fill one mem table
+    for (int j = 0; j < kNumKeysPerMemtable; j++) {
+      Put(Key(j), "");
+    }
+    ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay());
+  }
+  // Inserting a new entry would create a new mem table, triggering slow down.
+  Put(Key(0), "");
+  ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+
+  sleeping_task.WakeUp();
+  sleeping_task.WaitUntilDone();
+}
+#endif  // !defined(ROCKSDB_LITE) && !defined(ROCKSDB_DISABLE_STALL_NOTIFICATION)
+
+TEST_F(DBTest, FailWhenCompressionNotSupportedTest) {
+  CompressionType compressions[] = {kZlibCompression, kBZip2Compression,
+                                    kLZ4Compression, kLZ4HCCompression,
+                                    kXpressCompression};
+  for (auto comp : compressions) {
+    if (!CompressionTypeSupported(comp)) {
+      // not supported, we should fail the Open()
+      Options options = CurrentOptions();
+      options.compression = comp;
+      ASSERT_TRUE(!TryReopen(options).ok());
+      // Try if CreateColumnFamily also fails
+      options.compression = kNoCompression;
+      ASSERT_OK(TryReopen(options));
+      ColumnFamilyOptions cf_options(options);
+      cf_options.compression = comp;
+      ColumnFamilyHandle* handle;
+      ASSERT_TRUE(!db_->CreateColumnFamily(cf_options, "name", &handle).ok());
+    }
+  }
+}
+
+TEST_F(DBTest, CreateColumnFamilyShouldFailOnIncompatibleOptions) {
+  Options options = CurrentOptions();
+  options.max_open_files = 100;
+  Reopen(options);
+
+  ColumnFamilyOptions cf_options(options);
+  // ttl is now supported when max_open_files is -1.
+  cf_options.ttl = 3600;
+  ColumnFamilyHandle* handle;
+  ASSERT_OK(db_->CreateColumnFamily(cf_options, "pikachu", &handle));
+  delete handle;
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest, RowCache) {
+  Options options = CurrentOptions();
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  options.row_cache = NewLRUCache(8192);
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put("foo", "bar"));
+  ASSERT_OK(Flush());
+
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 0);
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 0);
+  ASSERT_EQ(Get("foo"), "bar");
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 0);
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 1);
+  ASSERT_EQ(Get("foo"), "bar");
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 1);
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 1);
+}
+
+TEST_F(DBTest, PinnableSliceAndRowCache) {
+  Options options = CurrentOptions();
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  options.row_cache = NewLRUCache(8192);
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put("foo", "bar"));
+  ASSERT_OK(Flush());
+
+  ASSERT_EQ(Get("foo"), "bar");
+  ASSERT_EQ(
+      reinterpret_cast<LRUCache*>(options.row_cache.get())->TEST_GetLRUSize(),
+      1);
+
+  {
+    PinnableSlice pin_slice;
+    ASSERT_EQ(Get("foo", &pin_slice), Status::OK());
+    ASSERT_EQ(pin_slice.ToString(), "bar");
+    // Entry is already in cache, lookup will remove the element from lru
+    ASSERT_EQ(
+        reinterpret_cast<LRUCache*>(options.row_cache.get())->TEST_GetLRUSize(),
+        0);
+  }
+  // After PinnableSlice destruction element is added back in LRU
+  ASSERT_EQ(
+      reinterpret_cast<LRUCache*>(options.row_cache.get())->TEST_GetLRUSize(),
+      1);
+}
+
+#endif  // ROCKSDB_LITE
+
+TEST_F(DBTest, DeletingOldWalAfterDrop) {
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"Test:AllowFlushes", "DBImpl::BGWorkFlush"},
+       {"DBImpl::BGWorkFlush:done", "Test:WaitForFlush"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearTrace();
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  Options options = CurrentOptions();
+  options.max_total_wal_size = 8192;
+  options.compression = kNoCompression;
+  options.write_buffer_size = 1 << 20;
+  options.level0_file_num_compaction_trigger = (1 << 30);
+  options.level0_slowdown_writes_trigger = (1 << 30);
+  options.level0_stop_writes_trigger = (1 << 30);
+  options.disable_auto_compactions = true;
+  DestroyAndReopen(options);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  CreateColumnFamilies({"cf1", "cf2"}, options);
+  ASSERT_OK(Put(0, "key1", DummyString(8192)));
+  ASSERT_OK(Put(0, "key2", DummyString(8192)));
+  // the oldest wal should now be getting_flushed
+  ASSERT_OK(db_->DropColumnFamily(handles_[0]));
+  // all flushes should now do nothing because their CF is dropped
+  TEST_SYNC_POINT("Test:AllowFlushes");
+  TEST_SYNC_POINT("Test:WaitForFlush");
+  uint64_t lognum1 = dbfull()->TEST_LogfileNumber();
+  ASSERT_OK(Put(1, "key3", DummyString(8192)));
+  ASSERT_OK(Put(1, "key4", DummyString(8192)));
+  // new wal should have been created
+  uint64_t lognum2 = dbfull()->TEST_LogfileNumber();
+  EXPECT_GT(lognum2, lognum1);
+}
+
+TEST_F(DBTest, UnsupportedManualSync) {
+  DestroyAndReopen(CurrentOptions());
+  env_->is_wal_sync_thread_safe_.store(false);
+  Status s = db_->SyncWAL();
+  ASSERT_TRUE(s.IsNotSupported());
+}
+
+INSTANTIATE_TEST_CASE_P(DBTestWithParam, DBTestWithParam,
+                        ::testing::Combine(::testing::Values(1, 4),
+                                           ::testing::Bool()));
+
+TEST_F(DBTest, PauseBackgroundWorkTest) {
+  Options options = CurrentOptions();
+  options.write_buffer_size = 100000;  // Small write buffer
+  Reopen(options);
+
+  std::vector<port::Thread> threads;
+  std::atomic<bool> done(false);
+  db_->PauseBackgroundWork();
+  threads.emplace_back([&]() {
+    Random rnd(301);
+    for (int i = 0; i < 10000; ++i) {
+      Put(RandomString(&rnd, 10), RandomString(&rnd, 10));
+    }
+    done.store(true);
+  });
+  env_->SleepForMicroseconds(200000);
+  // make sure the thread is not done
+  ASSERT_FALSE(done.load());
+  db_->ContinueBackgroundWork();
+  for (auto& t : threads) {
+    t.join();
+  }
+  // now it's done
+  ASSERT_TRUE(done.load());
+}
+
+// Keep spawning short-living threads that create an iterator and quit.
+// Meanwhile in another thread keep flushing memtables.
+// This used to cause a deadlock.
+TEST_F(DBTest, ThreadLocalPtrDeadlock) {
+  std::atomic<int> flushes_done{0};
+  std::atomic<int> threads_destroyed{0};
+  auto done = [&] {
+    return flushes_done.load() > 10;
+  };
+
+  port::Thread flushing_thread([&] {
+    for (int i = 0; !done(); ++i) {
+      ASSERT_OK(db_->Put(WriteOptions(), Slice("hi"),
+                         Slice(std::to_string(i).c_str())));
+      ASSERT_OK(db_->Flush(FlushOptions()));
+      int cnt = ++flushes_done;
+      fprintf(stderr, "Flushed %d times\n", cnt);
+    }
+  });
+
+  std::vector<port::Thread> thread_spawning_threads(10);
+  for (auto& t: thread_spawning_threads) {
+    t = port::Thread([&] {
+      while (!done()) {
+        {
+          port::Thread tmp_thread([&] {
+            auto it = db_->NewIterator(ReadOptions());
+            delete it;
+          });
+          tmp_thread.join();
+        }
+        ++threads_destroyed;
+      }
+    });
+  }
+
+  for (auto& t: thread_spawning_threads) {
+    t.join();
+  }
+  flushing_thread.join();
+  fprintf(stderr, "Done. Flushed %d times, destroyed %d threads\n",
+          flushes_done.load(), threads_destroyed.load());
+}
+
+TEST_F(DBTest, LargeBlockSizeTest) {
+  Options options = CurrentOptions();
+  CreateAndReopenWithCF({"pikachu"}, options);
+  ASSERT_OK(Put(0, "foo", "bar"));
+  BlockBasedTableOptions table_options;
+  table_options.block_size = 8LL * 1024 * 1024 * 1024LL;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  ASSERT_NOK(TryReopenWithColumnFamilies({"default", "pikachu"}, options));
+}
+
+#ifndef ROCKSDB_LITE
+
+TEST_F(DBTest, CreationTimeOfOldestFile) {
+  const int kNumKeysPerFile = 32;
+  const int kNumLevelFiles = 2;
+  const int kValueSize = 100;
+
+  Options options = CurrentOptions();
+  options.max_open_files = -1;
+  env_->time_elapse_only_sleep_ = false;
+  options.env = env_;
+
+  env_->addon_time_.store(0);
+  DestroyAndReopen(options);
+
+  bool set_file_creation_time_to_zero = true;
+  int idx = 0;
+
+  int64_t time_1 = 0;
+  env_->GetCurrentTime(&time_1);
+  const uint64_t uint_time_1 = static_cast<uint64_t>(time_1);
+
+  // Add 50 hours
+  env_->addon_time_.fetch_add(50 * 60 * 60);
+
+  int64_t time_2 = 0;
+  env_->GetCurrentTime(&time_2);
+  const uint64_t uint_time_2 = static_cast<uint64_t>(time_2);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "PropertyBlockBuilder::AddTableProperty:Start", [&](void* arg) {
+        TableProperties* props = reinterpret_cast<TableProperties*>(arg);
+        if (set_file_creation_time_to_zero) {
+          if (idx == 0) {
+            props->file_creation_time = 0;
+            idx++;
+          } else if (idx == 1) {
+            props->file_creation_time = uint_time_1;
+            idx = 0;
+          }
+        } else {
+          if (idx == 0) {
+            props->file_creation_time = uint_time_1;
+            idx++;
+          } else if (idx == 1) {
+            props->file_creation_time = uint_time_2;
+          }
+        }
+      });
+  // Set file creation time in manifest all to 0.
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "FileMetaData::FileMetaData", [&](void* arg) {
+        FileMetaData* meta = static_cast<FileMetaData*>(arg);
+        meta->file_creation_time = 0;
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  Random rnd(301);
+  for (int i = 0; i < kNumLevelFiles; ++i) {
+    for (int j = 0; j < kNumKeysPerFile; ++j) {
+      ASSERT_OK(
+          Put(Key(i * kNumKeysPerFile + j), RandomString(&rnd, kValueSize)));
+    }
+    Flush();
+  }
+
+  // At this point there should be 2 files, one with file_creation_time = 0 and
+  // the other non-zero. GetCreationTimeOfOldestFile API should return 0.
+  uint64_t creation_time;
+  Status s1 = dbfull()->GetCreationTimeOfOldestFile(&creation_time);
+  ASSERT_EQ(0, creation_time);
+  ASSERT_EQ(s1, Status::OK());
+
+  // Testing with non-zero file creation time.
+  set_file_creation_time_to_zero = false;
+  options = CurrentOptions();
+  options.max_open_files = -1;
+  env_->time_elapse_only_sleep_ = false;
+  options.env = env_;
+
+  env_->addon_time_.store(0);
+  DestroyAndReopen(options);
+
+  for (int i = 0; i < kNumLevelFiles; ++i) {
+    for (int j = 0; j < kNumKeysPerFile; ++j) {
+      ASSERT_OK(
+          Put(Key(i * kNumKeysPerFile + j), RandomString(&rnd, kValueSize)));
+    }
+    Flush();
+  }
+
+  // At this point there should be 2 files with non-zero file creation time.
+  // GetCreationTimeOfOldestFile API should return non-zero value.
+  uint64_t ctime;
+  Status s2 = dbfull()->GetCreationTimeOfOldestFile(&ctime);
+  ASSERT_EQ(uint_time_1, ctime);
+  ASSERT_EQ(s2, Status::OK());
+
+  // Testing with max_open_files != -1
+  options = CurrentOptions();
+  options.max_open_files = 10;
+  DestroyAndReopen(options);
+  Status s3 = dbfull()->GetCreationTimeOfOldestFile(&ctime);
+  ASSERT_EQ(s3, Status::NotSupported());
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+#endif
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#ifdef ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS
+extern "C" {
+void RegisterCustomObjects(int argc, char** argv);
+}
+#else
+void RegisterCustomObjects(int /*argc*/, char** /*argv*/) {}
+#endif  // !ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  RegisterCustomObjects(argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_test2.cc b/src/rocksdb/db/db_test2.cc
new file mode 100644
index 000000000..f4e8e960a
--- /dev/null
+++ b/src/rocksdb/db/db_test2.cc
@@ -0,0 +1,4695 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include <atomic>
+#include <cstdlib>
+#include <functional>
+
+#include "db/db_test_util.h"
+#include "db/read_callback.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "rocksdb/persistent_cache.h"
+#include "rocksdb/wal_filter.h"
+#include "test_util/fault_injection_test_env.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBTest2 : public DBTestBase {
+ public:
+  DBTest2() : DBTestBase("/db_test2") {}
+};
+
+class PrefixFullBloomWithReverseComparator
+    : public DBTestBase,
+      public ::testing::WithParamInterface<bool> {
+ public:
+  PrefixFullBloomWithReverseComparator()
+      : DBTestBase("/prefix_bloom_reverse") {}
+  void SetUp() override { if_cache_filter_ = GetParam(); }
+  bool if_cache_filter_;
+};
+
+TEST_P(PrefixFullBloomWithReverseComparator,
+       PrefixFullBloomWithReverseComparator) {
+  Options options = last_options_;
+  options.comparator = ReverseBytewiseComparator();
+  options.prefix_extractor.reset(NewCappedPrefixTransform(3));
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  BlockBasedTableOptions bbto;
+  if (if_cache_filter_) {
+    bbto.no_block_cache = false;
+    bbto.cache_index_and_filter_blocks = true;
+    bbto.block_cache = NewLRUCache(1);
+  }
+  bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
+  bbto.whole_key_filtering = false;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  DestroyAndReopen(options);
+
+  ASSERT_OK(dbfull()->Put(WriteOptions(), "bar123", "foo"));
+  ASSERT_OK(dbfull()->Put(WriteOptions(), "bar234", "foo2"));
+  ASSERT_OK(dbfull()->Put(WriteOptions(), "foo123", "foo3"));
+
+  dbfull()->Flush(FlushOptions());
+
+  if (bbto.block_cache) {
+    bbto.block_cache->EraseUnRefEntries();
+  }
+
+  std::unique_ptr<Iterator> iter(db_->NewIterator(ReadOptions()));
+  iter->Seek("bar345");
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("bar234", iter->key().ToString());
+  ASSERT_EQ("foo2", iter->value().ToString());
+  iter->Next();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("bar123", iter->key().ToString());
+  ASSERT_EQ("foo", iter->value().ToString());
+
+  iter->Seek("foo234");
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("foo123", iter->key().ToString());
+  ASSERT_EQ("foo3", iter->value().ToString());
+
+  iter->Seek("bar");
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(!iter->Valid());
+}
+
+INSTANTIATE_TEST_CASE_P(PrefixFullBloomWithReverseComparator,
+                        PrefixFullBloomWithReverseComparator, testing::Bool());
+
+TEST_F(DBTest2, IteratorPropertyVersionNumber) {
+  Put("", "");
+  Iterator* iter1 = db_->NewIterator(ReadOptions());
+  std::string prop_value;
+  ASSERT_OK(
+      iter1->GetProperty("rocksdb.iterator.super-version-number", &prop_value));
+  uint64_t version_number1 =
+      static_cast<uint64_t>(std::atoi(prop_value.c_str()));
+
+  Put("", "");
+  Flush();
+
+  Iterator* iter2 = db_->NewIterator(ReadOptions());
+  ASSERT_OK(
+      iter2->GetProperty("rocksdb.iterator.super-version-number", &prop_value));
+  uint64_t version_number2 =
+      static_cast<uint64_t>(std::atoi(prop_value.c_str()));
+
+  ASSERT_GT(version_number2, version_number1);
+
+  Put("", "");
+
+  Iterator* iter3 = db_->NewIterator(ReadOptions());
+  ASSERT_OK(
+      iter3->GetProperty("rocksdb.iterator.super-version-number", &prop_value));
+  uint64_t version_number3 =
+      static_cast<uint64_t>(std::atoi(prop_value.c_str()));
+
+  ASSERT_EQ(version_number2, version_number3);
+
+  iter1->SeekToFirst();
+  ASSERT_OK(
+      iter1->GetProperty("rocksdb.iterator.super-version-number", &prop_value));
+  uint64_t version_number1_new =
+      static_cast<uint64_t>(std::atoi(prop_value.c_str()));
+  ASSERT_EQ(version_number1, version_number1_new);
+
+  delete iter1;
+  delete iter2;
+  delete iter3;
+}
+
+TEST_F(DBTest2, CacheIndexAndFilterWithDBRestart) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  BlockBasedTableOptions table_options;
+  table_options.cache_index_and_filter_blocks = true;
+  table_options.filter_policy.reset(NewBloomFilterPolicy(20));
+  options.table_factory.reset(new BlockBasedTableFactory(table_options));
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  Put(1, "a", "begin");
+  Put(1, "z", "end");
+  ASSERT_OK(Flush(1));
+  TryReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+  std::string value;
+  value = Get(1, "a");
+}
+
+TEST_F(DBTest2, MaxSuccessiveMergesChangeWithDBRecovery) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  options.max_successive_merges = 3;
+  options.merge_operator = MergeOperators::CreatePutOperator();
+  options.disable_auto_compactions = true;
+  DestroyAndReopen(options);
+  Put("poi", "Finch");
+  db_->Merge(WriteOptions(), "poi", "Reese");
+  db_->Merge(WriteOptions(), "poi", "Shaw");
+  db_->Merge(WriteOptions(), "poi", "Root");
+  options.max_successive_merges = 2;
+  Reopen(options);
+}
+
+#ifndef ROCKSDB_LITE
+class DBTestSharedWriteBufferAcrossCFs
+    : public DBTestBase,
+      public testing::WithParamInterface<std::tuple<bool, bool>> {
+ public:
+  DBTestSharedWriteBufferAcrossCFs()
+      : DBTestBase("/db_test_shared_write_buffer") {}
+  void SetUp() override {
+    use_old_interface_ = std::get<0>(GetParam());
+    cost_cache_ = std::get<1>(GetParam());
+  }
+  bool use_old_interface_;
+  bool cost_cache_;
+};
+
+TEST_P(DBTestSharedWriteBufferAcrossCFs, SharedWriteBufferAcrossCFs) {
+  Options options = CurrentOptions();
+  options.arena_block_size = 4096;
+
+  // Avoid undeterministic value by malloc_usable_size();
+  // Force arena block size to 1
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "Arena::Arena:0", [&](void* arg) {
+        size_t* block_size = static_cast<size_t*>(arg);
+        *block_size = 1;
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "Arena::AllocateNewBlock:0", [&](void* arg) {
+        std::pair<size_t*, size_t*>* pair =
+            static_cast<std::pair<size_t*, size_t*>*>(arg);
+        *std::get<0>(*pair) = *std::get<1>(*pair);
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // The total soft write buffer size is about 105000
+  std::shared_ptr<Cache> cache = NewLRUCache(4 * 1024 * 1024, 2);
+  ASSERT_LT(cache->GetUsage(), 256 * 1024);
+
+  if (use_old_interface_) {
+    options.db_write_buffer_size = 120000;  // this is the real limit
+  } else if (!cost_cache_) {
+    options.write_buffer_manager.reset(new WriteBufferManager(114285));
+  } else {
+    options.write_buffer_manager.reset(new WriteBufferManager(114285, cache));
+  }
+  options.write_buffer_size = 500000;  // this is never hit
+  CreateAndReopenWithCF({"pikachu", "dobrynia", "nikitich"}, options);
+
+  WriteOptions wo;
+  wo.disableWAL = true;
+
+  std::function<void()> wait_flush = [&]() {
+    dbfull()->TEST_WaitForFlushMemTable(handles_[0]);
+    dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
+    dbfull()->TEST_WaitForFlushMemTable(handles_[2]);
+    dbfull()->TEST_WaitForFlushMemTable(handles_[3]);
+  };
+
+  // Create some data and flush "default" and "nikitich" so that they
+  // are newer CFs created.
+  ASSERT_OK(Put(3, Key(1), DummyString(1), wo));
+  Flush(3);
+  ASSERT_OK(Put(3, Key(1), DummyString(1), wo));
+  ASSERT_OK(Put(0, Key(1), DummyString(1), wo));
+  Flush(0);
+  ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
+            static_cast<uint64_t>(1));
+  ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
+            static_cast<uint64_t>(1));
+
+  ASSERT_OK(Put(3, Key(1), DummyString(30000), wo));
+  if (cost_cache_) {
+    ASSERT_GE(cache->GetUsage(), 256 * 1024);
+    ASSERT_LE(cache->GetUsage(), 2 * 256 * 1024);
+  }
+  wait_flush();
+  ASSERT_OK(Put(0, Key(1), DummyString(60000), wo));
+  if (cost_cache_) {
+    ASSERT_GE(cache->GetUsage(), 256 * 1024);
+    ASSERT_LE(cache->GetUsage(), 2 * 256 * 1024);
+  }
+  wait_flush();
+  ASSERT_OK(Put(2, Key(1), DummyString(1), wo));
+  // No flush should trigger
+  wait_flush();
+  {
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
+              static_cast<uint64_t>(1));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"),
+              static_cast<uint64_t>(0));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"),
+              static_cast<uint64_t>(0));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
+              static_cast<uint64_t>(1));
+  }
+
+  // Trigger a flush. Flushing "nikitich".
+  ASSERT_OK(Put(3, Key(2), DummyString(30000), wo));
+  wait_flush();
+  ASSERT_OK(Put(0, Key(1), DummyString(1), wo));
+  wait_flush();
+  {
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
+              static_cast<uint64_t>(1));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"),
+              static_cast<uint64_t>(0));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"),
+              static_cast<uint64_t>(0));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
+              static_cast<uint64_t>(2));
+  }
+
+  // Without hitting the threshold, no flush should trigger.
+  ASSERT_OK(Put(2, Key(1), DummyString(30000), wo));
+  wait_flush();
+  ASSERT_OK(Put(2, Key(1), DummyString(1), wo));
+  wait_flush();
+  ASSERT_OK(Put(2, Key(1), DummyString(1), wo));
+  wait_flush();
+  {
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
+              static_cast<uint64_t>(1));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"),
+              static_cast<uint64_t>(0));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"),
+              static_cast<uint64_t>(0));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
+              static_cast<uint64_t>(2));
+  }
+
+  // Hit the write buffer limit again. "default"
+  // will have been flushed.
+  ASSERT_OK(Put(2, Key(2), DummyString(10000), wo));
+  wait_flush();
+  ASSERT_OK(Put(3, Key(1), DummyString(1), wo));
+  wait_flush();
+  ASSERT_OK(Put(0, Key(1), DummyString(1), wo));
+  wait_flush();
+  ASSERT_OK(Put(0, Key(1), DummyString(1), wo));
+  wait_flush();
+  ASSERT_OK(Put(0, Key(1), DummyString(1), wo));
+  wait_flush();
+  {
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
+              static_cast<uint64_t>(2));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"),
+              static_cast<uint64_t>(0));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"),
+              static_cast<uint64_t>(0));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
+              static_cast<uint64_t>(2));
+  }
+
+  // Trigger another flush. This time "dobrynia". "pikachu" should not
+  // be flushed, althrough it was never flushed.
+  ASSERT_OK(Put(1, Key(1), DummyString(1), wo));
+  wait_flush();
+  ASSERT_OK(Put(2, Key(1), DummyString(80000), wo));
+  wait_flush();
+  ASSERT_OK(Put(1, Key(1), DummyString(1), wo));
+  wait_flush();
+  ASSERT_OK(Put(2, Key(1), DummyString(1), wo));
+  wait_flush();
+
+  {
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
+              static_cast<uint64_t>(2));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"),
+              static_cast<uint64_t>(0));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"),
+              static_cast<uint64_t>(1));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
+              static_cast<uint64_t>(2));
+  }
+  if (cost_cache_) {
+    ASSERT_GE(cache->GetUsage(), 256 * 1024);
+    Close();
+    options.write_buffer_manager.reset();
+    last_options_.write_buffer_manager.reset();
+    ASSERT_LT(cache->GetUsage(), 256 * 1024);
+  }
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+INSTANTIATE_TEST_CASE_P(DBTestSharedWriteBufferAcrossCFs,
+                        DBTestSharedWriteBufferAcrossCFs,
+                        ::testing::Values(std::make_tuple(true, false),
+                                          std::make_tuple(false, false),
+                                          std::make_tuple(false, true)));
+
+TEST_F(DBTest2, SharedWriteBufferLimitAcrossDB) {
+  std::string dbname2 = test::PerThreadDBPath("db_shared_wb_db2");
+  Options options = CurrentOptions();
+  options.arena_block_size = 4096;
+  // Avoid undeterministic value by malloc_usable_size();
+  // Force arena block size to 1
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "Arena::Arena:0", [&](void* arg) {
+        size_t* block_size = static_cast<size_t*>(arg);
+        *block_size = 1;
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "Arena::AllocateNewBlock:0", [&](void* arg) {
+        std::pair<size_t*, size_t*>* pair =
+            static_cast<std::pair<size_t*, size_t*>*>(arg);
+        *std::get<0>(*pair) = *std::get<1>(*pair);
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  options.write_buffer_size = 500000;  // this is never hit
+  // Use a write buffer total size so that the soft limit is about
+  // 105000.
+  options.write_buffer_manager.reset(new WriteBufferManager(120000));
+  CreateAndReopenWithCF({"cf1", "cf2"}, options);
+
+  ASSERT_OK(DestroyDB(dbname2, options));
+  DB* db2 = nullptr;
+  ASSERT_OK(DB::Open(options, dbname2, &db2));
+
+  WriteOptions wo;
+  wo.disableWAL = true;
+
+  std::function<void()> wait_flush = [&]() {
+    dbfull()->TEST_WaitForFlushMemTable(handles_[0]);
+    dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
+    dbfull()->TEST_WaitForFlushMemTable(handles_[2]);
+    static_cast<DBImpl*>(db2)->TEST_WaitForFlushMemTable();
+  };
+
+  // Trigger a flush on cf2
+  ASSERT_OK(Put(2, Key(1), DummyString(70000), wo));
+  wait_flush();
+  ASSERT_OK(Put(0, Key(1), DummyString(20000), wo));
+  wait_flush();
+
+  // Insert to DB2
+  ASSERT_OK(db2->Put(wo, Key(2), DummyString(20000)));
+  wait_flush();
+
+  ASSERT_OK(Put(2, Key(1), DummyString(1), wo));
+  wait_flush();
+  static_cast<DBImpl*>(db2)->TEST_WaitForFlushMemTable();
+  {
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default") +
+                  GetNumberOfSstFilesForColumnFamily(db_, "cf1") +
+                  GetNumberOfSstFilesForColumnFamily(db_, "cf2"),
+              static_cast<uint64_t>(1));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db2, "default"),
+              static_cast<uint64_t>(0));
+  }
+
+  // Triggering to flush another CF in DB1
+  ASSERT_OK(db2->Put(wo, Key(2), DummyString(70000)));
+  wait_flush();
+  ASSERT_OK(Put(2, Key(1), DummyString(1), wo));
+  wait_flush();
+  {
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
+              static_cast<uint64_t>(1));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "cf1"),
+              static_cast<uint64_t>(0));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "cf2"),
+              static_cast<uint64_t>(1));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db2, "default"),
+              static_cast<uint64_t>(0));
+  }
+
+  // Triggering flush in DB2.
+  ASSERT_OK(db2->Put(wo, Key(3), DummyString(40000)));
+  wait_flush();
+  ASSERT_OK(db2->Put(wo, Key(1), DummyString(1)));
+  wait_flush();
+  static_cast<DBImpl*>(db2)->TEST_WaitForFlushMemTable();
+  {
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
+              static_cast<uint64_t>(1));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "cf1"),
+              static_cast<uint64_t>(0));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "cf2"),
+              static_cast<uint64_t>(1));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db2, "default"),
+              static_cast<uint64_t>(1));
+  }
+
+  delete db2;
+  ASSERT_OK(DestroyDB(dbname2, options));
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBTest2, TestWriteBufferNoLimitWithCache) {
+  Options options = CurrentOptions();
+  options.arena_block_size = 4096;
+  std::shared_ptr<Cache> cache =
+      NewLRUCache(LRUCacheOptions(10000000, 1, false, 0.0));
+  options.write_buffer_size = 50000;  // this is never hit
+  // Use a write buffer total size so that the soft limit is about
+  // 105000.
+  options.write_buffer_manager.reset(new WriteBufferManager(0, cache));
+  Reopen(options);
+
+  ASSERT_OK(Put("foo", "bar"));
+  // One dummy entry is 256KB.
+  ASSERT_GT(cache->GetUsage(), 128000);
+}
+
+namespace {
+  void ValidateKeyExistence(DB* db, const std::vector<Slice>& keys_must_exist,
+    const std::vector<Slice>& keys_must_not_exist) {
+    // Ensure that expected keys exist
+    std::vector<std::string> values;
+    if (keys_must_exist.size() > 0) {
+      std::vector<Status> status_list =
+        db->MultiGet(ReadOptions(), keys_must_exist, &values);
+      for (size_t i = 0; i < keys_must_exist.size(); i++) {
+        ASSERT_OK(status_list[i]);
+      }
+    }
+
+    // Ensure that given keys don't exist
+    if (keys_must_not_exist.size() > 0) {
+      std::vector<Status> status_list =
+        db->MultiGet(ReadOptions(), keys_must_not_exist, &values);
+      for (size_t i = 0; i < keys_must_not_exist.size(); i++) {
+        ASSERT_TRUE(status_list[i].IsNotFound());
+      }
+    }
+  }
+
+}  // namespace
+
+TEST_F(DBTest2, WalFilterTest) {
+  class TestWalFilter : public WalFilter {
+  private:
+    // Processing option that is requested to be applied at the given index
+    WalFilter::WalProcessingOption wal_processing_option_;
+    // Index at which to apply wal_processing_option_
+    // At other indexes default wal_processing_option::kContinueProcessing is
+    // returned.
+    size_t apply_option_at_record_index_;
+    // Current record index, incremented with each record encountered.
+    size_t current_record_index_;
+
+  public:
+    TestWalFilter(WalFilter::WalProcessingOption wal_processing_option,
+      size_t apply_option_for_record_index)
+      : wal_processing_option_(wal_processing_option),
+      apply_option_at_record_index_(apply_option_for_record_index),
+      current_record_index_(0) {}
+
+    WalProcessingOption LogRecord(const WriteBatch& /*batch*/,
+                                  WriteBatch* /*new_batch*/,
+                                  bool* /*batch_changed*/) const override {
+      WalFilter::WalProcessingOption option_to_return;
+
+      if (current_record_index_ == apply_option_at_record_index_) {
+        option_to_return = wal_processing_option_;
+      }
+      else {
+        option_to_return = WalProcessingOption::kContinueProcessing;
+      }
+
+      // Filter is passed as a const object for RocksDB to not modify the
+      // object, however we modify it for our own purpose here and hence
+      // cast the constness away.
+      (const_cast<TestWalFilter*>(this)->current_record_index_)++;
+
+      return option_to_return;
+    }
+
+    const char* Name() const override { return "TestWalFilter"; }
+  };
+
+  // Create 3 batches with two keys each
+  std::vector<std::vector<std::string>> batch_keys(3);
+
+  batch_keys[0].push_back("key1");
+  batch_keys[0].push_back("key2");
+  batch_keys[1].push_back("key3");
+  batch_keys[1].push_back("key4");
+  batch_keys[2].push_back("key5");
+  batch_keys[2].push_back("key6");
+
+  // Test with all WAL processing options
+  for (int option = 0;
+    option < static_cast<int>(
+    WalFilter::WalProcessingOption::kWalProcessingOptionMax);
+  option++) {
+    Options options = OptionsForLogIterTest();
+    DestroyAndReopen(options);
+    CreateAndReopenWithCF({ "pikachu" }, options);
+
+    // Write given keys in given batches
+    for (size_t i = 0; i < batch_keys.size(); i++) {
+      WriteBatch batch;
+      for (size_t j = 0; j < batch_keys[i].size(); j++) {
+        batch.Put(handles_[0], batch_keys[i][j], DummyString(1024));
+      }
+      dbfull()->Write(WriteOptions(), &batch);
+    }
+
+    WalFilter::WalProcessingOption wal_processing_option =
+      static_cast<WalFilter::WalProcessingOption>(option);
+
+    // Create a test filter that would apply wal_processing_option at the first
+    // record
+    size_t apply_option_for_record_index = 1;
+    TestWalFilter test_wal_filter(wal_processing_option,
+      apply_option_for_record_index);
+
+    // Reopen database with option to use WAL filter
+    options = OptionsForLogIterTest();
+    options.wal_filter = &test_wal_filter;
+    Status status =
+      TryReopenWithColumnFamilies({ "default", "pikachu" }, options);
+    if (wal_processing_option ==
+      WalFilter::WalProcessingOption::kCorruptedRecord) {
+      assert(!status.ok());
+      // In case of corruption we can turn off paranoid_checks to reopen
+      // databse
+      options.paranoid_checks = false;
+      ReopenWithColumnFamilies({ "default", "pikachu" }, options);
+    }
+    else {
+      assert(status.ok());
+    }
+
+    // Compute which keys we expect to be found
+    // and which we expect not to be found after recovery.
+    std::vector<Slice> keys_must_exist;
+    std::vector<Slice> keys_must_not_exist;
+    switch (wal_processing_option) {
+    case WalFilter::WalProcessingOption::kCorruptedRecord:
+    case WalFilter::WalProcessingOption::kContinueProcessing: {
+      fprintf(stderr, "Testing with complete WAL processing\n");
+      // we expect all records to be processed
+      for (size_t i = 0; i < batch_keys.size(); i++) {
+        for (size_t j = 0; j < batch_keys[i].size(); j++) {
+          keys_must_exist.push_back(Slice(batch_keys[i][j]));
+        }
+      }
+      break;
+    }
+    case WalFilter::WalProcessingOption::kIgnoreCurrentRecord: {
+      fprintf(stderr,
+        "Testing with ignoring record %" ROCKSDB_PRIszt " only\n",
+        apply_option_for_record_index);
+      // We expect the record with apply_option_for_record_index to be not
+      // found.
+      for (size_t i = 0; i < batch_keys.size(); i++) {
+        for (size_t j = 0; j < batch_keys[i].size(); j++) {
+          if (i == apply_option_for_record_index) {
+            keys_must_not_exist.push_back(Slice(batch_keys[i][j]));
+          }
+          else {
+            keys_must_exist.push_back(Slice(batch_keys[i][j]));
+          }
+        }
+      }
+      break;
+    }
+    case WalFilter::WalProcessingOption::kStopReplay: {
+      fprintf(stderr,
+        "Testing with stopping replay from record %" ROCKSDB_PRIszt
+        "\n",
+        apply_option_for_record_index);
+      // We expect records beyond apply_option_for_record_index to be not
+      // found.
+      for (size_t i = 0; i < batch_keys.size(); i++) {
+        for (size_t j = 0; j < batch_keys[i].size(); j++) {
+          if (i >= apply_option_for_record_index) {
+            keys_must_not_exist.push_back(Slice(batch_keys[i][j]));
+          }
+          else {
+            keys_must_exist.push_back(Slice(batch_keys[i][j]));
+          }
+        }
+      }
+      break;
+    }
+    default:
+      assert(false);  // unhandled case
+    }
+
+    bool checked_after_reopen = false;
+
+    while (true) {
+      // Ensure that expected keys exists
+      // and not expected keys don't exist after recovery
+      ValidateKeyExistence(db_, keys_must_exist, keys_must_not_exist);
+
+      if (checked_after_reopen) {
+        break;
+      }
+
+      // reopen database again to make sure previous log(s) are not used
+      //(even if they were skipped)
+      // reopn database with option to use WAL filter
+      options = OptionsForLogIterTest();
+      ReopenWithColumnFamilies({ "default", "pikachu" }, options);
+
+      checked_after_reopen = true;
+    }
+  }
+}
+
+TEST_F(DBTest2, WalFilterTestWithChangeBatch) {
+  class ChangeBatchHandler : public WriteBatch::Handler {
+  private:
+    // Batch to insert keys in
+    WriteBatch* new_write_batch_;
+    // Number of keys to add in the new batch
+    size_t num_keys_to_add_in_new_batch_;
+    // Number of keys added to new batch
+    size_t num_keys_added_;
+
+  public:
+    ChangeBatchHandler(WriteBatch* new_write_batch,
+      size_t num_keys_to_add_in_new_batch)
+      : new_write_batch_(new_write_batch),
+      num_keys_to_add_in_new_batch_(num_keys_to_add_in_new_batch),
+      num_keys_added_(0) {}
+    void Put(const Slice& key, const Slice& value) override {
+      if (num_keys_added_ < num_keys_to_add_in_new_batch_) {
+        new_write_batch_->Put(key, value);
+        ++num_keys_added_;
+      }
+    }
+  };
+
+  class TestWalFilterWithChangeBatch : public WalFilter {
+  private:
+    // Index at which to start changing records
+    size_t change_records_from_index_;
+    // Number of keys to add in the new batch
+    size_t num_keys_to_add_in_new_batch_;
+    // Current record index, incremented with each record encountered.
+    size_t current_record_index_;
+
+  public:
+    TestWalFilterWithChangeBatch(size_t change_records_from_index,
+      size_t num_keys_to_add_in_new_batch)
+      : change_records_from_index_(change_records_from_index),
+      num_keys_to_add_in_new_batch_(num_keys_to_add_in_new_batch),
+      current_record_index_(0) {}
+
+    WalProcessingOption LogRecord(const WriteBatch& batch,
+                                  WriteBatch* new_batch,
+                                  bool* batch_changed) const override {
+      if (current_record_index_ >= change_records_from_index_) {
+        ChangeBatchHandler handler(new_batch, num_keys_to_add_in_new_batch_);
+        batch.Iterate(&handler);
+        *batch_changed = true;
+      }
+
+      // Filter is passed as a const object for RocksDB to not modify the
+      // object, however we modify it for our own purpose here and hence
+      // cast the constness away.
+      (const_cast<TestWalFilterWithChangeBatch*>(this)
+        ->current_record_index_)++;
+
+      return WalProcessingOption::kContinueProcessing;
+    }
+
+    const char* Name() const override { return "TestWalFilterWithChangeBatch"; }
+  };
+
+  std::vector<std::vector<std::string>> batch_keys(3);
+
+  batch_keys[0].push_back("key1");
+  batch_keys[0].push_back("key2");
+  batch_keys[1].push_back("key3");
+  batch_keys[1].push_back("key4");
+  batch_keys[2].push_back("key5");
+  batch_keys[2].push_back("key6");
+
+  Options options = OptionsForLogIterTest();
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({ "pikachu" }, options);
+
+  // Write given keys in given batches
+  for (size_t i = 0; i < batch_keys.size(); i++) {
+    WriteBatch batch;
+    for (size_t j = 0; j < batch_keys[i].size(); j++) {
+      batch.Put(handles_[0], batch_keys[i][j], DummyString(1024));
+    }
+    dbfull()->Write(WriteOptions(), &batch);
+  }
+
+  // Create a test filter that would apply wal_processing_option at the first
+  // record
+  size_t change_records_from_index = 1;
+  size_t num_keys_to_add_in_new_batch = 1;
+  TestWalFilterWithChangeBatch test_wal_filter_with_change_batch(
+    change_records_from_index, num_keys_to_add_in_new_batch);
+
+  // Reopen database with option to use WAL filter
+  options = OptionsForLogIterTest();
+  options.wal_filter = &test_wal_filter_with_change_batch;
+  ReopenWithColumnFamilies({ "default", "pikachu" }, options);
+
+  // Ensure that all keys exist before change_records_from_index_
+  // And after that index only single key exists
+  // as our filter adds only single key for each batch
+  std::vector<Slice> keys_must_exist;
+  std::vector<Slice> keys_must_not_exist;
+
+  for (size_t i = 0; i < batch_keys.size(); i++) {
+    for (size_t j = 0; j < batch_keys[i].size(); j++) {
+      if (i >= change_records_from_index && j >= num_keys_to_add_in_new_batch) {
+        keys_must_not_exist.push_back(Slice(batch_keys[i][j]));
+      }
+      else {
+        keys_must_exist.push_back(Slice(batch_keys[i][j]));
+      }
+    }
+  }
+
+  bool checked_after_reopen = false;
+
+  while (true) {
+    // Ensure that expected keys exists
+    // and not expected keys don't exist after recovery
+    ValidateKeyExistence(db_, keys_must_exist, keys_must_not_exist);
+
+    if (checked_after_reopen) {
+      break;
+    }
+
+    // reopen database again to make sure previous log(s) are not used
+    //(even if they were skipped)
+    // reopn database with option to use WAL filter
+    options = OptionsForLogIterTest();
+    ReopenWithColumnFamilies({ "default", "pikachu" }, options);
+
+    checked_after_reopen = true;
+  }
+}
+
+TEST_F(DBTest2, WalFilterTestWithChangeBatchExtraKeys) {
+  class TestWalFilterWithChangeBatchAddExtraKeys : public WalFilter {
+  public:
+   WalProcessingOption LogRecord(const WriteBatch& batch, WriteBatch* new_batch,
+                                 bool* batch_changed) const override {
+     *new_batch = batch;
+     new_batch->Put("key_extra", "value_extra");
+     *batch_changed = true;
+     return WalProcessingOption::kContinueProcessing;
+   }
+
+   const char* Name() const override {
+     return "WalFilterTestWithChangeBatchExtraKeys";
+   }
+  };
+
+  std::vector<std::vector<std::string>> batch_keys(3);
+
+  batch_keys[0].push_back("key1");
+  batch_keys[0].push_back("key2");
+  batch_keys[1].push_back("key3");
+  batch_keys[1].push_back("key4");
+  batch_keys[2].push_back("key5");
+  batch_keys[2].push_back("key6");
+
+  Options options = OptionsForLogIterTest();
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({ "pikachu" }, options);
+
+  // Write given keys in given batches
+  for (size_t i = 0; i < batch_keys.size(); i++) {
+    WriteBatch batch;
+    for (size_t j = 0; j < batch_keys[i].size(); j++) {
+      batch.Put(handles_[0], batch_keys[i][j], DummyString(1024));
+    }
+    dbfull()->Write(WriteOptions(), &batch);
+  }
+
+  // Create a test filter that would add extra keys
+  TestWalFilterWithChangeBatchAddExtraKeys test_wal_filter_extra_keys;
+
+  // Reopen database with option to use WAL filter
+  options = OptionsForLogIterTest();
+  options.wal_filter = &test_wal_filter_extra_keys;
+  Status status = TryReopenWithColumnFamilies({"default", "pikachu"}, options);
+  ASSERT_TRUE(status.IsNotSupported());
+
+  // Reopen without filter, now reopen should succeed - previous
+  // attempt to open must not have altered the db.
+  options = OptionsForLogIterTest();
+  ReopenWithColumnFamilies({ "default", "pikachu" }, options);
+
+  std::vector<Slice> keys_must_exist;
+  std::vector<Slice> keys_must_not_exist;  // empty vector
+
+  for (size_t i = 0; i < batch_keys.size(); i++) {
+    for (size_t j = 0; j < batch_keys[i].size(); j++) {
+      keys_must_exist.push_back(Slice(batch_keys[i][j]));
+    }
+  }
+
+  ValidateKeyExistence(db_, keys_must_exist, keys_must_not_exist);
+}
+
+TEST_F(DBTest2, WalFilterTestWithColumnFamilies) {
+  class TestWalFilterWithColumnFamilies : public WalFilter {
+  private:
+    // column_family_id -> log_number map (provided to WALFilter)
+    std::map<uint32_t, uint64_t> cf_log_number_map_;
+    // column_family_name -> column_family_id map (provided to WALFilter)
+    std::map<std::string, uint32_t> cf_name_id_map_;
+    // column_family_name -> keys_found_in_wal map
+    // We store keys that are applicable to the column_family
+    // during recovery (i.e. aren't already flushed to SST file(s))
+    // for verification against the keys we expect.
+    std::map<uint32_t, std::vector<std::string>> cf_wal_keys_;
+  public:
+   void ColumnFamilyLogNumberMap(
+       const std::map<uint32_t, uint64_t>& cf_lognumber_map,
+       const std::map<std::string, uint32_t>& cf_name_id_map) override {
+     cf_log_number_map_ = cf_lognumber_map;
+     cf_name_id_map_ = cf_name_id_map;
+   }
+
+   WalProcessingOption LogRecordFound(unsigned long long log_number,
+                                      const std::string& /*log_file_name*/,
+                                      const WriteBatch& batch,
+                                      WriteBatch* /*new_batch*/,
+                                      bool* /*batch_changed*/) override {
+     class LogRecordBatchHandler : public WriteBatch::Handler {
+      private:
+        const std::map<uint32_t, uint64_t> & cf_log_number_map_;
+        std::map<uint32_t, std::vector<std::string>> & cf_wal_keys_;
+        unsigned long long log_number_;
+      public:
+        LogRecordBatchHandler(unsigned long long current_log_number,
+          const std::map<uint32_t, uint64_t> & cf_log_number_map,
+          std::map<uint32_t, std::vector<std::string>> & cf_wal_keys) :
+          cf_log_number_map_(cf_log_number_map),
+          cf_wal_keys_(cf_wal_keys),
+          log_number_(current_log_number){}
+
+        Status PutCF(uint32_t column_family_id, const Slice& key,
+                     const Slice& /*value*/) override {
+          auto it = cf_log_number_map_.find(column_family_id);
+          assert(it != cf_log_number_map_.end());
+          unsigned long long log_number_for_cf = it->second;
+          // If the current record is applicable for column_family_id
+          // (i.e. isn't flushed to SST file(s) for column_family_id)
+          // add it to the cf_wal_keys_ map for verification.
+          if (log_number_ >= log_number_for_cf) {
+            cf_wal_keys_[column_family_id].push_back(std::string(key.data(),
+              key.size()));
+          }
+          return Status::OK();
+        }
+      } handler(log_number, cf_log_number_map_, cf_wal_keys_);
+
+      batch.Iterate(&handler);
+
+      return WalProcessingOption::kContinueProcessing;
+   }
+
+   const char* Name() const override {
+     return "WalFilterTestWithColumnFamilies";
+   }
+
+    const std::map<uint32_t, std::vector<std::string>>& GetColumnFamilyKeys() {
+      return cf_wal_keys_;
+    }
+
+    const std::map<std::string, uint32_t> & GetColumnFamilyNameIdMap() {
+      return cf_name_id_map_;
+    }
+  };
+
+  std::vector<std::vector<std::string>> batch_keys_pre_flush(3);
+
+  batch_keys_pre_flush[0].push_back("key1");
+  batch_keys_pre_flush[0].push_back("key2");
+  batch_keys_pre_flush[1].push_back("key3");
+  batch_keys_pre_flush[1].push_back("key4");
+  batch_keys_pre_flush[2].push_back("key5");
+  batch_keys_pre_flush[2].push_back("key6");
+
+  Options options = OptionsForLogIterTest();
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({ "pikachu" }, options);
+
+  // Write given keys in given batches
+  for (size_t i = 0; i < batch_keys_pre_flush.size(); i++) {
+    WriteBatch batch;
+    for (size_t j = 0; j < batch_keys_pre_flush[i].size(); j++) {
+      batch.Put(handles_[0], batch_keys_pre_flush[i][j], DummyString(1024));
+      batch.Put(handles_[1], batch_keys_pre_flush[i][j], DummyString(1024));
+    }
+    dbfull()->Write(WriteOptions(), &batch);
+  }
+
+  //Flush default column-family
+  db_->Flush(FlushOptions(), handles_[0]);
+
+  // Do some more writes
+  std::vector<std::vector<std::string>> batch_keys_post_flush(3);
+
+  batch_keys_post_flush[0].push_back("key7");
+  batch_keys_post_flush[0].push_back("key8");
+  batch_keys_post_flush[1].push_back("key9");
+  batch_keys_post_flush[1].push_back("key10");
+  batch_keys_post_flush[2].push_back("key11");
+  batch_keys_post_flush[2].push_back("key12");
+
+  // Write given keys in given batches
+  for (size_t i = 0; i < batch_keys_post_flush.size(); i++) {
+    WriteBatch batch;
+    for (size_t j = 0; j < batch_keys_post_flush[i].size(); j++) {
+      batch.Put(handles_[0], batch_keys_post_flush[i][j], DummyString(1024));
+      batch.Put(handles_[1], batch_keys_post_flush[i][j], DummyString(1024));
+    }
+    dbfull()->Write(WriteOptions(), &batch);
+  }
+
+  // On Recovery we should only find the second batch applicable to default CF
+  // But both batches applicable to pikachu CF
+
+  // Create a test filter that would add extra keys
+  TestWalFilterWithColumnFamilies test_wal_filter_column_families;
+
+  // Reopen database with option to use WAL filter
+  options = OptionsForLogIterTest();
+  options.wal_filter = &test_wal_filter_column_families;
+  Status status =
+    TryReopenWithColumnFamilies({ "default", "pikachu" }, options);
+  ASSERT_TRUE(status.ok());
+
+  // verify that handles_[0] only has post_flush keys
+  // while handles_[1] has pre and post flush keys
+  auto cf_wal_keys = test_wal_filter_column_families.GetColumnFamilyKeys();
+  auto name_id_map = test_wal_filter_column_families.GetColumnFamilyNameIdMap();
+  size_t index = 0;
+  auto keys_cf = cf_wal_keys[name_id_map[kDefaultColumnFamilyName]];
+  //default column-family, only post_flush keys are expected
+  for (size_t i = 0; i < batch_keys_post_flush.size(); i++) {
+    for (size_t j = 0; j < batch_keys_post_flush[i].size(); j++) {
+      Slice key_from_the_log(keys_cf[index++]);
+      Slice batch_key(batch_keys_post_flush[i][j]);
+      ASSERT_TRUE(key_from_the_log.compare(batch_key) == 0);
+    }
+  }
+  ASSERT_TRUE(index == keys_cf.size());
+
+  index = 0;
+  keys_cf = cf_wal_keys[name_id_map["pikachu"]];
+  //pikachu column-family, all keys are expected
+  for (size_t i = 0; i < batch_keys_pre_flush.size(); i++) {
+    for (size_t j = 0; j < batch_keys_pre_flush[i].size(); j++) {
+      Slice key_from_the_log(keys_cf[index++]);
+      Slice batch_key(batch_keys_pre_flush[i][j]);
+      ASSERT_TRUE(key_from_the_log.compare(batch_key) == 0);
+    }
+  }
+
+  for (size_t i = 0; i < batch_keys_post_flush.size(); i++) {
+    for (size_t j = 0; j < batch_keys_post_flush[i].size(); j++) {
+      Slice key_from_the_log(keys_cf[index++]);
+      Slice batch_key(batch_keys_post_flush[i][j]);
+      ASSERT_TRUE(key_from_the_log.compare(batch_key) == 0);
+    }
+  }
+  ASSERT_TRUE(index == keys_cf.size());
+}
+
+TEST_F(DBTest2, PresetCompressionDict) {
+  // Verifies that compression ratio improves when dictionary is enabled, and
+  // improves even further when the dictionary is trained by ZSTD.
+  const size_t kBlockSizeBytes = 4 << 10;
+  const size_t kL0FileBytes = 128 << 10;
+  const size_t kApproxPerBlockOverheadBytes = 50;
+  const int kNumL0Files = 5;
+
+  Options options;
+  // Make sure to use any custom env that the test is configured with.
+  options.env = CurrentOptions().env;
+  options.allow_concurrent_memtable_write = false;
+  options.arena_block_size = kBlockSizeBytes;
+  options.create_if_missing = true;
+  options.disable_auto_compactions = true;
+  options.level0_file_num_compaction_trigger = kNumL0Files;
+  options.memtable_factory.reset(
+      new SpecialSkipListFactory(kL0FileBytes / kBlockSizeBytes));
+  options.num_levels = 2;
+  options.target_file_size_base = kL0FileBytes;
+  options.target_file_size_multiplier = 2;
+  options.write_buffer_size = kL0FileBytes;
+  BlockBasedTableOptions table_options;
+  table_options.block_size = kBlockSizeBytes;
+  std::vector<CompressionType> compression_types;
+  if (Zlib_Supported()) {
+    compression_types.push_back(kZlibCompression);
+  }
+#if LZ4_VERSION_NUMBER >= 10400  // r124+
+  compression_types.push_back(kLZ4Compression);
+  compression_types.push_back(kLZ4HCCompression);
+#endif                          // LZ4_VERSION_NUMBER >= 10400
+  if (ZSTD_Supported()) {
+    compression_types.push_back(kZSTD);
+  }
+
+  enum DictionaryTypes : int {
+    kWithoutDict,
+    kWithDict,
+    kWithZSTDTrainedDict,
+    kDictEnd,
+  };
+
+  for (auto compression_type : compression_types) {
+    options.compression = compression_type;
+    size_t bytes_without_dict = 0;
+    size_t bytes_with_dict = 0;
+    size_t bytes_with_zstd_trained_dict = 0;
+    for (int i = kWithoutDict; i < kDictEnd; i++) {
+      // First iteration: compress without preset dictionary
+      // Second iteration: compress with preset dictionary
+      // Third iteration (zstd only): compress with zstd-trained dictionary
+      //
+      // To make sure the compression dictionary has the intended effect, we
+      // verify the compressed size is smaller in successive iterations. Also in
+      // the non-first iterations, verify the data we get out is the same data
+      // we put in.
+      switch (i) {
+        case kWithoutDict:
+          options.compression_opts.max_dict_bytes = 0;
+          options.compression_opts.zstd_max_train_bytes = 0;
+          break;
+        case kWithDict:
+          options.compression_opts.max_dict_bytes = kBlockSizeBytes;
+          options.compression_opts.zstd_max_train_bytes = 0;
+          break;
+        case kWithZSTDTrainedDict:
+          if (compression_type != kZSTD) {
+            continue;
+          }
+          options.compression_opts.max_dict_bytes = kBlockSizeBytes;
+          options.compression_opts.zstd_max_train_bytes = kL0FileBytes;
+          break;
+        default:
+          assert(false);
+      }
+
+      options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+      options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+      CreateAndReopenWithCF({"pikachu"}, options);
+      Random rnd(301);
+      std::string seq_datas[10];
+      for (int j = 0; j < 10; ++j) {
+        seq_datas[j] =
+            RandomString(&rnd, kBlockSizeBytes - kApproxPerBlockOverheadBytes);
+      }
+
+      ASSERT_EQ(0, NumTableFilesAtLevel(0, 1));
+      for (int j = 0; j < kNumL0Files; ++j) {
+        for (size_t k = 0; k < kL0FileBytes / kBlockSizeBytes + 1; ++k) {
+          auto key_num = j * (kL0FileBytes / kBlockSizeBytes) + k;
+          ASSERT_OK(Put(1, Key(static_cast<int>(key_num)),
+                        seq_datas[(key_num / 10) % 10]));
+        }
+        dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
+        ASSERT_EQ(j + 1, NumTableFilesAtLevel(0, 1));
+      }
+      dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1],
+                                  true /* disallow_trivial_move */);
+      ASSERT_EQ(0, NumTableFilesAtLevel(0, 1));
+      ASSERT_GT(NumTableFilesAtLevel(1, 1), 0);
+
+      // Get the live sst files size
+      size_t total_sst_bytes = TotalSize(1);
+      if (i == kWithoutDict) {
+        bytes_without_dict = total_sst_bytes;
+      } else if (i == kWithDict) {
+        bytes_with_dict = total_sst_bytes;
+      } else if (i == kWithZSTDTrainedDict) {
+        bytes_with_zstd_trained_dict = total_sst_bytes;
+      }
+
+      for (size_t j = 0; j < kNumL0Files * (kL0FileBytes / kBlockSizeBytes);
+           j++) {
+        ASSERT_EQ(seq_datas[(j / 10) % 10], Get(1, Key(static_cast<int>(j))));
+      }
+      if (i == kWithDict) {
+        ASSERT_GT(bytes_without_dict, bytes_with_dict);
+      } else if (i == kWithZSTDTrainedDict) {
+        // In zstd compression, it is sometimes possible that using a trained
+        // dictionary does not get as good a compression ratio as without
+        // training.
+        // But using a dictionary (with or without training) should always get
+        // better compression ratio than not using one.
+        ASSERT_TRUE(bytes_with_dict > bytes_with_zstd_trained_dict ||
+                    bytes_without_dict > bytes_with_zstd_trained_dict);
+      }
+
+      DestroyAndReopen(options);
+    }
+  }
+}
+
+TEST_F(DBTest2, PresetCompressionDictLocality) {
+  if (!ZSTD_Supported()) {
+    return;
+  }
+  // Verifies that compression dictionary is generated from local data. The
+  // verification simply checks all output SSTs have different compression
+  // dictionaries. We do not verify effectiveness as that'd likely be flaky in
+  // the future.
+  const int kNumEntriesPerFile = 1 << 10;  // 1KB
+  const int kNumBytesPerEntry = 1 << 10;   // 1KB
+  const int kNumFiles = 4;
+  Options options = CurrentOptions();
+  options.compression = kZSTD;
+  options.compression_opts.max_dict_bytes = 1 << 14;        // 16KB
+  options.compression_opts.zstd_max_train_bytes = 1 << 18;  // 256KB
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  options.target_file_size_base = kNumEntriesPerFile * kNumBytesPerEntry;
+  BlockBasedTableOptions table_options;
+  table_options.cache_index_and_filter_blocks = true;
+  options.table_factory.reset(new BlockBasedTableFactory(table_options));
+  Reopen(options);
+
+  Random rnd(301);
+  for (int i = 0; i < kNumFiles; ++i) {
+    for (int j = 0; j < kNumEntriesPerFile; ++j) {
+      ASSERT_OK(Put(Key(i * kNumEntriesPerFile + j),
+                    RandomString(&rnd, kNumBytesPerEntry)));
+    }
+    ASSERT_OK(Flush());
+    MoveFilesToLevel(1);
+    ASSERT_EQ(NumTableFilesAtLevel(1), i + 1);
+  }
+
+  // Store all the dictionaries generated during a full compaction.
+  std::vector<std::string> compression_dicts;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "BlockBasedTableBuilder::WriteCompressionDictBlock:RawDict",
+      [&](void* arg) {
+        compression_dicts.emplace_back(static_cast<Slice*>(arg)->ToString());
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  CompactRangeOptions compact_range_opts;
+  compact_range_opts.bottommost_level_compaction =
+      BottommostLevelCompaction::kForceOptimized;
+  ASSERT_OK(db_->CompactRange(compact_range_opts, nullptr, nullptr));
+
+  // Dictionary compression should not be so good as to compress four totally
+  // random files into one. If it does then there's probably something wrong
+  // with the test.
+  ASSERT_GT(NumTableFilesAtLevel(1), 1);
+
+  // Furthermore, there should be one compression dictionary generated per file.
+  // And they should all be different from each other.
+  ASSERT_EQ(NumTableFilesAtLevel(1),
+            static_cast<int>(compression_dicts.size()));
+  for (size_t i = 1; i < compression_dicts.size(); ++i) {
+    std::string& a = compression_dicts[i - 1];
+    std::string& b = compression_dicts[i];
+    size_t alen = a.size();
+    size_t blen = b.size();
+    ASSERT_TRUE(alen != blen || memcmp(a.data(), b.data(), alen) != 0);
+  }
+}
+
+class CompactionCompressionListener : public EventListener {
+ public:
+  explicit CompactionCompressionListener(Options* db_options)
+      : db_options_(db_options) {}
+
+  void OnCompactionCompleted(DB* db, const CompactionJobInfo& ci) override {
+    // Figure out last level with files
+    int bottommost_level = 0;
+    for (int level = 0; level < db->NumberLevels(); level++) {
+      std::string files_at_level;
+      ASSERT_TRUE(
+          db->GetProperty("rocksdb.num-files-at-level" + NumberToString(level),
+                          &files_at_level));
+      if (files_at_level != "0") {
+        bottommost_level = level;
+      }
+    }
+
+    if (db_options_->bottommost_compression != kDisableCompressionOption &&
+        ci.output_level == bottommost_level) {
+      ASSERT_EQ(ci.compression, db_options_->bottommost_compression);
+    } else if (db_options_->compression_per_level.size() != 0) {
+      ASSERT_EQ(ci.compression,
+                db_options_->compression_per_level[ci.output_level]);
+    } else {
+      ASSERT_EQ(ci.compression, db_options_->compression);
+    }
+    max_level_checked = std::max(max_level_checked, ci.output_level);
+  }
+
+  int max_level_checked = 0;
+  const Options* db_options_;
+};
+
+TEST_F(DBTest2, CompressionOptions) {
+  if (!Zlib_Supported() || !Snappy_Supported()) {
+    return;
+  }
+
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = 2;
+  options.max_bytes_for_level_base = 100;
+  options.max_bytes_for_level_multiplier = 2;
+  options.num_levels = 7;
+  options.max_background_compactions = 1;
+
+  CompactionCompressionListener* listener =
+      new CompactionCompressionListener(&options);
+  options.listeners.emplace_back(listener);
+
+  const int kKeySize = 5;
+  const int kValSize = 20;
+  Random rnd(301);
+
+  for (int iter = 0; iter <= 2; iter++) {
+    listener->max_level_checked = 0;
+
+    if (iter == 0) {
+      // Use different compression algorithms for different levels but
+      // always use Zlib for bottommost level
+      options.compression_per_level = {kNoCompression,     kNoCompression,
+                                       kNoCompression,     kSnappyCompression,
+                                       kSnappyCompression, kSnappyCompression,
+                                       kZlibCompression};
+      options.compression = kNoCompression;
+      options.bottommost_compression = kZlibCompression;
+    } else if (iter == 1) {
+      // Use Snappy except for bottommost level use ZLib
+      options.compression_per_level = {};
+      options.compression = kSnappyCompression;
+      options.bottommost_compression = kZlibCompression;
+    } else if (iter == 2) {
+      // Use Snappy everywhere
+      options.compression_per_level = {};
+      options.compression = kSnappyCompression;
+      options.bottommost_compression = kDisableCompressionOption;
+    }
+
+    DestroyAndReopen(options);
+    // Write 10 random files
+    for (int i = 0; i < 10; i++) {
+      for (int j = 0; j < 5; j++) {
+        ASSERT_OK(
+            Put(RandomString(&rnd, kKeySize), RandomString(&rnd, kValSize)));
+      }
+      ASSERT_OK(Flush());
+      dbfull()->TEST_WaitForCompact();
+    }
+
+    // Make sure that we wrote enough to check all 7 levels
+    ASSERT_EQ(listener->max_level_checked, 6);
+  }
+}
+
+class CompactionStallTestListener : public EventListener {
+ public:
+  CompactionStallTestListener() : compacting_files_cnt_(0), compacted_files_cnt_(0) {}
+
+  void OnCompactionBegin(DB* /*db*/, const CompactionJobInfo& ci) override {
+    ASSERT_EQ(ci.cf_name, "default");
+    ASSERT_EQ(ci.base_input_level, 0);
+    ASSERT_EQ(ci.compaction_reason, CompactionReason::kLevelL0FilesNum);
+    compacting_files_cnt_ += ci.input_files.size();
+  }
+
+  void OnCompactionCompleted(DB* /*db*/, const CompactionJobInfo& ci) override {
+    ASSERT_EQ(ci.cf_name, "default");
+    ASSERT_EQ(ci.base_input_level, 0);
+    ASSERT_EQ(ci.compaction_reason, CompactionReason::kLevelL0FilesNum);
+    compacted_files_cnt_ += ci.input_files.size();
+  }
+
+  std::atomic<size_t> compacting_files_cnt_;
+  std::atomic<size_t> compacted_files_cnt_;
+};
+
+TEST_F(DBTest2, CompactionStall) {
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::BGWorkCompaction", "DBTest2::CompactionStall:0"},
+       {"DBImpl::BGWorkCompaction", "DBTest2::CompactionStall:1"},
+       {"DBTest2::CompactionStall:2",
+        "DBImpl::NotifyOnCompactionBegin::UnlockMutex"},
+       {"DBTest2::CompactionStall:3",
+        "DBImpl::NotifyOnCompactionCompleted::UnlockMutex"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = 4;
+  options.max_background_compactions = 40;
+  CompactionStallTestListener* listener = new CompactionStallTestListener();
+  options.listeners.emplace_back(listener);
+  DestroyAndReopen(options);
+  // make sure all background compaction jobs can be scheduled
+  auto stop_token =
+      dbfull()->TEST_write_controler().GetCompactionPressureToken();
+
+  Random rnd(301);
+
+  // 4 Files in L0
+  for (int i = 0; i < 4; i++) {
+    for (int j = 0; j < 10; j++) {
+      ASSERT_OK(Put(RandomString(&rnd, 10), RandomString(&rnd, 10)));
+    }
+    ASSERT_OK(Flush());
+  }
+
+  // Wait for compaction to be triggered
+  TEST_SYNC_POINT("DBTest2::CompactionStall:0");
+
+  // Clear "DBImpl::BGWorkCompaction" SYNC_POINT since we want to hold it again
+  // at DBTest2::CompactionStall::1
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearTrace();
+
+  // Another 6 L0 files to trigger compaction again
+  for (int i = 0; i < 6; i++) {
+    for (int j = 0; j < 10; j++) {
+      ASSERT_OK(Put(RandomString(&rnd, 10), RandomString(&rnd, 10)));
+    }
+    ASSERT_OK(Flush());
+  }
+
+  // Wait for another compaction to be triggered
+  TEST_SYNC_POINT("DBTest2::CompactionStall:1");
+
+  // Hold NotifyOnCompactionBegin in the unlock mutex section
+  TEST_SYNC_POINT("DBTest2::CompactionStall:2");
+
+  // Hold NotifyOnCompactionCompleted in the unlock mutex section
+  TEST_SYNC_POINT("DBTest2::CompactionStall:3");
+
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_LT(NumTableFilesAtLevel(0),
+            options.level0_file_num_compaction_trigger);
+  ASSERT_GT(listener->compacted_files_cnt_.load(),
+            10 - options.level0_file_num_compaction_trigger);
+  ASSERT_EQ(listener->compacting_files_cnt_.load(), listener->compacted_files_cnt_.load());
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+#endif  // ROCKSDB_LITE
+
+TEST_F(DBTest2, FirstSnapshotTest) {
+  Options options;
+  options.write_buffer_size = 100000;  // Small write buffer
+  options = CurrentOptions(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  // This snapshot will have sequence number 0 what is expected behaviour.
+  const Snapshot* s1 = db_->GetSnapshot();
+
+  Put(1, "k1", std::string(100000, 'x'));  // Fill memtable
+  Put(1, "k2", std::string(100000, 'y'));  // Trigger flush
+
+  db_->ReleaseSnapshot(s1);
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest2, DuplicateSnapshot) {
+  Options options;
+  options = CurrentOptions(options);
+  std::vector<const Snapshot*> snapshots;
+  DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
+  SequenceNumber oldest_ww_snap, first_ww_snap;
+
+  Put("k", "v");  // inc seq
+  snapshots.push_back(db_->GetSnapshot());
+  snapshots.push_back(db_->GetSnapshot());
+  Put("k", "v");  // inc seq
+  snapshots.push_back(db_->GetSnapshot());
+  snapshots.push_back(dbi->GetSnapshotForWriteConflictBoundary());
+  first_ww_snap = snapshots.back()->GetSequenceNumber();
+  Put("k", "v");  // inc seq
+  snapshots.push_back(dbi->GetSnapshotForWriteConflictBoundary());
+  snapshots.push_back(db_->GetSnapshot());
+  Put("k", "v");  // inc seq
+  snapshots.push_back(db_->GetSnapshot());
+
+  {
+    InstrumentedMutexLock l(dbi->mutex());
+    auto seqs = dbi->snapshots().GetAll(&oldest_ww_snap);
+    ASSERT_EQ(seqs.size(), 4);  // duplicates are not counted
+    ASSERT_EQ(oldest_ww_snap, first_ww_snap);
+  }
+
+  for (auto s : snapshots) {
+    db_->ReleaseSnapshot(s);
+  }
+}
+#endif  // ROCKSDB_LITE
+
+class PinL0IndexAndFilterBlocksTest
+    : public DBTestBase,
+      public testing::WithParamInterface<std::tuple<bool, bool>> {
+ public:
+  PinL0IndexAndFilterBlocksTest() : DBTestBase("/db_pin_l0_index_bloom_test") {}
+  void SetUp() override {
+    infinite_max_files_ = std::get<0>(GetParam());
+    disallow_preload_ = std::get<1>(GetParam());
+  }
+
+  void CreateTwoLevels(Options* options, bool close_afterwards) {
+    if (infinite_max_files_) {
+      options->max_open_files = -1;
+    }
+    options->create_if_missing = true;
+    options->statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+    BlockBasedTableOptions table_options;
+    table_options.cache_index_and_filter_blocks = true;
+    table_options.pin_l0_filter_and_index_blocks_in_cache = true;
+    table_options.filter_policy.reset(NewBloomFilterPolicy(20));
+    options->table_factory.reset(new BlockBasedTableFactory(table_options));
+    CreateAndReopenWithCF({"pikachu"}, *options);
+
+    Put(1, "a", "begin");
+    Put(1, "z", "end");
+    ASSERT_OK(Flush(1));
+    // move this table to L1
+    dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]);
+
+    // reset block cache
+    table_options.block_cache = NewLRUCache(64 * 1024);
+    options->table_factory.reset(NewBlockBasedTableFactory(table_options));
+    TryReopenWithColumnFamilies({"default", "pikachu"}, *options);
+    // create new table at L0
+    Put(1, "a2", "begin2");
+    Put(1, "z2", "end2");
+    ASSERT_OK(Flush(1));
+
+    if (close_afterwards) {
+      Close();  // This ensures that there is no ref to block cache entries
+    }
+    table_options.block_cache->EraseUnRefEntries();
+  }
+
+  bool infinite_max_files_;
+  bool disallow_preload_;
+};
+
+TEST_P(PinL0IndexAndFilterBlocksTest,
+       IndexAndFilterBlocksOfNewTableAddedToCacheWithPinning) {
+  Options options = CurrentOptions();
+  if (infinite_max_files_) {
+    options.max_open_files = -1;
+  }
+  options.create_if_missing = true;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  BlockBasedTableOptions table_options;
+  table_options.cache_index_and_filter_blocks = true;
+  table_options.pin_l0_filter_and_index_blocks_in_cache = true;
+  table_options.filter_policy.reset(NewBloomFilterPolicy(20));
+  options.table_factory.reset(new BlockBasedTableFactory(table_options));
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  ASSERT_OK(Put(1, "key", "val"));
+  // Create a new table.
+  ASSERT_OK(Flush(1));
+
+  // index/filter blocks added to block cache right after table creation.
+  ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+  ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
+  ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+  ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
+
+  // only index/filter were added
+  ASSERT_EQ(2, TestGetTickerCount(options, BLOCK_CACHE_ADD));
+  ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_DATA_MISS));
+
+  std::string value;
+  // Miss and hit count should remain the same, they're all pinned.
+  db_->KeyMayExist(ReadOptions(), handles_[1], "key", &value);
+  ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+  ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
+  ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+  ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
+
+  // Miss and hit count should remain the same, they're all pinned.
+  value = Get(1, "key");
+  ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+  ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
+  ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+  ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
+}
+
+TEST_P(PinL0IndexAndFilterBlocksTest,
+       MultiLevelIndexAndFilterBlocksCachedWithPinning) {
+  Options options = CurrentOptions();
+  PinL0IndexAndFilterBlocksTest::CreateTwoLevels(&options, false);
+  // get base cache values
+  uint64_t fm = TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS);
+  uint64_t fh = TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT);
+  uint64_t im = TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS);
+  uint64_t ih = TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT);
+
+  std::string value;
+  // this should be read from L0
+  // so cache values don't change
+  value = Get(1, "a2");
+  ASSERT_EQ(fm, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+  ASSERT_EQ(fh, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
+  ASSERT_EQ(im, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+  ASSERT_EQ(ih, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
+
+  // this should be read from L1
+  // the file is opened, prefetching results in a cache filter miss
+  // the block is loaded and added to the cache,
+  // then the get results in a cache hit for L1
+  // When we have inifinite max_files, there is still cache miss because we have
+  // reset the block cache
+  value = Get(1, "a");
+  ASSERT_EQ(fm + 1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+  ASSERT_EQ(im + 1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+}
+
+TEST_P(PinL0IndexAndFilterBlocksTest, DisablePrefetchingNonL0IndexAndFilter) {
+  Options options = CurrentOptions();
+  // This ensures that db does not ref anything in the block cache, so
+  // EraseUnRefEntries could clear them up.
+  bool close_afterwards = true;
+  PinL0IndexAndFilterBlocksTest::CreateTwoLevels(&options, close_afterwards);
+
+  // Get base cache values
+  uint64_t fm = TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS);
+  uint64_t fh = TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT);
+  uint64_t im = TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS);
+  uint64_t ih = TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT);
+
+  if (disallow_preload_) {
+    // Now we have two files. We narrow the max open files to allow 3 entries
+    // so that preloading SST files won't happen.
+    options.max_open_files = 13;
+    // RocksDB sanitize max open files to at least 20. Modify it back.
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+        "SanitizeOptions::AfterChangeMaxOpenFiles", [&](void* arg) {
+          int* max_open_files = static_cast<int*>(arg);
+          *max_open_files = 13;
+        });
+  }
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // Reopen database. If max_open_files is set as -1, table readers will be
+  // preloaded. This will trigger a BlockBasedTable::Open() and prefetch
+  // L0 index and filter. Level 1's prefetching is disabled in DB::Open()
+  TryReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+  if (!disallow_preload_) {
+    // After reopen, cache miss are increased by one because we read (and only
+    // read) filter and index on L0
+    ASSERT_EQ(fm + 1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+    ASSERT_EQ(fh, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
+    ASSERT_EQ(im + 1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+    ASSERT_EQ(ih, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
+  } else {
+    // If max_open_files is not -1, we do not preload table readers, so there is
+    // no change.
+    ASSERT_EQ(fm, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+    ASSERT_EQ(fh, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
+    ASSERT_EQ(im, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+    ASSERT_EQ(ih, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
+  }
+  std::string value;
+  // this should be read from L0
+  value = Get(1, "a2");
+  // If max_open_files is -1, we have pinned index and filter in Rep, so there
+  // will not be changes in index and filter misses or hits. If max_open_files
+  // is not -1, Get() will open a TableReader and prefetch index and filter.
+  ASSERT_EQ(fm + 1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+  ASSERT_EQ(fh, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
+  ASSERT_EQ(im + 1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+  ASSERT_EQ(ih, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
+
+  // this should be read from L1
+  value = Get(1, "a");
+  if (!disallow_preload_) {
+    // In inifinite max files case, there's a cache miss in executing Get()
+    // because index and filter are not prefetched before.
+    ASSERT_EQ(fm + 2, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+    ASSERT_EQ(fh, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
+    ASSERT_EQ(im + 2, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+    ASSERT_EQ(ih, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
+  } else {
+    // In this case, cache miss will be increased by one in
+    // BlockBasedTable::Open() because this is not in DB::Open() code path so we
+    // will prefetch L1's index and filter. Cache hit will also be increased by
+    // one because Get() will read index and filter from the block cache
+    // prefetched in previous Open() call.
+    ASSERT_EQ(fm + 2, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+    ASSERT_EQ(fh + 1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
+    ASSERT_EQ(im + 2, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+    ASSERT_EQ(ih + 1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
+  }
+
+  // Force a full compaction to one single file. There will be a block
+  // cache read for both of index and filter. If prefetch doesn't explicitly
+  // happen, it will happen when verifying the file.
+  Compact(1, "a", "zzzzz");
+  dbfull()->TEST_WaitForCompact();
+
+  if (!disallow_preload_) {
+    ASSERT_EQ(fm + 3, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+    ASSERT_EQ(fh, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
+    ASSERT_EQ(im + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+    ASSERT_EQ(ih + 2, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
+  } else {
+    ASSERT_EQ(fm + 3, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+    ASSERT_EQ(fh + 1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
+    ASSERT_EQ(im + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+    ASSERT_EQ(ih + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
+  }
+
+  // Bloom and index hit will happen when a Get() happens.
+  value = Get(1, "a");
+  if (!disallow_preload_) {
+    ASSERT_EQ(fm + 3, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+    ASSERT_EQ(fh + 1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
+    ASSERT_EQ(im + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+    ASSERT_EQ(ih + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
+  } else {
+    ASSERT_EQ(fm + 3, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+    ASSERT_EQ(fh + 2, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
+    ASSERT_EQ(im + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+    ASSERT_EQ(ih + 4, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(PinL0IndexAndFilterBlocksTest,
+                        PinL0IndexAndFilterBlocksTest,
+                        ::testing::Values(std::make_tuple(true, false),
+                                          std::make_tuple(false, false),
+                                          std::make_tuple(false, true)));
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest2, MaxCompactionBytesTest) {
+  Options options = CurrentOptions();
+  options.memtable_factory.reset(
+      new SpecialSkipListFactory(DBTestBase::kNumKeysByGenerateNewRandomFile));
+  options.compaction_style = kCompactionStyleLevel;
+  options.write_buffer_size = 200 << 10;
+  options.arena_block_size = 4 << 10;
+  options.level0_file_num_compaction_trigger = 4;
+  options.num_levels = 4;
+  options.compression = kNoCompression;
+  options.max_bytes_for_level_base = 450 << 10;
+  options.target_file_size_base = 100 << 10;
+  // Infinite for full compaction.
+  options.max_compaction_bytes = options.target_file_size_base * 100;
+
+  Reopen(options);
+
+  Random rnd(301);
+
+  for (int num = 0; num < 8; num++) {
+    GenerateNewRandomFile(&rnd);
+  }
+  CompactRangeOptions cro;
+  cro.bottommost_level_compaction = BottommostLevelCompaction::kForceOptimized;
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  ASSERT_EQ("0,0,8", FilesPerLevel(0));
+
+  // When compact from Ln -> Ln+1, cut a file if the file overlaps with
+  // more than three files in Ln+1.
+  options.max_compaction_bytes = options.target_file_size_base * 3;
+  Reopen(options);
+
+  GenerateNewRandomFile(&rnd);
+  // Add three more small files that overlap with the previous file
+  for (int i = 0; i < 3; i++) {
+    Put("a", "z");
+    ASSERT_OK(Flush());
+  }
+  dbfull()->TEST_WaitForCompact();
+
+  // Output files to L1 are cut to three pieces, according to
+  // options.max_compaction_bytes
+  ASSERT_EQ("0,3,8", FilesPerLevel(0));
+}
+
+static void UniqueIdCallback(void* arg) {
+  int* result = reinterpret_cast<int*>(arg);
+  if (*result == -1) {
+    *result = 0;
+  }
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearTrace();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "GetUniqueIdFromFile:FS_IOC_GETVERSION", UniqueIdCallback);
+}
+
+class MockPersistentCache : public PersistentCache {
+ public:
+  explicit MockPersistentCache(const bool is_compressed, const size_t max_size)
+      : is_compressed_(is_compressed), max_size_(max_size) {
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+        "GetUniqueIdFromFile:FS_IOC_GETVERSION", UniqueIdCallback);
+  }
+
+  ~MockPersistentCache() override {}
+
+  PersistentCache::StatsType Stats() override {
+    return PersistentCache::StatsType();
+  }
+
+  Status Insert(const Slice& page_key, const char* data,
+                const size_t size) override {
+    MutexLock _(&lock_);
+
+    if (size_ > max_size_) {
+      size_ -= data_.begin()->second.size();
+      data_.erase(data_.begin());
+    }
+
+    data_.insert(std::make_pair(page_key.ToString(), std::string(data, size)));
+    size_ += size;
+    return Status::OK();
+  }
+
+  Status Lookup(const Slice& page_key, std::unique_ptr<char[]>* data,
+                size_t* size) override {
+    MutexLock _(&lock_);
+    auto it = data_.find(page_key.ToString());
+    if (it == data_.end()) {
+      return Status::NotFound();
+    }
+
+    assert(page_key.ToString() == it->first);
+    data->reset(new char[it->second.size()]);
+    memcpy(data->get(), it->second.c_str(), it->second.size());
+    *size = it->second.size();
+    return Status::OK();
+  }
+
+  bool IsCompressed() override { return is_compressed_; }
+
+  std::string GetPrintableOptions() const override {
+    return "MockPersistentCache";
+  }
+
+  port::Mutex lock_;
+  std::map<std::string, std::string> data_;
+  const bool is_compressed_ = true;
+  size_t size_ = 0;
+  const size_t max_size_ = 10 * 1024;  // 10KiB
+};
+
+#ifdef OS_LINUX
+// Make sure that in CPU time perf context counters, Env::NowCPUNanos()
+// is used, rather than Env::CPUNanos();
+TEST_F(DBTest2, TestPerfContextGetCpuTime) {
+  // force resizing table cache so table handle is not preloaded so that
+  // we can measure find_table_nanos during Get().
+  dbfull()->TEST_table_cache()->SetCapacity(0);
+  ASSERT_OK(Put("foo", "bar"));
+  ASSERT_OK(Flush());
+  env_->now_cpu_count_.store(0);
+
+  // CPU timing is not enabled with kEnableTimeExceptForMutex
+  SetPerfLevel(PerfLevel::kEnableTimeExceptForMutex);
+  ASSERT_EQ("bar", Get("foo"));
+  ASSERT_EQ(0, get_perf_context()->get_cpu_nanos);
+  ASSERT_EQ(0, env_->now_cpu_count_.load());
+
+  uint64_t kDummyAddonTime = uint64_t{1000000000000};
+
+  // Add time to NowNanos() reading.
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "TableCache::FindTable:0",
+      [&](void* /*arg*/) { env_->addon_time_.fetch_add(kDummyAddonTime); });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  SetPerfLevel(PerfLevel::kEnableTimeAndCPUTimeExceptForMutex);
+  ASSERT_EQ("bar", Get("foo"));
+  ASSERT_GT(env_->now_cpu_count_.load(), 2);
+  ASSERT_LT(get_perf_context()->get_cpu_nanos, kDummyAddonTime);
+  ASSERT_GT(get_perf_context()->find_table_nanos, kDummyAddonTime);
+
+  SetPerfLevel(PerfLevel::kDisable);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBTest2, TestPerfContextIterCpuTime) {
+  DestroyAndReopen(CurrentOptions());
+  // force resizing table cache so table handle is not preloaded so that
+  // we can measure find_table_nanos during iteration
+  dbfull()->TEST_table_cache()->SetCapacity(0);
+
+  const size_t kNumEntries = 10;
+  for (size_t i = 0; i < kNumEntries; ++i) {
+    ASSERT_OK(Put("k" + ToString(i), "v" + ToString(i)));
+  }
+  ASSERT_OK(Flush());
+  for (size_t i = 0; i < kNumEntries; ++i) {
+    ASSERT_EQ("v" + ToString(i), Get("k" + ToString(i)));
+  }
+  std::string last_key = "k" + ToString(kNumEntries - 1);
+  std::string last_value = "v" + ToString(kNumEntries - 1);
+  env_->now_cpu_count_.store(0);
+
+  // CPU timing is not enabled with kEnableTimeExceptForMutex
+  SetPerfLevel(PerfLevel::kEnableTimeExceptForMutex);
+  Iterator* iter = db_->NewIterator(ReadOptions());
+  iter->Seek("k0");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("v0", iter->value().ToString());
+  iter->SeekForPrev(last_key);
+  ASSERT_TRUE(iter->Valid());
+  iter->SeekToLast();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(last_value, iter->value().ToString());
+  iter->SeekToFirst();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("v0", iter->value().ToString());
+  ASSERT_EQ(0, get_perf_context()->iter_seek_cpu_nanos);
+  iter->Next();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("v1", iter->value().ToString());
+  ASSERT_EQ(0, get_perf_context()->iter_next_cpu_nanos);
+  iter->Prev();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("v0", iter->value().ToString());
+  ASSERT_EQ(0, get_perf_context()->iter_prev_cpu_nanos);
+  ASSERT_EQ(0, env_->now_cpu_count_.load());
+  delete iter;
+
+  uint64_t kDummyAddonTime = uint64_t{1000000000000};
+
+  // Add time to NowNanos() reading.
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "TableCache::FindTable:0",
+      [&](void* /*arg*/) { env_->addon_time_.fetch_add(kDummyAddonTime); });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  SetPerfLevel(PerfLevel::kEnableTimeAndCPUTimeExceptForMutex);
+  iter = db_->NewIterator(ReadOptions());
+  iter->Seek("k0");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("v0", iter->value().ToString());
+  iter->SeekForPrev(last_key);
+  ASSERT_TRUE(iter->Valid());
+  iter->SeekToLast();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(last_value, iter->value().ToString());
+  iter->SeekToFirst();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("v0", iter->value().ToString());
+  ASSERT_GT(get_perf_context()->iter_seek_cpu_nanos, 0);
+  ASSERT_LT(get_perf_context()->iter_seek_cpu_nanos, kDummyAddonTime);
+  iter->Next();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("v1", iter->value().ToString());
+  ASSERT_GT(get_perf_context()->iter_next_cpu_nanos, 0);
+  ASSERT_LT(get_perf_context()->iter_next_cpu_nanos, kDummyAddonTime);
+  iter->Prev();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("v0", iter->value().ToString());
+  ASSERT_GT(get_perf_context()->iter_prev_cpu_nanos, 0);
+  ASSERT_LT(get_perf_context()->iter_prev_cpu_nanos, kDummyAddonTime);
+  ASSERT_GE(env_->now_cpu_count_.load(), 12);
+  ASSERT_GT(get_perf_context()->find_table_nanos, kDummyAddonTime);
+
+  SetPerfLevel(PerfLevel::kDisable);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  delete iter;
+}
+#endif  // OS_LINUX
+
+// GetUniqueIdFromFile is not implemented on these platforms. Persistent cache
+// breaks when that function is not implemented and no regular block cache is
+// provided.
+#if !defined(OS_SOLARIS) && !defined(OS_WIN)
+TEST_F(DBTest2, PersistentCache) {
+  int num_iter = 80;
+
+  Options options;
+  options.write_buffer_size = 64 * 1024;  // small write buffer
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  options = CurrentOptions(options);
+
+  auto bsizes = {/*no block cache*/ 0, /*1M*/ 1 * 1024 * 1024};
+  auto types = {/*compressed*/ 1, /*uncompressed*/ 0};
+  for (auto bsize : bsizes) {
+    for (auto type : types) {
+      BlockBasedTableOptions table_options;
+      table_options.persistent_cache.reset(
+          new MockPersistentCache(type, 10 * 1024));
+      table_options.no_block_cache = true;
+      table_options.block_cache = bsize ? NewLRUCache(bsize) : nullptr;
+      table_options.block_cache_compressed = nullptr;
+      options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+      DestroyAndReopen(options);
+      CreateAndReopenWithCF({"pikachu"}, options);
+      // default column family doesn't have block cache
+      Options no_block_cache_opts;
+      no_block_cache_opts.statistics = options.statistics;
+      no_block_cache_opts = CurrentOptions(no_block_cache_opts);
+      BlockBasedTableOptions table_options_no_bc;
+      table_options_no_bc.no_block_cache = true;
+      no_block_cache_opts.table_factory.reset(
+          NewBlockBasedTableFactory(table_options_no_bc));
+      ReopenWithColumnFamilies(
+          {"default", "pikachu"},
+          std::vector<Options>({no_block_cache_opts, options}));
+
+      Random rnd(301);
+
+      // Write 8MB (80 values, each 100K)
+      ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
+      std::vector<std::string> values;
+      std::string str;
+      for (int i = 0; i < num_iter; i++) {
+        if (i % 4 == 0) {  // high compression ratio
+          str = RandomString(&rnd, 1000);
+        }
+        values.push_back(str);
+        ASSERT_OK(Put(1, Key(i), values[i]));
+      }
+
+      // flush all data from memtable so that reads are from block cache
+      ASSERT_OK(Flush(1));
+
+      for (int i = 0; i < num_iter; i++) {
+        ASSERT_EQ(Get(1, Key(i)), values[i]);
+      }
+
+      auto hit = options.statistics->getTickerCount(PERSISTENT_CACHE_HIT);
+      auto miss = options.statistics->getTickerCount(PERSISTENT_CACHE_MISS);
+
+      ASSERT_GT(hit, 0);
+      ASSERT_GT(miss, 0);
+    }
+  }
+}
+#endif  // !defined(OS_SOLARIS) && !defined(OS_WIN)
+
+namespace {
+void CountSyncPoint() {
+  TEST_SYNC_POINT_CALLBACK("DBTest2::MarkedPoint", nullptr /* arg */);
+}
+}  // namespace
+
+TEST_F(DBTest2, SyncPointMarker) {
+  std::atomic<int> sync_point_called(0);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBTest2::MarkedPoint",
+      [&](void* /*arg*/) { sync_point_called.fetch_add(1); });
+
+  // The first dependency enforces Marker can be loaded before MarkedPoint.
+  // The second checks that thread 1's MarkedPoint should be disabled here.
+  // Execution order:
+  // |   Thread 1    |  Thread 2   |
+  // |               |   Marker    |
+  // |  MarkedPoint  |             |
+  // | Thread1First  |             |
+  // |               | MarkedPoint |
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependencyAndMarkers(
+      {{"DBTest2::SyncPointMarker:Thread1First", "DBTest2::MarkedPoint"}},
+      {{"DBTest2::SyncPointMarker:Marker", "DBTest2::MarkedPoint"}});
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  std::function<void()> func1 = [&]() {
+    CountSyncPoint();
+    TEST_SYNC_POINT("DBTest2::SyncPointMarker:Thread1First");
+  };
+
+  std::function<void()> func2 = [&]() {
+    TEST_SYNC_POINT("DBTest2::SyncPointMarker:Marker");
+    CountSyncPoint();
+  };
+
+  auto thread1 = port::Thread(func1);
+  auto thread2 = port::Thread(func2);
+  thread1.join();
+  thread2.join();
+
+  // Callback is only executed once
+  ASSERT_EQ(sync_point_called.load(), 1);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+#endif
+
+size_t GetEncodedEntrySize(size_t key_size, size_t value_size) {
+  std::string buffer;
+
+  PutVarint32(&buffer, static_cast<uint32_t>(0));
+  PutVarint32(&buffer, static_cast<uint32_t>(key_size));
+  PutVarint32(&buffer, static_cast<uint32_t>(value_size));
+
+  return buffer.size() + key_size + value_size;
+}
+
+TEST_F(DBTest2, ReadAmpBitmap) {
+  Options options = CurrentOptions();
+  BlockBasedTableOptions bbto;
+  uint32_t bytes_per_bit[2] = {1, 16};
+  for (size_t k = 0; k < 2; k++) {
+    // Disable delta encoding to make it easier to calculate read amplification
+    bbto.use_delta_encoding = false;
+    // Huge block cache to make it easier to calculate read amplification
+    bbto.block_cache = NewLRUCache(1024 * 1024 * 1024);
+    bbto.read_amp_bytes_per_bit = bytes_per_bit[k];
+    options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+    options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+    DestroyAndReopen(options);
+
+    const size_t kNumEntries = 10000;
+
+    Random rnd(301);
+    for (size_t i = 0; i < kNumEntries; i++) {
+      ASSERT_OK(Put(Key(static_cast<int>(i)), RandomString(&rnd, 100)));
+    }
+    ASSERT_OK(Flush());
+
+    Close();
+    Reopen(options);
+
+    // Read keys/values randomly and verify that reported read amp error
+    // is less than 2%
+    uint64_t total_useful_bytes = 0;
+    std::set<int> read_keys;
+    std::string value;
+    for (size_t i = 0; i < kNumEntries * 5; i++) {
+      int key_idx = rnd.Next() % kNumEntries;
+      std::string key = Key(key_idx);
+      ASSERT_OK(db_->Get(ReadOptions(), key, &value));
+
+      if (read_keys.find(key_idx) == read_keys.end()) {
+        auto internal_key = InternalKey(key, 0, ValueType::kTypeValue);
+        total_useful_bytes +=
+            GetEncodedEntrySize(internal_key.size(), value.size());
+        read_keys.insert(key_idx);
+      }
+
+      double expected_read_amp =
+          static_cast<double>(total_useful_bytes) /
+          options.statistics->getTickerCount(READ_AMP_TOTAL_READ_BYTES);
+
+      double read_amp =
+          static_cast<double>(options.statistics->getTickerCount(
+              READ_AMP_ESTIMATE_USEFUL_BYTES)) /
+          options.statistics->getTickerCount(READ_AMP_TOTAL_READ_BYTES);
+
+      double error_pct = fabs(expected_read_amp - read_amp) * 100;
+      // Error between reported read amp and real read amp should be less than
+      // 2%
+      EXPECT_LE(error_pct, 2);
+    }
+
+    // Make sure we read every thing in the DB (which is smaller than our cache)
+    Iterator* iter = db_->NewIterator(ReadOptions());
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+      ASSERT_EQ(iter->value().ToString(), Get(iter->key().ToString()));
+    }
+    delete iter;
+
+    // Read amp is on average 100% since we read all what we loaded in memory
+    if (k == 0) {
+      ASSERT_EQ(
+          options.statistics->getTickerCount(READ_AMP_ESTIMATE_USEFUL_BYTES),
+          options.statistics->getTickerCount(READ_AMP_TOTAL_READ_BYTES));
+    } else {
+      ASSERT_NEAR(
+          options.statistics->getTickerCount(READ_AMP_ESTIMATE_USEFUL_BYTES) *
+              1.0f /
+              options.statistics->getTickerCount(READ_AMP_TOTAL_READ_BYTES),
+          1, .01);
+    }
+  }
+}
+
+#ifndef OS_SOLARIS // GetUniqueIdFromFile is not implemented
+TEST_F(DBTest2, ReadAmpBitmapLiveInCacheAfterDBClose) {
+  {
+    const int kIdBufLen = 100;
+    char id_buf[kIdBufLen];
+#ifndef OS_WIN
+    // You can't open a directory on windows using random access file
+    std::unique_ptr<RandomAccessFile> file;
+    ASSERT_OK(env_->NewRandomAccessFile(dbname_, &file, EnvOptions()));
+    if (file->GetUniqueId(id_buf, kIdBufLen) == 0) {
+      // fs holding db directory doesn't support getting a unique file id,
+      // this means that running this test will fail because lru_cache will load
+      // the blocks again regardless of them being already in the cache
+      return;
+    }
+#else
+    std::unique_ptr<Directory> dir;
+    ASSERT_OK(env_->NewDirectory(dbname_, &dir));
+    if (dir->GetUniqueId(id_buf, kIdBufLen) == 0) {
+      // fs holding db directory doesn't support getting a unique file id,
+      // this means that running this test will fail because lru_cache will load
+      // the blocks again regardless of them being already in the cache
+      return;
+    }
+#endif
+  }
+  uint32_t bytes_per_bit[2] = {1, 16};
+  for (size_t k = 0; k < 2; k++) {
+    std::shared_ptr<Cache> lru_cache = NewLRUCache(1024 * 1024 * 1024);
+    std::shared_ptr<Statistics> stats = ROCKSDB_NAMESPACE::CreateDBStatistics();
+
+    Options options = CurrentOptions();
+    BlockBasedTableOptions bbto;
+    // Disable delta encoding to make it easier to calculate read amplification
+    bbto.use_delta_encoding = false;
+    // Huge block cache to make it easier to calculate read amplification
+    bbto.block_cache = lru_cache;
+    bbto.read_amp_bytes_per_bit = bytes_per_bit[k];
+    options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+    options.statistics = stats;
+    DestroyAndReopen(options);
+
+    const int kNumEntries = 10000;
+
+    Random rnd(301);
+    for (int i = 0; i < kNumEntries; i++) {
+      ASSERT_OK(Put(Key(i), RandomString(&rnd, 100)));
+    }
+    ASSERT_OK(Flush());
+
+    Close();
+    Reopen(options);
+
+    uint64_t total_useful_bytes = 0;
+    std::set<int> read_keys;
+    std::string value;
+    // Iter1: Read half the DB, Read even keys
+    // Key(0), Key(2), Key(4), Key(6), Key(8), ...
+    for (int i = 0; i < kNumEntries; i += 2) {
+      std::string key = Key(i);
+      ASSERT_OK(db_->Get(ReadOptions(), key, &value));
+
+      if (read_keys.find(i) == read_keys.end()) {
+        auto internal_key = InternalKey(key, 0, ValueType::kTypeValue);
+        total_useful_bytes +=
+            GetEncodedEntrySize(internal_key.size(), value.size());
+        read_keys.insert(i);
+      }
+    }
+
+    size_t total_useful_bytes_iter1 =
+        options.statistics->getTickerCount(READ_AMP_ESTIMATE_USEFUL_BYTES);
+    size_t total_loaded_bytes_iter1 =
+        options.statistics->getTickerCount(READ_AMP_TOTAL_READ_BYTES);
+
+    Close();
+    std::shared_ptr<Statistics> new_statistics =
+        ROCKSDB_NAMESPACE::CreateDBStatistics();
+    // Destroy old statistics obj that the blocks in lru_cache are pointing to
+    options.statistics.reset();
+    // Use the statistics object that we just created
+    options.statistics = new_statistics;
+    Reopen(options);
+
+    // Iter2: Read half the DB, Read odd keys
+    // Key(1), Key(3), Key(5), Key(7), Key(9), ...
+    for (int i = 1; i < kNumEntries; i += 2) {
+      std::string key = Key(i);
+      ASSERT_OK(db_->Get(ReadOptions(), key, &value));
+
+      if (read_keys.find(i) == read_keys.end()) {
+        auto internal_key = InternalKey(key, 0, ValueType::kTypeValue);
+        total_useful_bytes +=
+            GetEncodedEntrySize(internal_key.size(), value.size());
+        read_keys.insert(i);
+      }
+    }
+
+    size_t total_useful_bytes_iter2 =
+        options.statistics->getTickerCount(READ_AMP_ESTIMATE_USEFUL_BYTES);
+    size_t total_loaded_bytes_iter2 =
+        options.statistics->getTickerCount(READ_AMP_TOTAL_READ_BYTES);
+
+
+    // Read amp is on average 100% since we read all what we loaded in memory
+    if (k == 0) {
+      ASSERT_EQ(total_useful_bytes_iter1 + total_useful_bytes_iter2,
+                total_loaded_bytes_iter1 + total_loaded_bytes_iter2);
+    } else {
+      ASSERT_NEAR((total_useful_bytes_iter1 + total_useful_bytes_iter2) * 1.0f /
+                      (total_loaded_bytes_iter1 + total_loaded_bytes_iter2),
+                  1, .01);
+    }
+  }
+}
+#endif // !OS_SOLARIS
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest2, AutomaticCompactionOverlapManualCompaction) {
+  Options options = CurrentOptions();
+  options.num_levels = 3;
+  options.IncreaseParallelism(20);
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put(Key(0), "a"));
+  ASSERT_OK(Put(Key(5), "a"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put(Key(10), "a"));
+  ASSERT_OK(Put(Key(15), "a"));
+  ASSERT_OK(Flush());
+
+  CompactRangeOptions cro;
+  cro.change_level = true;
+  cro.target_level = 2;
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+  auto get_stat = [](std::string level_str, LevelStatType type,
+                     std::map<std::string, std::string> props) {
+    auto prop_str =
+        "compaction." + level_str + "." +
+        InternalStats::compaction_level_stats.at(type).property_name.c_str();
+    auto prop_item = props.find(prop_str);
+    return prop_item == props.end() ? 0 : std::stod(prop_item->second);
+  };
+
+  // Trivial move 2 files to L2
+  ASSERT_EQ("0,0,2", FilesPerLevel());
+  // Also test that the stats GetMapProperty API reporting the same result
+  {
+    std::map<std::string, std::string> prop;
+    ASSERT_TRUE(dbfull()->GetMapProperty("rocksdb.cfstats", &prop));
+    ASSERT_EQ(0, get_stat("L0", LevelStatType::NUM_FILES, prop));
+    ASSERT_EQ(0, get_stat("L1", LevelStatType::NUM_FILES, prop));
+    ASSERT_EQ(2, get_stat("L2", LevelStatType::NUM_FILES, prop));
+    ASSERT_EQ(2, get_stat("Sum", LevelStatType::NUM_FILES, prop));
+  }
+
+  // While the compaction is running, we will create 2 new files that
+  // can fit in L2, these 2 files will be moved to L2 and overlap with
+  // the running compaction and break the LSM consistency.
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "CompactionJob::Run():Start", [&](void* /*arg*/) {
+        ASSERT_OK(
+            dbfull()->SetOptions({{"level0_file_num_compaction_trigger", "2"},
+                                  {"max_bytes_for_level_base", "1"}}));
+        ASSERT_OK(Put(Key(6), "a"));
+        ASSERT_OK(Put(Key(7), "a"));
+        ASSERT_OK(Flush());
+
+        ASSERT_OK(Put(Key(8), "a"));
+        ASSERT_OK(Put(Key(9), "a"));
+        ASSERT_OK(Flush());
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // Run a manual compaction that will compact the 2 files in L2
+  // into 1 file in L2
+  cro.exclusive_manual_compaction = false;
+  cro.bottommost_level_compaction = BottommostLevelCompaction::kForceOptimized;
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+  // Test that the stats GetMapProperty API reporting 1 file in L2
+  {
+    std::map<std::string, std::string> prop;
+    ASSERT_TRUE(dbfull()->GetMapProperty("rocksdb.cfstats", &prop));
+    ASSERT_EQ(1, get_stat("L2", LevelStatType::NUM_FILES, prop));
+  }
+}
+
+TEST_F(DBTest2, ManualCompactionOverlapManualCompaction) {
+  Options options = CurrentOptions();
+  options.num_levels = 2;
+  options.IncreaseParallelism(20);
+  options.disable_auto_compactions = true;
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put(Key(0), "a"));
+  ASSERT_OK(Put(Key(5), "a"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put(Key(10), "a"));
+  ASSERT_OK(Put(Key(15), "a"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+  // Trivial move 2 files to L1
+  ASSERT_EQ("0,2", FilesPerLevel());
+
+  std::function<void()> bg_manual_compact = [&]() {
+    std::string k1 = Key(6);
+    std::string k2 = Key(9);
+    Slice k1s(k1);
+    Slice k2s(k2);
+    CompactRangeOptions cro;
+    cro.exclusive_manual_compaction = false;
+    ASSERT_OK(db_->CompactRange(cro, &k1s, &k2s));
+  };
+  ROCKSDB_NAMESPACE::port::Thread bg_thread;
+
+  // While the compaction is running, we will create 2 new files that
+  // can fit in L1, these 2 files will be moved to L1 and overlap with
+  // the running compaction and break the LSM consistency.
+  std::atomic<bool> flag(false);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "CompactionJob::Run():Start", [&](void* /*arg*/) {
+        if (flag.exchange(true)) {
+          // We want to make sure to call this callback only once
+          return;
+        }
+        ASSERT_OK(Put(Key(6), "a"));
+        ASSERT_OK(Put(Key(7), "a"));
+        ASSERT_OK(Flush());
+
+        ASSERT_OK(Put(Key(8), "a"));
+        ASSERT_OK(Put(Key(9), "a"));
+        ASSERT_OK(Flush());
+
+        // Start a non-exclusive manual compaction in a bg thread
+        bg_thread = port::Thread(bg_manual_compact);
+        // This manual compaction conflict with the other manual compaction
+        // so it should wait until the first compaction finish
+        env_->SleepForMicroseconds(1000000);
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // Run a manual compaction that will compact the 2 files in L1
+  // into 1 file in L1
+  CompactRangeOptions cro;
+  cro.exclusive_manual_compaction = false;
+  cro.bottommost_level_compaction = BottommostLevelCompaction::kForceOptimized;
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  bg_thread.join();
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBTest2, PausingManualCompaction1) {
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.num_levels = 7;
+
+  DestroyAndReopen(options);
+  Random rnd(301);
+  // Generate a file containing 10 keys.
+  for (int i = 0; i < 10; i++) {
+    ASSERT_OK(Put(Key(i), RandomString(&rnd, 50)));
+  }
+  ASSERT_OK(Flush());
+
+  // Generate another file containing same keys
+  for (int i = 0; i < 10; i++) {
+    ASSERT_OK(Put(Key(i), RandomString(&rnd, 50)));
+  }
+  ASSERT_OK(Flush());
+
+  int manual_compactions_paused = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "CompactionJob::Run():PausingManualCompaction:1", [&](void* arg) {
+        auto paused = reinterpret_cast<std::atomic<bool>*>(arg);
+        ASSERT_FALSE(paused->load(std::memory_order_acquire));
+        paused->store(true, std::memory_order_release);
+        manual_compactions_paused += 1;
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  std::vector<std::string> files_before_compact, files_after_compact;
+  // Remember file name before compaction is triggered
+  std::vector<LiveFileMetaData> files_meta;
+  dbfull()->GetLiveFilesMetaData(&files_meta);
+  for (auto file : files_meta) {
+    files_before_compact.push_back(file.name);
+  }
+
+  // OK, now trigger a manual compaction
+  dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+
+  // Wait for compactions to get scheduled and stopped
+  dbfull()->TEST_WaitForCompact(true);
+
+  // Get file names after compaction is stopped
+  files_meta.clear();
+  dbfull()->GetLiveFilesMetaData(&files_meta);
+  for (auto file : files_meta) {
+    files_after_compact.push_back(file.name);
+  }
+
+  // Like nothing happened
+  ASSERT_EQ(files_before_compact, files_after_compact);
+  ASSERT_EQ(manual_compactions_paused, 1);
+
+  manual_compactions_paused = 0;
+  // Now make sure CompactFiles also not run
+  dbfull()->CompactFiles(ROCKSDB_NAMESPACE::CompactionOptions(),
+                         files_before_compact, 0);
+  // Wait for manual compaction to get scheduled and finish
+  dbfull()->TEST_WaitForCompact(true);
+
+  files_meta.clear();
+  files_after_compact.clear();
+  dbfull()->GetLiveFilesMetaData(&files_meta);
+  for (auto file : files_meta) {
+    files_after_compact.push_back(file.name);
+  }
+
+  ASSERT_EQ(files_before_compact, files_after_compact);
+  // CompactFiles returns at entry point
+  ASSERT_EQ(manual_compactions_paused, 0);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+// PausingManualCompaction does not affect auto compaction
+TEST_F(DBTest2, PausingManualCompaction2) {
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = 2;
+  options.disable_auto_compactions = false;
+
+  DestroyAndReopen(options);
+  dbfull()->DisableManualCompaction();
+
+  Random rnd(301);
+  for (int i = 0; i < 2; i++) {
+    // Generate a file containing 10 keys.
+    for (int j = 0; j < 100; j++) {
+      ASSERT_OK(Put(Key(j), RandomString(&rnd, 50)));
+    }
+    ASSERT_OK(Flush());
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
+
+  std::vector<LiveFileMetaData> files_meta;
+  dbfull()->GetLiveFilesMetaData(&files_meta);
+  ASSERT_EQ(files_meta.size(), 1);
+}
+
+TEST_F(DBTest2, PausingManualCompaction3) {
+  CompactRangeOptions compact_options;
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.num_levels = 7;
+
+  Random rnd(301);
+  auto generate_files = [&]() {
+    for (int i = 0; i < options.num_levels; i++) {
+      for (int j = 0; j < options.num_levels - i + 1; j++) {
+        for (int k = 0; k < 1000; k++) {
+          ASSERT_OK(Put(Key(k + j * 1000), RandomString(&rnd, 50)));
+        }
+        Flush();
+      }
+
+      for (int l = 1; l < options.num_levels - i; l++) {
+        MoveFilesToLevel(l);
+      }
+    }
+  };
+
+  DestroyAndReopen(options);
+  generate_files();
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ("2,3,4,5,6,7,8", FilesPerLevel());
+#endif  // !ROCKSDB_LITE
+  int run_manual_compactions = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "CompactionJob::Run():PausingManualCompaction:1",
+      [&](void* /*arg*/) { run_manual_compactions++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  dbfull()->DisableManualCompaction();
+  dbfull()->CompactRange(compact_options, nullptr, nullptr);
+  dbfull()->TEST_WaitForCompact(true);
+  // As manual compaction disabled, not even reach sync point
+  ASSERT_EQ(run_manual_compactions, 0);
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ("2,3,4,5,6,7,8", FilesPerLevel());
+#endif  // !ROCKSDB_LITE
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack(
+      "CompactionJob::Run():PausingManualCompaction:1");
+  dbfull()->EnableManualCompaction();
+  dbfull()->CompactRange(compact_options, nullptr, nullptr);
+  dbfull()->TEST_WaitForCompact(true);
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel());
+#endif  // !ROCKSDB_LITE
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBTest2, PausingManualCompaction4) {
+  CompactRangeOptions compact_options;
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.num_levels = 7;
+
+  Random rnd(301);
+  auto generate_files = [&]() {
+    for (int i = 0; i < options.num_levels; i++) {
+      for (int j = 0; j < options.num_levels - i + 1; j++) {
+        for (int k = 0; k < 1000; k++) {
+          ASSERT_OK(Put(Key(k + j * 1000), RandomString(&rnd, 50)));
+        }
+        Flush();
+      }
+
+      for (int l = 1; l < options.num_levels - i; l++) {
+        MoveFilesToLevel(l);
+      }
+    }
+  };
+
+  DestroyAndReopen(options);
+  generate_files();
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ("2,3,4,5,6,7,8", FilesPerLevel());
+#endif  // !ROCKSDB_LITE
+  int run_manual_compactions = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "CompactionJob::Run():PausingManualCompaction:2", [&](void* arg) {
+        auto paused = reinterpret_cast<std::atomic<bool>*>(arg);
+        ASSERT_FALSE(paused->load(std::memory_order_acquire));
+        paused->store(true, std::memory_order_release);
+        run_manual_compactions++;
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  dbfull()->EnableManualCompaction();
+  dbfull()->CompactRange(compact_options, nullptr, nullptr);
+  dbfull()->TEST_WaitForCompact(true);
+  ASSERT_EQ(run_manual_compactions, 1);
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ("2,3,4,5,6,7,8", FilesPerLevel());
+#endif  // !ROCKSDB_LITE
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack(
+      "CompactionJob::Run():PausingManualCompaction:2");
+  dbfull()->EnableManualCompaction();
+  dbfull()->CompactRange(compact_options, nullptr, nullptr);
+  dbfull()->TEST_WaitForCompact(true);
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel());
+#endif  // !ROCKSDB_LITE
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBTest2, OptimizeForPointLookup) {
+  Options options = CurrentOptions();
+  Close();
+  options.OptimizeForPointLookup(2);
+  ASSERT_OK(DB::Open(options, dbname_, &db_));
+
+  ASSERT_OK(Put("foo", "v1"));
+  ASSERT_EQ("v1", Get("foo"));
+  Flush();
+  ASSERT_EQ("v1", Get("foo"));
+}
+
+TEST_F(DBTest2, OptimizeForSmallDB) {
+  Options options = CurrentOptions();
+  Close();
+  options.OptimizeForSmallDb();
+
+  // Find the cache object
+  ASSERT_EQ(std::string(BlockBasedTableFactory::kName),
+            std::string(options.table_factory->Name()));
+  BlockBasedTableOptions* table_options =
+      reinterpret_cast<BlockBasedTableOptions*>(
+          options.table_factory->GetOptions());
+  ASSERT_TRUE(table_options != nullptr);
+  std::shared_ptr<Cache> cache = table_options->block_cache;
+
+  ASSERT_EQ(0, cache->GetUsage());
+  ASSERT_OK(DB::Open(options, dbname_, &db_));
+  ASSERT_OK(Put("foo", "v1"));
+
+  // memtable size is costed to the block cache
+  ASSERT_NE(0, cache->GetUsage());
+
+  ASSERT_EQ("v1", Get("foo"));
+  Flush();
+
+  size_t prev_size = cache->GetUsage();
+  // Remember block cache size, so that we can find that
+  // it is filled after Get().
+  // Use pinnable slice so that it can ping the block so that
+  // when we check the size it is not evicted.
+  PinnableSlice value;
+  ASSERT_OK(db_->Get(ReadOptions(), db_->DefaultColumnFamily(), "foo", &value));
+  ASSERT_GT(cache->GetUsage(), prev_size);
+  value.Reset();
+}
+
+#endif  // ROCKSDB_LITE
+
+TEST_F(DBTest2, GetRaceFlush1) {
+  ASSERT_OK(Put("foo", "v1"));
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::GetImpl:1", "DBTest2::GetRaceFlush:1"},
+       {"DBTest2::GetRaceFlush:2", "DBImpl::GetImpl:2"}});
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ROCKSDB_NAMESPACE::port::Thread t1([&] {
+    TEST_SYNC_POINT("DBTest2::GetRaceFlush:1");
+    ASSERT_OK(Put("foo", "v2"));
+    Flush();
+    TEST_SYNC_POINT("DBTest2::GetRaceFlush:2");
+  });
+
+  // Get() is issued after the first Put(), so it should see either
+  // "v1" or "v2".
+  ASSERT_NE("NOT_FOUND", Get("foo"));
+  t1.join();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBTest2, GetRaceFlush2) {
+  ASSERT_OK(Put("foo", "v1"));
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::GetImpl:3", "DBTest2::GetRaceFlush:1"},
+       {"DBTest2::GetRaceFlush:2", "DBImpl::GetImpl:4"}});
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  port::Thread t1([&] {
+    TEST_SYNC_POINT("DBTest2::GetRaceFlush:1");
+    ASSERT_OK(Put("foo", "v2"));
+    Flush();
+    TEST_SYNC_POINT("DBTest2::GetRaceFlush:2");
+  });
+
+  // Get() is issued after the first Put(), so it should see either
+  // "v1" or "v2".
+  ASSERT_NE("NOT_FOUND", Get("foo"));
+  t1.join();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBTest2, DirectIO) {
+  if (!IsDirectIOSupported()) {
+    return;
+  }
+  Options options = CurrentOptions();
+  options.use_direct_reads = options.use_direct_io_for_flush_and_compaction =
+      true;
+  options.allow_mmap_reads = options.allow_mmap_writes = false;
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put(Key(0), "a"));
+  ASSERT_OK(Put(Key(5), "a"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put(Key(10), "a"));
+  ASSERT_OK(Put(Key(15), "a"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  Reopen(options);
+}
+
+TEST_F(DBTest2, MemtableOnlyIterator) {
+  Options options = CurrentOptions();
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  ASSERT_OK(Put(1, "foo", "first"));
+  ASSERT_OK(Put(1, "bar", "second"));
+
+  ReadOptions ropt;
+  ropt.read_tier = kMemtableTier;
+  std::string value;
+  Iterator* it = nullptr;
+
+  // Before flushing
+  // point lookups
+  ASSERT_OK(db_->Get(ropt, handles_[1], "foo", &value));
+  ASSERT_EQ("first", value);
+  ASSERT_OK(db_->Get(ropt, handles_[1], "bar", &value));
+  ASSERT_EQ("second", value);
+
+  // Memtable-only iterator (read_tier=kMemtableTier); data not flushed yet.
+  it = db_->NewIterator(ropt, handles_[1]);
+  int count = 0;
+  for (it->SeekToFirst(); it->Valid(); it->Next()) {
+    ASSERT_TRUE(it->Valid());
+    count++;
+  }
+  ASSERT_TRUE(!it->Valid());
+  ASSERT_EQ(2, count);
+  delete it;
+
+  Flush(1);
+
+  // After flushing
+  // point lookups
+  ASSERT_OK(db_->Get(ropt, handles_[1], "foo", &value));
+  ASSERT_EQ("first", value);
+  ASSERT_OK(db_->Get(ropt, handles_[1], "bar", &value));
+  ASSERT_EQ("second", value);
+  // nothing should be returned using memtable-only iterator after flushing.
+  it = db_->NewIterator(ropt, handles_[1]);
+  count = 0;
+  for (it->SeekToFirst(); it->Valid(); it->Next()) {
+    ASSERT_TRUE(it->Valid());
+    count++;
+  }
+  ASSERT_TRUE(!it->Valid());
+  ASSERT_EQ(0, count);
+  delete it;
+
+  // Add a key to memtable
+  ASSERT_OK(Put(1, "foobar", "third"));
+  it = db_->NewIterator(ropt, handles_[1]);
+  count = 0;
+  for (it->SeekToFirst(); it->Valid(); it->Next()) {
+    ASSERT_TRUE(it->Valid());
+    ASSERT_EQ("foobar", it->key().ToString());
+    ASSERT_EQ("third", it->value().ToString());
+    count++;
+  }
+  ASSERT_TRUE(!it->Valid());
+  ASSERT_EQ(1, count);
+  delete it;
+}
+
+TEST_F(DBTest2, LowPriWrite) {
+  Options options = CurrentOptions();
+  // Compaction pressure should trigger since 6 files
+  options.level0_file_num_compaction_trigger = 4;
+  options.level0_slowdown_writes_trigger = 12;
+  options.level0_stop_writes_trigger = 30;
+  options.delayed_write_rate = 8 * 1024 * 1024;
+  Reopen(options);
+
+  std::atomic<int> rate_limit_count(0);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "GenericRateLimiter::Request:1", [&](void* arg) {
+        rate_limit_count.fetch_add(1);
+        int64_t* rate_bytes_per_sec = static_cast<int64_t*>(arg);
+        ASSERT_EQ(1024 * 1024, *rate_bytes_per_sec);
+      });
+  // Block compaction
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+      {"DBTest.LowPriWrite:0", "DBImpl::BGWorkCompaction"},
+  });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  WriteOptions wo;
+  for (int i = 0; i < 6; i++) {
+    wo.low_pri = false;
+    Put("", "", wo);
+    wo.low_pri = true;
+    Put("", "", wo);
+    Flush();
+  }
+  ASSERT_EQ(0, rate_limit_count.load());
+  wo.low_pri = true;
+  Put("", "", wo);
+  ASSERT_EQ(1, rate_limit_count.load());
+  wo.low_pri = false;
+  Put("", "", wo);
+  ASSERT_EQ(1, rate_limit_count.load());
+
+  TEST_SYNC_POINT("DBTest.LowPriWrite:0");
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+  dbfull()->TEST_WaitForCompact();
+  wo.low_pri = true;
+  Put("", "", wo);
+  ASSERT_EQ(1, rate_limit_count.load());
+  wo.low_pri = false;
+  Put("", "", wo);
+  ASSERT_EQ(1, rate_limit_count.load());
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest2, RateLimitedCompactionReads) {
+  // compaction input has 512KB data
+  const int kNumKeysPerFile = 128;
+  const int kBytesPerKey = 1024;
+  const int kNumL0Files = 4;
+
+  for (auto use_direct_io : {false, true}) {
+    if (use_direct_io && !IsDirectIOSupported()) {
+      continue;
+    }
+    Options options = CurrentOptions();
+    options.compression = kNoCompression;
+    options.level0_file_num_compaction_trigger = kNumL0Files;
+    options.memtable_factory.reset(new SpecialSkipListFactory(kNumKeysPerFile));
+    options.new_table_reader_for_compaction_inputs = true;
+    // takes roughly one second, split into 100 x 10ms intervals. Each interval
+    // permits 5.12KB, which is smaller than the block size, so this test
+    // exercises the code for chunking reads.
+    options.rate_limiter.reset(NewGenericRateLimiter(
+        static_cast<int64_t>(kNumL0Files * kNumKeysPerFile *
+                             kBytesPerKey) /* rate_bytes_per_sec */,
+        10 * 1000 /* refill_period_us */, 10 /* fairness */,
+        RateLimiter::Mode::kReadsOnly));
+    options.use_direct_reads = options.use_direct_io_for_flush_and_compaction =
+        use_direct_io;
+    BlockBasedTableOptions bbto;
+    bbto.block_size = 16384;
+    bbto.no_block_cache = true;
+    options.table_factory.reset(new BlockBasedTableFactory(bbto));
+    DestroyAndReopen(options);
+
+    for (int i = 0; i < kNumL0Files; ++i) {
+      for (int j = 0; j <= kNumKeysPerFile; ++j) {
+        ASSERT_OK(Put(Key(j), DummyString(kBytesPerKey)));
+      }
+      dbfull()->TEST_WaitForFlushMemTable();
+      ASSERT_EQ(i + 1, NumTableFilesAtLevel(0));
+    }
+    dbfull()->TEST_WaitForCompact();
+    ASSERT_EQ(0, NumTableFilesAtLevel(0));
+
+    ASSERT_EQ(0, options.rate_limiter->GetTotalBytesThrough(Env::IO_HIGH));
+    // should be slightly above 512KB due to non-data blocks read. Arbitrarily
+    // chose 1MB as the upper bound on the total bytes read.
+    size_t rate_limited_bytes =
+        options.rate_limiter->GetTotalBytesThrough(Env::IO_LOW);
+    // Include the explicit prefetch of the footer in direct I/O case.
+    size_t direct_io_extra = use_direct_io ? 512 * 1024 : 0;
+    ASSERT_GE(
+        rate_limited_bytes,
+        static_cast<size_t>(kNumKeysPerFile * kBytesPerKey * kNumL0Files));
+    ASSERT_LT(
+        rate_limited_bytes,
+        static_cast<size_t>(2 * kNumKeysPerFile * kBytesPerKey * kNumL0Files +
+                            direct_io_extra));
+
+    Iterator* iter = db_->NewIterator(ReadOptions());
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+      ASSERT_EQ(iter->value().ToString(), DummyString(kBytesPerKey));
+    }
+    delete iter;
+    // bytes read for user iterator shouldn't count against the rate limit.
+    ASSERT_EQ(rate_limited_bytes,
+              static_cast<size_t>(
+                  options.rate_limiter->GetTotalBytesThrough(Env::IO_LOW)));
+  }
+}
+#endif  // ROCKSDB_LITE
+
+// Make sure DB can be reopen with reduced number of levels, given no file
+// is on levels higher than the new num_levels.
+TEST_F(DBTest2, ReduceLevel) {
+  Options options;
+  options.disable_auto_compactions = true;
+  options.num_levels = 7;
+  Reopen(options);
+  Put("foo", "bar");
+  Flush();
+  MoveFilesToLevel(6);
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel());
+#endif  // !ROCKSDB_LITE
+  CompactRangeOptions compact_options;
+  compact_options.change_level = true;
+  compact_options.target_level = 1;
+  dbfull()->CompactRange(compact_options, nullptr, nullptr);
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ("0,1", FilesPerLevel());
+#endif  // !ROCKSDB_LITE
+  options.num_levels = 3;
+  Reopen(options);
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ("0,1", FilesPerLevel());
+#endif  // !ROCKSDB_LITE
+}
+
+// Test that ReadCallback is actually used in both memtbale and sst tables
+TEST_F(DBTest2, ReadCallbackTest) {
+  Options options;
+  options.disable_auto_compactions = true;
+  options.num_levels = 7;
+  Reopen(options);
+  std::vector<const Snapshot*> snapshots;
+  // Try to create a db with multiple layers and a memtable
+  const std::string key = "foo";
+  const std::string value = "bar";
+  // This test assumes that the seq start with 1 and increased by 1 after each
+  // write batch of size 1. If that behavior changes, the test needs to be
+  // updated as well.
+  // TODO(myabandeh): update this test to use the seq number that is returned by
+  // the DB instead of assuming what seq the DB used.
+  int i = 1;
+  for (; i < 10; i++) {
+    Put(key, value + std::to_string(i));
+    // Take a snapshot to avoid the value being removed during compaction
+    auto snapshot = dbfull()->GetSnapshot();
+    snapshots.push_back(snapshot);
+  }
+  Flush();
+  for (; i < 20; i++) {
+    Put(key, value + std::to_string(i));
+    // Take a snapshot to avoid the value being removed during compaction
+    auto snapshot = dbfull()->GetSnapshot();
+    snapshots.push_back(snapshot);
+  }
+  Flush();
+  MoveFilesToLevel(6);
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel());
+#endif  // !ROCKSDB_LITE
+  for (; i < 30; i++) {
+    Put(key, value + std::to_string(i));
+    auto snapshot = dbfull()->GetSnapshot();
+    snapshots.push_back(snapshot);
+  }
+  Flush();
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ("1,0,0,0,0,0,2", FilesPerLevel());
+#endif  // !ROCKSDB_LITE
+  // And also add some values to the memtable
+  for (; i < 40; i++) {
+    Put(key, value + std::to_string(i));
+    auto snapshot = dbfull()->GetSnapshot();
+    snapshots.push_back(snapshot);
+  }
+
+  class TestReadCallback : public ReadCallback {
+   public:
+    explicit TestReadCallback(SequenceNumber snapshot)
+        : ReadCallback(snapshot), snapshot_(snapshot) {}
+    bool IsVisibleFullCheck(SequenceNumber seq) override {
+      return seq <= snapshot_;
+    }
+
+   private:
+    SequenceNumber snapshot_;
+  };
+
+  for (int seq = 1; seq < i; seq++) {
+    PinnableSlice pinnable_val;
+    ReadOptions roptions;
+    TestReadCallback callback(seq);
+    bool dont_care = true;
+    DBImpl::GetImplOptions get_impl_options;
+    get_impl_options.column_family = dbfull()->DefaultColumnFamily();
+    get_impl_options.value = &pinnable_val;
+    get_impl_options.value_found = &dont_care;
+    get_impl_options.callback = &callback;
+    Status s = dbfull()->GetImpl(roptions, key, get_impl_options);
+    ASSERT_TRUE(s.ok());
+    // Assuming that after each Put the DB increased seq by one, the value and
+    // seq number must be equal since we also inc value by 1 after each Put.
+    ASSERT_EQ(value + std::to_string(seq), pinnable_val.ToString());
+  }
+
+  for (auto snapshot : snapshots) {
+    dbfull()->ReleaseSnapshot(snapshot);
+  }
+}
+
+#ifndef ROCKSDB_LITE
+
+TEST_F(DBTest2, LiveFilesOmitObsoleteFiles) {
+  // Regression test for race condition where an obsolete file is returned to
+  // user as a "live file" but then deleted, all while file deletions are
+  // disabled.
+  //
+  // It happened like this:
+  //
+  // 1. [flush thread] Log file "x.log" found by FindObsoleteFiles
+  // 2. [user thread] DisableFileDeletions, GetSortedWalFiles are called and the
+  //    latter returned "x.log"
+  // 3. [flush thread] PurgeObsoleteFiles deleted "x.log"
+  // 4. [user thread] Reading "x.log" failed
+  //
+  // Unfortunately the only regression test I can come up with involves sleep.
+  // We cannot set SyncPoints to repro since, once the fix is applied, the
+  // SyncPoints would cause a deadlock as the repro's sequence of events is now
+  // prohibited.
+  //
+  // Instead, if we sleep for a second between Find and Purge, and ensure the
+  // read attempt happens after purge, then the sequence of events will almost
+  // certainly happen on the old code.
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+      {"DBImpl::BackgroundCallFlush:FilesFound",
+       "DBTest2::LiveFilesOmitObsoleteFiles:FlushTriggered"},
+      {"DBImpl::PurgeObsoleteFiles:End",
+       "DBTest2::LiveFilesOmitObsoleteFiles:LiveFilesCaptured"},
+  });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::PurgeObsoleteFiles:Begin",
+      [&](void* /*arg*/) { env_->SleepForMicroseconds(1000000); });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  Put("key", "val");
+  FlushOptions flush_opts;
+  flush_opts.wait = false;
+  db_->Flush(flush_opts);
+  TEST_SYNC_POINT("DBTest2::LiveFilesOmitObsoleteFiles:FlushTriggered");
+
+  db_->DisableFileDeletions();
+  VectorLogPtr log_files;
+  db_->GetSortedWalFiles(log_files);
+  TEST_SYNC_POINT("DBTest2::LiveFilesOmitObsoleteFiles:LiveFilesCaptured");
+  for (const auto& log_file : log_files) {
+    ASSERT_OK(env_->FileExists(LogFileName(dbname_, log_file->LogNumber())));
+  }
+
+  db_->EnableFileDeletions();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBTest2, TestNumPread) {
+  Options options = CurrentOptions();
+  // disable block cache
+  BlockBasedTableOptions table_options;
+  table_options.no_block_cache = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  Reopen(options);
+  env_->count_random_reads_ = true;
+
+  env_->random_file_open_counter_.store(0);
+  ASSERT_OK(Put("bar", "foo"));
+  ASSERT_OK(Put("foo", "bar"));
+  ASSERT_OK(Flush());
+  // After flush, we'll open the file and read footer, meta block,
+  // property block and index block.
+  ASSERT_EQ(4, env_->random_read_counter_.Read());
+  ASSERT_EQ(1, env_->random_file_open_counter_.load());
+
+  // One pread per a normal data block read
+  env_->random_file_open_counter_.store(0);
+  env_->random_read_counter_.Reset();
+  ASSERT_EQ("bar", Get("foo"));
+  ASSERT_EQ(1, env_->random_read_counter_.Read());
+  // All files are already opened.
+  ASSERT_EQ(0, env_->random_file_open_counter_.load());
+
+  env_->random_file_open_counter_.store(0);
+  env_->random_read_counter_.Reset();
+  ASSERT_OK(Put("bar2", "foo2"));
+  ASSERT_OK(Put("foo2", "bar2"));
+  ASSERT_OK(Flush());
+  // After flush, we'll open the file and read footer, meta block,
+  // property block and index block.
+  ASSERT_EQ(4, env_->random_read_counter_.Read());
+  ASSERT_EQ(1, env_->random_file_open_counter_.load());
+
+  // Compaction needs two input blocks, which requires 2 preads, and
+  // generate a new SST file which needs 4 preads (footer, meta block,
+  // property block and index block). In total 6.
+  env_->random_file_open_counter_.store(0);
+  env_->random_read_counter_.Reset();
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ(6, env_->random_read_counter_.Read());
+  // All compactin input files should have already been opened.
+  ASSERT_EQ(1, env_->random_file_open_counter_.load());
+
+  // One pread per a normal data block read
+  env_->random_file_open_counter_.store(0);
+  env_->random_read_counter_.Reset();
+  ASSERT_EQ("foo2", Get("bar2"));
+  ASSERT_EQ(1, env_->random_read_counter_.Read());
+  // SST files are already opened.
+  ASSERT_EQ(0, env_->random_file_open_counter_.load());
+}
+
+TEST_F(DBTest2, TraceAndReplay) {
+  Options options = CurrentOptions();
+  options.merge_operator = MergeOperators::CreatePutOperator();
+  ReadOptions ro;
+  WriteOptions wo;
+  TraceOptions trace_opts;
+  EnvOptions env_opts;
+  CreateAndReopenWithCF({"pikachu"}, options);
+  Random rnd(301);
+  Iterator* single_iter = nullptr;
+
+  ASSERT_TRUE(db_->EndTrace().IsIOError());
+
+  std::string trace_filename = dbname_ + "/rocksdb.trace";
+  std::unique_ptr<TraceWriter> trace_writer;
+  ASSERT_OK(NewFileTraceWriter(env_, env_opts, trace_filename, &trace_writer));
+  ASSERT_OK(db_->StartTrace(trace_opts, std::move(trace_writer)));
+
+  ASSERT_OK(Put(0, "a", "1"));
+  ASSERT_OK(Merge(0, "b", "2"));
+  ASSERT_OK(Delete(0, "c"));
+  ASSERT_OK(SingleDelete(0, "d"));
+  ASSERT_OK(db_->DeleteRange(wo, dbfull()->DefaultColumnFamily(), "e", "f"));
+
+  WriteBatch batch;
+  ASSERT_OK(batch.Put("f", "11"));
+  ASSERT_OK(batch.Merge("g", "12"));
+  ASSERT_OK(batch.Delete("h"));
+  ASSERT_OK(batch.SingleDelete("i"));
+  ASSERT_OK(batch.DeleteRange("j", "k"));
+  ASSERT_OK(db_->Write(wo, &batch));
+
+  single_iter = db_->NewIterator(ro);
+  single_iter->Seek("f");
+  single_iter->SeekForPrev("g");
+  delete single_iter;
+
+  ASSERT_EQ("1", Get(0, "a"));
+  ASSERT_EQ("12", Get(0, "g"));
+
+  ASSERT_OK(Put(1, "foo", "bar"));
+  ASSERT_OK(Put(1, "rocksdb", "rocks"));
+  ASSERT_EQ("NOT_FOUND", Get(1, "leveldb"));
+
+  ASSERT_OK(db_->EndTrace());
+  // These should not get into the trace file as it is after EndTrace.
+  Put("hello", "world");
+  Merge("foo", "bar");
+
+  // Open another db, replay, and verify the data
+  std::string value;
+  std::string dbname2 = test::TmpDir(env_) + "/db_replay";
+  ASSERT_OK(DestroyDB(dbname2, options));
+
+  // Using a different name than db2, to pacify infer's use-after-lifetime
+  // warnings (http://fbinfer.com).
+  DB* db2_init = nullptr;
+  options.create_if_missing = true;
+  ASSERT_OK(DB::Open(options, dbname2, &db2_init));
+  ColumnFamilyHandle* cf;
+  ASSERT_OK(
+      db2_init->CreateColumnFamily(ColumnFamilyOptions(), "pikachu", &cf));
+  delete cf;
+  delete db2_init;
+
+  DB* db2 = nullptr;
+  std::vector<ColumnFamilyDescriptor> column_families;
+  ColumnFamilyOptions cf_options;
+  cf_options.merge_operator = MergeOperators::CreatePutOperator();
+  column_families.push_back(ColumnFamilyDescriptor("default", cf_options));
+  column_families.push_back(
+      ColumnFamilyDescriptor("pikachu", ColumnFamilyOptions()));
+  std::vector<ColumnFamilyHandle*> handles;
+  ASSERT_OK(DB::Open(DBOptions(), dbname2, column_families, &handles, &db2));
+
+  env_->SleepForMicroseconds(100);
+  // Verify that the keys don't already exist
+  ASSERT_TRUE(db2->Get(ro, handles[0], "a", &value).IsNotFound());
+  ASSERT_TRUE(db2->Get(ro, handles[0], "g", &value).IsNotFound());
+
+  std::unique_ptr<TraceReader> trace_reader;
+  ASSERT_OK(NewFileTraceReader(env_, env_opts, trace_filename, &trace_reader));
+  Replayer replayer(db2, handles_, std::move(trace_reader));
+  ASSERT_OK(replayer.Replay());
+
+  ASSERT_OK(db2->Get(ro, handles[0], "a", &value));
+  ASSERT_EQ("1", value);
+  ASSERT_OK(db2->Get(ro, handles[0], "g", &value));
+  ASSERT_EQ("12", value);
+  ASSERT_TRUE(db2->Get(ro, handles[0], "hello", &value).IsNotFound());
+  ASSERT_TRUE(db2->Get(ro, handles[0], "world", &value).IsNotFound());
+
+  ASSERT_OK(db2->Get(ro, handles[1], "foo", &value));
+  ASSERT_EQ("bar", value);
+  ASSERT_OK(db2->Get(ro, handles[1], "rocksdb", &value));
+  ASSERT_EQ("rocks", value);
+
+  for (auto handle : handles) {
+    delete handle;
+  }
+  delete db2;
+  ASSERT_OK(DestroyDB(dbname2, options));
+}
+
+TEST_F(DBTest2, TraceWithLimit) {
+  Options options = CurrentOptions();
+  options.merge_operator = MergeOperators::CreatePutOperator();
+  ReadOptions ro;
+  WriteOptions wo;
+  TraceOptions trace_opts;
+  EnvOptions env_opts;
+  CreateAndReopenWithCF({"pikachu"}, options);
+  Random rnd(301);
+
+  // test the max trace file size options
+  trace_opts.max_trace_file_size = 5;
+  std::string trace_filename = dbname_ + "/rocksdb.trace1";
+  std::unique_ptr<TraceWriter> trace_writer;
+  ASSERT_OK(NewFileTraceWriter(env_, env_opts, trace_filename, &trace_writer));
+  ASSERT_OK(db_->StartTrace(trace_opts, std::move(trace_writer)));
+  ASSERT_OK(Put(0, "a", "1"));
+  ASSERT_OK(Put(0, "b", "1"));
+  ASSERT_OK(Put(0, "c", "1"));
+  ASSERT_OK(db_->EndTrace());
+
+  std::string dbname2 = test::TmpDir(env_) + "/db_replay2";
+  std::string value;
+  ASSERT_OK(DestroyDB(dbname2, options));
+
+  // Using a different name than db2, to pacify infer's use-after-lifetime
+  // warnings (http://fbinfer.com).
+  DB* db2_init = nullptr;
+  options.create_if_missing = true;
+  ASSERT_OK(DB::Open(options, dbname2, &db2_init));
+  ColumnFamilyHandle* cf;
+  ASSERT_OK(
+      db2_init->CreateColumnFamily(ColumnFamilyOptions(), "pikachu", &cf));
+  delete cf;
+  delete db2_init;
+
+  DB* db2 = nullptr;
+  std::vector<ColumnFamilyDescriptor> column_families;
+  ColumnFamilyOptions cf_options;
+  cf_options.merge_operator = MergeOperators::CreatePutOperator();
+  column_families.push_back(ColumnFamilyDescriptor("default", cf_options));
+  column_families.push_back(
+      ColumnFamilyDescriptor("pikachu", ColumnFamilyOptions()));
+  std::vector<ColumnFamilyHandle*> handles;
+  ASSERT_OK(DB::Open(DBOptions(), dbname2, column_families, &handles, &db2));
+
+  env_->SleepForMicroseconds(100);
+  // Verify that the keys don't already exist
+  ASSERT_TRUE(db2->Get(ro, handles[0], "a", &value).IsNotFound());
+  ASSERT_TRUE(db2->Get(ro, handles[0], "b", &value).IsNotFound());
+  ASSERT_TRUE(db2->Get(ro, handles[0], "c", &value).IsNotFound());
+
+  std::unique_ptr<TraceReader> trace_reader;
+  ASSERT_OK(NewFileTraceReader(env_, env_opts, trace_filename, &trace_reader));
+  Replayer replayer(db2, handles_, std::move(trace_reader));
+  ASSERT_OK(replayer.Replay());
+
+  ASSERT_TRUE(db2->Get(ro, handles[0], "a", &value).IsNotFound());
+  ASSERT_TRUE(db2->Get(ro, handles[0], "b", &value).IsNotFound());
+  ASSERT_TRUE(db2->Get(ro, handles[0], "c", &value).IsNotFound());
+
+  for (auto handle : handles) {
+    delete handle;
+  }
+  delete db2;
+  ASSERT_OK(DestroyDB(dbname2, options));
+}
+
+TEST_F(DBTest2, TraceWithSampling) {
+  Options options = CurrentOptions();
+  ReadOptions ro;
+  WriteOptions wo;
+  TraceOptions trace_opts;
+  EnvOptions env_opts;
+  CreateAndReopenWithCF({"pikachu"}, options);
+  Random rnd(301);
+
+  // test the trace file sampling options
+  trace_opts.sampling_frequency = 2;
+  std::string trace_filename = dbname_ + "/rocksdb.trace_sampling";
+  std::unique_ptr<TraceWriter> trace_writer;
+  ASSERT_OK(NewFileTraceWriter(env_, env_opts, trace_filename, &trace_writer));
+  ASSERT_OK(db_->StartTrace(trace_opts, std::move(trace_writer)));
+  ASSERT_OK(Put(0, "a", "1"));
+  ASSERT_OK(Put(0, "b", "2"));
+  ASSERT_OK(Put(0, "c", "3"));
+  ASSERT_OK(Put(0, "d", "4"));
+  ASSERT_OK(Put(0, "e", "5"));
+  ASSERT_OK(db_->EndTrace());
+
+  std::string dbname2 = test::TmpDir(env_) + "/db_replay_sampling";
+  std::string value;
+  ASSERT_OK(DestroyDB(dbname2, options));
+
+  // Using a different name than db2, to pacify infer's use-after-lifetime
+  // warnings (http://fbinfer.com).
+  DB* db2_init = nullptr;
+  options.create_if_missing = true;
+  ASSERT_OK(DB::Open(options, dbname2, &db2_init));
+  ColumnFamilyHandle* cf;
+  ASSERT_OK(
+      db2_init->CreateColumnFamily(ColumnFamilyOptions(), "pikachu", &cf));
+  delete cf;
+  delete db2_init;
+
+  DB* db2 = nullptr;
+  std::vector<ColumnFamilyDescriptor> column_families;
+  ColumnFamilyOptions cf_options;
+  column_families.push_back(ColumnFamilyDescriptor("default", cf_options));
+  column_families.push_back(
+      ColumnFamilyDescriptor("pikachu", ColumnFamilyOptions()));
+  std::vector<ColumnFamilyHandle*> handles;
+  ASSERT_OK(DB::Open(DBOptions(), dbname2, column_families, &handles, &db2));
+
+  env_->SleepForMicroseconds(100);
+  ASSERT_TRUE(db2->Get(ro, handles[0], "a", &value).IsNotFound());
+  ASSERT_TRUE(db2->Get(ro, handles[0], "b", &value).IsNotFound());
+  ASSERT_TRUE(db2->Get(ro, handles[0], "c", &value).IsNotFound());
+  ASSERT_TRUE(db2->Get(ro, handles[0], "d", &value).IsNotFound());
+  ASSERT_TRUE(db2->Get(ro, handles[0], "e", &value).IsNotFound());
+
+  std::unique_ptr<TraceReader> trace_reader;
+  ASSERT_OK(NewFileTraceReader(env_, env_opts, trace_filename, &trace_reader));
+  Replayer replayer(db2, handles_, std::move(trace_reader));
+  ASSERT_OK(replayer.Replay());
+
+  ASSERT_TRUE(db2->Get(ro, handles[0], "a", &value).IsNotFound());
+  ASSERT_FALSE(db2->Get(ro, handles[0], "b", &value).IsNotFound());
+  ASSERT_TRUE(db2->Get(ro, handles[0], "c", &value).IsNotFound());
+  ASSERT_FALSE(db2->Get(ro, handles[0], "d", &value).IsNotFound());
+  ASSERT_TRUE(db2->Get(ro, handles[0], "e", &value).IsNotFound());
+
+  for (auto handle : handles) {
+    delete handle;
+  }
+  delete db2;
+  ASSERT_OK(DestroyDB(dbname2, options));
+}
+
+TEST_F(DBTest2, TraceWithFilter) {
+  Options options = CurrentOptions();
+  options.merge_operator = MergeOperators::CreatePutOperator();
+  ReadOptions ro;
+  WriteOptions wo;
+  TraceOptions trace_opts;
+  EnvOptions env_opts;
+  CreateAndReopenWithCF({"pikachu"}, options);
+  Random rnd(301);
+  Iterator* single_iter = nullptr;
+
+  trace_opts.filter = TraceFilterType::kTraceFilterWrite;
+
+  std::string trace_filename = dbname_ + "/rocksdb.trace";
+  std::unique_ptr<TraceWriter> trace_writer;
+  ASSERT_OK(NewFileTraceWriter(env_, env_opts, trace_filename, &trace_writer));
+  ASSERT_OK(db_->StartTrace(trace_opts, std::move(trace_writer)));
+
+  ASSERT_OK(Put(0, "a", "1"));
+  ASSERT_OK(Merge(0, "b", "2"));
+  ASSERT_OK(Delete(0, "c"));
+  ASSERT_OK(SingleDelete(0, "d"));
+  ASSERT_OK(db_->DeleteRange(wo, dbfull()->DefaultColumnFamily(), "e", "f"));
+
+  WriteBatch batch;
+  ASSERT_OK(batch.Put("f", "11"));
+  ASSERT_OK(batch.Merge("g", "12"));
+  ASSERT_OK(batch.Delete("h"));
+  ASSERT_OK(batch.SingleDelete("i"));
+  ASSERT_OK(batch.DeleteRange("j", "k"));
+  ASSERT_OK(db_->Write(wo, &batch));
+
+  single_iter = db_->NewIterator(ro);
+  single_iter->Seek("f");
+  single_iter->SeekForPrev("g");
+  delete single_iter;
+
+  ASSERT_EQ("1", Get(0, "a"));
+  ASSERT_EQ("12", Get(0, "g"));
+
+  ASSERT_OK(Put(1, "foo", "bar"));
+  ASSERT_OK(Put(1, "rocksdb", "rocks"));
+  ASSERT_EQ("NOT_FOUND", Get(1, "leveldb"));
+
+  ASSERT_OK(db_->EndTrace());
+  // These should not get into the trace file as it is after EndTrace.
+  Put("hello", "world");
+  Merge("foo", "bar");
+
+  // Open another db, replay, and verify the data
+  std::string value;
+  std::string dbname2 = test::TmpDir(env_) + "/db_replay";
+  ASSERT_OK(DestroyDB(dbname2, options));
+
+  // Using a different name than db2, to pacify infer's use-after-lifetime
+  // warnings (http://fbinfer.com).
+  DB* db2_init = nullptr;
+  options.create_if_missing = true;
+  ASSERT_OK(DB::Open(options, dbname2, &db2_init));
+  ColumnFamilyHandle* cf;
+  ASSERT_OK(
+      db2_init->CreateColumnFamily(ColumnFamilyOptions(), "pikachu", &cf));
+  delete cf;
+  delete db2_init;
+
+  DB* db2 = nullptr;
+  std::vector<ColumnFamilyDescriptor> column_families;
+  ColumnFamilyOptions cf_options;
+  cf_options.merge_operator = MergeOperators::CreatePutOperator();
+  column_families.push_back(ColumnFamilyDescriptor("default", cf_options));
+  column_families.push_back(
+      ColumnFamilyDescriptor("pikachu", ColumnFamilyOptions()));
+  std::vector<ColumnFamilyHandle*> handles;
+  ASSERT_OK(DB::Open(DBOptions(), dbname2, column_families, &handles, &db2));
+
+  env_->SleepForMicroseconds(100);
+  // Verify that the keys don't already exist
+  ASSERT_TRUE(db2->Get(ro, handles[0], "a", &value).IsNotFound());
+  ASSERT_TRUE(db2->Get(ro, handles[0], "g", &value).IsNotFound());
+
+  std::unique_ptr<TraceReader> trace_reader;
+  ASSERT_OK(NewFileTraceReader(env_, env_opts, trace_filename, &trace_reader));
+  Replayer replayer(db2, handles_, std::move(trace_reader));
+  ASSERT_OK(replayer.Replay());
+
+  // All the key-values should not present since we filter out the WRITE ops.
+  ASSERT_TRUE(db2->Get(ro, handles[0], "a", &value).IsNotFound());
+  ASSERT_TRUE(db2->Get(ro, handles[0], "g", &value).IsNotFound());
+  ASSERT_TRUE(db2->Get(ro, handles[0], "hello", &value).IsNotFound());
+  ASSERT_TRUE(db2->Get(ro, handles[0], "world", &value).IsNotFound());
+  ASSERT_TRUE(db2->Get(ro, handles[0], "foo", &value).IsNotFound());
+  ASSERT_TRUE(db2->Get(ro, handles[0], "rocksdb", &value).IsNotFound());
+
+  for (auto handle : handles) {
+    delete handle;
+  }
+  delete db2;
+  ASSERT_OK(DestroyDB(dbname2, options));
+
+  // Set up a new db.
+  std::string dbname3 = test::TmpDir(env_) + "/db_not_trace_read";
+  ASSERT_OK(DestroyDB(dbname3, options));
+
+  DB* db3_init = nullptr;
+  options.create_if_missing = true;
+  ColumnFamilyHandle* cf3;
+  ASSERT_OK(DB::Open(options, dbname3, &db3_init));
+  ASSERT_OK(
+      db3_init->CreateColumnFamily(ColumnFamilyOptions(), "pikachu", &cf3));
+  delete cf3;
+  delete db3_init;
+
+  column_families.clear();
+  column_families.push_back(ColumnFamilyDescriptor("default", cf_options));
+  column_families.push_back(
+      ColumnFamilyDescriptor("pikachu", ColumnFamilyOptions()));
+  handles.clear();
+
+  DB* db3 =  nullptr;
+  ASSERT_OK(DB::Open(DBOptions(), dbname3, column_families, &handles, &db3));
+
+  env_->SleepForMicroseconds(100);
+  // Verify that the keys don't already exist
+  ASSERT_TRUE(db3->Get(ro, handles[0], "a", &value).IsNotFound());
+  ASSERT_TRUE(db3->Get(ro, handles[0], "g", &value).IsNotFound());
+
+  //The tracer will not record the READ ops.
+  trace_opts.filter = TraceFilterType::kTraceFilterGet;
+  std::string trace_filename3 = dbname_ + "/rocksdb.trace_3";
+  std::unique_ptr<TraceWriter> trace_writer3;
+  ASSERT_OK(
+    NewFileTraceWriter(env_, env_opts, trace_filename3, &trace_writer3));
+  ASSERT_OK(db3->StartTrace(trace_opts, std::move(trace_writer3)));
+
+  ASSERT_OK(db3->Put(wo, handles[0], "a", "1"));
+  ASSERT_OK(db3->Merge(wo, handles[0], "b", "2"));
+  ASSERT_OK(db3->Delete(wo, handles[0], "c"));
+  ASSERT_OK(db3->SingleDelete(wo, handles[0], "d"));
+
+  ASSERT_OK(db3->Get(ro, handles[0], "a", &value));
+  ASSERT_EQ(value, "1");
+  ASSERT_TRUE(db3->Get(ro, handles[0], "c", &value).IsNotFound());
+
+  ASSERT_OK(db3->EndTrace());
+
+  for (auto handle : handles) {
+    delete handle;
+  }
+  delete db3;
+  ASSERT_OK(DestroyDB(dbname3, options));
+
+  std::unique_ptr<TraceReader> trace_reader3;
+  ASSERT_OK(
+    NewFileTraceReader(env_, env_opts, trace_filename3, &trace_reader3));
+
+  // Count the number of records in the trace file;
+  int count = 0;
+  std::string data;
+  Status s;
+  while (true) {
+    s = trace_reader3->Read(&data);
+    if (!s.ok()) {
+      break;
+    }
+    count += 1;
+  }
+  // We also need to count the header and footer
+  // 4 WRITE + HEADER + FOOTER = 6
+  ASSERT_EQ(count, 6);
+}
+
+#endif  // ROCKSDB_LITE
+
+TEST_F(DBTest2, PinnableSliceAndMmapReads) {
+  Options options = CurrentOptions();
+  options.allow_mmap_reads = true;
+  options.max_open_files = 100;
+  options.compression = kNoCompression;
+  Reopen(options);
+
+  ASSERT_OK(Put("foo", "bar"));
+  ASSERT_OK(Flush());
+
+  PinnableSlice pinned_value;
+  ASSERT_EQ(Get("foo", &pinned_value), Status::OK());
+  // It is not safe to pin mmap files as they might disappear by compaction
+  ASSERT_FALSE(pinned_value.IsPinned());
+  ASSERT_EQ(pinned_value.ToString(), "bar");
+
+  dbfull()->TEST_CompactRange(0 /* level */, nullptr /* begin */,
+                              nullptr /* end */, nullptr /* column_family */,
+                              true /* disallow_trivial_move */);
+
+  // Ensure pinned_value doesn't rely on memory munmap'd by the above
+  // compaction. It crashes if it does.
+  ASSERT_EQ(pinned_value.ToString(), "bar");
+
+#ifndef ROCKSDB_LITE
+  pinned_value.Reset();
+  // Unsafe to pin mmap files when they could be kicked out of table cache
+  Close();
+  ASSERT_OK(ReadOnlyReopen(options));
+  ASSERT_EQ(Get("foo", &pinned_value), Status::OK());
+  ASSERT_FALSE(pinned_value.IsPinned());
+  ASSERT_EQ(pinned_value.ToString(), "bar");
+
+  pinned_value.Reset();
+  // In read-only mode with infinite capacity on table cache it should pin the
+  // value and avoid the memcpy
+  Close();
+  options.max_open_files = -1;
+  ASSERT_OK(ReadOnlyReopen(options));
+  ASSERT_EQ(Get("foo", &pinned_value), Status::OK());
+  ASSERT_TRUE(pinned_value.IsPinned());
+  ASSERT_EQ(pinned_value.ToString(), "bar");
+#endif
+}
+
+TEST_F(DBTest2, DISABLED_IteratorPinnedMemory) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  BlockBasedTableOptions bbto;
+  bbto.no_block_cache = false;
+  bbto.cache_index_and_filter_blocks = false;
+  bbto.block_cache = NewLRUCache(100000);
+  bbto.block_size = 400;  // small block size
+  options.table_factory.reset(new BlockBasedTableFactory(bbto));
+  Reopen(options);
+
+  Random rnd(301);
+  std::string v = RandomString(&rnd, 400);
+
+  // Since v is the size of a block, each key should take a block
+  // of 400+ bytes.
+  Put("1", v);
+  Put("3", v);
+  Put("5", v);
+  Put("7", v);
+  ASSERT_OK(Flush());
+
+  ASSERT_EQ(0, bbto.block_cache->GetPinnedUsage());
+
+  // Verify that iterators don't pin more than one data block in block cache
+  // at each time.
+  {
+    std::unique_ptr<Iterator> iter(db_->NewIterator(ReadOptions()));
+    iter->SeekToFirst();
+
+    for (int i = 0; i < 4; i++) {
+      ASSERT_TRUE(iter->Valid());
+      // Block cache should contain exactly one block.
+      ASSERT_GT(bbto.block_cache->GetPinnedUsage(), 0);
+      ASSERT_LT(bbto.block_cache->GetPinnedUsage(), 800);
+      iter->Next();
+    }
+    ASSERT_FALSE(iter->Valid());
+
+    iter->Seek("4");
+    ASSERT_TRUE(iter->Valid());
+
+    ASSERT_GT(bbto.block_cache->GetPinnedUsage(), 0);
+    ASSERT_LT(bbto.block_cache->GetPinnedUsage(), 800);
+
+    iter->Seek("3");
+    ASSERT_TRUE(iter->Valid());
+
+    ASSERT_GT(bbto.block_cache->GetPinnedUsage(), 0);
+    ASSERT_LT(bbto.block_cache->GetPinnedUsage(), 800);
+  }
+  ASSERT_EQ(0, bbto.block_cache->GetPinnedUsage());
+
+  // Test compaction case
+  Put("2", v);
+  Put("5", v);
+  Put("6", v);
+  Put("8", v);
+  ASSERT_OK(Flush());
+
+  // Clear existing data in block cache
+  bbto.block_cache->SetCapacity(0);
+  bbto.block_cache->SetCapacity(100000);
+
+  // Verify compaction input iterators don't hold more than one data blocks at
+  // one time.
+  std::atomic<bool> finished(false);
+  std::atomic<int> block_newed(0);
+  std::atomic<int> block_destroyed(0);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "Block::Block:0", [&](void* /*arg*/) {
+        if (finished) {
+          return;
+        }
+        // Two iterators. At most 2 outstanding blocks.
+        EXPECT_GE(block_newed.load(), block_destroyed.load());
+        EXPECT_LE(block_newed.load(), block_destroyed.load() + 1);
+        block_newed.fetch_add(1);
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "Block::~Block", [&](void* /*arg*/) {
+        if (finished) {
+          return;
+        }
+        // Two iterators. At most 2 outstanding blocks.
+        EXPECT_GE(block_newed.load(), block_destroyed.load() + 1);
+        EXPECT_LE(block_newed.load(), block_destroyed.load() + 2);
+        block_destroyed.fetch_add(1);
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "CompactionJob::Run:BeforeVerify",
+      [&](void* /*arg*/) { finished = true; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+  // Two input files. Each of them has 4 data blocks.
+  ASSERT_EQ(8, block_newed.load());
+  ASSERT_EQ(8, block_destroyed.load());
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBTest2, TestBBTTailPrefetch) {
+  std::atomic<bool> called(false);
+  size_t expected_lower_bound = 512 * 1024;
+  size_t expected_higher_bound = 512 * 1024;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "BlockBasedTable::Open::TailPrefetchLen", [&](void* arg) {
+        size_t* prefetch_size = static_cast<size_t*>(arg);
+        EXPECT_LE(expected_lower_bound, *prefetch_size);
+        EXPECT_GE(expected_higher_bound, *prefetch_size);
+        called = true;
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  Put("1", "1");
+  Put("9", "1");
+  Flush();
+
+  expected_lower_bound = 0;
+  expected_higher_bound = 8 * 1024;
+
+  Put("1", "1");
+  Put("9", "1");
+  Flush();
+
+  Put("1", "1");
+  Put("9", "1");
+  Flush();
+
+  // Full compaction to make sure there is no L0 file after the open.
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+  ASSERT_TRUE(called.load());
+  called = false;
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  std::atomic<bool> first_call(true);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "BlockBasedTable::Open::TailPrefetchLen", [&](void* arg) {
+        size_t* prefetch_size = static_cast<size_t*>(arg);
+        if (first_call) {
+          EXPECT_EQ(4 * 1024, *prefetch_size);
+          first_call = false;
+        } else {
+          EXPECT_GE(4 * 1024, *prefetch_size);
+        }
+        called = true;
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  Options options = CurrentOptions();
+  options.max_file_opening_threads = 1;  // one thread
+  BlockBasedTableOptions table_options;
+  table_options.cache_index_and_filter_blocks = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options.max_open_files = -1;
+  Reopen(options);
+
+  Put("1", "1");
+  Put("9", "1");
+  Flush();
+
+  Put("1", "1");
+  Put("9", "1");
+  Flush();
+
+  ASSERT_TRUE(called.load());
+  called = false;
+
+  // Parallel loading SST files
+  options.max_file_opening_threads = 16;
+  Reopen(options);
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+  ASSERT_TRUE(called.load());
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_F(DBTest2, TestGetColumnFamilyHandleUnlocked) {
+  // Setup sync point dependency to reproduce the race condition of
+  // DBImpl::GetColumnFamilyHandleUnlocked
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+      {"TestGetColumnFamilyHandleUnlocked::GetColumnFamilyHandleUnlocked1",
+       "TestGetColumnFamilyHandleUnlocked::PreGetColumnFamilyHandleUnlocked2"},
+      {"TestGetColumnFamilyHandleUnlocked::GetColumnFamilyHandleUnlocked2",
+       "TestGetColumnFamilyHandleUnlocked::ReadColumnFamilyHandle1"},
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  CreateColumnFamilies({"test1", "test2"}, Options());
+  ASSERT_EQ(handles_.size(), 2);
+
+  DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
+  port::Thread user_thread1([&]() {
+    auto cfh = dbi->GetColumnFamilyHandleUnlocked(handles_[0]->GetID());
+    ASSERT_EQ(cfh->GetID(), handles_[0]->GetID());
+    TEST_SYNC_POINT("TestGetColumnFamilyHandleUnlocked::GetColumnFamilyHandleUnlocked1");
+    TEST_SYNC_POINT("TestGetColumnFamilyHandleUnlocked::ReadColumnFamilyHandle1");
+    ASSERT_EQ(cfh->GetID(), handles_[0]->GetID());
+  });
+
+  port::Thread user_thread2([&]() {
+    TEST_SYNC_POINT("TestGetColumnFamilyHandleUnlocked::PreGetColumnFamilyHandleUnlocked2");
+    auto cfh = dbi->GetColumnFamilyHandleUnlocked(handles_[1]->GetID());
+    ASSERT_EQ(cfh->GetID(), handles_[1]->GetID());
+    TEST_SYNC_POINT("TestGetColumnFamilyHandleUnlocked::GetColumnFamilyHandleUnlocked2");
+    ASSERT_EQ(cfh->GetID(), handles_[1]->GetID());
+  });
+
+  user_thread1.join();
+  user_thread2.join();
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest2, TestCompactFiles) {
+  // Setup sync point dependency to reproduce the race condition of
+  // DBImpl::GetColumnFamilyHandleUnlocked
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+      {"TestCompactFiles::IngestExternalFile1",
+       "TestCompactFiles::IngestExternalFile2"},
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  Options options;
+  options.num_levels = 2;
+  options.disable_auto_compactions = true;
+  Reopen(options);
+  auto* handle = db_->DefaultColumnFamily();
+  ASSERT_EQ(db_->NumberLevels(handle), 2);
+
+  ROCKSDB_NAMESPACE::SstFileWriter sst_file_writer{
+      ROCKSDB_NAMESPACE::EnvOptions(), options};
+  std::string external_file1 = dbname_ + "/test_compact_files1.sst_t";
+  std::string external_file2 = dbname_ + "/test_compact_files2.sst_t";
+  std::string external_file3 = dbname_ + "/test_compact_files3.sst_t";
+
+  ASSERT_OK(sst_file_writer.Open(external_file1));
+  ASSERT_OK(sst_file_writer.Put("1", "1"));
+  ASSERT_OK(sst_file_writer.Put("2", "2"));
+  ASSERT_OK(sst_file_writer.Finish());
+
+  ASSERT_OK(sst_file_writer.Open(external_file2));
+  ASSERT_OK(sst_file_writer.Put("3", "3"));
+  ASSERT_OK(sst_file_writer.Put("4", "4"));
+  ASSERT_OK(sst_file_writer.Finish());
+
+  ASSERT_OK(sst_file_writer.Open(external_file3));
+  ASSERT_OK(sst_file_writer.Put("5", "5"));
+  ASSERT_OK(sst_file_writer.Put("6", "6"));
+  ASSERT_OK(sst_file_writer.Finish());
+
+  ASSERT_OK(db_->IngestExternalFile(handle, {external_file1, external_file3},
+                                    IngestExternalFileOptions()));
+  ASSERT_EQ(NumTableFilesAtLevel(1, 0), 2);
+  std::vector<std::string> files;
+  GetSstFiles(env_, dbname_, &files);
+  ASSERT_EQ(files.size(), 2);
+
+  port::Thread user_thread1(
+      [&]() { db_->CompactFiles(CompactionOptions(), handle, files, 1); });
+
+  port::Thread user_thread2([&]() {
+    ASSERT_OK(db_->IngestExternalFile(handle, {external_file2},
+                                      IngestExternalFileOptions()));
+    TEST_SYNC_POINT("TestCompactFiles::IngestExternalFile1");
+  });
+
+  user_thread1.join();
+  user_thread2.join();
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+#endif  // ROCKSDB_LITE
+
+// TODO: figure out why this test fails in appveyor
+#ifndef OS_WIN
+TEST_F(DBTest2, MultiDBParallelOpenTest) {
+  const int kNumDbs = 2;
+  Options options = CurrentOptions();
+  std::vector<std::string> dbnames;
+  for (int i = 0; i < kNumDbs; ++i) {
+    dbnames.emplace_back(test::TmpDir(env_) + "/db" + ToString(i));
+    ASSERT_OK(DestroyDB(dbnames.back(), options));
+  }
+
+  // Verify empty DBs can be created in parallel
+  std::vector<std::thread> open_threads;
+  std::vector<DB*> dbs{static_cast<unsigned int>(kNumDbs), nullptr};
+  options.create_if_missing = true;
+  for (int i = 0; i < kNumDbs; ++i) {
+    open_threads.emplace_back(
+        [&](int dbnum) {
+          ASSERT_OK(DB::Open(options, dbnames[dbnum], &dbs[dbnum]));
+        },
+        i);
+  }
+
+  // Now add some data and close, so next we can verify non-empty DBs can be
+  // recovered in parallel
+  for (int i = 0; i < kNumDbs; ++i) {
+    open_threads[i].join();
+    ASSERT_OK(dbs[i]->Put(WriteOptions(), "xi", "gua"));
+    delete dbs[i];
+  }
+
+  // Verify non-empty DBs can be recovered in parallel
+  dbs.clear();
+  open_threads.clear();
+  for (int i = 0; i < kNumDbs; ++i) {
+    open_threads.emplace_back(
+        [&](int dbnum) {
+          ASSERT_OK(DB::Open(options, dbnames[dbnum], &dbs[dbnum]));
+        },
+        i);
+  }
+
+  // Wait and cleanup
+  for (int i = 0; i < kNumDbs; ++i) {
+    open_threads[i].join();
+    delete dbs[i];
+    ASSERT_OK(DestroyDB(dbnames[i], options));
+  }
+}
+#endif  // OS_WIN
+
+namespace {
+class DummyOldStats : public Statistics {
+ public:
+  uint64_t getTickerCount(uint32_t /*ticker_type*/) const override { return 0; }
+  void recordTick(uint32_t /* ticker_type */, uint64_t /* count */) override {
+    num_rt++;
+  }
+  void setTickerCount(uint32_t /*ticker_type*/, uint64_t /*count*/) override {}
+  uint64_t getAndResetTickerCount(uint32_t /*ticker_type*/) override {
+    return 0;
+  }
+  void measureTime(uint32_t /*histogram_type*/, uint64_t /*count*/) override {
+    num_mt++;
+  }
+  void histogramData(
+      uint32_t /*histogram_type*/,
+      ROCKSDB_NAMESPACE::HistogramData* const /*data*/) const override {}
+  std::string getHistogramString(uint32_t /*type*/) const override {
+    return "";
+  }
+  bool HistEnabledForType(uint32_t /*type*/) const override { return false; }
+  std::string ToString() const override { return ""; }
+  int num_rt = 0;
+  int num_mt = 0;
+};
+}  // namespace
+
+TEST_F(DBTest2, OldStatsInterface) {
+  DummyOldStats* dos = new DummyOldStats();
+  std::shared_ptr<Statistics> stats(dos);
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.statistics = stats;
+  Reopen(options);
+
+  Put("foo", "bar");
+  ASSERT_EQ("bar", Get("foo"));
+  ASSERT_OK(Flush());
+  ASSERT_EQ("bar", Get("foo"));
+
+  ASSERT_GT(dos->num_rt, 0);
+  ASSERT_GT(dos->num_mt, 0);
+}
+
+TEST_F(DBTest2, CloseWithUnreleasedSnapshot) {
+  const Snapshot* ss = db_->GetSnapshot();
+
+  for (auto h : handles_) {
+    db_->DestroyColumnFamilyHandle(h);
+  }
+  handles_.clear();
+
+  ASSERT_NOK(db_->Close());
+  db_->ReleaseSnapshot(ss);
+  ASSERT_OK(db_->Close());
+  delete db_;
+  db_ = nullptr;
+}
+
+TEST_F(DBTest2, PrefixBloomReseek) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.prefix_extractor.reset(NewCappedPrefixTransform(3));
+  BlockBasedTableOptions bbto;
+  bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
+  bbto.whole_key_filtering = false;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  DestroyAndReopen(options);
+
+  // Construct two L1 files with keys:
+  // f1:[aaa1 ccc1] f2:[ddd0]
+  ASSERT_OK(Put("aaa1", ""));
+  ASSERT_OK(Put("ccc1", ""));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("ddd0", ""));
+  ASSERT_OK(Flush());
+  CompactRangeOptions cro;
+  cro.bottommost_level_compaction = BottommostLevelCompaction::kSkip;
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+  ASSERT_OK(Put("bbb1", ""));
+
+  Iterator* iter = db_->NewIterator(ReadOptions());
+
+  // Seeking into f1, the iterator will check bloom filter which returns the
+  // file iterator ot be invalidate, and the cursor will put into f2, with
+  // the next key to be "ddd0".
+  iter->Seek("bbb1");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("bbb1", iter->key().ToString());
+
+  // Reseek ccc1, the L1 iterator needs to go back to f1 and reseek.
+  iter->Seek("ccc1");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("ccc1", iter->key().ToString());
+
+  delete iter;
+}
+
+TEST_F(DBTest2, PrefixBloomFilteredOut) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.prefix_extractor.reset(NewCappedPrefixTransform(3));
+  BlockBasedTableOptions bbto;
+  bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
+  bbto.whole_key_filtering = false;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  DestroyAndReopen(options);
+
+  // Construct two L1 files with keys:
+  // f1:[aaa1 ccc1] f2:[ddd0]
+  ASSERT_OK(Put("aaa1", ""));
+  ASSERT_OK(Put("ccc1", ""));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("ddd0", ""));
+  ASSERT_OK(Flush());
+  CompactRangeOptions cro;
+  cro.bottommost_level_compaction = BottommostLevelCompaction::kSkip;
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+  Iterator* iter = db_->NewIterator(ReadOptions());
+
+  // Bloom filter is filterd out by f1.
+  // This is just one of several valid position following the contract.
+  // Postioning to ccc1 or ddd0 is also valid. This is just to validate
+  // the behavior of the current implementation. If underlying implementation
+  // changes, the test might fail here.
+  iter->Seek("bbb1");
+  ASSERT_FALSE(iter->Valid());
+
+  delete iter;
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest2, RowCacheSnapshot) {
+  Options options = CurrentOptions();
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  options.row_cache = NewLRUCache(8 * 8192);
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put("foo", "bar1"));
+
+  const Snapshot* s1 = db_->GetSnapshot();
+
+  ASSERT_OK(Put("foo", "bar2"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put("foo2", "bar"));
+  const Snapshot* s2 = db_->GetSnapshot();
+  ASSERT_OK(Put("foo3", "bar"));
+  const Snapshot* s3 = db_->GetSnapshot();
+
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 0);
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 0);
+  ASSERT_EQ(Get("foo"), "bar2");
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 0);
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 1);
+  ASSERT_EQ(Get("foo"), "bar2");
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 1);
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 1);
+  ASSERT_EQ(Get("foo", s1), "bar1");
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 1);
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 2);
+  ASSERT_EQ(Get("foo", s2), "bar2");
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 2);
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 2);
+  ASSERT_EQ(Get("foo", s1), "bar1");
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 3);
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 2);
+  ASSERT_EQ(Get("foo", s3), "bar2");
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 4);
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 2);
+
+  db_->ReleaseSnapshot(s1);
+  db_->ReleaseSnapshot(s2);
+  db_->ReleaseSnapshot(s3);
+}
+#endif  // ROCKSDB_LITE
+
+// When DB is reopened with multiple column families, the manifest file
+// is written after the first CF is flushed, and it is written again
+// after each flush. If DB crashes between the flushes, the flushed CF
+// flushed will pass the latest log file, and now we require it not
+// to be corrupted, and triggering a corruption report.
+// We need to fix the bug and enable the test.
+TEST_F(DBTest2, CrashInRecoveryMultipleCF) {
+  const std::vector<std::string> sync_points = {
+      "DBImpl::RecoverLogFiles:BeforeFlushFinalMemtable",
+      "VersionSet::ProcessManifestWrites:BeforeWriteLastVersionEdit:0"};
+  for (const auto& test_sync_point : sync_points) {
+    Options options = CurrentOptions();
+    // First destroy original db to ensure a clean start.
+    DestroyAndReopen(options);
+    options.create_if_missing = true;
+    options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery;
+    CreateAndReopenWithCF({"pikachu"}, options);
+    ASSERT_OK(Put("foo", "bar"));
+    ASSERT_OK(Flush());
+    ASSERT_OK(Put(1, "foo", "bar"));
+    ASSERT_OK(Flush(1));
+    ASSERT_OK(Put("foo", "bar"));
+    ASSERT_OK(Put(1, "foo", "bar"));
+    // The value is large enough to be divided to two blocks.
+    std::string large_value(400, ' ');
+    ASSERT_OK(Put("foo1", large_value));
+    ASSERT_OK(Put("foo2", large_value));
+    Close();
+
+    // Corrupt the log file in the middle, so that it is not corrupted
+    // in the tail.
+    std::vector<std::string> filenames;
+    ASSERT_OK(env_->GetChildren(dbname_, &filenames));
+    for (const auto& f : filenames) {
+      uint64_t number;
+      FileType type;
+      if (ParseFileName(f, &number, &type) && type == FileType::kLogFile) {
+        std::string fname = dbname_ + "/" + f;
+        std::string file_content;
+        ASSERT_OK(ReadFileToString(env_, fname, &file_content));
+        file_content[400] = 'h';
+        file_content[401] = 'a';
+        ASSERT_OK(WriteStringToFile(env_, file_content, fname));
+        break;
+      }
+    }
+
+    // Reopen and freeze the file system after the first manifest write.
+    FaultInjectionTestEnv fit_env(options.env);
+    options.env = &fit_env;
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+        test_sync_point,
+        [&](void* /*arg*/) { fit_env.SetFilesystemActive(false); });
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+    ASSERT_NOK(TryReopenWithColumnFamilies(
+        {kDefaultColumnFamilyName, "pikachu"}, options));
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+    fit_env.SetFilesystemActive(true);
+    // If we continue using failure ingestion Env, it will conplain something
+    // when renaming current file, which is not expected. Need to investigate
+    // why.
+    options.env = env_;
+    ASSERT_OK(TryReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"},
+                                          options));
+  }
+}
+
+TEST_F(DBTest2, SeekFileRangeDeleteTail) {
+  Options options = CurrentOptions();
+  options.prefix_extractor.reset(NewCappedPrefixTransform(1));
+  options.num_levels = 3;
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put("a", "a"));
+  const Snapshot* s1 = db_->GetSnapshot();
+  ASSERT_OK(
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "f"));
+  ASSERT_OK(Put("b", "a"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put("x", "a"));
+  ASSERT_OK(Put("z", "a"));
+  ASSERT_OK(Flush());
+
+  CompactRangeOptions cro;
+  cro.change_level = true;
+  cro.target_level = 2;
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+  {
+    ReadOptions ro;
+    ro.total_order_seek = true;
+    std::unique_ptr<Iterator> iter(db_->NewIterator(ro));
+    iter->Seek("e");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("x", iter->key().ToString());
+  }
+  db_->ReleaseSnapshot(s1);
+}
+
+TEST_F(DBTest2, BackgroundPurgeTest) {
+  Options options = CurrentOptions();
+  options.write_buffer_manager =
+      std::make_shared<ROCKSDB_NAMESPACE::WriteBufferManager>(1 << 20);
+  options.avoid_unnecessary_blocking_io = true;
+  DestroyAndReopen(options);
+  size_t base_value = options.write_buffer_manager->memory_usage();
+
+  ASSERT_OK(Put("a", "a"));
+  Iterator* iter = db_->NewIterator(ReadOptions());
+  ASSERT_OK(Flush());
+  size_t value = options.write_buffer_manager->memory_usage();
+  ASSERT_GT(value, base_value);
+
+  db_->GetEnv()->SetBackgroundThreads(1, Env::Priority::HIGH);
+  test::SleepingBackgroundTask sleeping_task_after;
+  db_->GetEnv()->Schedule(&test::SleepingBackgroundTask::DoSleepTask,
+                          &sleeping_task_after, Env::Priority::HIGH);
+  delete iter;
+
+  Env::Default()->SleepForMicroseconds(100000);
+  value = options.write_buffer_manager->memory_usage();
+  ASSERT_GT(value, base_value);
+
+  sleeping_task_after.WakeUp();
+  sleeping_task_after.WaitUntilDone();
+
+  test::SleepingBackgroundTask sleeping_task_after2;
+  db_->GetEnv()->Schedule(&test::SleepingBackgroundTask::DoSleepTask,
+                          &sleeping_task_after2, Env::Priority::HIGH);
+  sleeping_task_after2.WakeUp();
+  sleeping_task_after2.WaitUntilDone();
+
+  value = options.write_buffer_manager->memory_usage();
+  ASSERT_EQ(base_value, value);
+}
+
+TEST_F(DBTest2, SwitchMemtableRaceWithNewManifest) {
+  Options options = CurrentOptions();
+  DestroyAndReopen(options);
+  options.max_manifest_file_size = 10;
+  options.create_if_missing = true;
+  CreateAndReopenWithCF({"pikachu"}, options);
+  ASSERT_EQ(2, handles_.size());
+
+  ASSERT_OK(Put("foo", "value"));
+  const int kL0Files = options.level0_file_num_compaction_trigger;
+  for (int i = 0; i < kL0Files; ++i) {
+    ASSERT_OK(Put(/*cf=*/1, "a", std::to_string(i)));
+    ASSERT_OK(Flush(/*cf=*/1));
+  }
+
+  port::Thread thread([&]() { ASSERT_OK(Flush()); });
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  thread.join();
+}
+
+TEST_F(DBTest2, SameSmallestInSameLevel) {
+  // This test validates fractional casacading logic when several files at one
+  // one level only contains the same user key.
+  Options options = CurrentOptions();
+  options.merge_operator = MergeOperators::CreateStringAppendOperator();
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put("key", "1"));
+  ASSERT_OK(Put("key", "2"));
+  ASSERT_OK(db_->Merge(WriteOptions(), "key", "3"));
+  ASSERT_OK(db_->Merge(WriteOptions(), "key", "4"));
+  Flush();
+  CompactRangeOptions cro;
+  cro.change_level = true;
+  cro.target_level = 2;
+  ASSERT_OK(dbfull()->CompactRange(cro, db_->DefaultColumnFamily(), nullptr,
+                                   nullptr));
+
+  ASSERT_OK(db_->Merge(WriteOptions(), "key", "5"));
+  Flush();
+  ASSERT_OK(db_->Merge(WriteOptions(), "key", "6"));
+  Flush();
+  ASSERT_OK(db_->Merge(WriteOptions(), "key", "7"));
+  Flush();
+  ASSERT_OK(db_->Merge(WriteOptions(), "key", "8"));
+  Flush();
+  dbfull()->TEST_WaitForCompact(true);
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ("0,4,1", FilesPerLevel());
+#endif  // ROCKSDB_LITE
+
+  ASSERT_EQ("2,3,4,5,6,7,8", Get("key"));
+}
+
+TEST_F(DBTest2, BlockBasedTablePrefixIndexSeekForPrev) {
+  // create a DB with block prefix index
+  BlockBasedTableOptions table_options;
+  Options options = CurrentOptions();
+  table_options.block_size = 300;
+  table_options.index_type = BlockBasedTableOptions::kHashSearch;
+  table_options.index_shortening =
+      BlockBasedTableOptions::IndexShorteningMode::kNoShortening;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+
+  Reopen(options);
+
+  Random rnd(301);
+  std::string large_value = RandomString(&rnd, 500);
+
+  ASSERT_OK(Put("a1", large_value));
+  ASSERT_OK(Put("x1", large_value));
+  ASSERT_OK(Put("y1", large_value));
+  Flush();
+
+  {
+    std::unique_ptr<Iterator> iterator(db_->NewIterator(ReadOptions()));
+    iterator->SeekForPrev("x3");
+    ASSERT_TRUE(iterator->Valid());
+    ASSERT_EQ("x1", iterator->key().ToString());
+
+    iterator->SeekForPrev("a3");
+    ASSERT_TRUE(iterator->Valid());
+    ASSERT_EQ("a1", iterator->key().ToString());
+
+    iterator->SeekForPrev("y3");
+    ASSERT_TRUE(iterator->Valid());
+    ASSERT_EQ("y1", iterator->key().ToString());
+
+    // Query more than one non-existing prefix to cover the case both
+    // of empty hash bucket and hash bucket conflict.
+    iterator->SeekForPrev("b1");
+    // Result should be not valid or "a1".
+    if (iterator->Valid()) {
+      ASSERT_EQ("a1", iterator->key().ToString());
+    }
+
+    iterator->SeekForPrev("c1");
+    // Result should be not valid or "a1".
+    if (iterator->Valid()) {
+      ASSERT_EQ("a1", iterator->key().ToString());
+    }
+
+    iterator->SeekForPrev("d1");
+    // Result should be not valid or "a1".
+    if (iterator->Valid()) {
+      ASSERT_EQ("a1", iterator->key().ToString());
+    }
+
+    iterator->SeekForPrev("y3");
+    ASSERT_TRUE(iterator->Valid());
+    ASSERT_EQ("y1", iterator->key().ToString());
+  }
+}
+
+TEST_F(DBTest2, ChangePrefixExtractor) {
+  for (bool use_partitioned_filter : {true, false}) {
+    // create a DB with block prefix index
+    BlockBasedTableOptions table_options;
+    Options options = CurrentOptions();
+
+    // Sometimes filter is checked based on upper bound. Assert counters
+    // for that case. Otherwise, only check data correctness.
+#ifndef ROCKSDB_LITE
+    bool expect_filter_check = !use_partitioned_filter;
+#else
+    bool expect_filter_check = false;
+#endif
+    table_options.partition_filters = use_partitioned_filter;
+    if (use_partitioned_filter) {
+      table_options.index_type =
+          BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+    }
+    table_options.filter_policy.reset(NewBloomFilterPolicy(10, false));
+
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+    options.statistics = CreateDBStatistics();
+
+    options.prefix_extractor.reset(NewFixedPrefixTransform(2));
+    DestroyAndReopen(options);
+
+    Random rnd(301);
+
+    ASSERT_OK(Put("aa", ""));
+    ASSERT_OK(Put("xb", ""));
+    ASSERT_OK(Put("xx1", ""));
+    ASSERT_OK(Put("xz1", ""));
+    ASSERT_OK(Put("zz", ""));
+    Flush();
+
+    // After reopening DB with prefix size 2 => 1, prefix extractor
+    // won't take effective unless it won't change results based
+    // on upper bound and seek key.
+    options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+    Reopen(options);
+
+    {
+      std::unique_ptr<Iterator> iterator(db_->NewIterator(ReadOptions()));
+      iterator->Seek("xa");
+      ASSERT_TRUE(iterator->Valid());
+      ASSERT_EQ("xb", iterator->key().ToString());
+      // It's a bug that the counter BLOOM_FILTER_PREFIX_CHECKED is not
+      // correct in this case. So don't check counters in this case.
+      if (expect_filter_check) {
+        ASSERT_EQ(0, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
+      }
+
+      iterator->Seek("xz");
+      ASSERT_TRUE(iterator->Valid());
+      ASSERT_EQ("xz1", iterator->key().ToString());
+      if (expect_filter_check) {
+        ASSERT_EQ(0, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
+      }
+    }
+
+    std::string ub_str = "xg9";
+    Slice ub(ub_str);
+    ReadOptions ro;
+    ro.iterate_upper_bound = &ub;
+
+    {
+      std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
+
+      // SeekForPrev() never uses prefix bloom if it is changed.
+      iterator->SeekForPrev("xg0");
+      ASSERT_TRUE(iterator->Valid());
+      ASSERT_EQ("xb", iterator->key().ToString());
+      if (expect_filter_check) {
+        ASSERT_EQ(0, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
+      }
+    }
+
+    ub_str = "xx9";
+    ub = Slice(ub_str);
+    {
+      std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
+
+      iterator->Seek("x");
+      ASSERT_TRUE(iterator->Valid());
+      ASSERT_EQ("xb", iterator->key().ToString());
+      if (expect_filter_check) {
+        ASSERT_EQ(0, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
+      }
+
+      iterator->Seek("xx0");
+      ASSERT_TRUE(iterator->Valid());
+      ASSERT_EQ("xx1", iterator->key().ToString());
+      if (expect_filter_check) {
+        ASSERT_EQ(1, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
+      }
+    }
+
+    CompactRangeOptions compact_range_opts;
+    compact_range_opts.bottommost_level_compaction =
+        BottommostLevelCompaction::kForce;
+    ASSERT_OK(db_->CompactRange(compact_range_opts, nullptr, nullptr));
+    ASSERT_OK(db_->CompactRange(compact_range_opts, nullptr, nullptr));
+
+    // Re-execute similar queries after a full compaction
+    {
+      std::unique_ptr<Iterator> iterator(db_->NewIterator(ReadOptions()));
+
+      iterator->Seek("x");
+      ASSERT_TRUE(iterator->Valid());
+      ASSERT_EQ("xb", iterator->key().ToString());
+      if (expect_filter_check) {
+        ASSERT_EQ(2, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
+      }
+
+      iterator->Seek("xg");
+      ASSERT_TRUE(iterator->Valid());
+      ASSERT_EQ("xx1", iterator->key().ToString());
+      if (expect_filter_check) {
+        ASSERT_EQ(3, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
+      }
+
+      iterator->Seek("xz");
+      ASSERT_TRUE(iterator->Valid());
+      ASSERT_EQ("xz1", iterator->key().ToString());
+      if (expect_filter_check) {
+        ASSERT_EQ(4, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
+      }
+    }
+    {
+      std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
+
+      iterator->SeekForPrev("xx0");
+      ASSERT_TRUE(iterator->Valid());
+      ASSERT_EQ("xb", iterator->key().ToString());
+      if (expect_filter_check) {
+        ASSERT_EQ(5, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
+      }
+
+      iterator->Seek("xx0");
+      ASSERT_TRUE(iterator->Valid());
+      ASSERT_EQ("xx1", iterator->key().ToString());
+      if (expect_filter_check) {
+        ASSERT_EQ(6, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
+      }
+    }
+
+    ub_str = "xg9";
+    ub = Slice(ub_str);
+    {
+      std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
+      iterator->SeekForPrev("xg0");
+      ASSERT_TRUE(iterator->Valid());
+      ASSERT_EQ("xb", iterator->key().ToString());
+      if (expect_filter_check) {
+        ASSERT_EQ(7, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
+      }
+    }
+  }
+}
+
+TEST_F(DBTest2, BlockBasedTablePrefixGetIndexNotFound) {
+  // create a DB with block prefix index
+  BlockBasedTableOptions table_options;
+  Options options = CurrentOptions();
+  table_options.block_size = 300;
+  table_options.index_type = BlockBasedTableOptions::kHashSearch;
+  table_options.index_shortening =
+      BlockBasedTableOptions::IndexShorteningMode::kNoShortening;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+  options.level0_file_num_compaction_trigger = 8;
+
+  Reopen(options);
+
+  ASSERT_OK(Put("b1", "ok"));
+  Flush();
+
+  // Flushing several files so that the chance that hash bucket
+  // is empty fo "b" in at least one of the files is high.
+  ASSERT_OK(Put("a1", ""));
+  ASSERT_OK(Put("c1", ""));
+  Flush();
+
+  ASSERT_OK(Put("a2", ""));
+  ASSERT_OK(Put("c2", ""));
+  Flush();
+
+  ASSERT_OK(Put("a3", ""));
+  ASSERT_OK(Put("c3", ""));
+  Flush();
+
+  ASSERT_OK(Put("a4", ""));
+  ASSERT_OK(Put("c4", ""));
+  Flush();
+
+  ASSERT_OK(Put("a5", ""));
+  ASSERT_OK(Put("c5", ""));
+  Flush();
+
+  ASSERT_EQ("ok", Get("b1"));
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest2, AutoPrefixMode1) {
+  // create a DB with block prefix index
+  BlockBasedTableOptions table_options;
+  Options options = CurrentOptions();
+  table_options.filter_policy.reset(NewBloomFilterPolicy(10, false));
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+  options.statistics = CreateDBStatistics();
+
+  Reopen(options);
+
+  Random rnd(301);
+  std::string large_value = RandomString(&rnd, 500);
+
+  ASSERT_OK(Put("a1", large_value));
+  ASSERT_OK(Put("x1", large_value));
+  ASSERT_OK(Put("y1", large_value));
+  Flush();
+
+  ReadOptions ro;
+  ro.total_order_seek = false;
+  ro.auto_prefix_mode = true;
+  {
+    std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
+    iterator->Seek("b1");
+    ASSERT_TRUE(iterator->Valid());
+    ASSERT_EQ("x1", iterator->key().ToString());
+    ASSERT_EQ(0, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
+  }
+
+  std::string ub_str = "b9";
+  Slice ub(ub_str);
+  ro.iterate_upper_bound = &ub;
+
+  {
+    std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
+    iterator->Seek("b1");
+    ASSERT_FALSE(iterator->Valid());
+    ASSERT_EQ(1, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
+  }
+
+  ub_str = "z";
+  ub = Slice(ub_str);
+  {
+    std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
+    iterator->Seek("b1");
+    ASSERT_TRUE(iterator->Valid());
+    ASSERT_EQ("x1", iterator->key().ToString());
+    ASSERT_EQ(1, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
+  }
+
+  ub_str = "c";
+  ub = Slice(ub_str);
+  {
+    std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
+    iterator->Seek("b1");
+    ASSERT_FALSE(iterator->Valid());
+    ASSERT_EQ(2, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
+  }
+
+  // The same queries without recreating iterator
+  {
+    ub_str = "b9";
+    ub = Slice(ub_str);
+    ro.iterate_upper_bound = &ub;
+
+    std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
+    iterator->Seek("b1");
+    ASSERT_FALSE(iterator->Valid());
+    ASSERT_EQ(3, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
+
+    ub_str = "z";
+    ub = Slice(ub_str);
+
+    iterator->Seek("b1");
+    ASSERT_TRUE(iterator->Valid());
+    ASSERT_EQ("x1", iterator->key().ToString());
+    ASSERT_EQ(3, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
+
+    ub_str = "c";
+    ub = Slice(ub_str);
+
+    iterator->Seek("b1");
+    ASSERT_FALSE(iterator->Valid());
+    ASSERT_EQ(4, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
+
+    ub_str = "b9";
+    ub = Slice(ub_str);
+    ro.iterate_upper_bound = &ub;
+    iterator->SeekForPrev("b1");
+    ASSERT_TRUE(iterator->Valid());
+    ASSERT_EQ("a1", iterator->key().ToString());
+    ASSERT_EQ(4, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
+
+    ub_str = "zz";
+    ub = Slice(ub_str);
+    ro.iterate_upper_bound = &ub;
+    iterator->SeekToLast();
+    ASSERT_TRUE(iterator->Valid());
+    ASSERT_EQ("y1", iterator->key().ToString());
+
+    iterator->SeekToFirst();
+    ASSERT_TRUE(iterator->Valid());
+    ASSERT_EQ("a1", iterator->key().ToString());
+  }
+}
+#endif  // ROCKSDB_LITE
+}  // namespace ROCKSDB_NAMESPACE
+
+#ifdef ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS
+extern "C" {
+void RegisterCustomObjects(int argc, char** argv);
+}
+#else
+void RegisterCustomObjects(int /*argc*/, char** /*argv*/) {}
+#endif  // !ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  RegisterCustomObjects(argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_test_util.cc b/src/rocksdb/db/db_test_util.cc
new file mode 100644
index 000000000..c73abde41
--- /dev/null
+++ b/src/rocksdb/db/db_test_util.cc
@@ -0,0 +1,1564 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/db_test_util.h"
+#include "db/forward_iterator.h"
+#include "rocksdb/env_encryption.h"
+#include "rocksdb/utilities/object_registry.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Special Env used to delay background operations
+
+SpecialEnv::SpecialEnv(Env* base)
+    : EnvWrapper(base),
+      rnd_(301),
+      sleep_counter_(this),
+      addon_time_(0),
+      time_elapse_only_sleep_(false),
+      no_slowdown_(false) {
+  delay_sstable_sync_.store(false, std::memory_order_release);
+  drop_writes_.store(false, std::memory_order_release);
+  no_space_.store(false, std::memory_order_release);
+  non_writable_.store(false, std::memory_order_release);
+  count_random_reads_ = false;
+  count_sequential_reads_ = false;
+  manifest_sync_error_.store(false, std::memory_order_release);
+  manifest_write_error_.store(false, std::memory_order_release);
+  log_write_error_.store(false, std::memory_order_release);
+  random_file_open_counter_.store(0, std::memory_order_relaxed);
+  delete_count_.store(0, std::memory_order_relaxed);
+  num_open_wal_file_.store(0);
+  log_write_slowdown_ = 0;
+  bytes_written_ = 0;
+  sync_counter_ = 0;
+  non_writeable_rate_ = 0;
+  new_writable_count_ = 0;
+  non_writable_count_ = 0;
+  table_write_callback_ = nullptr;
+}
+#ifndef ROCKSDB_LITE
+ROT13BlockCipher rot13Cipher_(16);
+#endif  // ROCKSDB_LITE
+
+DBTestBase::DBTestBase(const std::string path)
+    : mem_env_(nullptr), encrypted_env_(nullptr), option_config_(kDefault) {
+  Env* base_env = Env::Default();
+#ifndef ROCKSDB_LITE
+  const char* test_env_uri = getenv("TEST_ENV_URI");
+  if (test_env_uri) {
+    Env* test_env = nullptr;
+    Status s = Env::LoadEnv(test_env_uri, &test_env, &env_guard_);
+    base_env = test_env;
+    EXPECT_OK(s);
+    EXPECT_NE(Env::Default(), base_env);
+  }
+#endif  // !ROCKSDB_LITE
+  EXPECT_NE(nullptr, base_env);
+  if (getenv("MEM_ENV")) {
+    mem_env_ = new MockEnv(base_env);
+  }
+#ifndef ROCKSDB_LITE
+  if (getenv("ENCRYPTED_ENV")) {
+    encrypted_env_ = NewEncryptedEnv(mem_env_ ? mem_env_ : base_env,
+                                     new CTREncryptionProvider(rot13Cipher_));
+  }
+#endif  // !ROCKSDB_LITE
+  env_ = new SpecialEnv(encrypted_env_ ? encrypted_env_
+                                       : (mem_env_ ? mem_env_ : base_env));
+  env_->SetBackgroundThreads(1, Env::LOW);
+  env_->SetBackgroundThreads(1, Env::HIGH);
+  dbname_ = test::PerThreadDBPath(env_, path);
+  alternative_wal_dir_ = dbname_ + "/wal";
+  alternative_db_log_dir_ = dbname_ + "/db_log_dir";
+  auto options = CurrentOptions();
+  options.env = env_;
+  auto delete_options = options;
+  delete_options.wal_dir = alternative_wal_dir_;
+  EXPECT_OK(DestroyDB(dbname_, delete_options));
+  // Destroy it for not alternative WAL dir is used.
+  EXPECT_OK(DestroyDB(dbname_, options));
+  db_ = nullptr;
+  Reopen(options);
+  Random::GetTLSInstance()->Reset(0xdeadbeef);
+}
+
+DBTestBase::~DBTestBase() {
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+  Close();
+  Options options;
+  options.db_paths.emplace_back(dbname_, 0);
+  options.db_paths.emplace_back(dbname_ + "_2", 0);
+  options.db_paths.emplace_back(dbname_ + "_3", 0);
+  options.db_paths.emplace_back(dbname_ + "_4", 0);
+  options.env = env_;
+
+  if (getenv("KEEP_DB")) {
+    printf("DB is still at %s\n", dbname_.c_str());
+  } else {
+    EXPECT_OK(DestroyDB(dbname_, options));
+  }
+  delete env_;
+}
+
+bool DBTestBase::ShouldSkipOptions(int option_config, int skip_mask) {
+#ifdef ROCKSDB_LITE
+    // These options are not supported in ROCKSDB_LITE
+    if (option_config == kHashSkipList ||
+        option_config == kPlainTableFirstBytePrefix ||
+        option_config == kPlainTableCappedPrefix ||
+        option_config == kPlainTableCappedPrefixNonMmap ||
+        option_config == kPlainTableAllBytesPrefix ||
+        option_config == kVectorRep || option_config == kHashLinkList ||
+        option_config == kUniversalCompaction ||
+        option_config == kUniversalCompactionMultiLevel ||
+        option_config == kUniversalSubcompactions ||
+        option_config == kFIFOCompaction ||
+        option_config == kConcurrentSkipList) {
+      return true;
+    }
+#endif
+
+    if ((skip_mask & kSkipUniversalCompaction) &&
+        (option_config == kUniversalCompaction ||
+         option_config == kUniversalCompactionMultiLevel ||
+         option_config == kUniversalSubcompactions)) {
+      return true;
+    }
+    if ((skip_mask & kSkipMergePut) && option_config == kMergePut) {
+      return true;
+    }
+    if ((skip_mask & kSkipNoSeekToLast) &&
+        (option_config == kHashLinkList || option_config == kHashSkipList)) {
+      return true;
+    }
+    if ((skip_mask & kSkipPlainTable) &&
+        (option_config == kPlainTableAllBytesPrefix ||
+         option_config == kPlainTableFirstBytePrefix ||
+         option_config == kPlainTableCappedPrefix ||
+         option_config == kPlainTableCappedPrefixNonMmap)) {
+      return true;
+    }
+    if ((skip_mask & kSkipHashIndex) &&
+        (option_config == kBlockBasedTableWithPrefixHashIndex ||
+         option_config == kBlockBasedTableWithWholeKeyHashIndex)) {
+      return true;
+    }
+    if ((skip_mask & kSkipFIFOCompaction) && option_config == kFIFOCompaction) {
+      return true;
+    }
+    if ((skip_mask & kSkipMmapReads) && option_config == kWalDirAndMmapReads) {
+      return true;
+    }
+    return false;
+}
+
+// Switch to a fresh database with the next option configuration to
+// test.  Return false if there are no more configurations to test.
+bool DBTestBase::ChangeOptions(int skip_mask) {
+  for (option_config_++; option_config_ < kEnd; option_config_++) {
+    if (ShouldSkipOptions(option_config_, skip_mask)) {
+      continue;
+    }
+    break;
+  }
+
+  if (option_config_ >= kEnd) {
+    Destroy(last_options_);
+    return false;
+  } else {
+    auto options = CurrentOptions();
+    options.create_if_missing = true;
+    DestroyAndReopen(options);
+    return true;
+  }
+}
+
+// Switch between different compaction styles.
+bool DBTestBase::ChangeCompactOptions() {
+  if (option_config_ == kDefault) {
+    option_config_ = kUniversalCompaction;
+    Destroy(last_options_);
+    auto options = CurrentOptions();
+    options.create_if_missing = true;
+    TryReopen(options);
+    return true;
+  } else if (option_config_ == kUniversalCompaction) {
+    option_config_ = kUniversalCompactionMultiLevel;
+    Destroy(last_options_);
+    auto options = CurrentOptions();
+    options.create_if_missing = true;
+    TryReopen(options);
+    return true;
+  } else if (option_config_ == kUniversalCompactionMultiLevel) {
+    option_config_ = kLevelSubcompactions;
+    Destroy(last_options_);
+    auto options = CurrentOptions();
+    assert(options.max_subcompactions > 1);
+    TryReopen(options);
+    return true;
+  } else if (option_config_ == kLevelSubcompactions) {
+    option_config_ = kUniversalSubcompactions;
+    Destroy(last_options_);
+    auto options = CurrentOptions();
+    assert(options.max_subcompactions > 1);
+    TryReopen(options);
+    return true;
+  } else {
+    return false;
+  }
+}
+
+// Switch between different WAL settings
+bool DBTestBase::ChangeWalOptions() {
+  if (option_config_ == kDefault) {
+    option_config_ = kDBLogDir;
+    Destroy(last_options_);
+    auto options = CurrentOptions();
+    Destroy(options);
+    options.create_if_missing = true;
+    TryReopen(options);
+    return true;
+  } else if (option_config_ == kDBLogDir) {
+    option_config_ = kWalDirAndMmapReads;
+    Destroy(last_options_);
+    auto options = CurrentOptions();
+    Destroy(options);
+    options.create_if_missing = true;
+    TryReopen(options);
+    return true;
+  } else if (option_config_ == kWalDirAndMmapReads) {
+    option_config_ = kRecycleLogFiles;
+    Destroy(last_options_);
+    auto options = CurrentOptions();
+    Destroy(options);
+    TryReopen(options);
+    return true;
+  } else {
+    return false;
+  }
+}
+
+// Switch between different filter policy
+// Jump from kDefault to kFilter to kFullFilter
+bool DBTestBase::ChangeFilterOptions() {
+  if (option_config_ == kDefault) {
+    option_config_ = kFilter;
+  } else if (option_config_ == kFilter) {
+    option_config_ = kFullFilterWithNewTableReaderForCompactions;
+  } else if (option_config_ == kFullFilterWithNewTableReaderForCompactions) {
+    option_config_ = kPartitionedFilterWithNewTableReaderForCompactions;
+  } else {
+    return false;
+  }
+  Destroy(last_options_);
+
+  auto options = CurrentOptions();
+  options.create_if_missing = true;
+  TryReopen(options);
+  return true;
+}
+
+// Switch between different DB options for file ingestion tests.
+bool DBTestBase::ChangeOptionsForFileIngestionTest() {
+  if (option_config_ == kDefault) {
+    option_config_ = kUniversalCompaction;
+    Destroy(last_options_);
+    auto options = CurrentOptions();
+    options.create_if_missing = true;
+    TryReopen(options);
+    return true;
+  } else if (option_config_ == kUniversalCompaction) {
+    option_config_ = kUniversalCompactionMultiLevel;
+    Destroy(last_options_);
+    auto options = CurrentOptions();
+    options.create_if_missing = true;
+    TryReopen(options);
+    return true;
+  } else if (option_config_ == kUniversalCompactionMultiLevel) {
+    option_config_ = kLevelSubcompactions;
+    Destroy(last_options_);
+    auto options = CurrentOptions();
+    assert(options.max_subcompactions > 1);
+    TryReopen(options);
+    return true;
+  } else if (option_config_ == kLevelSubcompactions) {
+    option_config_ = kUniversalSubcompactions;
+    Destroy(last_options_);
+    auto options = CurrentOptions();
+    assert(options.max_subcompactions > 1);
+    TryReopen(options);
+    return true;
+  } else if (option_config_ == kUniversalSubcompactions) {
+    option_config_ = kDirectIO;
+    Destroy(last_options_);
+    auto options = CurrentOptions();
+    TryReopen(options);
+    return true;
+  } else {
+    return false;
+  }
+}
+
+// Return the current option configuration.
+Options DBTestBase::CurrentOptions(
+    const anon::OptionsOverride& options_override) const {
+  return GetOptions(option_config_, GetDefaultOptions(), options_override);
+}
+
+Options DBTestBase::CurrentOptions(
+    const Options& default_options,
+    const anon::OptionsOverride& options_override) const {
+  return GetOptions(option_config_, default_options, options_override);
+}
+
+Options DBTestBase::GetDefaultOptions() {
+  Options options;
+  options.write_buffer_size = 4090 * 4096;
+  options.target_file_size_base = 2 * 1024 * 1024;
+  options.max_bytes_for_level_base = 10 * 1024 * 1024;
+  options.max_open_files = 5000;
+  options.wal_recovery_mode = WALRecoveryMode::kTolerateCorruptedTailRecords;
+  options.compaction_pri = CompactionPri::kByCompensatedSize;
+  return options;
+}
+
+Options DBTestBase::GetOptions(
+    int option_config, const Options& default_options,
+    const anon::OptionsOverride& options_override) const {
+  // this redundant copy is to minimize code change w/o having lint error.
+  Options options = default_options;
+  BlockBasedTableOptions table_options;
+  bool set_block_based_table_factory = true;
+#if !defined(OS_MACOSX) && !defined(OS_WIN) && !defined(OS_SOLARIS) && \
+    !defined(OS_AIX)
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack(
+      "NewRandomAccessFile:O_DIRECT");
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack(
+      "NewWritableFile:O_DIRECT");
+#endif
+
+  bool can_allow_mmap = IsMemoryMappedAccessSupported();
+  switch (option_config) {
+#ifndef ROCKSDB_LITE
+    case kHashSkipList:
+      options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+      options.memtable_factory.reset(NewHashSkipListRepFactory(16));
+      options.allow_concurrent_memtable_write = false;
+      options.unordered_write = false;
+      break;
+    case kPlainTableFirstBytePrefix:
+      options.table_factory.reset(new PlainTableFactory());
+      options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+      options.allow_mmap_reads = can_allow_mmap;
+      options.max_sequential_skip_in_iterations = 999999;
+      set_block_based_table_factory = false;
+      break;
+    case kPlainTableCappedPrefix:
+      options.table_factory.reset(new PlainTableFactory());
+      options.prefix_extractor.reset(NewCappedPrefixTransform(8));
+      options.allow_mmap_reads = can_allow_mmap;
+      options.max_sequential_skip_in_iterations = 999999;
+      set_block_based_table_factory = false;
+      break;
+    case kPlainTableCappedPrefixNonMmap:
+      options.table_factory.reset(new PlainTableFactory());
+      options.prefix_extractor.reset(NewCappedPrefixTransform(8));
+      options.allow_mmap_reads = false;
+      options.max_sequential_skip_in_iterations = 999999;
+      set_block_based_table_factory = false;
+      break;
+    case kPlainTableAllBytesPrefix:
+      options.table_factory.reset(new PlainTableFactory());
+      options.prefix_extractor.reset(NewNoopTransform());
+      options.allow_mmap_reads = can_allow_mmap;
+      options.max_sequential_skip_in_iterations = 999999;
+      set_block_based_table_factory = false;
+      break;
+    case kVectorRep:
+      options.memtable_factory.reset(new VectorRepFactory(100));
+      options.allow_concurrent_memtable_write = false;
+      options.unordered_write = false;
+      break;
+    case kHashLinkList:
+      options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+      options.memtable_factory.reset(
+          NewHashLinkListRepFactory(4, 0, 3, true, 4));
+      options.allow_concurrent_memtable_write = false;
+      options.unordered_write = false;
+      break;
+      case kDirectIO: {
+        options.use_direct_reads = true;
+        options.use_direct_io_for_flush_and_compaction = true;
+        options.compaction_readahead_size = 2 * 1024 * 1024;
+  #if !defined(OS_MACOSX) && !defined(OS_WIN) && !defined(OS_SOLARIS) && \
+      !defined(OS_AIX) && !defined(OS_OPENBSD)
+        ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+            "NewWritableFile:O_DIRECT", [&](void* arg) {
+              int* val = static_cast<int*>(arg);
+              *val &= ~O_DIRECT;
+            });
+        ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+            "NewRandomAccessFile:O_DIRECT", [&](void* arg) {
+              int* val = static_cast<int*>(arg);
+              *val &= ~O_DIRECT;
+            });
+        ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+#endif
+        break;
+      }
+#endif  // ROCKSDB_LITE
+    case kMergePut:
+      options.merge_operator = MergeOperators::CreatePutOperator();
+      break;
+    case kFilter:
+      table_options.filter_policy.reset(NewBloomFilterPolicy(10, true));
+      break;
+    case kFullFilterWithNewTableReaderForCompactions:
+      table_options.filter_policy.reset(NewBloomFilterPolicy(10, false));
+      options.new_table_reader_for_compaction_inputs = true;
+      options.compaction_readahead_size = 10 * 1024 * 1024;
+      break;
+    case kPartitionedFilterWithNewTableReaderForCompactions:
+      table_options.filter_policy.reset(NewBloomFilterPolicy(10, false));
+      table_options.partition_filters = true;
+      table_options.index_type =
+          BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+      options.new_table_reader_for_compaction_inputs = true;
+      options.compaction_readahead_size = 10 * 1024 * 1024;
+      break;
+    case kUncompressed:
+      options.compression = kNoCompression;
+      break;
+    case kNumLevel_3:
+      options.num_levels = 3;
+      break;
+    case kDBLogDir:
+      options.db_log_dir = alternative_db_log_dir_;
+      break;
+    case kWalDirAndMmapReads:
+      options.wal_dir = alternative_wal_dir_;
+      // mmap reads should be orthogonal to WalDir setting, so we piggyback to
+      // this option config to test mmap reads as well
+      options.allow_mmap_reads = can_allow_mmap;
+      break;
+    case kManifestFileSize:
+      options.max_manifest_file_size = 50;  // 50 bytes
+      break;
+    case kPerfOptions:
+      options.soft_rate_limit = 2.0;
+      options.delayed_write_rate = 8 * 1024 * 1024;
+      options.report_bg_io_stats = true;
+      // TODO(3.13) -- test more options
+      break;
+    case kUniversalCompaction:
+      options.compaction_style = kCompactionStyleUniversal;
+      options.num_levels = 1;
+      break;
+    case kUniversalCompactionMultiLevel:
+      options.compaction_style = kCompactionStyleUniversal;
+      options.num_levels = 8;
+      break;
+    case kCompressedBlockCache:
+      options.allow_mmap_writes = can_allow_mmap;
+      table_options.block_cache_compressed = NewLRUCache(8 * 1024 * 1024);
+      break;
+    case kInfiniteMaxOpenFiles:
+      options.max_open_files = -1;
+      break;
+    case kxxHashChecksum: {
+      table_options.checksum = kxxHash;
+      break;
+    }
+    case kxxHash64Checksum: {
+      table_options.checksum = kxxHash64;
+      break;
+    }
+    case kFIFOCompaction: {
+      options.compaction_style = kCompactionStyleFIFO;
+      break;
+    }
+    case kBlockBasedTableWithPrefixHashIndex: {
+      table_options.index_type = BlockBasedTableOptions::kHashSearch;
+      options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+      break;
+    }
+    case kBlockBasedTableWithWholeKeyHashIndex: {
+      table_options.index_type = BlockBasedTableOptions::kHashSearch;
+      options.prefix_extractor.reset(NewNoopTransform());
+      break;
+    }
+    case kBlockBasedTableWithPartitionedIndex: {
+      table_options.index_type = BlockBasedTableOptions::kTwoLevelIndexSearch;
+      options.prefix_extractor.reset(NewNoopTransform());
+      break;
+    }
+    case kBlockBasedTableWithPartitionedIndexFormat4: {
+      table_options.format_version = 4;
+      // Format 4 changes the binary index format. Since partitioned index is a
+      // super-set of simple indexes, we are also using kTwoLevelIndexSearch to
+      // test this format.
+      table_options.index_type = BlockBasedTableOptions::kTwoLevelIndexSearch;
+      // The top-level index in partition filters are also affected by format 4.
+      table_options.filter_policy.reset(NewBloomFilterPolicy(10, false));
+      table_options.partition_filters = true;
+      table_options.index_block_restart_interval = 8;
+      break;
+    }
+    case kBlockBasedTableWithIndexRestartInterval: {
+      table_options.index_block_restart_interval = 8;
+      break;
+    }
+    case kOptimizeFiltersForHits: {
+      options.optimize_filters_for_hits = true;
+      set_block_based_table_factory = true;
+      break;
+    }
+    case kRowCache: {
+      options.row_cache = NewLRUCache(1024 * 1024);
+      break;
+    }
+    case kRecycleLogFiles: {
+      options.recycle_log_file_num = 2;
+      break;
+    }
+    case kLevelSubcompactions: {
+      options.max_subcompactions = 4;
+      break;
+    }
+    case kUniversalSubcompactions: {
+      options.compaction_style = kCompactionStyleUniversal;
+      options.num_levels = 8;
+      options.max_subcompactions = 4;
+      break;
+    }
+    case kConcurrentSkipList: {
+      options.allow_concurrent_memtable_write = true;
+      options.enable_write_thread_adaptive_yield = true;
+      break;
+    }
+    case kPipelinedWrite: {
+      options.enable_pipelined_write = true;
+      break;
+    }
+    case kConcurrentWALWrites: {
+      // This options optimize 2PC commit path
+      options.two_write_queues = true;
+      options.manual_wal_flush = true;
+      break;
+    }
+    case kUnorderedWrite: {
+      options.allow_concurrent_memtable_write = false;
+      options.unordered_write = false;
+      break;
+    }
+
+    default:
+      break;
+  }
+
+  if (options_override.filter_policy) {
+    table_options.filter_policy = options_override.filter_policy;
+    table_options.partition_filters = options_override.partition_filters;
+    table_options.metadata_block_size = options_override.metadata_block_size;
+  }
+  if (set_block_based_table_factory) {
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  }
+  options.env = env_;
+  options.create_if_missing = true;
+  options.fail_if_options_file_error = true;
+  return options;
+}
+
+void DBTestBase::CreateColumnFamilies(const std::vector<std::string>& cfs,
+                                      const Options& options) {
+  ColumnFamilyOptions cf_opts(options);
+  size_t cfi = handles_.size();
+  handles_.resize(cfi + cfs.size());
+  for (auto cf : cfs) {
+    Status s = db_->CreateColumnFamily(cf_opts, cf, &handles_[cfi++]);
+    ASSERT_OK(s);
+  }
+}
+
+void DBTestBase::CreateAndReopenWithCF(const std::vector<std::string>& cfs,
+                                       const Options& options) {
+  CreateColumnFamilies(cfs, options);
+  std::vector<std::string> cfs_plus_default = cfs;
+  cfs_plus_default.insert(cfs_plus_default.begin(), kDefaultColumnFamilyName);
+  ReopenWithColumnFamilies(cfs_plus_default, options);
+}
+
+void DBTestBase::ReopenWithColumnFamilies(const std::vector<std::string>& cfs,
+                                          const std::vector<Options>& options) {
+  ASSERT_OK(TryReopenWithColumnFamilies(cfs, options));
+}
+
+void DBTestBase::ReopenWithColumnFamilies(const std::vector<std::string>& cfs,
+                                          const Options& options) {
+  ASSERT_OK(TryReopenWithColumnFamilies(cfs, options));
+}
+
+Status DBTestBase::TryReopenWithColumnFamilies(
+    const std::vector<std::string>& cfs, const std::vector<Options>& options) {
+  Close();
+  EXPECT_EQ(cfs.size(), options.size());
+  std::vector<ColumnFamilyDescriptor> column_families;
+  for (size_t i = 0; i < cfs.size(); ++i) {
+    column_families.push_back(ColumnFamilyDescriptor(cfs[i], options[i]));
+  }
+  DBOptions db_opts = DBOptions(options[0]);
+  last_options_ = options[0];
+  return DB::Open(db_opts, dbname_, column_families, &handles_, &db_);
+}
+
+Status DBTestBase::TryReopenWithColumnFamilies(
+    const std::vector<std::string>& cfs, const Options& options) {
+  Close();
+  std::vector<Options> v_opts(cfs.size(), options);
+  return TryReopenWithColumnFamilies(cfs, v_opts);
+}
+
+void DBTestBase::Reopen(const Options& options) {
+  ASSERT_OK(TryReopen(options));
+}
+
+void DBTestBase::Close() {
+  for (auto h : handles_) {
+    db_->DestroyColumnFamilyHandle(h);
+  }
+  handles_.clear();
+  delete db_;
+  db_ = nullptr;
+}
+
+void DBTestBase::DestroyAndReopen(const Options& options) {
+  // Destroy using last options
+  Destroy(last_options_);
+  ASSERT_OK(TryReopen(options));
+}
+
+void DBTestBase::Destroy(const Options& options, bool delete_cf_paths) {
+  std::vector<ColumnFamilyDescriptor> column_families;
+  if (delete_cf_paths) {
+    for (size_t i = 0; i < handles_.size(); ++i) {
+      ColumnFamilyDescriptor cfdescriptor;
+      handles_[i]->GetDescriptor(&cfdescriptor);
+      column_families.push_back(cfdescriptor);
+    }
+  }
+  Close();
+  ASSERT_OK(DestroyDB(dbname_, options, column_families));
+}
+
+Status DBTestBase::ReadOnlyReopen(const Options& options) {
+  return DB::OpenForReadOnly(options, dbname_, &db_);
+}
+
+Status DBTestBase::TryReopen(const Options& options) {
+  Close();
+  last_options_.table_factory.reset();
+  // Note: operator= is an unsafe approach here since it destructs
+  // std::shared_ptr in the same order of their creation, in contrast to
+  // destructors which destructs them in the opposite order of creation. One
+  // particular problme is that the cache destructor might invoke callback
+  // functions that use Option members such as statistics. To work around this
+  // problem, we manually call destructor of table_facotry which eventually
+  // clears the block cache.
+  last_options_ = options;
+  return DB::Open(options, dbname_, &db_);
+}
+
+bool DBTestBase::IsDirectIOSupported() {
+  return test::IsDirectIOSupported(env_, dbname_);
+}
+
+bool DBTestBase::IsMemoryMappedAccessSupported() const {
+  return (!encrypted_env_);
+}
+
+Status DBTestBase::Flush(int cf) {
+  if (cf == 0) {
+    return db_->Flush(FlushOptions());
+  } else {
+    return db_->Flush(FlushOptions(), handles_[cf]);
+  }
+}
+
+Status DBTestBase::Flush(const std::vector<int>& cf_ids) {
+  std::vector<ColumnFamilyHandle*> cfhs;
+  std::for_each(cf_ids.begin(), cf_ids.end(),
+                [&cfhs, this](int id) { cfhs.emplace_back(handles_[id]); });
+  return db_->Flush(FlushOptions(), cfhs);
+}
+
+Status DBTestBase::Put(const Slice& k, const Slice& v, WriteOptions wo) {
+  if (kMergePut == option_config_) {
+    return db_->Merge(wo, k, v);
+  } else {
+    return db_->Put(wo, k, v);
+  }
+}
+
+Status DBTestBase::Put(int cf, const Slice& k, const Slice& v,
+                       WriteOptions wo) {
+  if (kMergePut == option_config_) {
+    return db_->Merge(wo, handles_[cf], k, v);
+  } else {
+    return db_->Put(wo, handles_[cf], k, v);
+  }
+}
+
+Status DBTestBase::Merge(const Slice& k, const Slice& v, WriteOptions wo) {
+  return db_->Merge(wo, k, v);
+}
+
+Status DBTestBase::Merge(int cf, const Slice& k, const Slice& v,
+                         WriteOptions wo) {
+  return db_->Merge(wo, handles_[cf], k, v);
+}
+
+Status DBTestBase::Delete(const std::string& k) {
+  return db_->Delete(WriteOptions(), k);
+}
+
+Status DBTestBase::Delete(int cf, const std::string& k) {
+  return db_->Delete(WriteOptions(), handles_[cf], k);
+}
+
+Status DBTestBase::SingleDelete(const std::string& k) {
+  return db_->SingleDelete(WriteOptions(), k);
+}
+
+Status DBTestBase::SingleDelete(int cf, const std::string& k) {
+  return db_->SingleDelete(WriteOptions(), handles_[cf], k);
+}
+
+bool DBTestBase::SetPreserveDeletesSequenceNumber(SequenceNumber sn) {
+  return db_->SetPreserveDeletesSequenceNumber(sn);
+}
+
+std::string DBTestBase::Get(const std::string& k, const Snapshot* snapshot) {
+  ReadOptions options;
+  options.verify_checksums = true;
+  options.snapshot = snapshot;
+  std::string result;
+  Status s = db_->Get(options, k, &result);
+  if (s.IsNotFound()) {
+    result = "NOT_FOUND";
+  } else if (!s.ok()) {
+    result = s.ToString();
+  }
+  return result;
+}
+
+std::string DBTestBase::Get(int cf, const std::string& k,
+                            const Snapshot* snapshot) {
+  ReadOptions options;
+  options.verify_checksums = true;
+  options.snapshot = snapshot;
+  std::string result;
+  Status s = db_->Get(options, handles_[cf], k, &result);
+  if (s.IsNotFound()) {
+    result = "NOT_FOUND";
+  } else if (!s.ok()) {
+    result = s.ToString();
+  }
+  return result;
+}
+
+std::vector<std::string> DBTestBase::MultiGet(std::vector<int> cfs,
+                                              const std::vector<std::string>& k,
+                                              const Snapshot* snapshot,
+                                              const bool batched) {
+  ReadOptions options;
+  options.verify_checksums = true;
+  options.snapshot = snapshot;
+  std::vector<ColumnFamilyHandle*> handles;
+  std::vector<Slice> keys;
+  std::vector<std::string> result;
+
+  for (unsigned int i = 0; i < cfs.size(); ++i) {
+    handles.push_back(handles_[cfs[i]]);
+    keys.push_back(k[i]);
+  }
+  std::vector<Status> s;
+  if (!batched) {
+    s = db_->MultiGet(options, handles, keys, &result);
+    for (unsigned int i = 0; i < s.size(); ++i) {
+      if (s[i].IsNotFound()) {
+        result[i] = "NOT_FOUND";
+      } else if (!s[i].ok()) {
+        result[i] = s[i].ToString();
+      }
+    }
+  } else {
+    std::vector<PinnableSlice> pin_values(cfs.size());
+    result.resize(cfs.size());
+    s.resize(cfs.size());
+    db_->MultiGet(options, cfs.size(), handles.data(), keys.data(),
+                  pin_values.data(), s.data());
+    for (unsigned int i = 0; i < s.size(); ++i) {
+      if (s[i].IsNotFound()) {
+        result[i] = "NOT_FOUND";
+      } else if (!s[i].ok()) {
+        result[i] = s[i].ToString();
+      } else {
+        result[i].assign(pin_values[i].data(), pin_values[i].size());
+      }
+    }
+  }
+  return result;
+}
+
+std::vector<std::string> DBTestBase::MultiGet(const std::vector<std::string>& k,
+                                              const Snapshot* snapshot) {
+  ReadOptions options;
+  options.verify_checksums = true;
+  options.snapshot = snapshot;
+  std::vector<Slice> keys;
+  std::vector<std::string> result;
+  std::vector<Status> statuses(k.size());
+  std::vector<PinnableSlice> pin_values(k.size());
+
+  for (unsigned int i = 0; i < k.size(); ++i) {
+    keys.push_back(k[i]);
+  }
+  db_->MultiGet(options, dbfull()->DefaultColumnFamily(), keys.size(),
+                keys.data(), pin_values.data(), statuses.data());
+  result.resize(k.size());
+  for (auto iter = result.begin(); iter != result.end(); ++iter) {
+    iter->assign(pin_values[iter - result.begin()].data(),
+                 pin_values[iter - result.begin()].size());
+  }
+  for (unsigned int i = 0; i < statuses.size(); ++i) {
+    if (statuses[i].IsNotFound()) {
+      result[i] = "NOT_FOUND";
+    }
+  }
+  return result;
+}
+
+Status DBTestBase::Get(const std::string& k, PinnableSlice* v) {
+  ReadOptions options;
+  options.verify_checksums = true;
+  Status s = dbfull()->Get(options, dbfull()->DefaultColumnFamily(), k, v);
+  return s;
+}
+
+uint64_t DBTestBase::GetNumSnapshots() {
+  uint64_t int_num;
+  EXPECT_TRUE(dbfull()->GetIntProperty("rocksdb.num-snapshots", &int_num));
+  return int_num;
+}
+
+uint64_t DBTestBase::GetTimeOldestSnapshots() {
+  uint64_t int_num;
+  EXPECT_TRUE(
+      dbfull()->GetIntProperty("rocksdb.oldest-snapshot-time", &int_num));
+  return int_num;
+}
+
+uint64_t DBTestBase::GetSequenceOldestSnapshots() {
+  uint64_t int_num;
+  EXPECT_TRUE(
+      dbfull()->GetIntProperty("rocksdb.oldest-snapshot-sequence", &int_num));
+  return int_num;
+}
+
+// Return a string that contains all key,value pairs in order,
+// formatted like "(k1->v1)(k2->v2)".
+std::string DBTestBase::Contents(int cf) {
+  std::vector<std::string> forward;
+  std::string result;
+  Iterator* iter = (cf == 0) ? db_->NewIterator(ReadOptions())
+                             : db_->NewIterator(ReadOptions(), handles_[cf]);
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    std::string s = IterStatus(iter);
+    result.push_back('(');
+    result.append(s);
+    result.push_back(')');
+    forward.push_back(s);
+  }
+
+  // Check reverse iteration results are the reverse of forward results
+  unsigned int matched = 0;
+  for (iter->SeekToLast(); iter->Valid(); iter->Prev()) {
+    EXPECT_LT(matched, forward.size());
+    EXPECT_EQ(IterStatus(iter), forward[forward.size() - matched - 1]);
+    matched++;
+  }
+  EXPECT_EQ(matched, forward.size());
+
+  delete iter;
+  return result;
+}
+
+std::string DBTestBase::AllEntriesFor(const Slice& user_key, int cf) {
+  Arena arena;
+  auto options = CurrentOptions();
+  InternalKeyComparator icmp(options.comparator);
+  ReadRangeDelAggregator range_del_agg(&icmp,
+                                       kMaxSequenceNumber /* upper_bound */);
+  ScopedArenaIterator iter;
+  if (cf == 0) {
+    iter.set(dbfull()->NewInternalIterator(&arena, &range_del_agg,
+                                           kMaxSequenceNumber));
+  } else {
+    iter.set(dbfull()->NewInternalIterator(&arena, &range_del_agg,
+                                           kMaxSequenceNumber, handles_[cf]));
+  }
+  InternalKey target(user_key, kMaxSequenceNumber, kTypeValue);
+  iter->Seek(target.Encode());
+  std::string result;
+  if (!iter->status().ok()) {
+    result = iter->status().ToString();
+  } else {
+    result = "[ ";
+    bool first = true;
+    while (iter->Valid()) {
+      ParsedInternalKey ikey(Slice(), 0, kTypeValue);
+      if (!ParseInternalKey(iter->key(), &ikey)) {
+        result += "CORRUPTED";
+      } else {
+        if (!last_options_.comparator->Equal(ikey.user_key, user_key)) {
+          break;
+        }
+        if (!first) {
+          result += ", ";
+        }
+        first = false;
+        switch (ikey.type) {
+          case kTypeValue:
+            result += iter->value().ToString();
+            break;
+          case kTypeMerge:
+            // keep it the same as kTypeValue for testing kMergePut
+            result += iter->value().ToString();
+            break;
+          case kTypeDeletion:
+            result += "DEL";
+            break;
+          case kTypeSingleDeletion:
+            result += "SDEL";
+            break;
+          default:
+            assert(false);
+            break;
+        }
+      }
+      iter->Next();
+    }
+    if (!first) {
+      result += " ";
+    }
+    result += "]";
+  }
+  return result;
+}
+
+#ifndef ROCKSDB_LITE
+int DBTestBase::NumSortedRuns(int cf) {
+  ColumnFamilyMetaData cf_meta;
+  if (cf == 0) {
+    db_->GetColumnFamilyMetaData(&cf_meta);
+  } else {
+    db_->GetColumnFamilyMetaData(handles_[cf], &cf_meta);
+  }
+  int num_sr = static_cast<int>(cf_meta.levels[0].files.size());
+  for (size_t i = 1U; i < cf_meta.levels.size(); i++) {
+    if (cf_meta.levels[i].files.size() > 0) {
+      num_sr++;
+    }
+  }
+  return num_sr;
+}
+
+uint64_t DBTestBase::TotalSize(int cf) {
+  ColumnFamilyMetaData cf_meta;
+  if (cf == 0) {
+    db_->GetColumnFamilyMetaData(&cf_meta);
+  } else {
+    db_->GetColumnFamilyMetaData(handles_[cf], &cf_meta);
+  }
+  return cf_meta.size;
+}
+
+uint64_t DBTestBase::SizeAtLevel(int level) {
+  std::vector<LiveFileMetaData> metadata;
+  db_->GetLiveFilesMetaData(&metadata);
+  uint64_t sum = 0;
+  for (const auto& m : metadata) {
+    if (m.level == level) {
+      sum += m.size;
+    }
+  }
+  return sum;
+}
+
+size_t DBTestBase::TotalLiveFiles(int cf) {
+  ColumnFamilyMetaData cf_meta;
+  if (cf == 0) {
+    db_->GetColumnFamilyMetaData(&cf_meta);
+  } else {
+    db_->GetColumnFamilyMetaData(handles_[cf], &cf_meta);
+  }
+  size_t num_files = 0;
+  for (auto& level : cf_meta.levels) {
+    num_files += level.files.size();
+  }
+  return num_files;
+}
+
+size_t DBTestBase::CountLiveFiles() {
+  std::vector<LiveFileMetaData> metadata;
+  db_->GetLiveFilesMetaData(&metadata);
+  return metadata.size();
+}
+
+int DBTestBase::NumTableFilesAtLevel(int level, int cf) {
+  std::string property;
+  if (cf == 0) {
+    // default cfd
+    EXPECT_TRUE(db_->GetProperty(
+        "rocksdb.num-files-at-level" + NumberToString(level), &property));
+  } else {
+    EXPECT_TRUE(db_->GetProperty(
+        handles_[cf], "rocksdb.num-files-at-level" + NumberToString(level),
+        &property));
+  }
+  return atoi(property.c_str());
+}
+
+double DBTestBase::CompressionRatioAtLevel(int level, int cf) {
+  std::string property;
+  if (cf == 0) {
+    // default cfd
+    EXPECT_TRUE(db_->GetProperty(
+        "rocksdb.compression-ratio-at-level" + NumberToString(level),
+        &property));
+  } else {
+    EXPECT_TRUE(db_->GetProperty(
+        handles_[cf],
+        "rocksdb.compression-ratio-at-level" + NumberToString(level),
+        &property));
+  }
+  return std::stod(property);
+}
+
+int DBTestBase::TotalTableFiles(int cf, int levels) {
+  if (levels == -1) {
+    levels = (cf == 0) ? db_->NumberLevels() : db_->NumberLevels(handles_[1]);
+  }
+  int result = 0;
+  for (int level = 0; level < levels; level++) {
+    result += NumTableFilesAtLevel(level, cf);
+  }
+  return result;
+}
+
+// Return spread of files per level
+std::string DBTestBase::FilesPerLevel(int cf) {
+  int num_levels =
+      (cf == 0) ? db_->NumberLevels() : db_->NumberLevels(handles_[1]);
+  std::string result;
+  size_t last_non_zero_offset = 0;
+  for (int level = 0; level < num_levels; level++) {
+    int f = NumTableFilesAtLevel(level, cf);
+    char buf[100];
+    snprintf(buf, sizeof(buf), "%s%d", (level ? "," : ""), f);
+    result += buf;
+    if (f > 0) {
+      last_non_zero_offset = result.size();
+    }
+  }
+  result.resize(last_non_zero_offset);
+  return result;
+}
+#endif  // !ROCKSDB_LITE
+
+size_t DBTestBase::CountFiles() {
+  std::vector<std::string> files;
+  env_->GetChildren(dbname_, &files);
+
+  std::vector<std::string> logfiles;
+  if (dbname_ != last_options_.wal_dir) {
+    env_->GetChildren(last_options_.wal_dir, &logfiles);
+  }
+
+  return files.size() + logfiles.size();
+}
+
+uint64_t DBTestBase::Size(const Slice& start, const Slice& limit, int cf) {
+  Range r(start, limit);
+  uint64_t size;
+  if (cf == 0) {
+    db_->GetApproximateSizes(&r, 1, &size);
+  } else {
+    db_->GetApproximateSizes(handles_[1], &r, 1, &size);
+  }
+  return size;
+}
+
+void DBTestBase::Compact(int cf, const Slice& start, const Slice& limit,
+                         uint32_t target_path_id) {
+  CompactRangeOptions compact_options;
+  compact_options.target_path_id = target_path_id;
+  ASSERT_OK(db_->CompactRange(compact_options, handles_[cf], &start, &limit));
+}
+
+void DBTestBase::Compact(int cf, const Slice& start, const Slice& limit) {
+  ASSERT_OK(
+      db_->CompactRange(CompactRangeOptions(), handles_[cf], &start, &limit));
+}
+
+void DBTestBase::Compact(const Slice& start, const Slice& limit) {
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &start, &limit));
+}
+
+// Do n memtable compactions, each of which produces an sstable
+// covering the range [small,large].
+void DBTestBase::MakeTables(int n, const std::string& small,
+                            const std::string& large, int cf) {
+  for (int i = 0; i < n; i++) {
+    ASSERT_OK(Put(cf, small, "begin"));
+    ASSERT_OK(Put(cf, large, "end"));
+    ASSERT_OK(Flush(cf));
+    MoveFilesToLevel(n - i - 1, cf);
+  }
+}
+
+// Prevent pushing of new sstables into deeper levels by adding
+// tables that cover a specified range to all levels.
+void DBTestBase::FillLevels(const std::string& smallest,
+                            const std::string& largest, int cf) {
+  MakeTables(db_->NumberLevels(handles_[cf]), smallest, largest, cf);
+}
+
+void DBTestBase::MoveFilesToLevel(int level, int cf) {
+  for (int l = 0; l < level; ++l) {
+    if (cf > 0) {
+      dbfull()->TEST_CompactRange(l, nullptr, nullptr, handles_[cf]);
+    } else {
+      dbfull()->TEST_CompactRange(l, nullptr, nullptr);
+    }
+  }
+}
+
+#ifndef ROCKSDB_LITE
+void DBTestBase::DumpFileCounts(const char* label) {
+  fprintf(stderr, "---\n%s:\n", label);
+  fprintf(stderr, "maxoverlap: %" PRIu64 "\n",
+          dbfull()->TEST_MaxNextLevelOverlappingBytes());
+  for (int level = 0; level < db_->NumberLevels(); level++) {
+    int num = NumTableFilesAtLevel(level);
+    if (num > 0) {
+      fprintf(stderr, "  level %3d : %d files\n", level, num);
+    }
+  }
+}
+#endif  // !ROCKSDB_LITE
+
+std::string DBTestBase::DumpSSTableList() {
+  std::string property;
+  db_->GetProperty("rocksdb.sstables", &property);
+  return property;
+}
+
+void DBTestBase::GetSstFiles(Env* env, std::string path,
+                             std::vector<std::string>* files) {
+  env->GetChildren(path, files);
+
+  files->erase(
+      std::remove_if(files->begin(), files->end(), [](std::string name) {
+        uint64_t number;
+        FileType type;
+        return !(ParseFileName(name, &number, &type) && type == kTableFile);
+      }), files->end());
+}
+
+int DBTestBase::GetSstFileCount(std::string path) {
+  std::vector<std::string> files;
+  DBTestBase::GetSstFiles(env_, path, &files);
+  return static_cast<int>(files.size());
+}
+
+// this will generate non-overlapping files since it keeps increasing key_idx
+void DBTestBase::GenerateNewFile(int cf, Random* rnd, int* key_idx,
+                                 bool nowait) {
+  for (int i = 0; i < KNumKeysByGenerateNewFile; i++) {
+    ASSERT_OK(Put(cf, Key(*key_idx), RandomString(rnd, (i == 99) ? 1 : 990)));
+    (*key_idx)++;
+  }
+  if (!nowait) {
+    dbfull()->TEST_WaitForFlushMemTable();
+    dbfull()->TEST_WaitForCompact();
+  }
+}
+
+// this will generate non-overlapping files since it keeps increasing key_idx
+void DBTestBase::GenerateNewFile(Random* rnd, int* key_idx, bool nowait) {
+  for (int i = 0; i < KNumKeysByGenerateNewFile; i++) {
+    ASSERT_OK(Put(Key(*key_idx), RandomString(rnd, (i == 99) ? 1 : 990)));
+    (*key_idx)++;
+  }
+  if (!nowait) {
+    dbfull()->TEST_WaitForFlushMemTable();
+    dbfull()->TEST_WaitForCompact();
+  }
+}
+
+const int DBTestBase::kNumKeysByGenerateNewRandomFile = 51;
+
+void DBTestBase::GenerateNewRandomFile(Random* rnd, bool nowait) {
+  for (int i = 0; i < kNumKeysByGenerateNewRandomFile; i++) {
+    ASSERT_OK(Put("key" + RandomString(rnd, 7), RandomString(rnd, 2000)));
+  }
+  ASSERT_OK(Put("key" + RandomString(rnd, 7), RandomString(rnd, 200)));
+  if (!nowait) {
+    dbfull()->TEST_WaitForFlushMemTable();
+    dbfull()->TEST_WaitForCompact();
+  }
+}
+
+std::string DBTestBase::IterStatus(Iterator* iter) {
+  std::string result;
+  if (iter->Valid()) {
+    result = iter->key().ToString() + "->" + iter->value().ToString();
+  } else {
+    result = "(invalid)";
+  }
+  return result;
+}
+
+Options DBTestBase::OptionsForLogIterTest() {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.WAL_ttl_seconds = 1000;
+  return options;
+}
+
+std::string DBTestBase::DummyString(size_t len, char c) {
+  return std::string(len, c);
+}
+
+void DBTestBase::VerifyIterLast(std::string expected_key, int cf) {
+  Iterator* iter;
+  ReadOptions ro;
+  if (cf == 0) {
+    iter = db_->NewIterator(ro);
+  } else {
+    iter = db_->NewIterator(ro, handles_[cf]);
+  }
+  iter->SeekToLast();
+  ASSERT_EQ(IterStatus(iter), expected_key);
+  delete iter;
+}
+
+// Used to test InplaceUpdate
+
+// If previous value is nullptr or delta is > than previous value,
+//   sets newValue with delta
+// If previous value is not empty,
+//   updates previous value with 'b' string of previous value size - 1.
+UpdateStatus DBTestBase::updateInPlaceSmallerSize(char* prevValue,
+                                                  uint32_t* prevSize,
+                                                  Slice delta,
+                                                  std::string* newValue) {
+  if (prevValue == nullptr) {
+    *newValue = std::string(delta.size(), 'c');
+    return UpdateStatus::UPDATED;
+  } else {
+    *prevSize = *prevSize - 1;
+    std::string str_b = std::string(*prevSize, 'b');
+    memcpy(prevValue, str_b.c_str(), str_b.size());
+    return UpdateStatus::UPDATED_INPLACE;
+  }
+}
+
+UpdateStatus DBTestBase::updateInPlaceSmallerVarintSize(char* prevValue,
+                                                        uint32_t* prevSize,
+                                                        Slice delta,
+                                                        std::string* newValue) {
+  if (prevValue == nullptr) {
+    *newValue = std::string(delta.size(), 'c');
+    return UpdateStatus::UPDATED;
+  } else {
+    *prevSize = 1;
+    std::string str_b = std::string(*prevSize, 'b');
+    memcpy(prevValue, str_b.c_str(), str_b.size());
+    return UpdateStatus::UPDATED_INPLACE;
+  }
+}
+
+UpdateStatus DBTestBase::updateInPlaceLargerSize(char* /*prevValue*/,
+                                                 uint32_t* /*prevSize*/,
+                                                 Slice delta,
+                                                 std::string* newValue) {
+  *newValue = std::string(delta.size(), 'c');
+  return UpdateStatus::UPDATED;
+}
+
+UpdateStatus DBTestBase::updateInPlaceNoAction(char* /*prevValue*/,
+                                               uint32_t* /*prevSize*/,
+                                               Slice /*delta*/,
+                                               std::string* /*newValue*/) {
+  return UpdateStatus::UPDATE_FAILED;
+}
+
+// Utility method to test InplaceUpdate
+void DBTestBase::validateNumberOfEntries(int numValues, int cf) {
+  Arena arena;
+  auto options = CurrentOptions();
+  InternalKeyComparator icmp(options.comparator);
+  ReadRangeDelAggregator range_del_agg(&icmp,
+                                       kMaxSequenceNumber /* upper_bound */);
+  // This should be defined after range_del_agg so that it destructs the
+  // assigned iterator before it range_del_agg is already destructed.
+  ScopedArenaIterator iter;
+  if (cf != 0) {
+    iter.set(dbfull()->NewInternalIterator(&arena, &range_del_agg,
+                                           kMaxSequenceNumber, handles_[cf]));
+  } else {
+    iter.set(dbfull()->NewInternalIterator(&arena, &range_del_agg,
+                                           kMaxSequenceNumber));
+  }
+  iter->SeekToFirst();
+  ASSERT_EQ(iter->status().ok(), true);
+  int seq = numValues;
+  while (iter->Valid()) {
+    ParsedInternalKey ikey;
+    ikey.clear();
+    ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true);
+
+    // checks sequence number for updates
+    ASSERT_EQ(ikey.sequence, (unsigned)seq--);
+    iter->Next();
+  }
+  ASSERT_EQ(0, seq);
+}
+
+void DBTestBase::CopyFile(const std::string& source,
+                          const std::string& destination, uint64_t size) {
+  const EnvOptions soptions;
+  std::unique_ptr<SequentialFile> srcfile;
+  ASSERT_OK(env_->NewSequentialFile(source, &srcfile, soptions));
+  std::unique_ptr<WritableFile> destfile;
+  ASSERT_OK(env_->NewWritableFile(destination, &destfile, soptions));
+
+  if (size == 0) {
+    // default argument means copy everything
+    ASSERT_OK(env_->GetFileSize(source, &size));
+  }
+
+  char buffer[4096];
+  Slice slice;
+  while (size > 0) {
+    uint64_t one = std::min(uint64_t(sizeof(buffer)), size);
+    ASSERT_OK(srcfile->Read(one, &slice, buffer));
+    ASSERT_OK(destfile->Append(slice));
+    size -= slice.size();
+  }
+  ASSERT_OK(destfile->Close());
+}
+
+std::unordered_map<std::string, uint64_t> DBTestBase::GetAllSSTFiles(
+    uint64_t* total_size) {
+  std::unordered_map<std::string, uint64_t> res;
+
+  if (total_size) {
+    *total_size = 0;
+  }
+  std::vector<std::string> files;
+  env_->GetChildren(dbname_, &files);
+  for (auto& file_name : files) {
+    uint64_t number;
+    FileType type;
+    std::string file_path = dbname_ + "/" + file_name;
+    if (ParseFileName(file_name, &number, &type) && type == kTableFile) {
+      uint64_t file_size = 0;
+      env_->GetFileSize(file_path, &file_size);
+      res[file_path] = file_size;
+      if (total_size) {
+        *total_size += file_size;
+      }
+    }
+  }
+  return res;
+}
+
+std::vector<std::uint64_t> DBTestBase::ListTableFiles(Env* env,
+                                                      const std::string& path) {
+  std::vector<std::string> files;
+  std::vector<uint64_t> file_numbers;
+  env->GetChildren(path, &files);
+  uint64_t number;
+  FileType type;
+  for (size_t i = 0; i < files.size(); ++i) {
+    if (ParseFileName(files[i], &number, &type)) {
+      if (type == kTableFile) {
+        file_numbers.push_back(number);
+      }
+    }
+  }
+  return file_numbers;
+}
+
+void DBTestBase::VerifyDBFromMap(std::map<std::string, std::string> true_data,
+                                 size_t* total_reads_res, bool tailing_iter,
+                                 std::map<std::string, Status> status) {
+  size_t total_reads = 0;
+
+  for (auto& kv : true_data) {
+    Status s = status[kv.first];
+    if (s.ok()) {
+      ASSERT_EQ(Get(kv.first), kv.second);
+    } else {
+      std::string value;
+      ASSERT_EQ(s, db_->Get(ReadOptions(), kv.first, &value));
+    }
+    total_reads++;
+  }
+
+  // Normal Iterator
+  {
+    int iter_cnt = 0;
+    ReadOptions ro;
+    ro.total_order_seek = true;
+    Iterator* iter = db_->NewIterator(ro);
+    // Verify Iterator::Next()
+    iter_cnt = 0;
+    auto data_iter = true_data.begin();
+    Status s;
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next(), data_iter++) {
+      ASSERT_EQ(iter->key().ToString(), data_iter->first);
+      Status current_status = status[data_iter->first];
+      if (!current_status.ok()) {
+        s = current_status;
+      }
+      ASSERT_EQ(iter->status(), s);
+      if (current_status.ok()) {
+        ASSERT_EQ(iter->value().ToString(), data_iter->second);
+      }
+      iter_cnt++;
+      total_reads++;
+    }
+    ASSERT_EQ(data_iter, true_data.end()) << iter_cnt << " / "
+                                          << true_data.size();
+    delete iter;
+
+    // Verify Iterator::Prev()
+    // Use a new iterator to make sure its status is clean.
+    iter = db_->NewIterator(ro);
+    iter_cnt = 0;
+    s = Status::OK();
+    auto data_rev = true_data.rbegin();
+    for (iter->SeekToLast(); iter->Valid(); iter->Prev(), data_rev++) {
+      ASSERT_EQ(iter->key().ToString(), data_rev->first);
+      Status current_status = status[data_rev->first];
+      if (!current_status.ok()) {
+        s = current_status;
+      }
+      ASSERT_EQ(iter->status(), s);
+      if (current_status.ok()) {
+        ASSERT_EQ(iter->value().ToString(), data_rev->second);
+      }
+      iter_cnt++;
+      total_reads++;
+    }
+    ASSERT_EQ(data_rev, true_data.rend()) << iter_cnt << " / "
+                                          << true_data.size();
+
+    // Verify Iterator::Seek()
+    for (auto kv : true_data) {
+      iter->Seek(kv.first);
+      ASSERT_EQ(kv.first, iter->key().ToString());
+      ASSERT_EQ(kv.second, iter->value().ToString());
+      total_reads++;
+    }
+    delete iter;
+  }
+
+  if (tailing_iter) {
+#ifndef ROCKSDB_LITE
+    // Tailing iterator
+    int iter_cnt = 0;
+    ReadOptions ro;
+    ro.tailing = true;
+    ro.total_order_seek = true;
+    Iterator* iter = db_->NewIterator(ro);
+
+    // Verify ForwardIterator::Next()
+    iter_cnt = 0;
+    auto data_iter = true_data.begin();
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next(), data_iter++) {
+      ASSERT_EQ(iter->key().ToString(), data_iter->first);
+      ASSERT_EQ(iter->value().ToString(), data_iter->second);
+      iter_cnt++;
+      total_reads++;
+    }
+    ASSERT_EQ(data_iter, true_data.end()) << iter_cnt << " / "
+                                          << true_data.size();
+
+    // Verify ForwardIterator::Seek()
+    for (auto kv : true_data) {
+      iter->Seek(kv.first);
+      ASSERT_EQ(kv.first, iter->key().ToString());
+      ASSERT_EQ(kv.second, iter->value().ToString());
+      total_reads++;
+    }
+
+    delete iter;
+#endif  // ROCKSDB_LITE
+  }
+
+  if (total_reads_res) {
+    *total_reads_res = total_reads;
+  }
+}
+
+void DBTestBase::VerifyDBInternal(
+    std::vector<std::pair<std::string, std::string>> true_data) {
+  Arena arena;
+  InternalKeyComparator icmp(last_options_.comparator);
+  ReadRangeDelAggregator range_del_agg(&icmp,
+                                       kMaxSequenceNumber /* upper_bound */);
+  auto iter =
+      dbfull()->NewInternalIterator(&arena, &range_del_agg, kMaxSequenceNumber);
+  iter->SeekToFirst();
+  for (auto p : true_data) {
+    ASSERT_TRUE(iter->Valid());
+    ParsedInternalKey ikey;
+    ASSERT_TRUE(ParseInternalKey(iter->key(), &ikey));
+    ASSERT_EQ(p.first, ikey.user_key);
+    ASSERT_EQ(p.second, iter->value());
+    iter->Next();
+  };
+  ASSERT_FALSE(iter->Valid());
+  iter->~InternalIterator();
+}
+
+#ifndef ROCKSDB_LITE
+
+uint64_t DBTestBase::GetNumberOfSstFilesForColumnFamily(
+    DB* db, std::string column_family_name) {
+  std::vector<LiveFileMetaData> metadata;
+  db->GetLiveFilesMetaData(&metadata);
+  uint64_t result = 0;
+  for (auto& fileMetadata : metadata) {
+    result += (fileMetadata.column_family_name == column_family_name);
+  }
+  return result;
+}
+#endif  // ROCKSDB_LITE
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/db_test_util.h b/src/rocksdb/db/db_test_util.h
new file mode 100644
index 000000000..eeabea9bd
--- /dev/null
+++ b/src/rocksdb/db/db_test_util.h
@@ -0,0 +1,1000 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <fcntl.h>
+#include <cinttypes>
+
+#include <algorithm>
+#include <map>
+#include <set>
+#include <string>
+#include <thread>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "db/db_impl/db_impl.h"
+#include "db/dbformat.h"
+#include "env/mock_env.h"
+#include "file/filename.h"
+#include "memtable/hash_linklist_rep.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/options.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/sst_file_writer.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/table.h"
+#include "rocksdb/utilities/checkpoint.h"
+#include "table/block_based/block_based_table_factory.h"
+#include "table/mock_table.h"
+#include "table/plain/plain_table_factory.h"
+#include "table/scoped_arena_iterator.h"
+#include "test_util/mock_time_env.h"
+#include "util/compression.h"
+#include "util/mutexlock.h"
+
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/string_util.h"
+#include "utilities/merge_operators.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace anon {
+class AtomicCounter {
+ public:
+  explicit AtomicCounter(Env* env = NULL)
+      : env_(env), cond_count_(&mu_), count_(0) {}
+
+  void Increment() {
+    MutexLock l(&mu_);
+    count_++;
+    cond_count_.SignalAll();
+  }
+
+  int Read() {
+    MutexLock l(&mu_);
+    return count_;
+  }
+
+  bool WaitFor(int count) {
+    MutexLock l(&mu_);
+
+    uint64_t start = env_->NowMicros();
+    while (count_ < count) {
+      uint64_t now = env_->NowMicros();
+      cond_count_.TimedWait(now + /*1s*/ 1 * 1000 * 1000);
+      if (env_->NowMicros() - start > /*10s*/ 10 * 1000 * 1000) {
+        return false;
+      }
+      if (count_ < count) {
+        GTEST_LOG_(WARNING) << "WaitFor is taking more time than usual";
+      }
+    }
+
+    return true;
+  }
+
+  void Reset() {
+    MutexLock l(&mu_);
+    count_ = 0;
+    cond_count_.SignalAll();
+  }
+
+ private:
+  Env* env_;
+  port::Mutex mu_;
+  port::CondVar cond_count_;
+  int count_;
+};
+
+struct OptionsOverride {
+  std::shared_ptr<const FilterPolicy> filter_policy = nullptr;
+  // These will be used only if filter_policy is set
+  bool partition_filters = false;
+  uint64_t metadata_block_size = 1024;
+
+  // Used as a bit mask of individual enums in which to skip an XF test point
+  int skip_policy = 0;
+};
+
+}  // namespace anon
+
+enum SkipPolicy { kSkipNone = 0, kSkipNoSnapshot = 1, kSkipNoPrefix = 2 };
+
+// A hacky skip list mem table that triggers flush after number of entries.
+class SpecialMemTableRep : public MemTableRep {
+ public:
+  explicit SpecialMemTableRep(Allocator* allocator, MemTableRep* memtable,
+                              int num_entries_flush)
+      : MemTableRep(allocator),
+        memtable_(memtable),
+        num_entries_flush_(num_entries_flush),
+        num_entries_(0) {}
+
+  virtual KeyHandle Allocate(const size_t len, char** buf) override {
+    return memtable_->Allocate(len, buf);
+  }
+
+  // Insert key into the list.
+  // REQUIRES: nothing that compares equal to key is currently in the list.
+  virtual void Insert(KeyHandle handle) override {
+    num_entries_++;
+    memtable_->Insert(handle);
+  }
+
+  void InsertConcurrently(KeyHandle handle) override {
+    num_entries_++;
+    memtable_->Insert(handle);
+  }
+
+  // Returns true iff an entry that compares equal to key is in the list.
+  virtual bool Contains(const char* key) const override {
+    return memtable_->Contains(key);
+  }
+
+  virtual size_t ApproximateMemoryUsage() override {
+    // Return a high memory usage when number of entries exceeds the threshold
+    // to trigger a flush.
+    return (num_entries_ < num_entries_flush_) ? 0 : 1024 * 1024 * 1024;
+  }
+
+  virtual void Get(const LookupKey& k, void* callback_args,
+                   bool (*callback_func)(void* arg,
+                                         const char* entry)) override {
+    memtable_->Get(k, callback_args, callback_func);
+  }
+
+  uint64_t ApproximateNumEntries(const Slice& start_ikey,
+                                 const Slice& end_ikey) override {
+    return memtable_->ApproximateNumEntries(start_ikey, end_ikey);
+  }
+
+  virtual MemTableRep::Iterator* GetIterator(Arena* arena = nullptr) override {
+    return memtable_->GetIterator(arena);
+  }
+
+  virtual ~SpecialMemTableRep() override {}
+
+ private:
+  std::unique_ptr<MemTableRep> memtable_;
+  int num_entries_flush_;
+  int num_entries_;
+};
+
+// The factory for the hacky skip list mem table that triggers flush after
+// number of entries exceeds a threshold.
+class SpecialSkipListFactory : public MemTableRepFactory {
+ public:
+  // After number of inserts exceeds `num_entries_flush` in a mem table, trigger
+  // flush.
+  explicit SpecialSkipListFactory(int num_entries_flush)
+      : num_entries_flush_(num_entries_flush) {}
+
+  using MemTableRepFactory::CreateMemTableRep;
+  virtual MemTableRep* CreateMemTableRep(
+      const MemTableRep::KeyComparator& compare, Allocator* allocator,
+      const SliceTransform* transform, Logger* /*logger*/) override {
+    return new SpecialMemTableRep(
+        allocator, factory_.CreateMemTableRep(compare, allocator, transform, 0),
+        num_entries_flush_);
+  }
+  virtual const char* Name() const override { return "SkipListFactory"; }
+
+  bool IsInsertConcurrentlySupported() const override {
+    return factory_.IsInsertConcurrentlySupported();
+  }
+
+ private:
+  SkipListFactory factory_;
+  int num_entries_flush_;
+};
+
+// Special Env used to delay background operations
+class SpecialEnv : public EnvWrapper {
+ public:
+  explicit SpecialEnv(Env* base);
+
+  Status NewWritableFile(const std::string& f, std::unique_ptr<WritableFile>* r,
+                         const EnvOptions& soptions) override {
+    class SSTableFile : public WritableFile {
+     private:
+      SpecialEnv* env_;
+      std::unique_ptr<WritableFile> base_;
+
+     public:
+      SSTableFile(SpecialEnv* env, std::unique_ptr<WritableFile>&& base)
+          : env_(env), base_(std::move(base)) {}
+      Status Append(const Slice& data) override {
+        if (env_->table_write_callback_) {
+          (*env_->table_write_callback_)();
+        }
+        if (env_->drop_writes_.load(std::memory_order_acquire)) {
+          // Drop writes on the floor
+          return Status::OK();
+        } else if (env_->no_space_.load(std::memory_order_acquire)) {
+          return Status::NoSpace("No space left on device");
+        } else {
+          env_->bytes_written_ += data.size();
+          return base_->Append(data);
+        }
+      }
+      Status PositionedAppend(const Slice& data, uint64_t offset) override {
+        if (env_->table_write_callback_) {
+          (*env_->table_write_callback_)();
+        }
+        if (env_->drop_writes_.load(std::memory_order_acquire)) {
+          // Drop writes on the floor
+          return Status::OK();
+        } else if (env_->no_space_.load(std::memory_order_acquire)) {
+          return Status::NoSpace("No space left on device");
+        } else {
+          env_->bytes_written_ += data.size();
+          return base_->PositionedAppend(data, offset);
+        }
+      }
+      Status Truncate(uint64_t size) override { return base_->Truncate(size); }
+      Status RangeSync(uint64_t offset, uint64_t nbytes) override {
+        Status s = base_->RangeSync(offset, nbytes);
+#if !(defined NDEBUG) || !defined(OS_WIN)
+        TEST_SYNC_POINT_CALLBACK("SpecialEnv::SStableFile::RangeSync", &s);
+#endif  // !(defined NDEBUG) || !defined(OS_WIN)
+        return s;
+      }
+      Status Close() override {
+// SyncPoint is not supported in Released Windows Mode.
+#if !(defined NDEBUG) || !defined(OS_WIN)
+        // Check preallocation size
+        // preallocation size is never passed to base file.
+        size_t preallocation_size = preallocation_block_size();
+        TEST_SYNC_POINT_CALLBACK("DBTestWritableFile.GetPreallocationStatus",
+                                 &preallocation_size);
+#endif  // !(defined NDEBUG) || !defined(OS_WIN)
+        Status s = base_->Close();
+#if !(defined NDEBUG) || !defined(OS_WIN)
+        TEST_SYNC_POINT_CALLBACK("SpecialEnv::SStableFile::Close", &s);
+#endif  // !(defined NDEBUG) || !defined(OS_WIN)
+        return s;
+      }
+      Status Flush() override { return base_->Flush(); }
+      Status Sync() override {
+        ++env_->sync_counter_;
+        while (env_->delay_sstable_sync_.load(std::memory_order_acquire)) {
+          env_->SleepForMicroseconds(100000);
+        }
+        Status s = base_->Sync();
+#if !(defined NDEBUG) || !defined(OS_WIN)
+        TEST_SYNC_POINT_CALLBACK("SpecialEnv::SStableFile::Sync", &s);
+#endif  // !(defined NDEBUG) || !defined(OS_WIN)
+        return s;
+      }
+      void SetIOPriority(Env::IOPriority pri) override {
+        base_->SetIOPriority(pri);
+      }
+      Env::IOPriority GetIOPriority() override {
+        return base_->GetIOPriority();
+      }
+      bool use_direct_io() const override {
+        return base_->use_direct_io();
+      }
+      Status Allocate(uint64_t offset, uint64_t len) override {
+        return base_->Allocate(offset, len);
+      }
+    };
+    class ManifestFile : public WritableFile {
+     public:
+      ManifestFile(SpecialEnv* env, std::unique_ptr<WritableFile>&& b)
+          : env_(env), base_(std::move(b)) {}
+      Status Append(const Slice& data) override {
+        if (env_->manifest_write_error_.load(std::memory_order_acquire)) {
+          return Status::IOError("simulated writer error");
+        } else {
+          return base_->Append(data);
+        }
+      }
+      Status Truncate(uint64_t size) override { return base_->Truncate(size); }
+      Status Close() override { return base_->Close(); }
+      Status Flush() override { return base_->Flush(); }
+      Status Sync() override {
+        ++env_->sync_counter_;
+        if (env_->manifest_sync_error_.load(std::memory_order_acquire)) {
+          return Status::IOError("simulated sync error");
+        } else {
+          return base_->Sync();
+        }
+      }
+      uint64_t GetFileSize() override { return base_->GetFileSize(); }
+      Status Allocate(uint64_t offset, uint64_t len) override {
+        return base_->Allocate(offset, len);
+      }
+
+     private:
+      SpecialEnv* env_;
+      std::unique_ptr<WritableFile> base_;
+    };
+    class WalFile : public WritableFile {
+     public:
+      WalFile(SpecialEnv* env, std::unique_ptr<WritableFile>&& b)
+          : env_(env), base_(std::move(b)) {
+        env_->num_open_wal_file_.fetch_add(1);
+      }
+      virtual ~WalFile() { env_->num_open_wal_file_.fetch_add(-1); }
+      Status Append(const Slice& data) override {
+#if !(defined NDEBUG) || !defined(OS_WIN)
+        TEST_SYNC_POINT("SpecialEnv::WalFile::Append:1");
+#endif
+        Status s;
+        if (env_->log_write_error_.load(std::memory_order_acquire)) {
+          s = Status::IOError("simulated writer error");
+        } else {
+          int slowdown =
+              env_->log_write_slowdown_.load(std::memory_order_acquire);
+          if (slowdown > 0) {
+            env_->SleepForMicroseconds(slowdown);
+          }
+          s = base_->Append(data);
+        }
+#if !(defined NDEBUG) || !defined(OS_WIN)
+        TEST_SYNC_POINT("SpecialEnv::WalFile::Append:2");
+#endif
+        return s;
+      }
+      Status Truncate(uint64_t size) override { return base_->Truncate(size); }
+      Status Close() override {
+// SyncPoint is not supported in Released Windows Mode.
+#if !(defined NDEBUG) || !defined(OS_WIN)
+        // Check preallocation size
+        // preallocation size is never passed to base file.
+        size_t preallocation_size = preallocation_block_size();
+        TEST_SYNC_POINT_CALLBACK("DBTestWalFile.GetPreallocationStatus",
+                                 &preallocation_size);
+#endif  // !(defined NDEBUG) || !defined(OS_WIN)
+
+        return base_->Close();
+      }
+      Status Flush() override { return base_->Flush(); }
+      Status Sync() override {
+        ++env_->sync_counter_;
+        return base_->Sync();
+      }
+      bool IsSyncThreadSafe() const override {
+        return env_->is_wal_sync_thread_safe_.load();
+      }
+      Status Allocate(uint64_t offset, uint64_t len) override {
+        return base_->Allocate(offset, len);
+      }
+
+     private:
+      SpecialEnv* env_;
+      std::unique_ptr<WritableFile> base_;
+    };
+
+    if (non_writeable_rate_.load(std::memory_order_acquire) > 0) {
+      uint32_t random_number;
+      {
+        MutexLock l(&rnd_mutex_);
+        random_number = rnd_.Uniform(100);
+      }
+      if (random_number < non_writeable_rate_.load()) {
+        return Status::IOError("simulated random write error");
+      }
+    }
+
+    new_writable_count_++;
+
+    if (non_writable_count_.load() > 0) {
+      non_writable_count_--;
+      return Status::IOError("simulated write error");
+    }
+
+    EnvOptions optimized = soptions;
+    if (strstr(f.c_str(), "MANIFEST") != nullptr ||
+        strstr(f.c_str(), "log") != nullptr) {
+      optimized.use_mmap_writes = false;
+      optimized.use_direct_writes = false;
+    }
+
+    Status s = target()->NewWritableFile(f, r, optimized);
+    if (s.ok()) {
+      if (strstr(f.c_str(), ".sst") != nullptr) {
+        r->reset(new SSTableFile(this, std::move(*r)));
+      } else if (strstr(f.c_str(), "MANIFEST") != nullptr) {
+        r->reset(new ManifestFile(this, std::move(*r)));
+      } else if (strstr(f.c_str(), "log") != nullptr) {
+        r->reset(new WalFile(this, std::move(*r)));
+      }
+    }
+    return s;
+  }
+
+  Status NewRandomAccessFile(const std::string& f,
+                             std::unique_ptr<RandomAccessFile>* r,
+                             const EnvOptions& soptions) override {
+    class CountingFile : public RandomAccessFile {
+     public:
+      CountingFile(std::unique_ptr<RandomAccessFile>&& target,
+                   anon::AtomicCounter* counter,
+                   std::atomic<size_t>* bytes_read)
+          : target_(std::move(target)),
+            counter_(counter),
+            bytes_read_(bytes_read) {}
+      virtual Status Read(uint64_t offset, size_t n, Slice* result,
+                          char* scratch) const override {
+        counter_->Increment();
+        Status s = target_->Read(offset, n, result, scratch);
+        *bytes_read_ += result->size();
+        return s;
+      }
+
+      virtual Status Prefetch(uint64_t offset, size_t n) override {
+        Status s = target_->Prefetch(offset, n);
+        *bytes_read_ += n;
+        return s;
+      }
+
+     private:
+      std::unique_ptr<RandomAccessFile> target_;
+      anon::AtomicCounter* counter_;
+      std::atomic<size_t>* bytes_read_;
+    };
+
+    Status s = target()->NewRandomAccessFile(f, r, soptions);
+    random_file_open_counter_++;
+    if (s.ok() && count_random_reads_) {
+      r->reset(new CountingFile(std::move(*r), &random_read_counter_,
+                                &random_read_bytes_counter_));
+    }
+    if (s.ok() && soptions.compaction_readahead_size > 0) {
+      compaction_readahead_size_ = soptions.compaction_readahead_size;
+    }
+    return s;
+  }
+
+  virtual Status NewSequentialFile(const std::string& f,
+                                   std::unique_ptr<SequentialFile>* r,
+                                   const EnvOptions& soptions) override {
+    class CountingFile : public SequentialFile {
+     public:
+      CountingFile(std::unique_ptr<SequentialFile>&& target,
+                   anon::AtomicCounter* counter)
+          : target_(std::move(target)), counter_(counter) {}
+      virtual Status Read(size_t n, Slice* result, char* scratch) override {
+        counter_->Increment();
+        return target_->Read(n, result, scratch);
+      }
+      virtual Status Skip(uint64_t n) override { return target_->Skip(n); }
+
+     private:
+      std::unique_ptr<SequentialFile> target_;
+      anon::AtomicCounter* counter_;
+    };
+
+    Status s = target()->NewSequentialFile(f, r, soptions);
+    if (s.ok() && count_sequential_reads_) {
+      r->reset(new CountingFile(std::move(*r), &sequential_read_counter_));
+    }
+    return s;
+  }
+
+  virtual void SleepForMicroseconds(int micros) override {
+    sleep_counter_.Increment();
+    if (no_slowdown_ || time_elapse_only_sleep_) {
+      addon_time_.fetch_add(micros);
+    }
+    if (!no_slowdown_) {
+      target()->SleepForMicroseconds(micros);
+    }
+  }
+
+  virtual Status GetCurrentTime(int64_t* unix_time) override {
+    Status s;
+    if (!time_elapse_only_sleep_) {
+      s = target()->GetCurrentTime(unix_time);
+    }
+    if (s.ok()) {
+      *unix_time += addon_time_.load();
+    }
+    return s;
+  }
+
+  virtual uint64_t NowCPUNanos() override {
+    now_cpu_count_.fetch_add(1);
+    return target()->NowCPUNanos();
+  }
+
+  virtual uint64_t NowNanos() override {
+    return (time_elapse_only_sleep_ ? 0 : target()->NowNanos()) +
+           addon_time_.load() * 1000;
+  }
+
+  virtual uint64_t NowMicros() override {
+    return (time_elapse_only_sleep_ ? 0 : target()->NowMicros()) +
+           addon_time_.load();
+  }
+
+  virtual Status DeleteFile(const std::string& fname) override {
+    delete_count_.fetch_add(1);
+    return target()->DeleteFile(fname);
+  }
+
+  Random rnd_;
+  port::Mutex rnd_mutex_;  // Lock to pretect rnd_
+
+  // sstable Sync() calls are blocked while this pointer is non-nullptr.
+  std::atomic<bool> delay_sstable_sync_;
+
+  // Drop writes on the floor while this pointer is non-nullptr.
+  std::atomic<bool> drop_writes_;
+
+  // Simulate no-space errors while this pointer is non-nullptr.
+  std::atomic<bool> no_space_;
+
+  // Simulate non-writable file system while this pointer is non-nullptr
+  std::atomic<bool> non_writable_;
+
+  // Force sync of manifest files to fail while this pointer is non-nullptr
+  std::atomic<bool> manifest_sync_error_;
+
+  // Force write to manifest files to fail while this pointer is non-nullptr
+  std::atomic<bool> manifest_write_error_;
+
+  // Force write to log files to fail while this pointer is non-nullptr
+  std::atomic<bool> log_write_error_;
+
+  // Slow down every log write, in micro-seconds.
+  std::atomic<int> log_write_slowdown_;
+
+  // Number of WAL files that are still open for write.
+  std::atomic<int> num_open_wal_file_;
+
+  bool count_random_reads_;
+  anon::AtomicCounter random_read_counter_;
+  std::atomic<size_t> random_read_bytes_counter_;
+  std::atomic<int> random_file_open_counter_;
+
+  bool count_sequential_reads_;
+  anon::AtomicCounter sequential_read_counter_;
+
+  anon::AtomicCounter sleep_counter_;
+
+  std::atomic<int64_t> bytes_written_;
+
+  std::atomic<int> sync_counter_;
+
+  std::atomic<uint32_t> non_writeable_rate_;
+
+  std::atomic<uint32_t> new_writable_count_;
+
+  std::atomic<uint32_t> non_writable_count_;
+
+  std::function<void()>* table_write_callback_;
+
+  std::atomic<int64_t> addon_time_;
+
+  std::atomic<int> now_cpu_count_;
+
+  std::atomic<int> delete_count_;
+
+  std::atomic<bool> time_elapse_only_sleep_;
+
+  bool no_slowdown_;
+
+  std::atomic<bool> is_wal_sync_thread_safe_{true};
+
+  std::atomic<size_t> compaction_readahead_size_{};
+};
+
+#ifndef ROCKSDB_LITE
+class OnFileDeletionListener : public EventListener {
+ public:
+  OnFileDeletionListener() : matched_count_(0), expected_file_name_("") {}
+
+  void SetExpectedFileName(const std::string file_name) {
+    expected_file_name_ = file_name;
+  }
+
+  void VerifyMatchedCount(size_t expected_value) {
+    ASSERT_EQ(matched_count_, expected_value);
+  }
+
+  void OnTableFileDeleted(const TableFileDeletionInfo& info) override {
+    if (expected_file_name_ != "") {
+      ASSERT_EQ(expected_file_name_, info.file_path);
+      expected_file_name_ = "";
+      matched_count_++;
+    }
+  }
+
+ private:
+  size_t matched_count_;
+  std::string expected_file_name_;
+};
+#endif
+
+// A test merge operator mimics put but also fails if one of merge operands is
+// "corrupted".
+class TestPutOperator : public MergeOperator {
+ public:
+  virtual bool FullMergeV2(const MergeOperationInput& merge_in,
+                           MergeOperationOutput* merge_out) const override {
+    if (merge_in.existing_value != nullptr &&
+        *(merge_in.existing_value) == "corrupted") {
+      return false;
+    }
+    for (auto value : merge_in.operand_list) {
+      if (value == "corrupted") {
+        return false;
+      }
+    }
+    merge_out->existing_operand = merge_in.operand_list.back();
+    return true;
+  }
+
+  virtual const char* Name() const override { return "TestPutOperator"; }
+};
+
+class DBTestBase : public testing::Test {
+ public:
+  // Sequence of option configurations to try
+  enum OptionConfig : int {
+    kDefault = 0,
+    kBlockBasedTableWithPrefixHashIndex = 1,
+    kBlockBasedTableWithWholeKeyHashIndex = 2,
+    kPlainTableFirstBytePrefix = 3,
+    kPlainTableCappedPrefix = 4,
+    kPlainTableCappedPrefixNonMmap = 5,
+    kPlainTableAllBytesPrefix = 6,
+    kVectorRep = 7,
+    kHashLinkList = 8,
+    kMergePut = 9,
+    kFilter = 10,
+    kFullFilterWithNewTableReaderForCompactions = 11,
+    kUncompressed = 12,
+    kNumLevel_3 = 13,
+    kDBLogDir = 14,
+    kWalDirAndMmapReads = 15,
+    kManifestFileSize = 16,
+    kPerfOptions = 17,
+    kHashSkipList = 18,
+    kUniversalCompaction = 19,
+    kUniversalCompactionMultiLevel = 20,
+    kCompressedBlockCache = 21,
+    kInfiniteMaxOpenFiles = 22,
+    kxxHashChecksum = 23,
+    kFIFOCompaction = 24,
+    kOptimizeFiltersForHits = 25,
+    kRowCache = 26,
+    kRecycleLogFiles = 27,
+    kConcurrentSkipList = 28,
+    kPipelinedWrite = 29,
+    kConcurrentWALWrites = 30,
+    kDirectIO,
+    kLevelSubcompactions,
+    kBlockBasedTableWithIndexRestartInterval,
+    kBlockBasedTableWithPartitionedIndex,
+    kBlockBasedTableWithPartitionedIndexFormat4,
+    kPartitionedFilterWithNewTableReaderForCompactions,
+    kUniversalSubcompactions,
+    kxxHash64Checksum,
+    kUnorderedWrite,
+    // This must be the last line
+    kEnd,
+  };
+
+ public:
+  std::string dbname_;
+  std::string alternative_wal_dir_;
+  std::string alternative_db_log_dir_;
+  MockEnv* mem_env_;
+  Env* encrypted_env_;
+  SpecialEnv* env_;
+  std::shared_ptr<Env> env_guard_;
+  DB* db_;
+  std::vector<ColumnFamilyHandle*> handles_;
+
+  int option_config_;
+  Options last_options_;
+
+  // Skip some options, as they may not be applicable to a specific test.
+  // To add more skip constants, use values 4, 8, 16, etc.
+  enum OptionSkip {
+    kNoSkip = 0,
+    kSkipDeletesFilterFirst = 1,
+    kSkipUniversalCompaction = 2,
+    kSkipMergePut = 4,
+    kSkipPlainTable = 8,
+    kSkipHashIndex = 16,
+    kSkipNoSeekToLast = 32,
+    kSkipFIFOCompaction = 128,
+    kSkipMmapReads = 256,
+  };
+
+  const int kRangeDelSkipConfigs =
+      // Plain tables do not support range deletions.
+      kSkipPlainTable |
+      // MmapReads disables the iterator pinning that RangeDelAggregator
+      // requires.
+      kSkipMmapReads;
+
+  explicit DBTestBase(const std::string path);
+
+  ~DBTestBase();
+
+  static std::string RandomString(Random* rnd, int len) {
+    std::string r;
+    test::RandomString(rnd, len, &r);
+    return r;
+  }
+
+  static std::string Key(int i) {
+    char buf[100];
+    snprintf(buf, sizeof(buf), "key%06d", i);
+    return std::string(buf);
+  }
+
+  static bool ShouldSkipOptions(int option_config, int skip_mask = kNoSkip);
+
+  // Switch to a fresh database with the next option configuration to
+  // test.  Return false if there are no more configurations to test.
+  bool ChangeOptions(int skip_mask = kNoSkip);
+
+  // Switch between different compaction styles.
+  bool ChangeCompactOptions();
+
+  // Switch between different WAL-realted options.
+  bool ChangeWalOptions();
+
+  // Switch between different filter policy
+  // Jump from kDefault to kFilter to kFullFilter
+  bool ChangeFilterOptions();
+
+  // Switch between different DB options for file ingestion tests.
+  bool ChangeOptionsForFileIngestionTest();
+
+  // Return the current option configuration.
+  Options CurrentOptions(const anon::OptionsOverride& options_override =
+                             anon::OptionsOverride()) const;
+
+  Options CurrentOptions(const Options& default_options,
+                         const anon::OptionsOverride& options_override =
+                             anon::OptionsOverride()) const;
+
+  static Options GetDefaultOptions();
+
+  Options GetOptions(int option_config,
+                     const Options& default_options = GetDefaultOptions(),
+                     const anon::OptionsOverride& options_override =
+                         anon::OptionsOverride()) const;
+
+  DBImpl* dbfull() { return reinterpret_cast<DBImpl*>(db_); }
+
+  void CreateColumnFamilies(const std::vector<std::string>& cfs,
+                            const Options& options);
+
+  void CreateAndReopenWithCF(const std::vector<std::string>& cfs,
+                             const Options& options);
+
+  void ReopenWithColumnFamilies(const std::vector<std::string>& cfs,
+                                const std::vector<Options>& options);
+
+  void ReopenWithColumnFamilies(const std::vector<std::string>& cfs,
+                                const Options& options);
+
+  Status TryReopenWithColumnFamilies(const std::vector<std::string>& cfs,
+                                     const std::vector<Options>& options);
+
+  Status TryReopenWithColumnFamilies(const std::vector<std::string>& cfs,
+                                     const Options& options);
+
+  void Reopen(const Options& options);
+
+  void Close();
+
+  void DestroyAndReopen(const Options& options);
+
+  void Destroy(const Options& options, bool delete_cf_paths = false);
+
+  Status ReadOnlyReopen(const Options& options);
+
+  Status TryReopen(const Options& options);
+
+  bool IsDirectIOSupported();
+
+  bool IsMemoryMappedAccessSupported() const;
+
+  Status Flush(int cf = 0);
+
+  Status Flush(const std::vector<int>& cf_ids);
+
+  Status Put(const Slice& k, const Slice& v, WriteOptions wo = WriteOptions());
+
+  Status Put(int cf, const Slice& k, const Slice& v,
+             WriteOptions wo = WriteOptions());
+
+  Status Merge(const Slice& k, const Slice& v,
+               WriteOptions wo = WriteOptions());
+
+  Status Merge(int cf, const Slice& k, const Slice& v,
+               WriteOptions wo = WriteOptions());
+
+  Status Delete(const std::string& k);
+
+  Status Delete(int cf, const std::string& k);
+
+  Status SingleDelete(const std::string& k);
+
+  Status SingleDelete(int cf, const std::string& k);
+
+  bool SetPreserveDeletesSequenceNumber(SequenceNumber sn);
+
+  std::string Get(const std::string& k, const Snapshot* snapshot = nullptr);
+
+  std::string Get(int cf, const std::string& k,
+                  const Snapshot* snapshot = nullptr);
+
+  Status Get(const std::string& k, PinnableSlice* v);
+
+  std::vector<std::string> MultiGet(std::vector<int> cfs,
+                                    const std::vector<std::string>& k,
+                                    const Snapshot* snapshot,
+                                    const bool batched);
+
+  std::vector<std::string> MultiGet(const std::vector<std::string>& k,
+                                    const Snapshot* snapshot = nullptr);
+
+  uint64_t GetNumSnapshots();
+
+  uint64_t GetTimeOldestSnapshots();
+
+  uint64_t GetSequenceOldestSnapshots();
+
+  // Return a string that contains all key,value pairs in order,
+  // formatted like "(k1->v1)(k2->v2)".
+  std::string Contents(int cf = 0);
+
+  std::string AllEntriesFor(const Slice& user_key, int cf = 0);
+
+#ifndef ROCKSDB_LITE
+  int NumSortedRuns(int cf = 0);
+
+  uint64_t TotalSize(int cf = 0);
+
+  uint64_t SizeAtLevel(int level);
+
+  size_t TotalLiveFiles(int cf = 0);
+
+  size_t CountLiveFiles();
+
+  int NumTableFilesAtLevel(int level, int cf = 0);
+
+  double CompressionRatioAtLevel(int level, int cf = 0);
+
+  int TotalTableFiles(int cf = 0, int levels = -1);
+#endif  // ROCKSDB_LITE
+
+  // Return spread of files per level
+  std::string FilesPerLevel(int cf = 0);
+
+  size_t CountFiles();
+
+  uint64_t Size(const Slice& start, const Slice& limit, int cf = 0);
+
+  void Compact(int cf, const Slice& start, const Slice& limit,
+               uint32_t target_path_id);
+
+  void Compact(int cf, const Slice& start, const Slice& limit);
+
+  void Compact(const Slice& start, const Slice& limit);
+
+  // Do n memtable compactions, each of which produces an sstable
+  // covering the range [small,large].
+  void MakeTables(int n, const std::string& small, const std::string& large,
+                  int cf = 0);
+
+  // Prevent pushing of new sstables into deeper levels by adding
+  // tables that cover a specified range to all levels.
+  void FillLevels(const std::string& smallest, const std::string& largest,
+                  int cf);
+
+  void MoveFilesToLevel(int level, int cf = 0);
+
+#ifndef ROCKSDB_LITE
+  void DumpFileCounts(const char* label);
+#endif  // ROCKSDB_LITE
+
+  std::string DumpSSTableList();
+
+  static void GetSstFiles(Env* env, std::string path,
+                          std::vector<std::string>* files);
+
+  int GetSstFileCount(std::string path);
+
+  // this will generate non-overlapping files since it keeps increasing key_idx
+  void GenerateNewFile(Random* rnd, int* key_idx, bool nowait = false);
+
+  void GenerateNewFile(int fd, Random* rnd, int* key_idx, bool nowait = false);
+
+  static const int kNumKeysByGenerateNewRandomFile;
+  static const int KNumKeysByGenerateNewFile = 100;
+
+  void GenerateNewRandomFile(Random* rnd, bool nowait = false);
+
+  std::string IterStatus(Iterator* iter);
+
+  Options OptionsForLogIterTest();
+
+  std::string DummyString(size_t len, char c = 'a');
+
+  void VerifyIterLast(std::string expected_key, int cf = 0);
+
+  // Used to test InplaceUpdate
+
+  // If previous value is nullptr or delta is > than previous value,
+  //   sets newValue with delta
+  // If previous value is not empty,
+  //   updates previous value with 'b' string of previous value size - 1.
+  static UpdateStatus updateInPlaceSmallerSize(char* prevValue,
+                                               uint32_t* prevSize, Slice delta,
+                                               std::string* newValue);
+
+  static UpdateStatus updateInPlaceSmallerVarintSize(char* prevValue,
+                                                     uint32_t* prevSize,
+                                                     Slice delta,
+                                                     std::string* newValue);
+
+  static UpdateStatus updateInPlaceLargerSize(char* prevValue,
+                                              uint32_t* prevSize, Slice delta,
+                                              std::string* newValue);
+
+  static UpdateStatus updateInPlaceNoAction(char* prevValue, uint32_t* prevSize,
+                                            Slice delta, std::string* newValue);
+
+  // Utility method to test InplaceUpdate
+  void validateNumberOfEntries(int numValues, int cf = 0);
+
+  void CopyFile(const std::string& source, const std::string& destination,
+                uint64_t size = 0);
+
+  std::unordered_map<std::string, uint64_t> GetAllSSTFiles(
+      uint64_t* total_size = nullptr);
+
+  std::vector<std::uint64_t> ListTableFiles(Env* env, const std::string& path);
+
+  void VerifyDBFromMap(
+      std::map<std::string, std::string> true_data,
+      size_t* total_reads_res = nullptr, bool tailing_iter = false,
+      std::map<std::string, Status> status = std::map<std::string, Status>());
+
+  void VerifyDBInternal(
+      std::vector<std::pair<std::string, std::string>> true_data);
+
+#ifndef ROCKSDB_LITE
+  uint64_t GetNumberOfSstFilesForColumnFamily(DB* db,
+                                              std::string column_family_name);
+#endif  // ROCKSDB_LITE
+
+  uint64_t TestGetTickerCount(const Options& options, Tickers ticker_type) {
+    return options.statistics->getTickerCount(ticker_type);
+  }
+
+  uint64_t TestGetAndResetTickerCount(const Options& options,
+                                      Tickers ticker_type) {
+    return options.statistics->getAndResetTickerCount(ticker_type);
+  }
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/db_universal_compaction_test.cc b/src/rocksdb/db/db_universal_compaction_test.cc
new file mode 100644
index 000000000..61531ae16
--- /dev/null
+++ b/src/rocksdb/db/db_universal_compaction_test.cc
@@ -0,0 +1,2254 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/db_test_util.h"
+#include "port/stack_trace.h"
+#if !defined(ROCKSDB_LITE)
+#include "rocksdb/utilities/table_properties_collectors.h"
+#include "test_util/sync_point.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+static std::string CompressibleString(Random* rnd, int len) {
+  std::string r;
+  test::CompressibleString(rnd, 0.8, len, &r);
+  return r;
+}
+
+class DBTestUniversalCompactionBase
+    : public DBTestBase,
+      public ::testing::WithParamInterface<std::tuple<int, bool>> {
+ public:
+  explicit DBTestUniversalCompactionBase(
+      const std::string& path) : DBTestBase(path) {}
+  void SetUp() override {
+    num_levels_ = std::get<0>(GetParam());
+    exclusive_manual_compaction_ = std::get<1>(GetParam());
+  }
+  int num_levels_;
+  bool exclusive_manual_compaction_;
+};
+
+class DBTestUniversalCompaction : public DBTestUniversalCompactionBase {
+ public:
+  DBTestUniversalCompaction() :
+      DBTestUniversalCompactionBase("/db_universal_compaction_test") {}
+};
+
+class DBTestUniversalCompaction2 : public DBTestBase {
+ public:
+  DBTestUniversalCompaction2() : DBTestBase("/db_universal_compaction_test2") {}
+};
+
+namespace {
+void VerifyCompactionResult(
+    const ColumnFamilyMetaData& cf_meta,
+    const std::set<std::string>& overlapping_file_numbers) {
+#ifndef NDEBUG
+  for (auto& level : cf_meta.levels) {
+    for (auto& file : level.files) {
+      assert(overlapping_file_numbers.find(file.name) ==
+             overlapping_file_numbers.end());
+    }
+  }
+#endif
+}
+
+class KeepFilter : public CompactionFilter {
+ public:
+  bool Filter(int /*level*/, const Slice& /*key*/, const Slice& /*value*/,
+              std::string* /*new_value*/,
+              bool* /*value_changed*/) const override {
+    return false;
+  }
+
+  const char* Name() const override { return "KeepFilter"; }
+};
+
+class KeepFilterFactory : public CompactionFilterFactory {
+ public:
+  explicit KeepFilterFactory(bool check_context = false)
+      : check_context_(check_context) {}
+
+  std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+      const CompactionFilter::Context& context) override {
+    if (check_context_) {
+      EXPECT_EQ(expect_full_compaction_.load(), context.is_full_compaction);
+      EXPECT_EQ(expect_manual_compaction_.load(), context.is_manual_compaction);
+    }
+    return std::unique_ptr<CompactionFilter>(new KeepFilter());
+  }
+
+  const char* Name() const override { return "KeepFilterFactory"; }
+  bool check_context_;
+  std::atomic_bool expect_full_compaction_;
+  std::atomic_bool expect_manual_compaction_;
+};
+
+class DelayFilter : public CompactionFilter {
+ public:
+  explicit DelayFilter(DBTestBase* d) : db_test(d) {}
+  bool Filter(int /*level*/, const Slice& /*key*/, const Slice& /*value*/,
+              std::string* /*new_value*/,
+              bool* /*value_changed*/) const override {
+    db_test->env_->addon_time_.fetch_add(1000);
+    return true;
+  }
+
+  const char* Name() const override { return "DelayFilter"; }
+
+ private:
+  DBTestBase* db_test;
+};
+
+class DelayFilterFactory : public CompactionFilterFactory {
+ public:
+  explicit DelayFilterFactory(DBTestBase* d) : db_test(d) {}
+  std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+      const CompactionFilter::Context& /*context*/) override {
+    return std::unique_ptr<CompactionFilter>(new DelayFilter(db_test));
+  }
+
+  const char* Name() const override { return "DelayFilterFactory"; }
+
+ private:
+  DBTestBase* db_test;
+};
+}  // namespace
+
+// Make sure we don't trigger a problem if the trigger condtion is given
+// to be 0, which is invalid.
+TEST_P(DBTestUniversalCompaction, UniversalCompactionSingleSortedRun) {
+  Options options = CurrentOptions();
+
+  options.compaction_style = kCompactionStyleUniversal;
+  options.num_levels = num_levels_;
+  // Config universal compaction to always compact to one single sorted run.
+  options.level0_file_num_compaction_trigger = 0;
+  options.compaction_options_universal.size_ratio = 10;
+  options.compaction_options_universal.min_merge_width = 2;
+  options.compaction_options_universal.max_size_amplification_percent = 0;
+
+  options.write_buffer_size = 105 << 10;  // 105KB
+  options.arena_block_size = 4 << 10;
+  options.target_file_size_base = 32 << 10;  // 32KB
+  // trigger compaction if there are >= 4 files
+  KeepFilterFactory* filter = new KeepFilterFactory(true);
+  filter->expect_manual_compaction_.store(false);
+  options.compaction_filter_factory.reset(filter);
+
+  DestroyAndReopen(options);
+  ASSERT_EQ(1, db_->GetOptions().level0_file_num_compaction_trigger);
+
+  Random rnd(301);
+  int key_idx = 0;
+
+  filter->expect_full_compaction_.store(true);
+
+  for (int num = 0; num < 16; num++) {
+    // Write 100KB file. And immediately it should be compacted to one file.
+    GenerateNewFile(&rnd, &key_idx);
+    dbfull()->TEST_WaitForCompact();
+    ASSERT_EQ(NumSortedRuns(0), 1);
+  }
+  ASSERT_OK(Put(Key(key_idx), ""));
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(NumSortedRuns(0), 1);
+}
+
+TEST_P(DBTestUniversalCompaction, OptimizeFiltersForHits) {
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  options.compaction_options_universal.size_ratio = 5;
+  options.num_levels = num_levels_;
+  options.write_buffer_size = 105 << 10;  // 105KB
+  options.arena_block_size = 4 << 10;
+  options.target_file_size_base = 32 << 10;  // 32KB
+  // trigger compaction if there are >= 4 files
+  options.level0_file_num_compaction_trigger = 4;
+  BlockBasedTableOptions bbto;
+  bbto.cache_index_and_filter_blocks = true;
+  bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
+  bbto.whole_key_filtering = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  options.optimize_filters_for_hits = true;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  options.memtable_factory.reset(new SpecialSkipListFactory(3));
+
+  DestroyAndReopen(options);
+
+  // block compaction from happening
+  env_->SetBackgroundThreads(1, Env::LOW);
+  test::SleepingBackgroundTask sleeping_task_low;
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+                 Env::Priority::LOW);
+
+  for (int num = 0; num < options.level0_file_num_compaction_trigger; num++) {
+    Put(Key(num * 10), "val");
+    if (num) {
+      dbfull()->TEST_WaitForFlushMemTable();
+    }
+    Put(Key(30 + num * 10), "val");
+    Put(Key(60 + num * 10), "val");
+  }
+  Put("", "");
+  dbfull()->TEST_WaitForFlushMemTable();
+
+  // Query set of non existing keys
+  for (int i = 5; i < 90; i += 10) {
+    ASSERT_EQ(Get(Key(i)), "NOT_FOUND");
+  }
+
+  // Make sure bloom filter is used at least once.
+  ASSERT_GT(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0);
+  auto prev_counter = TestGetTickerCount(options, BLOOM_FILTER_USEFUL);
+
+  // Make sure bloom filter is used for all but the last L0 file when looking
+  // up a non-existent key that's in the range of all L0 files.
+  ASSERT_EQ(Get(Key(35)), "NOT_FOUND");
+  ASSERT_EQ(prev_counter + NumTableFilesAtLevel(0) - 1,
+            TestGetTickerCount(options, BLOOM_FILTER_USEFUL));
+  prev_counter = TestGetTickerCount(options, BLOOM_FILTER_USEFUL);
+
+  // Unblock compaction and wait it for happening.
+  sleeping_task_low.WakeUp();
+  dbfull()->TEST_WaitForCompact();
+
+  // The same queries will not trigger bloom filter
+  for (int i = 5; i < 90; i += 10) {
+    ASSERT_EQ(Get(Key(i)), "NOT_FOUND");
+  }
+  ASSERT_EQ(prev_counter, TestGetTickerCount(options, BLOOM_FILTER_USEFUL));
+}
+
+// TODO(kailiu) The tests on UniversalCompaction has some issues:
+//  1. A lot of magic numbers ("11" or "12").
+//  2. Made assumption on the memtable flush conditions, which may change from
+//     time to time.
+TEST_P(DBTestUniversalCompaction, UniversalCompactionTrigger) {
+  Options options;
+  options.compaction_style = kCompactionStyleUniversal;
+  options.compaction_options_universal.size_ratio = 5;
+  options.num_levels = num_levels_;
+  options.write_buffer_size = 105 << 10;  // 105KB
+  options.arena_block_size = 4 << 10;
+  options.target_file_size_base = 32 << 10;  // 32KB
+  // trigger compaction if there are >= 4 files
+  options.level0_file_num_compaction_trigger = 4;
+  KeepFilterFactory* filter = new KeepFilterFactory(true);
+  filter->expect_manual_compaction_.store(false);
+  options.compaction_filter_factory.reset(filter);
+
+  options = CurrentOptions(options);
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBTestWritableFile.GetPreallocationStatus", [&](void* arg) {
+        ASSERT_TRUE(arg != nullptr);
+        size_t preallocation_size = *(static_cast<size_t*>(arg));
+        if (num_levels_ > 3) {
+          ASSERT_LE(preallocation_size, options.target_file_size_base * 1.1);
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  Random rnd(301);
+  int key_idx = 0;
+
+  filter->expect_full_compaction_.store(true);
+  // Stage 1:
+  //   Generate a set of files at level 0, but don't trigger level-0
+  //   compaction.
+  for (int num = 0; num < options.level0_file_num_compaction_trigger - 1;
+       num++) {
+    // Write 100KB
+    GenerateNewFile(1, &rnd, &key_idx);
+  }
+
+  // Generate one more file at level-0, which should trigger level-0
+  // compaction.
+  GenerateNewFile(1, &rnd, &key_idx);
+  // Suppose each file flushed from mem table has size 1. Now we compact
+  // (level0_file_num_compaction_trigger+1)=4 files and should have a big
+  // file of size 4.
+  ASSERT_EQ(NumSortedRuns(1), 1);
+
+  // Stage 2:
+  //   Now we have one file at level 0, with size 4. We also have some data in
+  //   mem table. Let's continue generating new files at level 0, but don't
+  //   trigger level-0 compaction.
+  //   First, clean up memtable before inserting new data. This will generate
+  //   a level-0 file, with size around 0.4 (according to previously written
+  //   data amount).
+  filter->expect_full_compaction_.store(false);
+  ASSERT_OK(Flush(1));
+  for (int num = 0; num < options.level0_file_num_compaction_trigger - 3;
+       num++) {
+    GenerateNewFile(1, &rnd, &key_idx);
+    ASSERT_EQ(NumSortedRuns(1), num + 3);
+  }
+
+  // Generate one more file at level-0, which should trigger level-0
+  // compaction.
+  GenerateNewFile(1, &rnd, &key_idx);
+  // Before compaction, we have 4 files at level 0, with size 4, 0.4, 1, 1.
+  // After compaction, we should have 2 files, with size 4, 2.4.
+  ASSERT_EQ(NumSortedRuns(1), 2);
+
+  // Stage 3:
+  //   Now we have 2 files at level 0, with size 4 and 2.4. Continue
+  //   generating new files at level 0.
+  for (int num = 0; num < options.level0_file_num_compaction_trigger - 3;
+       num++) {
+    GenerateNewFile(1, &rnd, &key_idx);
+    ASSERT_EQ(NumSortedRuns(1), num + 3);
+  }
+
+  // Generate one more file at level-0, which should trigger level-0
+  // compaction.
+  GenerateNewFile(1, &rnd, &key_idx);
+  // Before compaction, we have 4 files at level 0, with size 4, 2.4, 1, 1.
+  // After compaction, we should have 3 files, with size 4, 2.4, 2.
+  ASSERT_EQ(NumSortedRuns(1), 3);
+
+  // Stage 4:
+  //   Now we have 3 files at level 0, with size 4, 2.4, 2. Let's generate a
+  //   new file of size 1.
+  GenerateNewFile(1, &rnd, &key_idx);
+  dbfull()->TEST_WaitForCompact();
+  // Level-0 compaction is triggered, but no file will be picked up.
+  ASSERT_EQ(NumSortedRuns(1), 4);
+
+  // Stage 5:
+  //   Now we have 4 files at level 0, with size 4, 2.4, 2, 1. Let's generate
+  //   a new file of size 1.
+  filter->expect_full_compaction_.store(true);
+  GenerateNewFile(1, &rnd, &key_idx);
+  dbfull()->TEST_WaitForCompact();
+  // All files at level 0 will be compacted into a single one.
+  ASSERT_EQ(NumSortedRuns(1), 1);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_P(DBTestUniversalCompaction, UniversalCompactionSizeAmplification) {
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  options.num_levels = num_levels_;
+  options.write_buffer_size = 100 << 10;     // 100KB
+  options.target_file_size_base = 32 << 10;  // 32KB
+  options.level0_file_num_compaction_trigger = 3;
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  // Trigger compaction if size amplification exceeds 110%
+  options.compaction_options_universal.max_size_amplification_percent = 110;
+  options = CurrentOptions(options);
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+  Random rnd(301);
+  int key_idx = 0;
+
+  //   Generate two files in Level 0. Both files are approx the same size.
+  for (int num = 0; num < options.level0_file_num_compaction_trigger - 1;
+       num++) {
+    // Write 110KB (11 values, each 10K)
+    for (int i = 0; i < 11; i++) {
+      ASSERT_OK(Put(1, Key(key_idx), RandomString(&rnd, 10000)));
+      key_idx++;
+    }
+    dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
+    ASSERT_EQ(NumSortedRuns(1), num + 1);
+  }
+  ASSERT_EQ(NumSortedRuns(1), 2);
+
+  // Flush whatever is remaining in memtable. This is typically
+  // small, which should not trigger size ratio based compaction
+  // but will instead trigger size amplification.
+  ASSERT_OK(Flush(1));
+
+  dbfull()->TEST_WaitForCompact();
+
+  // Verify that size amplification did occur
+  ASSERT_EQ(NumSortedRuns(1), 1);
+}
+
+TEST_P(DBTestUniversalCompaction, DynamicUniversalCompactionSizeAmplification) {
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  options.num_levels = 1;
+  options.write_buffer_size = 100 << 10;     // 100KB
+  options.target_file_size_base = 32 << 10;  // 32KB
+  options.level0_file_num_compaction_trigger = 3;
+  // Initial setup of compaction_options_universal will prevent universal
+  // compaction from happening
+  options.compaction_options_universal.size_ratio = 100;
+  options.compaction_options_universal.min_merge_width = 100;
+  DestroyAndReopen(options);
+
+  int total_picked_compactions = 0;
+  int total_size_amp_compactions = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "UniversalCompactionBuilder::PickCompaction:Return", [&](void* arg) {
+        if (arg) {
+          total_picked_compactions++;
+          Compaction* c = static_cast<Compaction*>(arg);
+          if (c->compaction_reason() ==
+              CompactionReason::kUniversalSizeAmplification) {
+            total_size_amp_compactions++;
+          }
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  MutableCFOptions mutable_cf_options;
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  Random rnd(301);
+  int key_idx = 0;
+
+  //   Generate two files in Level 0. Both files are approx the same size.
+  for (int num = 0; num < options.level0_file_num_compaction_trigger - 1;
+       num++) {
+    // Write 110KB (11 values, each 10K)
+    for (int i = 0; i < 11; i++) {
+      ASSERT_OK(Put(1, Key(key_idx), RandomString(&rnd, 10000)));
+      key_idx++;
+    }
+    dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
+    ASSERT_EQ(NumSortedRuns(1), num + 1);
+  }
+  ASSERT_EQ(NumSortedRuns(1), 2);
+
+  // Flush whatever is remaining in memtable. This is typically
+  // small, which should not trigger size ratio based compaction
+  // but could instead trigger size amplification if it's set
+  // to 110.
+  ASSERT_OK(Flush(1));
+  dbfull()->TEST_WaitForCompact();
+  // Verify compaction did not happen
+  ASSERT_EQ(NumSortedRuns(1), 3);
+
+  // Trigger compaction if size amplification exceeds 110% without reopening DB
+  ASSERT_EQ(dbfull()
+                ->GetOptions(handles_[1])
+                .compaction_options_universal.max_size_amplification_percent,
+            200U);
+  ASSERT_OK(dbfull()->SetOptions(handles_[1],
+                                 {{"compaction_options_universal",
+                                   "{max_size_amplification_percent=110;}"}}));
+  ASSERT_EQ(dbfull()
+                ->GetOptions(handles_[1])
+                .compaction_options_universal.max_size_amplification_percent,
+            110u);
+  ASSERT_OK(dbfull()->TEST_GetLatestMutableCFOptions(handles_[1],
+                                                     &mutable_cf_options));
+  ASSERT_EQ(110u, mutable_cf_options.compaction_options_universal
+                      .max_size_amplification_percent);
+
+  dbfull()->TEST_WaitForCompact();
+  // Verify that size amplification did happen
+  ASSERT_EQ(NumSortedRuns(1), 1);
+  ASSERT_EQ(total_picked_compactions, 1);
+  ASSERT_EQ(total_size_amp_compactions, 1);
+}
+
+TEST_P(DBTestUniversalCompaction, DynamicUniversalCompactionReadAmplification) {
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  options.num_levels = 1;
+  options.write_buffer_size = 100 << 10;     // 100KB
+  options.target_file_size_base = 32 << 10;  // 32KB
+  options.level0_file_num_compaction_trigger = 3;
+  // Initial setup of compaction_options_universal will prevent universal
+  // compaction from happening
+  options.compaction_options_universal.max_size_amplification_percent = 2000;
+  options.compaction_options_universal.size_ratio = 0;
+  options.compaction_options_universal.min_merge_width = 100;
+  DestroyAndReopen(options);
+
+  int total_picked_compactions = 0;
+  int total_size_ratio_compactions = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "UniversalCompactionBuilder::PickCompaction:Return", [&](void* arg) {
+        if (arg) {
+          total_picked_compactions++;
+          Compaction* c = static_cast<Compaction*>(arg);
+          if (c->compaction_reason() == CompactionReason::kUniversalSizeRatio) {
+            total_size_ratio_compactions++;
+          }
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  MutableCFOptions mutable_cf_options;
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  Random rnd(301);
+  int key_idx = 0;
+
+  // Generate three files in Level 0. All files are approx the same size.
+  for (int num = 0; num < options.level0_file_num_compaction_trigger; num++) {
+    // Write 110KB (11 values, each 10K)
+    for (int i = 0; i < 11; i++) {
+      ASSERT_OK(Put(1, Key(key_idx), RandomString(&rnd, 10000)));
+      key_idx++;
+    }
+    dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
+    ASSERT_EQ(NumSortedRuns(1), num + 1);
+  }
+  ASSERT_EQ(NumSortedRuns(1), options.level0_file_num_compaction_trigger);
+
+  // Flush whatever is remaining in memtable. This is typically small, about
+  // 30KB.
+  ASSERT_OK(Flush(1));
+  dbfull()->TEST_WaitForCompact();
+  // Verify compaction did not happen
+  ASSERT_EQ(NumSortedRuns(1), options.level0_file_num_compaction_trigger + 1);
+  ASSERT_EQ(total_picked_compactions, 0);
+
+  ASSERT_OK(dbfull()->SetOptions(
+      handles_[1],
+      {{"compaction_options_universal",
+        "{min_merge_width=2;max_merge_width=2;size_ratio=100;}"}}));
+  ASSERT_EQ(dbfull()
+                ->GetOptions(handles_[1])
+                .compaction_options_universal.min_merge_width,
+            2u);
+  ASSERT_EQ(dbfull()
+                ->GetOptions(handles_[1])
+                .compaction_options_universal.max_merge_width,
+            2u);
+  ASSERT_EQ(
+      dbfull()->GetOptions(handles_[1]).compaction_options_universal.size_ratio,
+      100u);
+
+  ASSERT_OK(dbfull()->TEST_GetLatestMutableCFOptions(handles_[1],
+                                                     &mutable_cf_options));
+  ASSERT_EQ(mutable_cf_options.compaction_options_universal.size_ratio, 100u);
+  ASSERT_EQ(mutable_cf_options.compaction_options_universal.min_merge_width,
+            2u);
+  ASSERT_EQ(mutable_cf_options.compaction_options_universal.max_merge_width,
+            2u);
+
+  dbfull()->TEST_WaitForCompact();
+
+  // Files in L0 are approx: 0.3 (30KB), 1, 1, 1.
+  // On compaction: the files are below the size amp threshold, so we
+  // fallthrough to checking read amp conditions. The configured size ratio is
+  // not big enough to take 0.3 into consideration. So the next files 1 and 1
+  // are compacted together first as they satisfy size ratio condition and
+  // (min_merge_width, max_merge_width) condition, to give out a file size of 2.
+  // Next, the newly generated 2 and the last file 1 are compacted together. So
+  // at the end: #sortedRuns = 2, #picked_compactions = 2, and all the picked
+  // ones are size ratio based compactions.
+  ASSERT_EQ(NumSortedRuns(1), 2);
+  // If max_merge_width had not been changed dynamically above, and if it
+  // continued to be the default value of UINIT_MAX, total_picked_compactions
+  // would have been 1.
+  ASSERT_EQ(total_picked_compactions, 2);
+  ASSERT_EQ(total_size_ratio_compactions, 2);
+}
+
+TEST_P(DBTestUniversalCompaction, CompactFilesOnUniversalCompaction) {
+  const int kTestKeySize = 16;
+  const int kTestValueSize = 984;
+  const int kEntrySize = kTestKeySize + kTestValueSize;
+  const int kEntriesPerBuffer = 10;
+
+  ChangeCompactOptions();
+  Options options;
+  options.create_if_missing = true;
+  options.compaction_style = kCompactionStyleLevel;
+  options.num_levels = 1;
+  options.target_file_size_base = options.write_buffer_size;
+  options.compression = kNoCompression;
+  options = CurrentOptions(options);
+  options.write_buffer_size = kEntrySize * kEntriesPerBuffer;
+  CreateAndReopenWithCF({"pikachu"}, options);
+  ASSERT_EQ(options.compaction_style, kCompactionStyleUniversal);
+  Random rnd(301);
+  for (int key = 1024 * kEntriesPerBuffer; key >= 0; --key) {
+    ASSERT_OK(Put(1, ToString(key), RandomString(&rnd, kTestValueSize)));
+  }
+  dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
+  dbfull()->TEST_WaitForCompact();
+  ColumnFamilyMetaData cf_meta;
+  dbfull()->GetColumnFamilyMetaData(handles_[1], &cf_meta);
+  std::vector<std::string> compaction_input_file_names;
+  for (auto file : cf_meta.levels[0].files) {
+    if (rnd.OneIn(2)) {
+      compaction_input_file_names.push_back(file.name);
+    }
+  }
+
+  if (compaction_input_file_names.size() == 0) {
+    compaction_input_file_names.push_back(
+        cf_meta.levels[0].files[0].name);
+  }
+
+  // expect fail since universal compaction only allow L0 output
+  ASSERT_FALSE(dbfull()
+                   ->CompactFiles(CompactionOptions(), handles_[1],
+                                  compaction_input_file_names, 1)
+                   .ok());
+
+  // expect ok and verify the compacted files no longer exist.
+  ASSERT_OK(dbfull()->CompactFiles(
+      CompactionOptions(), handles_[1],
+      compaction_input_file_names, 0));
+
+  dbfull()->GetColumnFamilyMetaData(handles_[1], &cf_meta);
+  VerifyCompactionResult(
+      cf_meta,
+      std::set<std::string>(compaction_input_file_names.begin(),
+          compaction_input_file_names.end()));
+
+  compaction_input_file_names.clear();
+
+  // Pick the first and the last file, expect everything is
+  // compacted into one single file.
+  compaction_input_file_names.push_back(
+      cf_meta.levels[0].files[0].name);
+  compaction_input_file_names.push_back(
+      cf_meta.levels[0].files[
+          cf_meta.levels[0].files.size() - 1].name);
+  ASSERT_OK(dbfull()->CompactFiles(
+      CompactionOptions(), handles_[1],
+      compaction_input_file_names, 0));
+
+  dbfull()->GetColumnFamilyMetaData(handles_[1], &cf_meta);
+  ASSERT_EQ(cf_meta.levels[0].files.size(), 1U);
+}
+
+TEST_P(DBTestUniversalCompaction, UniversalCompactionTargetLevel) {
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  options.write_buffer_size = 100 << 10;     // 100KB
+  options.num_levels = 7;
+  options.disable_auto_compactions = true;
+  DestroyAndReopen(options);
+
+  // Generate 3 overlapping files
+  Random rnd(301);
+  for (int i = 0; i < 210; i++) {
+    ASSERT_OK(Put(Key(i), RandomString(&rnd, 100)));
+  }
+  ASSERT_OK(Flush());
+
+  for (int i = 200; i < 300; i++) {
+    ASSERT_OK(Put(Key(i), RandomString(&rnd, 100)));
+  }
+  ASSERT_OK(Flush());
+
+  for (int i = 250; i < 260; i++) {
+    ASSERT_OK(Put(Key(i), RandomString(&rnd, 100)));
+  }
+  ASSERT_OK(Flush());
+
+  ASSERT_EQ("3", FilesPerLevel(0));
+  // Compact all files into 1 file and put it in L4
+  CompactRangeOptions compact_options;
+  compact_options.change_level = true;
+  compact_options.target_level = 4;
+  compact_options.exclusive_manual_compaction = exclusive_manual_compaction_;
+  db_->CompactRange(compact_options, nullptr, nullptr);
+  ASSERT_EQ("0,0,0,0,1", FilesPerLevel(0));
+}
+
+#ifndef ROCKSDB_VALGRIND_RUN
+class DBTestUniversalCompactionMultiLevels
+    : public DBTestUniversalCompactionBase {
+ public:
+  DBTestUniversalCompactionMultiLevels() :
+      DBTestUniversalCompactionBase(
+          "/db_universal_compaction_multi_levels_test") {}
+};
+
+TEST_P(DBTestUniversalCompactionMultiLevels, UniversalCompactionMultiLevels) {
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  options.num_levels = num_levels_;
+  options.write_buffer_size = 100 << 10;  // 100KB
+  options.level0_file_num_compaction_trigger = 8;
+  options.max_background_compactions = 3;
+  options.target_file_size_base = 32 * 1024;
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  // Trigger compaction if size amplification exceeds 110%
+  options.compaction_options_universal.max_size_amplification_percent = 110;
+  options = CurrentOptions(options);
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+  Random rnd(301);
+  int num_keys = 100000;
+  for (int i = 0; i < num_keys * 2; i++) {
+    ASSERT_OK(Put(1, Key(i % num_keys), Key(i)));
+  }
+
+  dbfull()->TEST_WaitForCompact();
+
+  for (int i = num_keys; i < num_keys * 2; i++) {
+    ASSERT_EQ(Get(1, Key(i % num_keys)), Key(i));
+  }
+}
+
+// Tests universal compaction with trivial move enabled
+TEST_P(DBTestUniversalCompactionMultiLevels, UniversalCompactionTrivialMove) {
+  int32_t trivial_move = 0;
+  int32_t non_trivial_move = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:TrivialMove",
+      [&](void* /*arg*/) { trivial_move++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:NonTrivial", [&](void* arg) {
+        non_trivial_move++;
+        ASSERT_TRUE(arg != nullptr);
+        int output_level = *(static_cast<int*>(arg));
+        ASSERT_EQ(output_level, 0);
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  options.compaction_options_universal.allow_trivial_move = true;
+  options.num_levels = 3;
+  options.write_buffer_size = 100 << 10;  // 100KB
+  options.level0_file_num_compaction_trigger = 3;
+  options.max_background_compactions = 2;
+  options.target_file_size_base = 32 * 1024;
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  // Trigger compaction if size amplification exceeds 110%
+  options.compaction_options_universal.max_size_amplification_percent = 110;
+  options = CurrentOptions(options);
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+  Random rnd(301);
+  int num_keys = 150000;
+  for (int i = 0; i < num_keys; i++) {
+    ASSERT_OK(Put(1, Key(i), Key(i)));
+  }
+  std::vector<std::string> values;
+
+  ASSERT_OK(Flush(1));
+  dbfull()->TEST_WaitForCompact();
+
+  ASSERT_GT(trivial_move, 0);
+  ASSERT_GT(non_trivial_move, 0);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+INSTANTIATE_TEST_CASE_P(MultiLevels, DBTestUniversalCompactionMultiLevels,
+                        ::testing::Combine(::testing::Values(3, 20),
+                                           ::testing::Bool()));
+
+class DBTestUniversalCompactionParallel :
+    public DBTestUniversalCompactionBase {
+ public:
+  DBTestUniversalCompactionParallel() :
+      DBTestUniversalCompactionBase(
+          "/db_universal_compaction_prallel_test") {}
+};
+
+TEST_P(DBTestUniversalCompactionParallel, UniversalCompactionParallel) {
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  options.num_levels = num_levels_;
+  options.write_buffer_size = 1 << 10;  // 1KB
+  options.level0_file_num_compaction_trigger = 3;
+  options.max_background_compactions = 3;
+  options.max_background_flushes = 3;
+  options.target_file_size_base = 1 * 1024;
+  options.compaction_options_universal.max_size_amplification_percent = 110;
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  // Delay every compaction so multiple compactions will happen.
+  std::atomic<int> num_compactions_running(0);
+  std::atomic<bool> has_parallel(false);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "CompactionJob::Run():Start", [&](void* /*arg*/) {
+        if (num_compactions_running.fetch_add(1) > 0) {
+          has_parallel.store(true);
+          return;
+        }
+        for (int nwait = 0; nwait < 20000; nwait++) {
+          if (has_parallel.load() || num_compactions_running.load() > 1) {
+            has_parallel.store(true);
+            break;
+          }
+          env_->SleepForMicroseconds(1000);
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "CompactionJob::Run():End",
+      [&](void* /*arg*/) { num_compactions_running.fetch_add(-1); });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  options = CurrentOptions(options);
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+  Random rnd(301);
+  int num_keys = 30000;
+  for (int i = 0; i < num_keys * 2; i++) {
+    ASSERT_OK(Put(1, Key(i % num_keys), Key(i)));
+  }
+  dbfull()->TEST_WaitForCompact();
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ASSERT_EQ(num_compactions_running.load(), 0);
+  ASSERT_TRUE(has_parallel.load());
+
+  for (int i = num_keys; i < num_keys * 2; i++) {
+    ASSERT_EQ(Get(1, Key(i % num_keys)), Key(i));
+  }
+
+  // Reopen and check.
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+  for (int i = num_keys; i < num_keys * 2; i++) {
+    ASSERT_EQ(Get(1, Key(i % num_keys)), Key(i));
+  }
+}
+
+TEST_P(DBTestUniversalCompactionParallel, PickByFileNumberBug) {
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  options.num_levels = num_levels_;
+  options.write_buffer_size = 1 * 1024;  // 1KB
+  options.level0_file_num_compaction_trigger = 7;
+  options.max_background_compactions = 2;
+  options.target_file_size_base = 1024 * 1024;  // 1MB
+
+  // Disable size amplifiction compaction
+  options.compaction_options_universal.max_size_amplification_percent =
+      UINT_MAX;
+  DestroyAndReopen(options);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBTestUniversalCompactionParallel::PickByFileNumberBug:0",
+        "BackgroundCallCompaction:0"},
+       {"UniversalCompactionBuilder::PickCompaction:Return",
+        "DBTestUniversalCompactionParallel::PickByFileNumberBug:1"},
+       {"DBTestUniversalCompactionParallel::PickByFileNumberBug:2",
+        "CompactionJob::Run():Start"}});
+
+  int total_picked_compactions = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "UniversalCompactionBuilder::PickCompaction:Return", [&](void* arg) {
+        if (arg) {
+          total_picked_compactions++;
+        }
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // Write 7 files to trigger compaction
+  int key_idx = 1;
+  for (int i = 1; i <= 70; i++) {
+    std::string k = Key(key_idx++);
+    ASSERT_OK(Put(k, k));
+    if (i % 10 == 0) {
+      ASSERT_OK(Flush());
+    }
+  }
+
+  // Wait for the 1st background compaction process to start
+  TEST_SYNC_POINT("DBTestUniversalCompactionParallel::PickByFileNumberBug:0");
+  TEST_SYNC_POINT("DBTestUniversalCompactionParallel::PickByFileNumberBug:1");
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearTrace();
+
+  // Write 3 files while 1st compaction is held
+  // These 3 files have different sizes to avoid compacting based on size_ratio
+  int num_keys = 1000;
+  for (int i = 0; i < 3; i++) {
+    for (int j = 1; j <= num_keys; j++) {
+      std::string k = Key(key_idx++);
+      ASSERT_OK(Put(k, k));
+    }
+    ASSERT_OK(Flush());
+    num_keys -= 100;
+  }
+
+  // Hold the 1st compaction from finishing
+  TEST_SYNC_POINT("DBTestUniversalCompactionParallel::PickByFileNumberBug:2");
+  dbfull()->TEST_WaitForCompact();
+
+  // There should only be one picked compaction as the score drops below one
+  // after the first one is picked.
+  EXPECT_EQ(total_picked_compactions, 1);
+  EXPECT_EQ(TotalTableFiles(), 4);
+
+  // Stop SyncPoint and destroy the DB and reopen it again
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearTrace();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  key_idx = 1;
+  total_picked_compactions = 0;
+  DestroyAndReopen(options);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // Write 7 files to trigger compaction
+  for (int i = 1; i <= 70; i++) {
+    std::string k = Key(key_idx++);
+    ASSERT_OK(Put(k, k));
+    if (i % 10 == 0) {
+      ASSERT_OK(Flush());
+    }
+  }
+
+  // Wait for the 1st background compaction process to start
+  TEST_SYNC_POINT("DBTestUniversalCompactionParallel::PickByFileNumberBug:0");
+  TEST_SYNC_POINT("DBTestUniversalCompactionParallel::PickByFileNumberBug:1");
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearTrace();
+
+  // Write 8 files while 1st compaction is held
+  // These 8 files have different sizes to avoid compacting based on size_ratio
+  num_keys = 1000;
+  for (int i = 0; i < 8; i++) {
+    for (int j = 1; j <= num_keys; j++) {
+      std::string k = Key(key_idx++);
+      ASSERT_OK(Put(k, k));
+    }
+    ASSERT_OK(Flush());
+    num_keys -= 100;
+  }
+
+  // Wait for the 2nd background compaction process to start
+  TEST_SYNC_POINT("DBTestUniversalCompactionParallel::PickByFileNumberBug:0");
+  TEST_SYNC_POINT("DBTestUniversalCompactionParallel::PickByFileNumberBug:1");
+
+  // Hold the 1st and 2nd compaction from finishing
+  TEST_SYNC_POINT("DBTestUniversalCompactionParallel::PickByFileNumberBug:2");
+  dbfull()->TEST_WaitForCompact();
+
+  // This time we will trigger a compaction because of size ratio and
+  // another compaction because of number of files that are not compacted
+  // greater than 7
+  EXPECT_GE(total_picked_compactions, 2);
+}
+
+INSTANTIATE_TEST_CASE_P(Parallel, DBTestUniversalCompactionParallel,
+                        ::testing::Combine(::testing::Values(1, 10),
+                                           ::testing::Values(false)));
+#endif  // ROCKSDB_VALGRIND_RUN
+
+TEST_P(DBTestUniversalCompaction, UniversalCompactionOptions) {
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  options.write_buffer_size = 105 << 10;    // 105KB
+  options.arena_block_size = 4 << 10;       // 4KB
+  options.target_file_size_base = 32 << 10;  // 32KB
+  options.level0_file_num_compaction_trigger = 4;
+  options.num_levels = num_levels_;
+  options.compaction_options_universal.compression_size_percent = -1;
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  Random rnd(301);
+  int key_idx = 0;
+
+  for (int num = 0; num < options.level0_file_num_compaction_trigger; num++) {
+    // Write 100KB (100 values, each 1K)
+    for (int i = 0; i < 100; i++) {
+      ASSERT_OK(Put(1, Key(key_idx), RandomString(&rnd, 990)));
+      key_idx++;
+    }
+    dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
+
+    if (num < options.level0_file_num_compaction_trigger - 1) {
+      ASSERT_EQ(NumSortedRuns(1), num + 1);
+    }
+  }
+
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(NumSortedRuns(1), 1);
+}
+
+TEST_P(DBTestUniversalCompaction, UniversalCompactionStopStyleSimilarSize) {
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  options.write_buffer_size = 105 << 10;    // 105KB
+  options.arena_block_size = 4 << 10;       // 4KB
+  options.target_file_size_base = 32 << 10;  // 32KB
+  // trigger compaction if there are >= 4 files
+  options.level0_file_num_compaction_trigger = 4;
+  options.compaction_options_universal.size_ratio = 10;
+  options.compaction_options_universal.stop_style =
+      kCompactionStopStyleSimilarSize;
+  options.num_levels = num_levels_;
+  DestroyAndReopen(options);
+
+  Random rnd(301);
+  int key_idx = 0;
+
+  // Stage 1:
+  //   Generate a set of files at level 0, but don't trigger level-0
+  //   compaction.
+  for (int num = 0; num < options.level0_file_num_compaction_trigger - 1;
+       num++) {
+    // Write 100KB (100 values, each 1K)
+    for (int i = 0; i < 100; i++) {
+      ASSERT_OK(Put(Key(key_idx), RandomString(&rnd, 990)));
+      key_idx++;
+    }
+    dbfull()->TEST_WaitForFlushMemTable();
+    ASSERT_EQ(NumSortedRuns(), num + 1);
+  }
+
+  // Generate one more file at level-0, which should trigger level-0
+  // compaction.
+  for (int i = 0; i < 100; i++) {
+    ASSERT_OK(Put(Key(key_idx), RandomString(&rnd, 990)));
+    key_idx++;
+  }
+  dbfull()->TEST_WaitForCompact();
+  // Suppose each file flushed from mem table has size 1. Now we compact
+  // (level0_file_num_compaction_trigger+1)=4 files and should have a big
+  // file of size 4.
+  ASSERT_EQ(NumSortedRuns(), 1);
+
+  // Stage 2:
+  //   Now we have one file at level 0, with size 4. We also have some data in
+  //   mem table. Let's continue generating new files at level 0, but don't
+  //   trigger level-0 compaction.
+  //   First, clean up memtable before inserting new data. This will generate
+  //   a level-0 file, with size around 0.4 (according to previously written
+  //   data amount).
+  dbfull()->Flush(FlushOptions());
+  for (int num = 0; num < options.level0_file_num_compaction_trigger - 3;
+       num++) {
+    // Write 110KB (11 values, each 10K)
+    for (int i = 0; i < 100; i++) {
+      ASSERT_OK(Put(Key(key_idx), RandomString(&rnd, 990)));
+      key_idx++;
+    }
+    dbfull()->TEST_WaitForFlushMemTable();
+    ASSERT_EQ(NumSortedRuns(), num + 3);
+  }
+
+  // Generate one more file at level-0, which should trigger level-0
+  // compaction.
+  for (int i = 0; i < 100; i++) {
+    ASSERT_OK(Put(Key(key_idx), RandomString(&rnd, 990)));
+    key_idx++;
+  }
+  dbfull()->TEST_WaitForCompact();
+  // Before compaction, we have 4 files at level 0, with size 4, 0.4, 1, 1.
+  // After compaction, we should have 3 files, with size 4, 0.4, 2.
+  ASSERT_EQ(NumSortedRuns(), 3);
+  // Stage 3:
+  //   Now we have 3 files at level 0, with size 4, 0.4, 2. Generate one
+  //   more file at level-0, which should trigger level-0 compaction.
+  for (int i = 0; i < 100; i++) {
+    ASSERT_OK(Put(Key(key_idx), RandomString(&rnd, 990)));
+    key_idx++;
+  }
+  dbfull()->TEST_WaitForCompact();
+  // Level-0 compaction is triggered, but no file will be picked up.
+  ASSERT_EQ(NumSortedRuns(), 4);
+}
+
+TEST_P(DBTestUniversalCompaction, UniversalCompactionCompressRatio1) {
+  if (!Snappy_Supported()) {
+    return;
+  }
+
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  options.write_buffer_size = 100 << 10;     // 100KB
+  options.target_file_size_base = 32 << 10;  // 32KB
+  options.level0_file_num_compaction_trigger = 2;
+  options.num_levels = num_levels_;
+  options.compaction_options_universal.compression_size_percent = 70;
+  DestroyAndReopen(options);
+
+  Random rnd(301);
+  int key_idx = 0;
+
+  // The first compaction (2) is compressed.
+  for (int num = 0; num < 2; num++) {
+    // Write 110KB (11 values, each 10K)
+    for (int i = 0; i < 11; i++) {
+      ASSERT_OK(Put(Key(key_idx), CompressibleString(&rnd, 10000)));
+      key_idx++;
+    }
+    dbfull()->TEST_WaitForFlushMemTable();
+    dbfull()->TEST_WaitForCompact();
+  }
+  ASSERT_LT(TotalSize(), 110000U * 2 * 0.9);
+
+  // The second compaction (4) is compressed
+  for (int num = 0; num < 2; num++) {
+    // Write 110KB (11 values, each 10K)
+    for (int i = 0; i < 11; i++) {
+      ASSERT_OK(Put(Key(key_idx), CompressibleString(&rnd, 10000)));
+      key_idx++;
+    }
+    dbfull()->TEST_WaitForFlushMemTable();
+    dbfull()->TEST_WaitForCompact();
+  }
+  ASSERT_LT(TotalSize(), 110000 * 4 * 0.9);
+
+  // The third compaction (2 4) is compressed since this time it is
+  // (1 1 3.2) and 3.2/5.2 doesn't reach ratio.
+  for (int num = 0; num < 2; num++) {
+    // Write 110KB (11 values, each 10K)
+    for (int i = 0; i < 11; i++) {
+      ASSERT_OK(Put(Key(key_idx), CompressibleString(&rnd, 10000)));
+      key_idx++;
+    }
+    dbfull()->TEST_WaitForFlushMemTable();
+    dbfull()->TEST_WaitForCompact();
+  }
+  ASSERT_LT(TotalSize(), 110000 * 6 * 0.9);
+
+  // When we start for the compaction up to (2 4 8), the latest
+  // compressed is not compressed.
+  for (int num = 0; num < 8; num++) {
+    // Write 110KB (11 values, each 10K)
+    for (int i = 0; i < 11; i++) {
+      ASSERT_OK(Put(Key(key_idx), CompressibleString(&rnd, 10000)));
+      key_idx++;
+    }
+    dbfull()->TEST_WaitForFlushMemTable();
+    dbfull()->TEST_WaitForCompact();
+  }
+  ASSERT_GT(TotalSize(), 110000 * 11 * 0.8 + 110000 * 2);
+}
+
+TEST_P(DBTestUniversalCompaction, UniversalCompactionCompressRatio2) {
+  if (!Snappy_Supported()) {
+    return;
+  }
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  options.write_buffer_size = 100 << 10;     // 100KB
+  options.target_file_size_base = 32 << 10;  // 32KB
+  options.level0_file_num_compaction_trigger = 2;
+  options.num_levels = num_levels_;
+  options.compaction_options_universal.compression_size_percent = 95;
+  DestroyAndReopen(options);
+
+  Random rnd(301);
+  int key_idx = 0;
+
+  // When we start for the compaction up to (2 4 8), the latest
+  // compressed is compressed given the size ratio to compress.
+  for (int num = 0; num < 14; num++) {
+    // Write 120KB (12 values, each 10K)
+    for (int i = 0; i < 12; i++) {
+      ASSERT_OK(Put(Key(key_idx), CompressibleString(&rnd, 10000)));
+      key_idx++;
+    }
+    dbfull()->TEST_WaitForFlushMemTable();
+    dbfull()->TEST_WaitForCompact();
+  }
+  ASSERT_LT(TotalSize(), 120000U * 12 * 0.82 + 120000 * 2);
+}
+
+#ifndef ROCKSDB_VALGRIND_RUN
+// Test that checks trivial move in universal compaction
+TEST_P(DBTestUniversalCompaction, UniversalCompactionTrivialMoveTest1) {
+  int32_t trivial_move = 0;
+  int32_t non_trivial_move = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:TrivialMove",
+      [&](void* /*arg*/) { trivial_move++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:NonTrivial", [&](void* arg) {
+        non_trivial_move++;
+        ASSERT_TRUE(arg != nullptr);
+        int output_level = *(static_cast<int*>(arg));
+        ASSERT_EQ(output_level, 0);
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  options.compaction_options_universal.allow_trivial_move = true;
+  options.num_levels = 2;
+  options.write_buffer_size = 100 << 10;  // 100KB
+  options.level0_file_num_compaction_trigger = 3;
+  options.max_background_compactions = 1;
+  options.target_file_size_base = 32 * 1024;
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  // Trigger compaction if size amplification exceeds 110%
+  options.compaction_options_universal.max_size_amplification_percent = 110;
+  options = CurrentOptions(options);
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+  Random rnd(301);
+  int num_keys = 250000;
+  for (int i = 0; i < num_keys; i++) {
+    ASSERT_OK(Put(1, Key(i), Key(i)));
+  }
+  std::vector<std::string> values;
+
+  ASSERT_OK(Flush(1));
+  dbfull()->TEST_WaitForCompact();
+
+  ASSERT_GT(trivial_move, 0);
+  ASSERT_GT(non_trivial_move, 0);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+// Test that checks trivial move in universal compaction
+TEST_P(DBTestUniversalCompaction, UniversalCompactionTrivialMoveTest2) {
+  int32_t trivial_move = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:TrivialMove",
+      [&](void* /*arg*/) { trivial_move++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:NonTrivial", [&](void* arg) {
+        ASSERT_TRUE(arg != nullptr);
+        int output_level = *(static_cast<int*>(arg));
+        ASSERT_EQ(output_level, 0);
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  options.compaction_options_universal.allow_trivial_move = true;
+  options.num_levels = 15;
+  options.write_buffer_size = 100 << 10;  // 100KB
+  options.level0_file_num_compaction_trigger = 8;
+  options.max_background_compactions = 2;
+  options.target_file_size_base = 64 * 1024;
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  // Trigger compaction if size amplification exceeds 110%
+  options.compaction_options_universal.max_size_amplification_percent = 110;
+  options = CurrentOptions(options);
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+  Random rnd(301);
+  int num_keys = 500000;
+  for (int i = 0; i < num_keys; i++) {
+    ASSERT_OK(Put(1, Key(i), Key(i)));
+  }
+  std::vector<std::string> values;
+
+  ASSERT_OK(Flush(1));
+  dbfull()->TEST_WaitForCompact();
+
+  ASSERT_GT(trivial_move, 0);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+#endif  // ROCKSDB_VALGRIND_RUN
+
+TEST_P(DBTestUniversalCompaction, UniversalCompactionFourPaths) {
+  Options options = CurrentOptions();
+  options.db_paths.emplace_back(dbname_, 300 * 1024);
+  options.db_paths.emplace_back(dbname_ + "_2", 300 * 1024);
+  options.db_paths.emplace_back(dbname_ + "_3", 500 * 1024);
+  options.db_paths.emplace_back(dbname_ + "_4", 1024 * 1024 * 1024);
+  options.memtable_factory.reset(
+      new SpecialSkipListFactory(KNumKeysByGenerateNewFile - 1));
+  options.compaction_style = kCompactionStyleUniversal;
+  options.compaction_options_universal.size_ratio = 5;
+  options.write_buffer_size = 111 << 10;  // 114KB
+  options.arena_block_size = 4 << 10;
+  options.level0_file_num_compaction_trigger = 2;
+  options.num_levels = 1;
+
+  std::vector<std::string> filenames;
+  env_->GetChildren(options.db_paths[1].path, &filenames);
+  // Delete archival files.
+  for (size_t i = 0; i < filenames.size(); ++i) {
+    env_->DeleteFile(options.db_paths[1].path + "/" + filenames[i]);
+  }
+  env_->DeleteDir(options.db_paths[1].path);
+  Reopen(options);
+
+  Random rnd(301);
+  int key_idx = 0;
+
+  // First three 110KB files are not going to second path.
+  // After that, (100K, 200K)
+  for (int num = 0; num < 3; num++) {
+    GenerateNewFile(&rnd, &key_idx);
+  }
+
+  // Another 110KB triggers a compaction to 400K file to second path
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[2].path));
+
+  // (1, 4)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  // (1,1,4) -> (2, 4)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+  // (1, 2, 4) -> (3, 4)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+  // (1, 3, 4) -> (8)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[3].path));
+
+  // (1, 8)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[3].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  // (1, 1, 8) -> (2, 8)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[3].path));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+
+  // (1, 2, 8) -> (3, 8)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[3].path));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+  // (1, 3, 8) -> (4, 8)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[3].path));
+
+  // (1, 4, 8) -> (5, 8)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[3].path));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+  for (int i = 0; i < key_idx; i++) {
+    auto v = Get(Key(i));
+    ASSERT_NE(v, "NOT_FOUND");
+    ASSERT_TRUE(v.size() == 1 || v.size() == 990);
+  }
+
+  Reopen(options);
+
+  for (int i = 0; i < key_idx; i++) {
+    auto v = Get(Key(i));
+    ASSERT_NE(v, "NOT_FOUND");
+    ASSERT_TRUE(v.size() == 1 || v.size() == 990);
+  }
+
+  Destroy(options);
+}
+
+TEST_P(DBTestUniversalCompaction, UniversalCompactionCFPathUse) {
+  Options options = CurrentOptions();
+  options.db_paths.emplace_back(dbname_, 300 * 1024);
+  options.db_paths.emplace_back(dbname_ + "_2", 300 * 1024);
+  options.db_paths.emplace_back(dbname_ + "_3", 500 * 1024);
+  options.db_paths.emplace_back(dbname_ + "_4", 1024 * 1024 * 1024);
+  options.memtable_factory.reset(
+      new SpecialSkipListFactory(KNumKeysByGenerateNewFile - 1));
+  options.compaction_style = kCompactionStyleUniversal;
+  options.compaction_options_universal.size_ratio = 10;
+  options.write_buffer_size = 111 << 10;  // 114KB
+  options.arena_block_size = 4 << 10;
+  options.level0_file_num_compaction_trigger = 2;
+  options.num_levels = 1;
+
+  std::vector<Options> option_vector;
+  option_vector.emplace_back(options);
+  ColumnFamilyOptions cf_opt1(options), cf_opt2(options);
+  // Configure CF1 specific paths.
+  cf_opt1.cf_paths.emplace_back(dbname_ + "cf1", 300 * 1024);
+  cf_opt1.cf_paths.emplace_back(dbname_ + "cf1_2", 300 * 1024);
+  cf_opt1.cf_paths.emplace_back(dbname_ + "cf1_3", 500 * 1024);
+  cf_opt1.cf_paths.emplace_back(dbname_ + "cf1_4", 1024 * 1024 * 1024);
+  option_vector.emplace_back(DBOptions(options), cf_opt1);
+  CreateColumnFamilies({"one"},option_vector[1]);
+
+  // Configura CF2 specific paths.
+  cf_opt2.cf_paths.emplace_back(dbname_ + "cf2", 300 * 1024);
+  cf_opt2.cf_paths.emplace_back(dbname_ + "cf2_2", 300 * 1024);
+  cf_opt2.cf_paths.emplace_back(dbname_ + "cf2_3", 500 * 1024);
+  cf_opt2.cf_paths.emplace_back(dbname_ + "cf2_4", 1024 * 1024 * 1024);
+  option_vector.emplace_back(DBOptions(options), cf_opt2);
+  CreateColumnFamilies({"two"},option_vector[2]);
+
+  ReopenWithColumnFamilies({"default", "one", "two"}, option_vector);
+
+  Random rnd(301);
+  int key_idx = 0;
+  int key_idx1 = 0;
+  int key_idx2 = 0;
+
+  auto generate_file = [&]() {
+    GenerateNewFile(0, &rnd, &key_idx);
+    GenerateNewFile(1, &rnd, &key_idx1);
+    GenerateNewFile(2, &rnd, &key_idx2);
+  };
+
+  auto check_sstfilecount = [&](int path_id, int expected) {
+    ASSERT_EQ(expected, GetSstFileCount(options.db_paths[path_id].path));
+    ASSERT_EQ(expected, GetSstFileCount(cf_opt1.cf_paths[path_id].path));
+    ASSERT_EQ(expected, GetSstFileCount(cf_opt2.cf_paths[path_id].path));
+  };
+
+  auto check_getvalues = [&]() {
+    for (int i = 0; i < key_idx; i++) {
+      auto v = Get(0, Key(i));
+      ASSERT_NE(v, "NOT_FOUND");
+      ASSERT_TRUE(v.size() == 1 || v.size() == 990);
+    }
+
+    for (int i = 0; i < key_idx1; i++) {
+      auto v = Get(1, Key(i));
+      ASSERT_NE(v, "NOT_FOUND");
+      ASSERT_TRUE(v.size() == 1 || v.size() == 990);
+    }
+
+    for (int i = 0; i < key_idx2; i++) {
+      auto v = Get(2, Key(i));
+      ASSERT_NE(v, "NOT_FOUND");
+      ASSERT_TRUE(v.size() == 1 || v.size() == 990);
+    }
+  };
+
+  // First three 110KB files are not going to second path.
+  // After that, (100K, 200K)
+  for (int num = 0; num < 3; num++) {
+    generate_file();
+  }
+
+  // Another 110KB triggers a compaction to 400K file to second path
+  generate_file();
+  check_sstfilecount(2, 1);
+
+  // (1, 4)
+  generate_file();
+  check_sstfilecount(2, 1);
+  check_sstfilecount(0, 1);
+
+  // (1,1,4) -> (2, 4)
+  generate_file();
+  check_sstfilecount(2, 1);
+  check_sstfilecount(1, 1);
+  check_sstfilecount(0, 0);
+
+  // (1, 2, 4) -> (3, 4)
+  generate_file();
+  check_sstfilecount(2, 1);
+  check_sstfilecount(1, 1);
+  check_sstfilecount(0, 0);
+
+  // (1, 3, 4) -> (8)
+  generate_file();
+  check_sstfilecount(3, 1);
+
+  // (1, 8)
+  generate_file();
+  check_sstfilecount(3, 1);
+  check_sstfilecount(0, 1);
+
+  // (1, 1, 8) -> (2, 8)
+  generate_file();
+  check_sstfilecount(3, 1);
+  check_sstfilecount(1, 1);
+
+  // (1, 2, 8) -> (3, 8)
+  generate_file();
+  check_sstfilecount(3, 1);
+  check_sstfilecount(1, 1);
+  check_sstfilecount(0, 0);
+
+  // (1, 3, 8) -> (4, 8)
+  generate_file();
+  check_sstfilecount(2, 1);
+  check_sstfilecount(3, 1);
+
+  // (1, 4, 8) -> (5, 8)
+  generate_file();
+  check_sstfilecount(3, 1);
+  check_sstfilecount(2, 1);
+  check_sstfilecount(0, 0);
+
+  check_getvalues();
+
+  ReopenWithColumnFamilies({"default", "one", "two"}, option_vector);
+
+  check_getvalues();
+
+  Destroy(options, true);
+}
+
+TEST_P(DBTestUniversalCompaction, IncreaseUniversalCompactionNumLevels) {
+  std::function<void(int)> verify_func = [&](int num_keys_in_db) {
+    std::string keys_in_db;
+    Iterator* iter = dbfull()->NewIterator(ReadOptions(), handles_[1]);
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+      keys_in_db.append(iter->key().ToString());
+      keys_in_db.push_back(',');
+    }
+    delete iter;
+
+    std::string expected_keys;
+    for (int i = 0; i <= num_keys_in_db; i++) {
+      expected_keys.append(Key(i));
+      expected_keys.push_back(',');
+    }
+
+    ASSERT_EQ(keys_in_db, expected_keys);
+  };
+
+  Random rnd(301);
+  int max_key1 = 200;
+  int max_key2 = 600;
+  int max_key3 = 800;
+  const int KNumKeysPerFile = 10;
+
+  // Stage 1: open a DB with universal compaction, num_levels=1
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  options.num_levels = 1;
+  options.write_buffer_size = 200 << 10;  // 200KB
+  options.level0_file_num_compaction_trigger = 3;
+  options.memtable_factory.reset(new SpecialSkipListFactory(KNumKeysPerFile));
+  options = CurrentOptions(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  for (int i = 0; i <= max_key1; i++) {
+    // each value is 10K
+    ASSERT_OK(Put(1, Key(i), RandomString(&rnd, 10000)));
+    dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
+    dbfull()->TEST_WaitForCompact();
+  }
+  ASSERT_OK(Flush(1));
+  dbfull()->TEST_WaitForCompact();
+
+  // Stage 2: reopen with universal compaction, num_levels=4
+  options.compaction_style = kCompactionStyleUniversal;
+  options.num_levels = 4;
+  options = CurrentOptions(options);
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+  verify_func(max_key1);
+
+  // Insert more keys
+  for (int i = max_key1 + 1; i <= max_key2; i++) {
+    // each value is 10K
+    ASSERT_OK(Put(1, Key(i), RandomString(&rnd, 10000)));
+    dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
+    dbfull()->TEST_WaitForCompact();
+  }
+  ASSERT_OK(Flush(1));
+  dbfull()->TEST_WaitForCompact();
+
+  verify_func(max_key2);
+  // Compaction to non-L0 has happened.
+  ASSERT_GT(NumTableFilesAtLevel(options.num_levels - 1, 1), 0);
+
+  // Stage 3: Revert it back to one level and revert to num_levels=1.
+  options.num_levels = 4;
+  options.target_file_size_base = INT_MAX;
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+  // Compact all to level 0
+  CompactRangeOptions compact_options;
+  compact_options.change_level = true;
+  compact_options.target_level = 0;
+  compact_options.exclusive_manual_compaction = exclusive_manual_compaction_;
+  dbfull()->CompactRange(compact_options, handles_[1], nullptr, nullptr);
+  // Need to restart it once to remove higher level records in manifest.
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+  // Final reopen
+  options.compaction_style = kCompactionStyleUniversal;
+  options.num_levels = 1;
+  options = CurrentOptions(options);
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+  // Insert more keys
+  for (int i = max_key2 + 1; i <= max_key3; i++) {
+    // each value is 10K
+    ASSERT_OK(Put(1, Key(i), RandomString(&rnd, 10000)));
+    dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
+    dbfull()->TEST_WaitForCompact();
+  }
+  ASSERT_OK(Flush(1));
+  dbfull()->TEST_WaitForCompact();
+  verify_func(max_key3);
+}
+
+
+TEST_P(DBTestUniversalCompaction, UniversalCompactionSecondPathRatio) {
+  if (!Snappy_Supported()) {
+    return;
+  }
+  Options options = CurrentOptions();
+  options.db_paths.emplace_back(dbname_, 500 * 1024);
+  options.db_paths.emplace_back(dbname_ + "_2", 1024 * 1024 * 1024);
+  options.compaction_style = kCompactionStyleUniversal;
+  options.compaction_options_universal.size_ratio = 5;
+  options.write_buffer_size = 111 << 10;  // 114KB
+  options.arena_block_size = 4 << 10;
+  options.level0_file_num_compaction_trigger = 2;
+  options.num_levels = 1;
+  options.memtable_factory.reset(
+      new SpecialSkipListFactory(KNumKeysByGenerateNewFile - 1));
+
+  std::vector<std::string> filenames;
+  env_->GetChildren(options.db_paths[1].path, &filenames);
+  // Delete archival files.
+  for (size_t i = 0; i < filenames.size(); ++i) {
+    env_->DeleteFile(options.db_paths[1].path + "/" + filenames[i]);
+  }
+  env_->DeleteDir(options.db_paths[1].path);
+  Reopen(options);
+
+  Random rnd(301);
+  int key_idx = 0;
+
+  // First three 110KB files are not going to second path.
+  // After that, (100K, 200K)
+  for (int num = 0; num < 3; num++) {
+    GenerateNewFile(&rnd, &key_idx);
+  }
+
+  // Another 110KB triggers a compaction to 400K file to second path
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+
+  // (1, 4)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  // (1,1,4) -> (2, 4)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  // (1, 2, 4) -> (3, 4)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(2, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+  // (1, 3, 4) -> (8)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+  // (1, 8)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  // (1, 1, 8) -> (2, 8)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  // (1, 2, 8) -> (3, 8)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(2, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+  // (1, 3, 8) -> (4, 8)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(2, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+  // (1, 4, 8) -> (5, 8)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(2, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+  for (int i = 0; i < key_idx; i++) {
+    auto v = Get(Key(i));
+    ASSERT_NE(v, "NOT_FOUND");
+    ASSERT_TRUE(v.size() == 1 || v.size() == 990);
+  }
+
+  Reopen(options);
+
+  for (int i = 0; i < key_idx; i++) {
+    auto v = Get(Key(i));
+    ASSERT_NE(v, "NOT_FOUND");
+    ASSERT_TRUE(v.size() == 1 || v.size() == 990);
+  }
+
+  Destroy(options);
+}
+
+TEST_P(DBTestUniversalCompaction, ConcurrentBottomPriLowPriCompactions) {
+  if (num_levels_ == 1) {
+    // for single-level universal, everything's bottom level so nothing should
+    // be executed in bottom-pri thread pool.
+    return;
+  }
+  const int kNumFilesTrigger = 3;
+  Env::Default()->SetBackgroundThreads(1, Env::Priority::BOTTOM);
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  options.num_levels = num_levels_;
+  options.write_buffer_size = 100 << 10;     // 100KB
+  options.target_file_size_base = 32 << 10;  // 32KB
+  options.level0_file_num_compaction_trigger = kNumFilesTrigger;
+  // Trigger compaction if size amplification exceeds 110%
+  options.compaction_options_universal.max_size_amplification_percent = 110;
+  DestroyAndReopen(options);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {// wait for the full compaction to be picked before adding files intended
+       // for the second one.
+       {"DBImpl::BackgroundCompaction:ForwardToBottomPriPool",
+        "DBTestUniversalCompaction:ConcurrentBottomPriLowPriCompactions:0"},
+       // the full (bottom-pri) compaction waits until a partial (low-pri)
+       // compaction has started to verify they can run in parallel.
+       {"DBImpl::BackgroundCompaction:NonTrivial",
+        "DBImpl::BGWorkBottomCompaction"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  Random rnd(301);
+  for (int i = 0; i < 2; ++i) {
+    for (int num = 0; num < kNumFilesTrigger; num++) {
+      int key_idx = 0;
+      GenerateNewFile(&rnd, &key_idx, true /* no_wait */);
+      // use no_wait above because that one waits for flush and compaction. We
+      // don't want to wait for compaction because the full compaction is
+      // intentionally blocked while more files are flushed.
+      dbfull()->TEST_WaitForFlushMemTable();
+    }
+    if (i == 0) {
+      TEST_SYNC_POINT(
+          "DBTestUniversalCompaction:ConcurrentBottomPriLowPriCompactions:0");
+    }
+  }
+  dbfull()->TEST_WaitForCompact();
+
+  // First compaction should output to bottom level. Second should output to L0
+  // since older L0 files pending compaction prevent it from being placed lower.
+  ASSERT_EQ(NumSortedRuns(), 2);
+  ASSERT_GT(NumTableFilesAtLevel(0), 0);
+  ASSERT_GT(NumTableFilesAtLevel(num_levels_ - 1), 0);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  Env::Default()->SetBackgroundThreads(0, Env::Priority::BOTTOM);
+}
+
+TEST_P(DBTestUniversalCompaction, RecalculateScoreAfterPicking) {
+  // Regression test for extra compactions scheduled. Once enough compactions
+  // have been scheduled to bring the score below one, we should stop
+  // scheduling more; otherwise, other CFs/DBs may be delayed unnecessarily.
+  const int kNumFilesTrigger = 8;
+  Options options = CurrentOptions();
+  options.memtable_factory.reset(
+      new SpecialSkipListFactory(KNumKeysByGenerateNewFile - 1));
+  options.compaction_options_universal.max_merge_width = kNumFilesTrigger / 2;
+  options.compaction_options_universal.max_size_amplification_percent =
+      static_cast<unsigned int>(-1);
+  options.compaction_style = kCompactionStyleUniversal;
+  options.level0_file_num_compaction_trigger = kNumFilesTrigger;
+  options.num_levels = num_levels_;
+  Reopen(options);
+
+  std::atomic<int> num_compactions_attempted(0);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:Start",
+      [&](void* /*arg*/) { ++num_compactions_attempted; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  Random rnd(301);
+  for (int num = 0; num < kNumFilesTrigger; num++) {
+    ASSERT_EQ(NumSortedRuns(), num);
+    int key_idx = 0;
+    GenerateNewFile(&rnd, &key_idx);
+  }
+  dbfull()->TEST_WaitForCompact();
+  // Compacting the first four files was enough to bring the score below one so
+  // there's no need to schedule any more compactions.
+  ASSERT_EQ(1, num_compactions_attempted);
+  ASSERT_EQ(NumSortedRuns(), 5);
+}
+
+TEST_P(DBTestUniversalCompaction, FinalSortedRunCompactFilesConflict) {
+  // Regression test for conflict between:
+  // (1) Running CompactFiles including file in the final sorted run; and
+  // (2) Picking universal size-amp-triggered compaction, which always includes
+  //     the final sorted run.
+  if (exclusive_manual_compaction_) {
+    return;
+  }
+
+  Options opts = CurrentOptions();
+  opts.compaction_style = kCompactionStyleUniversal;
+  opts.compaction_options_universal.max_size_amplification_percent = 50;
+  opts.compaction_options_universal.min_merge_width = 2;
+  opts.compression = kNoCompression;
+  opts.level0_file_num_compaction_trigger = 2;
+  opts.max_background_compactions = 2;
+  opts.num_levels = num_levels_;
+  Reopen(opts);
+
+  // make sure compaction jobs can be parallelized
+  auto stop_token =
+      dbfull()->TEST_write_controler().GetCompactionPressureToken();
+
+  Put("key", "val");
+  Flush();
+  dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_EQ(NumTableFilesAtLevel(num_levels_ - 1), 1);
+  ColumnFamilyMetaData cf_meta;
+  ColumnFamilyHandle* default_cfh = db_->DefaultColumnFamily();
+  dbfull()->GetColumnFamilyMetaData(default_cfh, &cf_meta);
+  ASSERT_EQ(1, cf_meta.levels[num_levels_ - 1].files.size());
+  std::string first_sst_filename =
+      cf_meta.levels[num_levels_ - 1].files[0].name;
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"CompactFilesImpl:0",
+        "DBTestUniversalCompaction:FinalSortedRunCompactFilesConflict:0"},
+       {"DBImpl::BackgroundCompaction():AfterPickCompaction",
+        "CompactFilesImpl:1"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  port::Thread compact_files_thread([&]() {
+    ASSERT_OK(dbfull()->CompactFiles(CompactionOptions(), default_cfh,
+                                     {first_sst_filename}, num_levels_ - 1));
+  });
+
+  TEST_SYNC_POINT(
+      "DBTestUniversalCompaction:FinalSortedRunCompactFilesConflict:0");
+  for (int i = 0; i < 2; ++i) {
+    Put("key", "val");
+    Flush();
+  }
+  dbfull()->TEST_WaitForCompact();
+
+  compact_files_thread.join();
+}
+
+INSTANTIATE_TEST_CASE_P(NumLevels, DBTestUniversalCompaction,
+                        ::testing::Combine(::testing::Values(1, 3, 5),
+                                           ::testing::Bool()));
+
+class DBTestUniversalManualCompactionOutputPathId
+    : public DBTestUniversalCompactionBase {
+ public:
+  DBTestUniversalManualCompactionOutputPathId() :
+      DBTestUniversalCompactionBase(
+          "/db_universal_compaction_manual_pid_test") {}
+};
+
+TEST_P(DBTestUniversalManualCompactionOutputPathId,
+       ManualCompactionOutputPathId) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.db_paths.emplace_back(dbname_, 1000000000);
+  options.db_paths.emplace_back(dbname_ + "_2", 1000000000);
+  options.compaction_style = kCompactionStyleUniversal;
+  options.num_levels = num_levels_;
+  options.target_file_size_base = 1 << 30;  // Big size
+  options.level0_file_num_compaction_trigger = 10;
+  Destroy(options);
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+  MakeTables(3, "p", "q", 1);
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(2, TotalLiveFiles(1));
+  ASSERT_EQ(2, GetSstFileCount(options.db_paths[0].path));
+  ASSERT_EQ(0, GetSstFileCount(options.db_paths[1].path));
+
+  // Full compaction to DB path 0
+  CompactRangeOptions compact_options;
+  compact_options.target_path_id = 1;
+  compact_options.exclusive_manual_compaction = exclusive_manual_compaction_;
+  db_->CompactRange(compact_options, handles_[1], nullptr, nullptr);
+  ASSERT_EQ(1, TotalLiveFiles(1));
+  ASSERT_EQ(0, GetSstFileCount(options.db_paths[0].path));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+
+  ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"}, options);
+  ASSERT_EQ(1, TotalLiveFiles(1));
+  ASSERT_EQ(0, GetSstFileCount(options.db_paths[0].path));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+
+  MakeTables(1, "p", "q", 1);
+  ASSERT_EQ(2, TotalLiveFiles(1));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[0].path));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+
+  ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"}, options);
+  ASSERT_EQ(2, TotalLiveFiles(1));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[0].path));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+
+  // Full compaction to DB path 0
+  compact_options.target_path_id = 0;
+  compact_options.exclusive_manual_compaction = exclusive_manual_compaction_;
+  db_->CompactRange(compact_options, handles_[1], nullptr, nullptr);
+  ASSERT_EQ(1, TotalLiveFiles(1));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[0].path));
+  ASSERT_EQ(0, GetSstFileCount(options.db_paths[1].path));
+
+  // Fail when compacting to an invalid path ID
+  compact_options.target_path_id = 2;
+  compact_options.exclusive_manual_compaction = exclusive_manual_compaction_;
+  ASSERT_TRUE(db_->CompactRange(compact_options, handles_[1], nullptr, nullptr)
+                  .IsInvalidArgument());
+}
+
+INSTANTIATE_TEST_CASE_P(OutputPathId,
+                        DBTestUniversalManualCompactionOutputPathId,
+                        ::testing::Combine(::testing::Values(1, 8),
+                                           ::testing::Bool()));
+
+TEST_F(DBTestUniversalCompaction2, BasicL0toL1) {
+  const int kNumKeys = 3000;
+  const int kWindowSize = 100;
+  const int kNumDelsTrigger = 90;
+
+  Options opts = CurrentOptions();
+  opts.table_properties_collector_factories.emplace_back(
+      NewCompactOnDeletionCollectorFactory(kWindowSize, kNumDelsTrigger));
+  opts.compaction_style = kCompactionStyleUniversal;
+  opts.level0_file_num_compaction_trigger = 2;
+  opts.compression = kNoCompression;
+  opts.compaction_options_universal.size_ratio = 10;
+  opts.compaction_options_universal.min_merge_width = 2;
+  opts.compaction_options_universal.max_size_amplification_percent = 200;
+  Reopen(opts);
+
+  // add an L1 file to prevent tombstones from dropping due to obsolescence
+  // during flush
+  int i;
+  for (i = 0; i < 2000; ++i) {
+    Put(Key(i), "val");
+  }
+  Flush();
+  //  MoveFilesToLevel(6);
+  dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+
+  for (i = 1999; i < kNumKeys; ++i) {
+    if (i >= kNumKeys - kWindowSize &&
+        i < kNumKeys - kWindowSize + kNumDelsTrigger) {
+      Delete(Key(i));
+    } else {
+      Put(Key(i), "val");
+    }
+  }
+  Flush();
+
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(0, NumTableFilesAtLevel(0));
+  ASSERT_GT(NumTableFilesAtLevel(6), 0);
+}
+
+TEST_F(DBTestUniversalCompaction2, SingleLevel) {
+  const int kNumKeys = 3000;
+  const int kWindowSize = 100;
+  const int kNumDelsTrigger = 90;
+
+  Options opts = CurrentOptions();
+  opts.table_properties_collector_factories.emplace_back(
+      NewCompactOnDeletionCollectorFactory(kWindowSize, kNumDelsTrigger));
+  opts.compaction_style = kCompactionStyleUniversal;
+  opts.level0_file_num_compaction_trigger = 2;
+  opts.compression = kNoCompression;
+  opts.num_levels = 1;
+  opts.compaction_options_universal.size_ratio = 10;
+  opts.compaction_options_universal.min_merge_width = 2;
+  opts.compaction_options_universal.max_size_amplification_percent = 200;
+  Reopen(opts);
+
+  // add an L1 file to prevent tombstones from dropping due to obsolescence
+  // during flush
+  int i;
+  for (i = 0; i < 2000; ++i) {
+    Put(Key(i), "val");
+  }
+  Flush();
+
+  for (i = 1999; i < kNumKeys; ++i) {
+    if (i >= kNumKeys - kWindowSize &&
+        i < kNumKeys - kWindowSize + kNumDelsTrigger) {
+      Delete(Key(i));
+    } else {
+      Put(Key(i), "val");
+    }
+  }
+  Flush();
+
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(1, NumTableFilesAtLevel(0));
+}
+
+TEST_F(DBTestUniversalCompaction2, MultipleLevels) {
+  const int kWindowSize = 100;
+  const int kNumDelsTrigger = 90;
+
+  Options opts = CurrentOptions();
+  opts.table_properties_collector_factories.emplace_back(
+      NewCompactOnDeletionCollectorFactory(kWindowSize, kNumDelsTrigger));
+  opts.compaction_style = kCompactionStyleUniversal;
+  opts.level0_file_num_compaction_trigger = 4;
+  opts.compression = kNoCompression;
+  opts.compaction_options_universal.size_ratio = 10;
+  opts.compaction_options_universal.min_merge_width = 2;
+  opts.compaction_options_universal.max_size_amplification_percent = 200;
+  Reopen(opts);
+
+  // add an L1 file to prevent tombstones from dropping due to obsolescence
+  // during flush
+  int i;
+  for (i = 0; i < 500; ++i) {
+    Put(Key(i), "val");
+  }
+  Flush();
+  for (i = 500; i < 1000; ++i) {
+    Put(Key(i), "val");
+  }
+  Flush();
+  for (i = 1000; i < 1500; ++i) {
+    Put(Key(i), "val");
+  }
+  Flush();
+  for (i = 1500; i < 2000; ++i) {
+    Put(Key(i), "val");
+  }
+  Flush();
+
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(0, NumTableFilesAtLevel(0));
+  ASSERT_GT(NumTableFilesAtLevel(6), 0);
+
+  for (i = 1999; i < 2333; ++i) {
+    Put(Key(i), "val");
+  }
+  Flush();
+  for (i = 2333; i < 2666; ++i) {
+    Put(Key(i), "val");
+  }
+  Flush();
+  for (i = 2666; i < 2999; ++i) {
+    Put(Key(i), "val");
+  }
+  Flush();
+
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(0, NumTableFilesAtLevel(0));
+  ASSERT_GT(NumTableFilesAtLevel(6), 0);
+  ASSERT_GT(NumTableFilesAtLevel(5), 0);
+
+  for (i = 1900; i < 2100; ++i) {
+    Delete(Key(i));
+  }
+  Flush();
+
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(0, NumTableFilesAtLevel(0));
+  ASSERT_EQ(0, NumTableFilesAtLevel(1));
+  ASSERT_EQ(0, NumTableFilesAtLevel(2));
+  ASSERT_EQ(0, NumTableFilesAtLevel(3));
+  ASSERT_EQ(0, NumTableFilesAtLevel(4));
+  ASSERT_EQ(0, NumTableFilesAtLevel(5));
+  ASSERT_GT(NumTableFilesAtLevel(6), 0);
+}
+
+TEST_F(DBTestUniversalCompaction2, OverlappingL0) {
+  const int kWindowSize = 100;
+  const int kNumDelsTrigger = 90;
+
+  Options opts = CurrentOptions();
+  opts.table_properties_collector_factories.emplace_back(
+      NewCompactOnDeletionCollectorFactory(kWindowSize, kNumDelsTrigger));
+  opts.compaction_style = kCompactionStyleUniversal;
+  opts.level0_file_num_compaction_trigger = 5;
+  opts.compression = kNoCompression;
+  opts.compaction_options_universal.size_ratio = 10;
+  opts.compaction_options_universal.min_merge_width = 2;
+  opts.compaction_options_universal.max_size_amplification_percent = 200;
+  Reopen(opts);
+
+  // add an L1 file to prevent tombstones from dropping due to obsolescence
+  // during flush
+  int i;
+  for (i = 0; i < 2000; ++i) {
+    Put(Key(i), "val");
+  }
+  Flush();
+  for (i = 2000; i < 3000; ++i) {
+    Put(Key(i), "val");
+  }
+  Flush();
+  for (i = 3500; i < 4000; ++i) {
+    Put(Key(i), "val");
+  }
+  Flush();
+  for (i = 2900; i < 3100; ++i) {
+    Delete(Key(i));
+  }
+  Flush();
+
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(2, NumTableFilesAtLevel(0));
+  ASSERT_GT(NumTableFilesAtLevel(6), 0);
+}
+
+TEST_F(DBTestUniversalCompaction2, IngestBehind) {
+  const int kNumKeys = 3000;
+  const int kWindowSize = 100;
+  const int kNumDelsTrigger = 90;
+
+  Options opts = CurrentOptions();
+  opts.table_properties_collector_factories.emplace_back(
+      NewCompactOnDeletionCollectorFactory(kWindowSize, kNumDelsTrigger));
+  opts.compaction_style = kCompactionStyleUniversal;
+  opts.level0_file_num_compaction_trigger = 2;
+  opts.compression = kNoCompression;
+  opts.allow_ingest_behind = true;
+  opts.compaction_options_universal.size_ratio = 10;
+  opts.compaction_options_universal.min_merge_width = 2;
+  opts.compaction_options_universal.max_size_amplification_percent = 200;
+  Reopen(opts);
+
+  // add an L1 file to prevent tombstones from dropping due to obsolescence
+  // during flush
+  int i;
+  for (i = 0; i < 2000; ++i) {
+    Put(Key(i), "val");
+  }
+  Flush();
+  //  MoveFilesToLevel(6);
+  dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+
+  for (i = 1999; i < kNumKeys; ++i) {
+    if (i >= kNumKeys - kWindowSize &&
+        i < kNumKeys - kWindowSize + kNumDelsTrigger) {
+      Delete(Key(i));
+    } else {
+      Put(Key(i), "val");
+    }
+  }
+  Flush();
+
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(0, NumTableFilesAtLevel(0));
+  ASSERT_EQ(0, NumTableFilesAtLevel(6));
+  ASSERT_GT(NumTableFilesAtLevel(5), 0);
+}
+
+TEST_F(DBTestUniversalCompaction2, PeriodicCompactionDefault) {
+  Options options;
+  options.compaction_style = kCompactionStyleUniversal;
+
+  KeepFilterFactory* filter = new KeepFilterFactory(true);
+  options.compaction_filter_factory.reset(filter);
+  Reopen(options);
+  ASSERT_EQ(30 * 24 * 60 * 60,
+            dbfull()->GetOptions().periodic_compaction_seconds);
+
+  KeepFilter df;
+  options.compaction_filter_factory.reset();
+  options.compaction_filter = &df;
+  Reopen(options);
+  ASSERT_EQ(30 * 24 * 60 * 60,
+            dbfull()->GetOptions().periodic_compaction_seconds);
+
+  options.ttl = 60 * 24 * 60 * 60;
+  options.compaction_filter = nullptr;
+  Reopen(options);
+  ASSERT_EQ(60 * 24 * 60 * 60,
+            dbfull()->GetOptions().periodic_compaction_seconds);
+}
+
+TEST_F(DBTestUniversalCompaction2, PeriodicCompaction) {
+  Options opts = CurrentOptions();
+  opts.env = env_;
+  opts.compaction_style = kCompactionStyleUniversal;
+  opts.level0_file_num_compaction_trigger = 10;
+  opts.max_open_files = -1;
+  opts.compaction_options_universal.size_ratio = 10;
+  opts.compaction_options_universal.min_merge_width = 2;
+  opts.compaction_options_universal.max_size_amplification_percent = 200;
+  opts.periodic_compaction_seconds = 48 * 60 * 60;  // 2 days
+  opts.num_levels = 5;
+  env_->addon_time_.store(0);
+  Reopen(opts);
+
+  int periodic_compactions = 0;
+  int start_level = -1;
+  int output_level = -1;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "UniversalCompactionPicker::PickPeriodicCompaction:Return",
+      [&](void* arg) {
+        Compaction* compaction = reinterpret_cast<Compaction*>(arg);
+        ASSERT_TRUE(arg != nullptr);
+        ASSERT_TRUE(compaction->compaction_reason() ==
+                    CompactionReason::kPeriodicCompaction);
+        start_level = compaction->start_level();
+        output_level = compaction->output_level();
+        periodic_compactions++;
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // Case 1: Oldest flushed file excceeds periodic compaction threshold.
+  ASSERT_OK(Put("foo", "bar"));
+  Flush();
+  ASSERT_EQ(0, periodic_compactions);
+  // Move clock forward so that the flushed file would qualify periodic
+  // compaction.
+  env_->addon_time_.store(48 * 60 * 60 + 100);
+
+  // Another flush would trigger compaction the oldest file.
+  ASSERT_OK(Put("foo", "bar2"));
+  Flush();
+  dbfull()->TEST_WaitForCompact();
+
+  ASSERT_EQ(1, periodic_compactions);
+  ASSERT_EQ(0, start_level);
+  ASSERT_EQ(4, output_level);
+
+  // Case 2: Oldest compacted file excceeds periodic compaction threshold
+  periodic_compactions = 0;
+  // A flush doesn't trigger a periodic compaction when threshold not hit
+  ASSERT_OK(Put("foo", "bar2"));
+  Flush();
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(0, periodic_compactions);
+
+  // After periodic compaction threshold hits, a flush will trigger
+  // a compaction
+  ASSERT_OK(Put("foo", "bar2"));
+  env_->addon_time_.fetch_add(48 * 60 * 60 + 100);
+  Flush();
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(1, periodic_compactions);
+  ASSERT_EQ(0, start_level);
+  ASSERT_EQ(4, output_level);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // !defined(ROCKSDB_LITE)
+
+int main(int argc, char** argv) {
+#if !defined(ROCKSDB_LITE)
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+#else
+  (void) argc;
+  (void) argv;
+  return 0;
+#endif
+}
diff --git a/src/rocksdb/db/db_wal_test.cc b/src/rocksdb/db/db_wal_test.cc
new file mode 100644
index 000000000..ef81de803
--- /dev/null
+++ b/src/rocksdb/db/db_wal_test.cc
@@ -0,0 +1,1586 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/db_test_util.h"
+#include "env/composite_env_wrapper.h"
+#include "options/options_helper.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "test_util/fault_injection_test_env.h"
+#include "test_util/sync_point.h"
+
+namespace ROCKSDB_NAMESPACE {
+class DBWALTest : public DBTestBase {
+ public:
+  DBWALTest() : DBTestBase("/db_wal_test") {}
+
+#if defined(ROCKSDB_PLATFORM_POSIX)
+  uint64_t GetAllocatedFileSize(std::string file_name) {
+    struct stat sbuf;
+    int err = stat(file_name.c_str(), &sbuf);
+    assert(err == 0);
+    return sbuf.st_blocks * 512;
+  }
+#endif
+};
+
+// A SpecialEnv enriched to give more insight about deleted files
+class EnrichedSpecialEnv : public SpecialEnv {
+ public:
+  explicit EnrichedSpecialEnv(Env* base) : SpecialEnv(base) {}
+  Status NewSequentialFile(const std::string& f,
+                           std::unique_ptr<SequentialFile>* r,
+                           const EnvOptions& soptions) override {
+    InstrumentedMutexLock l(&env_mutex_);
+    if (f == skipped_wal) {
+      deleted_wal_reopened = true;
+      if (IsWAL(f) && largetest_deleted_wal.size() != 0 &&
+          f.compare(largetest_deleted_wal) <= 0) {
+        gap_in_wals = true;
+      }
+    }
+    return SpecialEnv::NewSequentialFile(f, r, soptions);
+  }
+  Status DeleteFile(const std::string& fname) override {
+    if (IsWAL(fname)) {
+      deleted_wal_cnt++;
+      InstrumentedMutexLock l(&env_mutex_);
+      // If this is the first WAL, remember its name and skip deleting it. We
+      // remember its name partly because the application might attempt to
+      // delete the file again.
+      if (skipped_wal.size() != 0 && skipped_wal != fname) {
+        if (largetest_deleted_wal.size() == 0 ||
+            largetest_deleted_wal.compare(fname) < 0) {
+          largetest_deleted_wal = fname;
+        }
+      } else {
+        skipped_wal = fname;
+        return Status::OK();
+      }
+    }
+    return SpecialEnv::DeleteFile(fname);
+  }
+  bool IsWAL(const std::string& fname) {
+    // printf("iswal %s\n", fname.c_str());
+    return fname.compare(fname.size() - 3, 3, "log") == 0;
+  }
+
+  InstrumentedMutex env_mutex_;
+  // the wal whose actual delete was skipped by the env
+  std::string skipped_wal = "";
+  // the largest WAL that was requested to be deleted
+  std::string largetest_deleted_wal = "";
+  // number of WALs that were successfully deleted
+  std::atomic<size_t> deleted_wal_cnt = {0};
+  // the WAL whose delete from fs was skipped is reopened during recovery
+  std::atomic<bool> deleted_wal_reopened = {false};
+  // whether a gap in the WALs was detected during recovery
+  std::atomic<bool> gap_in_wals = {false};
+};
+
+class DBWALTestWithEnrichedEnv : public DBTestBase {
+ public:
+  DBWALTestWithEnrichedEnv() : DBTestBase("/db_wal_test") {
+    enriched_env_ = new EnrichedSpecialEnv(env_->target());
+    auto options = CurrentOptions();
+    options.env = enriched_env_;
+    options.allow_2pc = true;
+    Reopen(options);
+    delete env_;
+    // to be deleted by the parent class
+    env_ = enriched_env_;
+  }
+
+ protected:
+  EnrichedSpecialEnv* enriched_env_;
+};
+
+// Test that the recovery would successfully avoid the gaps between the logs.
+// One known scenario that could cause this is that the application issue the
+// WAL deletion out of order. For the sake of simplicity in the test, here we
+// create the gap by manipulating the env to skip deletion of the first WAL but
+// not the ones after it.
+TEST_F(DBWALTestWithEnrichedEnv, SkipDeletedWALs) {
+  auto options = last_options_;
+  // To cause frequent WAL deletion
+  options.write_buffer_size = 128;
+  Reopen(options);
+
+  WriteOptions writeOpt = WriteOptions();
+  for (int i = 0; i < 128 * 5; i++) {
+    ASSERT_OK(dbfull()->Put(writeOpt, "foo", "v1"));
+  }
+  FlushOptions fo;
+  fo.wait = true;
+  ASSERT_OK(db_->Flush(fo));
+
+  // some wals are deleted
+  ASSERT_NE(0, enriched_env_->deleted_wal_cnt);
+  // but not the first one
+  ASSERT_NE(0, enriched_env_->skipped_wal.size());
+
+  // Test that the WAL that was not deleted will be skipped during recovery
+  options = last_options_;
+  Reopen(options);
+  ASSERT_FALSE(enriched_env_->deleted_wal_reopened);
+  ASSERT_FALSE(enriched_env_->gap_in_wals);
+}
+
+TEST_F(DBWALTest, WAL) {
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    WriteOptions writeOpt = WriteOptions();
+    writeOpt.disableWAL = true;
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v1"));
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v1"));
+
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    ASSERT_EQ("v1", Get(1, "foo"));
+    ASSERT_EQ("v1", Get(1, "bar"));
+
+    writeOpt.disableWAL = false;
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v2"));
+    writeOpt.disableWAL = true;
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v2"));
+
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    // Both value's should be present.
+    ASSERT_EQ("v2", Get(1, "bar"));
+    ASSERT_EQ("v2", Get(1, "foo"));
+
+    writeOpt.disableWAL = true;
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v3"));
+    writeOpt.disableWAL = false;
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v3"));
+
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    // again both values should be present.
+    ASSERT_EQ("v3", Get(1, "foo"));
+    ASSERT_EQ("v3", Get(1, "bar"));
+  } while (ChangeWalOptions());
+}
+
+TEST_F(DBWALTest, RollLog) {
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    ASSERT_OK(Put(1, "foo", "v1"));
+    ASSERT_OK(Put(1, "baz", "v5"));
+
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    for (int i = 0; i < 10; i++) {
+      ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    }
+    ASSERT_OK(Put(1, "foo", "v4"));
+    for (int i = 0; i < 10; i++) {
+      ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    }
+  } while (ChangeWalOptions());
+}
+
+TEST_F(DBWALTest, SyncWALNotBlockWrite) {
+  Options options = CurrentOptions();
+  options.max_write_buffer_number = 4;
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put("foo1", "bar1"));
+  ASSERT_OK(Put("foo5", "bar5"));
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+      {"WritableFileWriter::SyncWithoutFlush:1",
+       "DBWALTest::SyncWALNotBlockWrite:1"},
+      {"DBWALTest::SyncWALNotBlockWrite:2",
+       "WritableFileWriter::SyncWithoutFlush:2"},
+  });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ROCKSDB_NAMESPACE::port::Thread thread([&]() { ASSERT_OK(db_->SyncWAL()); });
+
+  TEST_SYNC_POINT("DBWALTest::SyncWALNotBlockWrite:1");
+  ASSERT_OK(Put("foo2", "bar2"));
+  ASSERT_OK(Put("foo3", "bar3"));
+  FlushOptions fo;
+  fo.wait = false;
+  ASSERT_OK(db_->Flush(fo));
+  ASSERT_OK(Put("foo4", "bar4"));
+
+  TEST_SYNC_POINT("DBWALTest::SyncWALNotBlockWrite:2");
+
+  thread.join();
+
+  ASSERT_EQ(Get("foo1"), "bar1");
+  ASSERT_EQ(Get("foo2"), "bar2");
+  ASSERT_EQ(Get("foo3"), "bar3");
+  ASSERT_EQ(Get("foo4"), "bar4");
+  ASSERT_EQ(Get("foo5"), "bar5");
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBWALTest, SyncWALNotWaitWrite) {
+  ASSERT_OK(Put("foo1", "bar1"));
+  ASSERT_OK(Put("foo3", "bar3"));
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+      {"SpecialEnv::WalFile::Append:1", "DBWALTest::SyncWALNotWaitWrite:1"},
+      {"DBWALTest::SyncWALNotWaitWrite:2", "SpecialEnv::WalFile::Append:2"},
+  });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ROCKSDB_NAMESPACE::port::Thread thread(
+      [&]() { ASSERT_OK(Put("foo2", "bar2")); });
+  // Moving this to SyncWAL before the actual fsync
+  // TEST_SYNC_POINT("DBWALTest::SyncWALNotWaitWrite:1");
+  ASSERT_OK(db_->SyncWAL());
+  // Moving this to SyncWAL after actual fsync
+  // TEST_SYNC_POINT("DBWALTest::SyncWALNotWaitWrite:2");
+
+  thread.join();
+
+  ASSERT_EQ(Get("foo1"), "bar1");
+  ASSERT_EQ(Get("foo2"), "bar2");
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBWALTest, Recover) {
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    ASSERT_OK(Put(1, "foo", "v1"));
+    ASSERT_OK(Put(1, "baz", "v5"));
+
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    ASSERT_EQ("v1", Get(1, "foo"));
+
+    ASSERT_EQ("v1", Get(1, "foo"));
+    ASSERT_EQ("v5", Get(1, "baz"));
+    ASSERT_OK(Put(1, "bar", "v2"));
+    ASSERT_OK(Put(1, "foo", "v3"));
+
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    ASSERT_EQ("v3", Get(1, "foo"));
+    ASSERT_OK(Put(1, "foo", "v4"));
+    ASSERT_EQ("v4", Get(1, "foo"));
+    ASSERT_EQ("v2", Get(1, "bar"));
+    ASSERT_EQ("v5", Get(1, "baz"));
+  } while (ChangeWalOptions());
+}
+
+TEST_F(DBWALTest, RecoverWithTableHandle) {
+  do {
+    Options options = CurrentOptions();
+    options.create_if_missing = true;
+    options.disable_auto_compactions = true;
+    options.avoid_flush_during_recovery = false;
+    DestroyAndReopen(options);
+    CreateAndReopenWithCF({"pikachu"}, options);
+
+    ASSERT_OK(Put(1, "foo", "v1"));
+    ASSERT_OK(Put(1, "bar", "v2"));
+    ASSERT_OK(Flush(1));
+    ASSERT_OK(Put(1, "foo", "v3"));
+    ASSERT_OK(Put(1, "bar", "v4"));
+    ASSERT_OK(Flush(1));
+    ASSERT_OK(Put(1, "big", std::string(100, 'a')));
+
+    options = CurrentOptions();
+    const int kSmallMaxOpenFiles = 13;
+    if (option_config_ == kDBLogDir) {
+      // Use this option to check not preloading files
+      // Set the max open files to be small enough so no preload will
+      // happen.
+      options.max_open_files = kSmallMaxOpenFiles;
+      // RocksDB sanitize max open files to at least 20. Modify it back.
+      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+          "SanitizeOptions::AfterChangeMaxOpenFiles", [&](void* arg) {
+            int* max_open_files = static_cast<int*>(arg);
+            *max_open_files = kSmallMaxOpenFiles;
+          });
+
+    } else if (option_config_ == kWalDirAndMmapReads) {
+      // Use this option to check always loading all files.
+      options.max_open_files = 100;
+    } else {
+      options.max_open_files = -1;
+    }
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+    ReopenWithColumnFamilies({"default", "pikachu"}, options);
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+
+    std::vector<std::vector<FileMetaData>> files;
+    dbfull()->TEST_GetFilesMetaData(handles_[1], &files);
+    size_t total_files = 0;
+    for (const auto& level : files) {
+      total_files += level.size();
+    }
+    ASSERT_EQ(total_files, 3);
+    for (const auto& level : files) {
+      for (const auto& file : level) {
+        if (options.max_open_files == kSmallMaxOpenFiles) {
+          ASSERT_TRUE(file.table_reader_handle == nullptr);
+        } else {
+          ASSERT_TRUE(file.table_reader_handle != nullptr);
+        }
+      }
+    }
+  } while (ChangeWalOptions());
+}
+
+TEST_F(DBWALTest, IgnoreRecoveredLog) {
+  std::string backup_logs = dbname_ + "/backup_logs";
+
+  do {
+    // delete old files in backup_logs directory
+    env_->CreateDirIfMissing(backup_logs);
+    std::vector<std::string> old_files;
+    env_->GetChildren(backup_logs, &old_files);
+    for (auto& file : old_files) {
+      if (file != "." && file != "..") {
+        env_->DeleteFile(backup_logs + "/" + file);
+      }
+    }
+    Options options = CurrentOptions();
+    options.create_if_missing = true;
+    options.merge_operator = MergeOperators::CreateUInt64AddOperator();
+    options.wal_dir = dbname_ + "/logs";
+    DestroyAndReopen(options);
+
+    // fill up the DB
+    std::string one, two;
+    PutFixed64(&one, 1);
+    PutFixed64(&two, 2);
+    ASSERT_OK(db_->Merge(WriteOptions(), Slice("foo"), Slice(one)));
+    ASSERT_OK(db_->Merge(WriteOptions(), Slice("foo"), Slice(one)));
+    ASSERT_OK(db_->Merge(WriteOptions(), Slice("bar"), Slice(one)));
+
+    // copy the logs to backup
+    std::vector<std::string> logs;
+    env_->GetChildren(options.wal_dir, &logs);
+    for (auto& log : logs) {
+      if (log != ".." && log != ".") {
+        CopyFile(options.wal_dir + "/" + log, backup_logs + "/" + log);
+      }
+    }
+
+    // recover the DB
+    Reopen(options);
+    ASSERT_EQ(two, Get("foo"));
+    ASSERT_EQ(one, Get("bar"));
+    Close();
+
+    // copy the logs from backup back to wal dir
+    for (auto& log : logs) {
+      if (log != ".." && log != ".") {
+        CopyFile(backup_logs + "/" + log, options.wal_dir + "/" + log);
+      }
+    }
+    // this should ignore the log files, recovery should not happen again
+    // if the recovery happens, the same merge operator would be called twice,
+    // leading to incorrect results
+    Reopen(options);
+    ASSERT_EQ(two, Get("foo"));
+    ASSERT_EQ(one, Get("bar"));
+    Close();
+    Destroy(options);
+    Reopen(options);
+    Close();
+
+    // copy the logs from backup back to wal dir
+    env_->CreateDirIfMissing(options.wal_dir);
+    for (auto& log : logs) {
+      if (log != ".." && log != ".") {
+        CopyFile(backup_logs + "/" + log, options.wal_dir + "/" + log);
+      }
+    }
+    // assert that we successfully recovered only from logs, even though we
+    // destroyed the DB
+    Reopen(options);
+    ASSERT_EQ(two, Get("foo"));
+    ASSERT_EQ(one, Get("bar"));
+
+    // Recovery will fail if DB directory doesn't exist.
+    Destroy(options);
+    // copy the logs from backup back to wal dir
+    env_->CreateDirIfMissing(options.wal_dir);
+    for (auto& log : logs) {
+      if (log != ".." && log != ".") {
+        CopyFile(backup_logs + "/" + log, options.wal_dir + "/" + log);
+        // we won't be needing this file no more
+        env_->DeleteFile(backup_logs + "/" + log);
+      }
+    }
+    Status s = TryReopen(options);
+    ASSERT_TRUE(!s.ok());
+    Destroy(options);
+  } while (ChangeWalOptions());
+}
+
+TEST_F(DBWALTest, RecoveryWithEmptyLog) {
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    ASSERT_OK(Put(1, "foo", "v1"));
+    ASSERT_OK(Put(1, "foo", "v2"));
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    ASSERT_OK(Put(1, "foo", "v3"));
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    ASSERT_EQ("v3", Get(1, "foo"));
+  } while (ChangeWalOptions());
+}
+
+#if !(defined NDEBUG) || !defined(OS_WIN)
+TEST_F(DBWALTest, PreallocateBlock) {
+  Options options = CurrentOptions();
+  options.write_buffer_size = 10 * 1000 * 1000;
+  options.max_total_wal_size = 0;
+
+  size_t expected_preallocation_size = static_cast<size_t>(
+      options.write_buffer_size + options.write_buffer_size / 10);
+
+  DestroyAndReopen(options);
+
+  std::atomic<int> called(0);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBTestWalFile.GetPreallocationStatus", [&](void* arg) {
+        ASSERT_TRUE(arg != nullptr);
+        size_t preallocation_size = *(static_cast<size_t*>(arg));
+        ASSERT_EQ(expected_preallocation_size, preallocation_size);
+        called.fetch_add(1);
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  Put("", "");
+  Flush();
+  Put("", "");
+  Close();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ASSERT_EQ(2, called.load());
+
+  options.max_total_wal_size = 1000 * 1000;
+  expected_preallocation_size = static_cast<size_t>(options.max_total_wal_size);
+  Reopen(options);
+  called.store(0);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBTestWalFile.GetPreallocationStatus", [&](void* arg) {
+        ASSERT_TRUE(arg != nullptr);
+        size_t preallocation_size = *(static_cast<size_t*>(arg));
+        ASSERT_EQ(expected_preallocation_size, preallocation_size);
+        called.fetch_add(1);
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  Put("", "");
+  Flush();
+  Put("", "");
+  Close();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ASSERT_EQ(2, called.load());
+
+  options.db_write_buffer_size = 800 * 1000;
+  expected_preallocation_size =
+      static_cast<size_t>(options.db_write_buffer_size);
+  Reopen(options);
+  called.store(0);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBTestWalFile.GetPreallocationStatus", [&](void* arg) {
+        ASSERT_TRUE(arg != nullptr);
+        size_t preallocation_size = *(static_cast<size_t*>(arg));
+        ASSERT_EQ(expected_preallocation_size, preallocation_size);
+        called.fetch_add(1);
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  Put("", "");
+  Flush();
+  Put("", "");
+  Close();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ASSERT_EQ(2, called.load());
+
+  expected_preallocation_size = 700 * 1000;
+  std::shared_ptr<WriteBufferManager> write_buffer_manager =
+      std::make_shared<WriteBufferManager>(static_cast<uint64_t>(700 * 1000));
+  options.write_buffer_manager = write_buffer_manager;
+  Reopen(options);
+  called.store(0);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBTestWalFile.GetPreallocationStatus", [&](void* arg) {
+        ASSERT_TRUE(arg != nullptr);
+        size_t preallocation_size = *(static_cast<size_t*>(arg));
+        ASSERT_EQ(expected_preallocation_size, preallocation_size);
+        called.fetch_add(1);
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  Put("", "");
+  Flush();
+  Put("", "");
+  Close();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ASSERT_EQ(2, called.load());
+}
+#endif  // !(defined NDEBUG) || !defined(OS_WIN)
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBWALTest, FullPurgePreservesRecycledLog) {
+  // For github issue #1303
+  for (int i = 0; i < 2; ++i) {
+    Options options = CurrentOptions();
+    options.create_if_missing = true;
+    options.recycle_log_file_num = 2;
+    if (i != 0) {
+      options.wal_dir = alternative_wal_dir_;
+    }
+
+    DestroyAndReopen(options);
+    ASSERT_OK(Put("foo", "v1"));
+    VectorLogPtr log_files;
+    ASSERT_OK(dbfull()->GetSortedWalFiles(log_files));
+    ASSERT_GT(log_files.size(), 0);
+    ASSERT_OK(Flush());
+
+    // Now the original WAL is in log_files[0] and should be marked for
+    // recycling.
+    // Verify full purge cannot remove this file.
+    JobContext job_context(0);
+    dbfull()->TEST_LockMutex();
+    dbfull()->FindObsoleteFiles(&job_context, true /* force */);
+    dbfull()->TEST_UnlockMutex();
+    dbfull()->PurgeObsoleteFiles(job_context);
+
+    if (i == 0) {
+      ASSERT_OK(
+          env_->FileExists(LogFileName(dbname_, log_files[0]->LogNumber())));
+    } else {
+      ASSERT_OK(env_->FileExists(
+          LogFileName(alternative_wal_dir_, log_files[0]->LogNumber())));
+    }
+  }
+}
+
+TEST_F(DBWALTest, FullPurgePreservesLogPendingReuse) {
+  // Ensures full purge cannot delete a WAL while it's in the process of being
+  // recycled. In particular, we force the full purge after a file has been
+  // chosen for reuse, but before it has been renamed.
+  for (int i = 0; i < 2; ++i) {
+    Options options = CurrentOptions();
+    options.recycle_log_file_num = 1;
+    if (i != 0) {
+      options.wal_dir = alternative_wal_dir_;
+    }
+    DestroyAndReopen(options);
+
+    // The first flush creates a second log so writes can continue before the
+    // flush finishes.
+    ASSERT_OK(Put("foo", "bar"));
+    ASSERT_OK(Flush());
+
+    // The second flush can recycle the first log. Sync points enforce the
+    // full purge happens after choosing the log to recycle and before it is
+    // renamed.
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+        {"DBImpl::CreateWAL:BeforeReuseWritableFile1",
+         "DBWALTest::FullPurgePreservesLogPendingReuse:PreFullPurge"},
+        {"DBWALTest::FullPurgePreservesLogPendingReuse:PostFullPurge",
+         "DBImpl::CreateWAL:BeforeReuseWritableFile2"},
+    });
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+    ROCKSDB_NAMESPACE::port::Thread thread([&]() {
+      TEST_SYNC_POINT(
+          "DBWALTest::FullPurgePreservesLogPendingReuse:PreFullPurge");
+      ASSERT_OK(db_->EnableFileDeletions(true));
+      TEST_SYNC_POINT(
+          "DBWALTest::FullPurgePreservesLogPendingReuse:PostFullPurge");
+    });
+    ASSERT_OK(Put("foo", "bar"));
+    ASSERT_OK(Flush());
+    thread.join();
+  }
+}
+
+TEST_F(DBWALTest, GetSortedWalFiles) {
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    VectorLogPtr log_files;
+    ASSERT_OK(dbfull()->GetSortedWalFiles(log_files));
+    ASSERT_EQ(0, log_files.size());
+
+    ASSERT_OK(Put(1, "foo", "v1"));
+    ASSERT_OK(dbfull()->GetSortedWalFiles(log_files));
+    ASSERT_EQ(1, log_files.size());
+  } while (ChangeWalOptions());
+}
+
+TEST_F(DBWALTest, GetCurrentWalFile) {
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+
+    std::unique_ptr<LogFile>* bad_log_file = nullptr;
+    ASSERT_NOK(dbfull()->GetCurrentWalFile(bad_log_file));
+
+    std::unique_ptr<LogFile> log_file;
+    ASSERT_OK(dbfull()->GetCurrentWalFile(&log_file));
+
+    // nothing has been written to the log yet
+    ASSERT_EQ(log_file->StartSequence(), 0);
+    ASSERT_EQ(log_file->SizeFileBytes(), 0);
+    ASSERT_EQ(log_file->Type(), kAliveLogFile);
+    ASSERT_GT(log_file->LogNumber(), 0);
+
+    // add some data and verify that the file size actually moves foward
+    ASSERT_OK(Put(0, "foo", "v1"));
+    ASSERT_OK(Put(0, "foo2", "v2"));
+    ASSERT_OK(Put(0, "foo3", "v3"));
+
+    ASSERT_OK(dbfull()->GetCurrentWalFile(&log_file));
+
+    ASSERT_EQ(log_file->StartSequence(), 0);
+    ASSERT_GT(log_file->SizeFileBytes(), 0);
+    ASSERT_EQ(log_file->Type(), kAliveLogFile);
+    ASSERT_GT(log_file->LogNumber(), 0);
+
+    // force log files to cycle and add some more data, then check if
+    // log number moves forward
+
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    for (int i = 0; i < 10; i++) {
+      ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    }
+
+    ASSERT_OK(Put(0, "foo4", "v4"));
+    ASSERT_OK(Put(0, "foo5", "v5"));
+    ASSERT_OK(Put(0, "foo6", "v6"));
+
+    ASSERT_OK(dbfull()->GetCurrentWalFile(&log_file));
+
+    ASSERT_EQ(log_file->StartSequence(), 0);
+    ASSERT_GT(log_file->SizeFileBytes(), 0);
+    ASSERT_EQ(log_file->Type(), kAliveLogFile);
+    ASSERT_GT(log_file->LogNumber(), 0);
+
+  } while (ChangeWalOptions());
+}
+
+TEST_F(DBWALTest, RecoveryWithLogDataForSomeCFs) {
+  // Test for regression of WAL cleanup missing files that don't contain data
+  // for every column family.
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    ASSERT_OK(Put(1, "foo", "v1"));
+    ASSERT_OK(Put(1, "foo", "v2"));
+    uint64_t earliest_log_nums[2];
+    for (int i = 0; i < 2; ++i) {
+      if (i > 0) {
+        ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+      }
+      VectorLogPtr log_files;
+      ASSERT_OK(dbfull()->GetSortedWalFiles(log_files));
+      if (log_files.size() > 0) {
+        earliest_log_nums[i] = log_files[0]->LogNumber();
+      } else {
+        earliest_log_nums[i] = port::kMaxUint64;
+      }
+    }
+    // Check at least the first WAL was cleaned up during the recovery.
+    ASSERT_LT(earliest_log_nums[0], earliest_log_nums[1]);
+  } while (ChangeWalOptions());
+}
+
+TEST_F(DBWALTest, RecoverWithLargeLog) {
+  do {
+    {
+      Options options = CurrentOptions();
+      CreateAndReopenWithCF({"pikachu"}, options);
+      ASSERT_OK(Put(1, "big1", std::string(200000, '1')));
+      ASSERT_OK(Put(1, "big2", std::string(200000, '2')));
+      ASSERT_OK(Put(1, "small3", std::string(10, '3')));
+      ASSERT_OK(Put(1, "small4", std::string(10, '4')));
+      ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
+    }
+
+    // Make sure that if we re-open with a small write buffer size that
+    // we flush table files in the middle of a large log file.
+    Options options;
+    options.write_buffer_size = 100000;
+    options = CurrentOptions(options);
+    ReopenWithColumnFamilies({"default", "pikachu"}, options);
+    ASSERT_EQ(NumTableFilesAtLevel(0, 1), 3);
+    ASSERT_EQ(std::string(200000, '1'), Get(1, "big1"));
+    ASSERT_EQ(std::string(200000, '2'), Get(1, "big2"));
+    ASSERT_EQ(std::string(10, '3'), Get(1, "small3"));
+    ASSERT_EQ(std::string(10, '4'), Get(1, "small4"));
+    ASSERT_GT(NumTableFilesAtLevel(0, 1), 1);
+  } while (ChangeWalOptions());
+}
+
+// In https://reviews.facebook.net/D20661 we change
+// recovery behavior: previously for each log file each column family
+// memtable was flushed, even it was empty. Now it's changed:
+// we try to create the smallest number of table files by merging
+// updates from multiple logs
+TEST_F(DBWALTest, RecoverCheckFileAmountWithSmallWriteBuffer) {
+  Options options = CurrentOptions();
+  options.write_buffer_size = 5000000;
+  CreateAndReopenWithCF({"pikachu", "dobrynia", "nikitich"}, options);
+
+  // Since we will reopen DB with smaller write_buffer_size,
+  // each key will go to new SST file
+  ASSERT_OK(Put(1, Key(10), DummyString(1000000)));
+  ASSERT_OK(Put(1, Key(10), DummyString(1000000)));
+  ASSERT_OK(Put(1, Key(10), DummyString(1000000)));
+  ASSERT_OK(Put(1, Key(10), DummyString(1000000)));
+
+  ASSERT_OK(Put(3, Key(10), DummyString(1)));
+  // Make 'dobrynia' to be flushed and new WAL file to be created
+  ASSERT_OK(Put(2, Key(10), DummyString(7500000)));
+  ASSERT_OK(Put(2, Key(1), DummyString(1)));
+  dbfull()->TEST_WaitForFlushMemTable(handles_[2]);
+  {
+    auto tables = ListTableFiles(env_, dbname_);
+    ASSERT_EQ(tables.size(), static_cast<size_t>(1));
+    // Make sure 'dobrynia' was flushed: check sst files amount
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"),
+              static_cast<uint64_t>(1));
+  }
+  // New WAL file
+  ASSERT_OK(Put(1, Key(1), DummyString(1)));
+  ASSERT_OK(Put(1, Key(1), DummyString(1)));
+  ASSERT_OK(Put(3, Key(10), DummyString(1)));
+  ASSERT_OK(Put(3, Key(10), DummyString(1)));
+  ASSERT_OK(Put(3, Key(10), DummyString(1)));
+
+  options.write_buffer_size = 4096;
+  options.arena_block_size = 4096;
+  ReopenWithColumnFamilies({"default", "pikachu", "dobrynia", "nikitich"},
+                           options);
+  {
+    // No inserts => default is empty
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
+              static_cast<uint64_t>(0));
+    // First 4 keys goes to separate SSTs + 1 more SST for 2 smaller keys
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"),
+              static_cast<uint64_t>(5));
+    // 1 SST for big key + 1 SST for small one
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"),
+              static_cast<uint64_t>(2));
+    // 1 SST for all keys
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
+              static_cast<uint64_t>(1));
+  }
+}
+
+// In https://reviews.facebook.net/D20661 we change
+// recovery behavior: previously for each log file each column family
+// memtable was flushed, even it wasn't empty. Now it's changed:
+// we try to create the smallest number of table files by merging
+// updates from multiple logs
+TEST_F(DBWALTest, RecoverCheckFileAmount) {
+  Options options = CurrentOptions();
+  options.write_buffer_size = 100000;
+  options.arena_block_size = 4 * 1024;
+  options.avoid_flush_during_recovery = false;
+  CreateAndReopenWithCF({"pikachu", "dobrynia", "nikitich"}, options);
+
+  ASSERT_OK(Put(0, Key(1), DummyString(1)));
+  ASSERT_OK(Put(1, Key(1), DummyString(1)));
+  ASSERT_OK(Put(2, Key(1), DummyString(1)));
+
+  // Make 'nikitich' memtable to be flushed
+  ASSERT_OK(Put(3, Key(10), DummyString(1002400)));
+  ASSERT_OK(Put(3, Key(1), DummyString(1)));
+  dbfull()->TEST_WaitForFlushMemTable(handles_[3]);
+  // 4 memtable are not flushed, 1 sst file
+  {
+    auto tables = ListTableFiles(env_, dbname_);
+    ASSERT_EQ(tables.size(), static_cast<size_t>(1));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
+              static_cast<uint64_t>(1));
+  }
+  // Memtable for 'nikitich' has flushed, new WAL file has opened
+  // 4 memtable still not flushed
+
+  // Write to new WAL file
+  ASSERT_OK(Put(0, Key(1), DummyString(1)));
+  ASSERT_OK(Put(1, Key(1), DummyString(1)));
+  ASSERT_OK(Put(2, Key(1), DummyString(1)));
+
+  // Fill up 'nikitich' one more time
+  ASSERT_OK(Put(3, Key(10), DummyString(1002400)));
+  // make it flush
+  ASSERT_OK(Put(3, Key(1), DummyString(1)));
+  dbfull()->TEST_WaitForFlushMemTable(handles_[3]);
+  // There are still 4 memtable not flushed, and 2 sst tables
+  ASSERT_OK(Put(0, Key(1), DummyString(1)));
+  ASSERT_OK(Put(1, Key(1), DummyString(1)));
+  ASSERT_OK(Put(2, Key(1), DummyString(1)));
+
+  {
+    auto tables = ListTableFiles(env_, dbname_);
+    ASSERT_EQ(tables.size(), static_cast<size_t>(2));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
+              static_cast<uint64_t>(2));
+  }
+
+  ReopenWithColumnFamilies({"default", "pikachu", "dobrynia", "nikitich"},
+                           options);
+  {
+    std::vector<uint64_t> table_files = ListTableFiles(env_, dbname_);
+    // Check, that records for 'default', 'dobrynia' and 'pikachu' from
+    // first, second and third WALs  went to the same SST.
+    // So, there is 6 SSTs: three  for 'nikitich', one for 'default', one for
+    // 'dobrynia', one for 'pikachu'
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
+              static_cast<uint64_t>(1));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
+              static_cast<uint64_t>(3));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"),
+              static_cast<uint64_t>(1));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"),
+              static_cast<uint64_t>(1));
+  }
+}
+
+TEST_F(DBWALTest, SyncMultipleLogs) {
+  const uint64_t kNumBatches = 2;
+  const int kBatchSize = 1000;
+
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.write_buffer_size = 4096;
+  Reopen(options);
+
+  WriteBatch batch;
+  WriteOptions wo;
+  wo.sync = true;
+
+  for (uint64_t b = 0; b < kNumBatches; b++) {
+    batch.Clear();
+    for (int i = 0; i < kBatchSize; i++) {
+      batch.Put(Key(i), DummyString(128));
+    }
+
+    dbfull()->Write(wo, &batch);
+  }
+
+  ASSERT_OK(dbfull()->SyncWAL());
+}
+
+// Github issue 1339. Prior the fix we read sequence id from the first log to
+// a local variable, then keep increase the variable as we replay logs,
+// ignoring actual sequence id of the records. This is incorrect if some writes
+// come with WAL disabled.
+TEST_F(DBWALTest, PartOfWritesWithWALDisabled) {
+  std::unique_ptr<FaultInjectionTestEnv> fault_env(
+      new FaultInjectionTestEnv(env_));
+  Options options = CurrentOptions();
+  options.env = fault_env.get();
+  options.disable_auto_compactions = true;
+  WriteOptions wal_on, wal_off;
+  wal_on.sync = true;
+  wal_on.disableWAL = false;
+  wal_off.disableWAL = true;
+  CreateAndReopenWithCF({"dummy"}, options);
+  ASSERT_OK(Put(1, "dummy", "d1", wal_on));  // seq id 1
+  ASSERT_OK(Put(1, "dummy", "d2", wal_off));
+  ASSERT_OK(Put(1, "dummy", "d3", wal_off));
+  ASSERT_OK(Put(0, "key", "v4", wal_on));  // seq id 4
+  ASSERT_OK(Flush(0));
+  ASSERT_OK(Put(0, "key", "v5", wal_on));  // seq id 5
+  ASSERT_EQ("v5", Get(0, "key"));
+  dbfull()->FlushWAL(false);
+  // Simulate a crash.
+  fault_env->SetFilesystemActive(false);
+  Close();
+  fault_env->ResetState();
+  ReopenWithColumnFamilies({"default", "dummy"}, options);
+  // Prior to the fix, we may incorrectly recover "v5" with sequence id = 3.
+  ASSERT_EQ("v5", Get(0, "key"));
+  // Destroy DB before destruct fault_env.
+  Destroy(options);
+}
+
+//
+// Test WAL recovery for the various modes available
+//
+class RecoveryTestHelper {
+ public:
+  // Number of WAL files to generate
+  static const int kWALFilesCount = 10;
+  // Starting number for the WAL file name like 00010.log
+  static const int kWALFileOffset = 10;
+  // Keys to be written per WAL file
+  static const int kKeysPerWALFile = 133;
+  // Size of the value
+  static const int kValueSize = 96;
+
+  // Create WAL files with values filled in
+  static void FillData(DBWALTest* test, const Options& options,
+                       const size_t wal_count, size_t* count) {
+    // Calling internal functions requires sanitized options.
+    Options sanitized_options = SanitizeOptions(test->dbname_, options);
+    const ImmutableDBOptions db_options(sanitized_options);
+
+    *count = 0;
+
+    std::shared_ptr<Cache> table_cache = NewLRUCache(50, 0);
+    EnvOptions env_options;
+    WriteBufferManager write_buffer_manager(db_options.db_write_buffer_size);
+
+    std::unique_ptr<VersionSet> versions;
+    std::unique_ptr<WalManager> wal_manager;
+    WriteController write_controller;
+
+    versions.reset(new VersionSet(test->dbname_, &db_options, env_options,
+                                  table_cache.get(), &write_buffer_manager,
+                                  &write_controller,
+                                  /*block_cache_tracer=*/nullptr));
+
+    wal_manager.reset(new WalManager(db_options, env_options));
+
+    std::unique_ptr<log::Writer> current_log_writer;
+
+    for (size_t j = kWALFileOffset; j < wal_count + kWALFileOffset; j++) {
+      uint64_t current_log_number = j;
+      std::string fname = LogFileName(test->dbname_, current_log_number);
+      std::unique_ptr<WritableFile> file;
+      ASSERT_OK(db_options.env->NewWritableFile(fname, &file, env_options));
+      std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
+          NewLegacyWritableFileWrapper(std::move(file)), fname, env_options));
+      current_log_writer.reset(
+          new log::Writer(std::move(file_writer), current_log_number,
+                          db_options.recycle_log_file_num > 0));
+
+      WriteBatch batch;
+      for (int i = 0; i < kKeysPerWALFile; i++) {
+        std::string key = "key" + ToString((*count)++);
+        std::string value = test->DummyString(kValueSize);
+        assert(current_log_writer.get() != nullptr);
+        uint64_t seq = versions->LastSequence() + 1;
+        batch.Clear();
+        batch.Put(key, value);
+        WriteBatchInternal::SetSequence(&batch, seq);
+        current_log_writer->AddRecord(WriteBatchInternal::Contents(&batch));
+        versions->SetLastAllocatedSequence(seq);
+        versions->SetLastPublishedSequence(seq);
+        versions->SetLastSequence(seq);
+      }
+    }
+  }
+
+  // Recreate and fill the store with some data
+  static size_t FillData(DBWALTest* test, Options* options) {
+    options->create_if_missing = true;
+    test->DestroyAndReopen(*options);
+    test->Close();
+
+    size_t count = 0;
+    FillData(test, *options, kWALFilesCount, &count);
+    return count;
+  }
+
+  // Read back all the keys we wrote and return the number of keys found
+  static size_t GetData(DBWALTest* test) {
+    size_t count = 0;
+    for (size_t i = 0; i < kWALFilesCount * kKeysPerWALFile; i++) {
+      if (test->Get("key" + ToString(i)) != "NOT_FOUND") {
+        ++count;
+      }
+    }
+    return count;
+  }
+
+  // Manuall corrupt the specified WAL
+  static void CorruptWAL(DBWALTest* test, const Options& options,
+                         const double off, const double len,
+                         const int wal_file_id, const bool trunc = false) {
+    Env* env = options.env;
+    std::string fname = LogFileName(test->dbname_, wal_file_id);
+    uint64_t size;
+    ASSERT_OK(env->GetFileSize(fname, &size));
+    ASSERT_GT(size, 0);
+#ifdef OS_WIN
+    // Windows disk cache behaves differently. When we truncate
+    // the original content is still in the cache due to the original
+    // handle is still open. Generally, in Windows, one prohibits
+    // shared access to files and it is not needed for WAL but we allow
+    // it to induce corruption at various tests.
+    test->Close();
+#endif
+    if (trunc) {
+      ASSERT_EQ(0, truncate(fname.c_str(), static_cast<int64_t>(size * off)));
+    } else {
+      InduceCorruption(fname, static_cast<size_t>(size * off + 8),
+                       static_cast<size_t>(size * len));
+    }
+  }
+
+  // Overwrite data with 'a' from offset for length len
+  static void InduceCorruption(const std::string& filename, size_t offset,
+                               size_t len) {
+    ASSERT_GT(len, 0U);
+
+    int fd = open(filename.c_str(), O_RDWR);
+
+    // On windows long is 32-bit
+    ASSERT_LE(offset, std::numeric_limits<long>::max());
+
+    ASSERT_GT(fd, 0);
+    ASSERT_EQ(offset, lseek(fd, static_cast<long>(offset), SEEK_SET));
+
+    void* buf = alloca(len);
+    memset(buf, 'b', len);
+    ASSERT_EQ(len, write(fd, buf, static_cast<unsigned int>(len)));
+
+    close(fd);
+  }
+};
+
+// Test scope:
+// - We expect to open the data store when there is incomplete trailing writes
+// at the end of any of the logs
+// - We do not expect to open the data store for corruption
+TEST_F(DBWALTest, kTolerateCorruptedTailRecords) {
+  const int jstart = RecoveryTestHelper::kWALFileOffset;
+  const int jend = jstart + RecoveryTestHelper::kWALFilesCount;
+
+  for (auto trunc : {true, false}) {        /* Corruption style */
+    for (int i = 0; i < 3; i++) {           /* Corruption offset position */
+      for (int j = jstart; j < jend; j++) { /* WAL file */
+        // Fill data for testing
+        Options options = CurrentOptions();
+        const size_t row_count = RecoveryTestHelper::FillData(this, &options);
+        // test checksum failure or parsing
+        RecoveryTestHelper::CorruptWAL(this, options, /*off=*/i * .3,
+                                       /*len%=*/.1, /*wal=*/j, trunc);
+
+        if (trunc) {
+          options.wal_recovery_mode =
+              WALRecoveryMode::kTolerateCorruptedTailRecords;
+          options.create_if_missing = false;
+          ASSERT_OK(TryReopen(options));
+          const size_t recovered_row_count = RecoveryTestHelper::GetData(this);
+          ASSERT_TRUE(i == 0 || recovered_row_count > 0);
+          ASSERT_LT(recovered_row_count, row_count);
+        } else {
+          options.wal_recovery_mode =
+              WALRecoveryMode::kTolerateCorruptedTailRecords;
+          ASSERT_NOK(TryReopen(options));
+        }
+      }
+    }
+  }
+}
+
+// Test scope:
+// We don't expect the data store to be opened if there is any corruption
+// (leading, middle or trailing -- incomplete writes or corruption)
+TEST_F(DBWALTest, kAbsoluteConsistency) {
+  const int jstart = RecoveryTestHelper::kWALFileOffset;
+  const int jend = jstart + RecoveryTestHelper::kWALFilesCount;
+
+  // Verify clean slate behavior
+  Options options = CurrentOptions();
+  const size_t row_count = RecoveryTestHelper::FillData(this, &options);
+  options.wal_recovery_mode = WALRecoveryMode::kAbsoluteConsistency;
+  options.create_if_missing = false;
+  ASSERT_OK(TryReopen(options));
+  ASSERT_EQ(RecoveryTestHelper::GetData(this), row_count);
+
+  for (auto trunc : {true, false}) { /* Corruption style */
+    for (int i = 0; i < 4; i++) {    /* Corruption offset position */
+      if (trunc && i == 0) {
+        continue;
+      }
+
+      for (int j = jstart; j < jend; j++) { /* wal files */
+        // fill with new date
+        RecoveryTestHelper::FillData(this, &options);
+        // corrupt the wal
+        RecoveryTestHelper::CorruptWAL(this, options, /*off=*/i * .3,
+                                       /*len%=*/.1, j, trunc);
+        // verify
+        options.wal_recovery_mode = WALRecoveryMode::kAbsoluteConsistency;
+        options.create_if_missing = false;
+        ASSERT_NOK(TryReopen(options));
+      }
+    }
+  }
+}
+
+// Test scope:
+// We don't expect the data store to be opened if there is any inconsistency
+// between WAL and SST files
+TEST_F(DBWALTest, kPointInTimeRecoveryCFConsistency) {
+  Options options = CurrentOptions();
+  options.avoid_flush_during_recovery = true;
+
+  // Create DB with multiple column families.
+  CreateAndReopenWithCF({"one", "two"}, options);
+  ASSERT_OK(Put(1, "key1", "val1"));
+  ASSERT_OK(Put(2, "key2", "val2"));
+
+  // Record the offset at this point
+  Env* env = options.env;
+  uint64_t wal_file_id = dbfull()->TEST_LogfileNumber();
+  std::string fname = LogFileName(dbname_, wal_file_id);
+  uint64_t offset_to_corrupt;
+  ASSERT_OK(env->GetFileSize(fname, &offset_to_corrupt));
+  ASSERT_GT(offset_to_corrupt, 0);
+
+  ASSERT_OK(Put(1, "key3", "val3"));
+  // Corrupt WAL at location of key3
+  RecoveryTestHelper::InduceCorruption(
+      fname, static_cast<size_t>(offset_to_corrupt), static_cast<size_t>(4));
+  ASSERT_OK(Put(2, "key4", "val4"));
+  ASSERT_OK(Put(1, "key5", "val5"));
+  Flush(2);
+
+  // PIT recovery & verify
+  options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery;
+  ASSERT_NOK(TryReopenWithColumnFamilies({"default", "one", "two"}, options));
+}
+
+// Test scope:
+// - We expect to open data store under all circumstances
+// - We expect only data upto the point where the first error was encountered
+TEST_F(DBWALTest, kPointInTimeRecovery) {
+  const int jstart = RecoveryTestHelper::kWALFileOffset;
+  const int jend = jstart + RecoveryTestHelper::kWALFilesCount;
+  const int maxkeys =
+      RecoveryTestHelper::kWALFilesCount * RecoveryTestHelper::kKeysPerWALFile;
+
+  for (auto trunc : {true, false}) {        /* Corruption style */
+    for (int i = 0; i < 4; i++) {           /* Offset of corruption */
+      for (int j = jstart; j < jend; j++) { /* WAL file */
+        // Fill data for testing
+        Options options = CurrentOptions();
+        const size_t row_count = RecoveryTestHelper::FillData(this, &options);
+
+        // Corrupt the wal
+        RecoveryTestHelper::CorruptWAL(this, options, /*off=*/i * .3,
+                                       /*len%=*/.1, j, trunc);
+
+        // Verify
+        options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery;
+        options.create_if_missing = false;
+        ASSERT_OK(TryReopen(options));
+
+        // Probe data for invariants
+        size_t recovered_row_count = RecoveryTestHelper::GetData(this);
+        ASSERT_LT(recovered_row_count, row_count);
+
+        bool expect_data = true;
+        for (size_t k = 0; k < maxkeys; ++k) {
+          bool found = Get("key" + ToString(i)) != "NOT_FOUND";
+          if (expect_data && !found) {
+            expect_data = false;
+          }
+          ASSERT_EQ(found, expect_data);
+        }
+
+        const size_t min = RecoveryTestHelper::kKeysPerWALFile *
+                           (j - RecoveryTestHelper::kWALFileOffset);
+        ASSERT_GE(recovered_row_count, min);
+        if (!trunc && i != 0) {
+          const size_t max = RecoveryTestHelper::kKeysPerWALFile *
+                             (j - RecoveryTestHelper::kWALFileOffset + 1);
+          ASSERT_LE(recovered_row_count, max);
+        }
+      }
+    }
+  }
+}
+
+// Test scope:
+// - We expect to open the data store under all scenarios
+// - We expect to have recovered records past the corruption zone
+TEST_F(DBWALTest, kSkipAnyCorruptedRecords) {
+  const int jstart = RecoveryTestHelper::kWALFileOffset;
+  const int jend = jstart + RecoveryTestHelper::kWALFilesCount;
+
+  for (auto trunc : {true, false}) {        /* Corruption style */
+    for (int i = 0; i < 4; i++) {           /* Corruption offset */
+      for (int j = jstart; j < jend; j++) { /* wal files */
+        // Fill data for testing
+        Options options = CurrentOptions();
+        const size_t row_count = RecoveryTestHelper::FillData(this, &options);
+
+        // Corrupt the WAL
+        RecoveryTestHelper::CorruptWAL(this, options, /*off=*/i * .3,
+                                       /*len%=*/.1, j, trunc);
+
+        // Verify behavior
+        options.wal_recovery_mode = WALRecoveryMode::kSkipAnyCorruptedRecords;
+        options.create_if_missing = false;
+        ASSERT_OK(TryReopen(options));
+
+        // Probe data for invariants
+        size_t recovered_row_count = RecoveryTestHelper::GetData(this);
+        ASSERT_LT(recovered_row_count, row_count);
+
+        if (!trunc) {
+          ASSERT_TRUE(i != 0 || recovered_row_count > 0);
+        }
+      }
+    }
+  }
+}
+
+TEST_F(DBWALTest, AvoidFlushDuringRecovery) {
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.avoid_flush_during_recovery = false;
+
+  // Test with flush after recovery.
+  Reopen(options);
+  ASSERT_OK(Put("foo", "v1"));
+  ASSERT_OK(Put("bar", "v2"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("foo", "v3"));
+  ASSERT_OK(Put("bar", "v4"));
+  ASSERT_EQ(1, TotalTableFiles());
+  // Reopen DB. Check if WAL logs flushed.
+  Reopen(options);
+  ASSERT_EQ("v3", Get("foo"));
+  ASSERT_EQ("v4", Get("bar"));
+  ASSERT_EQ(2, TotalTableFiles());
+
+  // Test without flush after recovery.
+  options.avoid_flush_during_recovery = true;
+  DestroyAndReopen(options);
+  ASSERT_OK(Put("foo", "v5"));
+  ASSERT_OK(Put("bar", "v6"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("foo", "v7"));
+  ASSERT_OK(Put("bar", "v8"));
+  ASSERT_EQ(1, TotalTableFiles());
+  // Reopen DB. WAL logs should not be flushed this time.
+  Reopen(options);
+  ASSERT_EQ("v7", Get("foo"));
+  ASSERT_EQ("v8", Get("bar"));
+  ASSERT_EQ(1, TotalTableFiles());
+
+  // Force flush with allow_2pc.
+  options.avoid_flush_during_recovery = true;
+  options.allow_2pc = true;
+  ASSERT_OK(Put("foo", "v9"));
+  ASSERT_OK(Put("bar", "v10"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("foo", "v11"));
+  ASSERT_OK(Put("bar", "v12"));
+  Reopen(options);
+  ASSERT_EQ("v11", Get("foo"));
+  ASSERT_EQ("v12", Get("bar"));
+  ASSERT_EQ(3, TotalTableFiles());
+}
+
+TEST_F(DBWALTest, WalCleanupAfterAvoidFlushDuringRecovery) {
+  // Verifies WAL files that were present during recovery, but not flushed due
+  // to avoid_flush_during_recovery, will be considered for deletion at a later
+  // stage. We check at least one such file is deleted during Flush().
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.avoid_flush_during_recovery = true;
+  Reopen(options);
+
+  ASSERT_OK(Put("foo", "v1"));
+  Reopen(options);
+  for (int i = 0; i < 2; ++i) {
+    if (i > 0) {
+      // Flush() triggers deletion of obsolete tracked files
+      Flush();
+    }
+    VectorLogPtr log_files;
+    ASSERT_OK(dbfull()->GetSortedWalFiles(log_files));
+    if (i == 0) {
+      ASSERT_GT(log_files.size(), 0);
+    } else {
+      ASSERT_EQ(0, log_files.size());
+    }
+  }
+}
+
+TEST_F(DBWALTest, RecoverWithoutFlush) {
+  Options options = CurrentOptions();
+  options.avoid_flush_during_recovery = true;
+  options.create_if_missing = false;
+  options.disable_auto_compactions = true;
+  options.write_buffer_size = 64 * 1024 * 1024;
+
+  size_t count = RecoveryTestHelper::FillData(this, &options);
+  auto validateData = [this, count]() {
+    for (size_t i = 0; i < count; i++) {
+      ASSERT_NE(Get("key" + ToString(i)), "NOT_FOUND");
+    }
+  };
+  Reopen(options);
+  validateData();
+  // Insert some data without flush
+  ASSERT_OK(Put("foo", "foo_v1"));
+  ASSERT_OK(Put("bar", "bar_v1"));
+  Reopen(options);
+  validateData();
+  ASSERT_EQ(Get("foo"), "foo_v1");
+  ASSERT_EQ(Get("bar"), "bar_v1");
+  // Insert again and reopen
+  ASSERT_OK(Put("foo", "foo_v2"));
+  ASSERT_OK(Put("bar", "bar_v2"));
+  Reopen(options);
+  validateData();
+  ASSERT_EQ(Get("foo"), "foo_v2");
+  ASSERT_EQ(Get("bar"), "bar_v2");
+  // manual flush and insert again
+  Flush();
+  ASSERT_EQ(Get("foo"), "foo_v2");
+  ASSERT_EQ(Get("bar"), "bar_v2");
+  ASSERT_OK(Put("foo", "foo_v3"));
+  ASSERT_OK(Put("bar", "bar_v3"));
+  Reopen(options);
+  validateData();
+  ASSERT_EQ(Get("foo"), "foo_v3");
+  ASSERT_EQ(Get("bar"), "bar_v3");
+}
+
+TEST_F(DBWALTest, RecoverWithoutFlushMultipleCF) {
+  const std::string kSmallValue = "v";
+  const std::string kLargeValue = DummyString(1024);
+  Options options = CurrentOptions();
+  options.avoid_flush_during_recovery = true;
+  options.create_if_missing = false;
+  options.disable_auto_compactions = true;
+
+  auto countWalFiles = [this]() {
+    VectorLogPtr log_files;
+    dbfull()->GetSortedWalFiles(log_files);
+    return log_files.size();
+  };
+
+  // Create DB with multiple column families and multiple log files.
+  CreateAndReopenWithCF({"one", "two"}, options);
+  ASSERT_OK(Put(0, "key1", kSmallValue));
+  ASSERT_OK(Put(1, "key2", kLargeValue));
+  Flush(1);
+  ASSERT_EQ(1, countWalFiles());
+  ASSERT_OK(Put(0, "key3", kSmallValue));
+  ASSERT_OK(Put(2, "key4", kLargeValue));
+  Flush(2);
+  ASSERT_EQ(2, countWalFiles());
+
+  // Reopen, insert and flush.
+  options.db_write_buffer_size = 64 * 1024 * 1024;
+  ReopenWithColumnFamilies({"default", "one", "two"}, options);
+  ASSERT_EQ(Get(0, "key1"), kSmallValue);
+  ASSERT_EQ(Get(1, "key2"), kLargeValue);
+  ASSERT_EQ(Get(0, "key3"), kSmallValue);
+  ASSERT_EQ(Get(2, "key4"), kLargeValue);
+  // Insert more data.
+  ASSERT_OK(Put(0, "key5", kLargeValue));
+  ASSERT_OK(Put(1, "key6", kLargeValue));
+  ASSERT_EQ(3, countWalFiles());
+  Flush(1);
+  ASSERT_OK(Put(2, "key7", kLargeValue));
+  dbfull()->FlushWAL(false);
+  ASSERT_EQ(4, countWalFiles());
+
+  // Reopen twice and validate.
+  for (int i = 0; i < 2; i++) {
+    ReopenWithColumnFamilies({"default", "one", "two"}, options);
+    ASSERT_EQ(Get(0, "key1"), kSmallValue);
+    ASSERT_EQ(Get(1, "key2"), kLargeValue);
+    ASSERT_EQ(Get(0, "key3"), kSmallValue);
+    ASSERT_EQ(Get(2, "key4"), kLargeValue);
+    ASSERT_EQ(Get(0, "key5"), kLargeValue);
+    ASSERT_EQ(Get(1, "key6"), kLargeValue);
+    ASSERT_EQ(Get(2, "key7"), kLargeValue);
+    ASSERT_EQ(4, countWalFiles());
+  }
+}
+
+// In this test we are trying to do the following:
+//   1. Create a DB with corrupted WAL log;
+//   2. Open with avoid_flush_during_recovery = true;
+//   3. Append more data without flushing, which creates new WAL log.
+//   4. Open again. See if it can correctly handle previous corruption.
+TEST_F(DBWALTest, RecoverFromCorruptedWALWithoutFlush) {
+  const int jstart = RecoveryTestHelper::kWALFileOffset;
+  const int jend = jstart + RecoveryTestHelper::kWALFilesCount;
+  const int kAppendKeys = 100;
+  Options options = CurrentOptions();
+  options.avoid_flush_during_recovery = true;
+  options.create_if_missing = false;
+  options.disable_auto_compactions = true;
+  options.write_buffer_size = 64 * 1024 * 1024;
+
+  auto getAll = [this]() {
+    std::vector<std::pair<std::string, std::string>> data;
+    ReadOptions ropt;
+    Iterator* iter = dbfull()->NewIterator(ropt);
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+      data.push_back(
+          std::make_pair(iter->key().ToString(), iter->value().ToString()));
+    }
+    delete iter;
+    return data;
+  };
+  for (auto& mode : wal_recovery_mode_string_map) {
+    options.wal_recovery_mode = mode.second;
+    for (auto trunc : {true, false}) {
+      for (int i = 0; i < 4; i++) {
+        for (int j = jstart; j < jend; j++) {
+          // Create corrupted WAL
+          RecoveryTestHelper::FillData(this, &options);
+          RecoveryTestHelper::CorruptWAL(this, options, /*off=*/i * .3,
+                                         /*len%=*/.1, /*wal=*/j, trunc);
+          // Skip the test if DB won't open.
+          if (!TryReopen(options).ok()) {
+            ASSERT_TRUE(options.wal_recovery_mode ==
+                            WALRecoveryMode::kAbsoluteConsistency ||
+                        (!trunc &&
+                         options.wal_recovery_mode ==
+                             WALRecoveryMode::kTolerateCorruptedTailRecords));
+            continue;
+          }
+          ASSERT_OK(TryReopen(options));
+          // Append some more data.
+          for (int k = 0; k < kAppendKeys; k++) {
+            std::string key = "extra_key" + ToString(k);
+            std::string value = DummyString(RecoveryTestHelper::kValueSize);
+            ASSERT_OK(Put(key, value));
+          }
+          // Save data for comparison.
+          auto data = getAll();
+          // Reopen. Verify data.
+          ASSERT_OK(TryReopen(options));
+          auto actual_data = getAll();
+          ASSERT_EQ(data, actual_data);
+        }
+      }
+    }
+  }
+}
+
+// Tests that total log size is recovered if we set
+// avoid_flush_during_recovery=true.
+// Flush should trigger if max_total_wal_size is reached.
+TEST_F(DBWALTest, RestoreTotalLogSizeAfterRecoverWithoutFlush) {
+  class TestFlushListener : public EventListener {
+   public:
+    std::atomic<int> count{0};
+
+    TestFlushListener() = default;
+
+    void OnFlushBegin(DB* /*db*/, const FlushJobInfo& flush_job_info) override {
+      count++;
+      assert(FlushReason::kWriteBufferManager == flush_job_info.flush_reason);
+    }
+  };
+  std::shared_ptr<TestFlushListener> test_listener =
+      std::make_shared<TestFlushListener>();
+
+  constexpr size_t kKB = 1024;
+  constexpr size_t kMB = 1024 * 1024;
+  Options options = CurrentOptions();
+  options.avoid_flush_during_recovery = true;
+  options.max_total_wal_size = 1 * kMB;
+  options.listeners.push_back(test_listener);
+  // Have to open DB in multi-CF mode to trigger flush when
+  // max_total_wal_size is reached.
+  CreateAndReopenWithCF({"one"}, options);
+  // Write some keys and we will end up with one log file which is slightly
+  // smaller than 1MB.
+  std::string value_100k(100 * kKB, 'v');
+  std::string value_300k(300 * kKB, 'v');
+  ASSERT_OK(Put(0, "foo", "v1"));
+  for (int i = 0; i < 9; i++) {
+    ASSERT_OK(Put(1, "key" + ToString(i), value_100k));
+  }
+  // Get log files before reopen.
+  VectorLogPtr log_files_before;
+  ASSERT_OK(dbfull()->GetSortedWalFiles(log_files_before));
+  ASSERT_EQ(1, log_files_before.size());
+  uint64_t log_size_before = log_files_before[0]->SizeFileBytes();
+  ASSERT_GT(log_size_before, 900 * kKB);
+  ASSERT_LT(log_size_before, 1 * kMB);
+  ReopenWithColumnFamilies({"default", "one"}, options);
+  // Write one more value to make log larger than 1MB.
+  ASSERT_OK(Put(1, "bar", value_300k));
+  // Get log files again. A new log file will be opened.
+  VectorLogPtr log_files_after_reopen;
+  ASSERT_OK(dbfull()->GetSortedWalFiles(log_files_after_reopen));
+  ASSERT_EQ(2, log_files_after_reopen.size());
+  ASSERT_EQ(log_files_before[0]->LogNumber(),
+            log_files_after_reopen[0]->LogNumber());
+  ASSERT_GT(log_files_after_reopen[0]->SizeFileBytes() +
+                log_files_after_reopen[1]->SizeFileBytes(),
+            1 * kMB);
+  // Write one more key to trigger flush.
+  ASSERT_OK(Put(0, "foo", "v2"));
+  dbfull()->TEST_WaitForFlushMemTable();
+  // Flushed two column families.
+  ASSERT_EQ(2, test_listener->count.load());
+}
+
+#if defined(ROCKSDB_PLATFORM_POSIX)
+#if defined(ROCKSDB_FALLOCATE_PRESENT)
+// Tests that we will truncate the preallocated space of the last log from
+// previous.
+TEST_F(DBWALTest, TruncateLastLogAfterRecoverWithoutFlush) {
+  constexpr size_t kKB = 1024;
+  Options options = CurrentOptions();
+  options.avoid_flush_during_recovery = true;
+  DestroyAndReopen(options);
+  size_t preallocated_size =
+      dbfull()->TEST_GetWalPreallocateBlockSize(options.write_buffer_size);
+  ASSERT_OK(Put("foo", "v1"));
+  VectorLogPtr log_files_before;
+  ASSERT_OK(dbfull()->GetSortedWalFiles(log_files_before));
+  ASSERT_EQ(1, log_files_before.size());
+  auto& file_before = log_files_before[0];
+  ASSERT_LT(file_before->SizeFileBytes(), 1 * kKB);
+  // The log file has preallocated space.
+  ASSERT_GE(GetAllocatedFileSize(dbname_ + file_before->PathName()),
+            preallocated_size);
+  Reopen(options);
+  VectorLogPtr log_files_after;
+  ASSERT_OK(dbfull()->GetSortedWalFiles(log_files_after));
+  ASSERT_EQ(1, log_files_after.size());
+  ASSERT_LT(log_files_after[0]->SizeFileBytes(), 1 * kKB);
+  // The preallocated space should be truncated.
+  ASSERT_LT(GetAllocatedFileSize(dbname_ + file_before->PathName()),
+            preallocated_size);
+}
+#endif  // ROCKSDB_FALLOCATE_PRESENT
+#endif  // ROCKSDB_PLATFORM_POSIX
+
+#endif  // ROCKSDB_LITE
+
+TEST_F(DBWALTest, WalTermTest) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  ASSERT_OK(Put(1, "foo", "bar"));
+
+  WriteOptions wo;
+  wo.sync = true;
+  wo.disableWAL = false;
+
+  WriteBatch batch;
+  batch.Put("foo", "bar");
+  batch.MarkWalTerminationPoint();
+  batch.Put("foo2", "bar2");
+
+  ASSERT_OK(dbfull()->Write(wo, &batch));
+
+  // make sure we can re-open it.
+  ASSERT_OK(TryReopenWithColumnFamilies({"default", "pikachu"}, options));
+  ASSERT_EQ("bar", Get(1, "foo"));
+  ASSERT_EQ("NOT_FOUND", Get(1, "foo2"));
+}
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_write_test.cc b/src/rocksdb/db/db_write_test.cc
new file mode 100644
index 000000000..cc1aaac08
--- /dev/null
+++ b/src/rocksdb/db/db_write_test.cc
@@ -0,0 +1,329 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include <atomic>
+#include <memory>
+#include <thread>
+#include <vector>
+#include <fstream>
+#include "db/db_test_util.h"
+#include "db/write_batch_internal.h"
+#include "db/write_thread.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "test_util/fault_injection_test_env.h"
+#include "test_util/sync_point.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Test variations of WriteImpl.
+class DBWriteTest : public DBTestBase, public testing::WithParamInterface<int> {
+ public:
+  DBWriteTest() : DBTestBase("/db_write_test") {}
+
+  Options GetOptions() { return DBTestBase::GetOptions(GetParam()); }
+
+  void Open() { DBTestBase::Reopen(GetOptions()); }
+};
+
+// It is invalid to do sync write while disabling WAL.
+TEST_P(DBWriteTest, SyncAndDisableWAL) {
+  WriteOptions write_options;
+  write_options.sync = true;
+  write_options.disableWAL = true;
+  ASSERT_TRUE(dbfull()->Put(write_options, "foo", "bar").IsInvalidArgument());
+  WriteBatch batch;
+  ASSERT_OK(batch.Put("foo", "bar"));
+  ASSERT_TRUE(dbfull()->Write(write_options, &batch).IsInvalidArgument());
+}
+
+TEST_P(DBWriteTest, WriteThreadHangOnWriteStall) {
+  Options options = GetOptions();
+  options.level0_stop_writes_trigger = options.level0_slowdown_writes_trigger = 4;
+  std::vector<port::Thread> threads;
+  std::atomic<int> thread_num(0);
+  port::Mutex mutex;
+  port::CondVar cv(&mutex);
+
+  Reopen(options);
+
+  std::function<void()> write_slowdown_func = [&]() {
+    int a = thread_num.fetch_add(1);
+    std::string key = "foo" + std::to_string(a);
+    WriteOptions wo;
+    wo.no_slowdown = false;
+    dbfull()->Put(wo, key, "bar");
+  };
+  std::function<void()> write_no_slowdown_func = [&]() {
+    int a = thread_num.fetch_add(1);
+    std::string key = "foo" + std::to_string(a);
+    WriteOptions wo;
+    wo.no_slowdown = true;
+    dbfull()->Put(wo, key, "bar");
+  };
+  std::function<void(void *)> unblock_main_thread_func = [&](void *) {
+    mutex.Lock();
+    cv.SignalAll();
+    mutex.Unlock();
+  };
+
+  // Create 3 L0 files and schedule 4th without waiting
+  Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar");
+  Flush();
+  Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar");
+  Flush();
+  Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar");
+  Flush();
+  Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar");
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "WriteThread::JoinBatchGroup:Start", unblock_main_thread_func);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBWriteTest::WriteThreadHangOnWriteStall:1",
+        "DBImpl::BackgroundCallFlush:start"},
+       {"DBWriteTest::WriteThreadHangOnWriteStall:2",
+        "DBImpl::WriteImpl:BeforeLeaderEnters"},
+       // Make compaction start wait for the write stall to be detected and
+       // implemented by a write group leader
+       {"DBWriteTest::WriteThreadHangOnWriteStall:3",
+        "BackgroundCallCompaction:0"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // Schedule creation of 4th L0 file without waiting. This will seal the
+  // memtable and then wait for a sync point before writing the file. We need
+  // to do it this way because SwitchMemtable() needs to enter the
+  // write_thread
+  FlushOptions fopt;
+  fopt.wait = false;
+  dbfull()->Flush(fopt);
+
+  // Create a mix of slowdown/no_slowdown write threads
+  mutex.Lock();
+  // First leader
+  threads.emplace_back(write_slowdown_func);
+  cv.Wait();
+  // Second leader. Will stall writes
+  threads.emplace_back(write_slowdown_func);
+  cv.Wait();
+  threads.emplace_back(write_no_slowdown_func);
+  cv.Wait();
+  threads.emplace_back(write_slowdown_func);
+  cv.Wait();
+  threads.emplace_back(write_no_slowdown_func);
+  cv.Wait();
+  threads.emplace_back(write_slowdown_func);
+  cv.Wait();
+  mutex.Unlock();
+
+  TEST_SYNC_POINT("DBWriteTest::WriteThreadHangOnWriteStall:1");
+  dbfull()->TEST_WaitForFlushMemTable(nullptr);
+  // This would have triggered a write stall. Unblock the write group leader
+  TEST_SYNC_POINT("DBWriteTest::WriteThreadHangOnWriteStall:2");
+  // The leader is going to create missing newer links. When the leader finishes,
+  // the next leader is going to delay writes and fail writers with no_slowdown
+
+  TEST_SYNC_POINT("DBWriteTest::WriteThreadHangOnWriteStall:3");
+  for (auto& t : threads) {
+    t.join();
+  }
+}
+
+TEST_P(DBWriteTest, IOErrorOnWALWritePropagateToWriteThreadFollower) {
+  constexpr int kNumThreads = 5;
+  std::unique_ptr<FaultInjectionTestEnv> mock_env(
+      new FaultInjectionTestEnv(Env::Default()));
+  Options options = GetOptions();
+  options.env = mock_env.get();
+  Reopen(options);
+  std::atomic<int> ready_count{0};
+  std::atomic<int> leader_count{0};
+  std::vector<port::Thread> threads;
+  mock_env->SetFilesystemActive(false);
+
+  // Wait until all threads linked to write threads, to make sure
+  // all threads join the same batch group.
+  SyncPoint::GetInstance()->SetCallBack(
+      "WriteThread::JoinBatchGroup:Wait", [&](void* arg) {
+        ready_count++;
+        auto* w = reinterpret_cast<WriteThread::Writer*>(arg);
+        if (w->state == WriteThread::STATE_GROUP_LEADER) {
+          leader_count++;
+          while (ready_count < kNumThreads) {
+            // busy waiting
+          }
+        }
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+  for (int i = 0; i < kNumThreads; i++) {
+    threads.push_back(port::Thread(
+        [&](int index) {
+          // All threads should fail.
+          auto res = Put("key" + ToString(index), "value");
+          if (options.manual_wal_flush) {
+            ASSERT_TRUE(res.ok());
+            // we should see fs error when we do the flush
+
+            // TSAN reports a false alarm for lock-order-inversion but Open and
+            // FlushWAL are not run concurrently. Disabling this until TSAN is
+            // fixed.
+            // res = dbfull()->FlushWAL(false);
+            // ASSERT_FALSE(res.ok());
+          } else {
+            ASSERT_FALSE(res.ok());
+          }
+        },
+        i));
+  }
+  for (int i = 0; i < kNumThreads; i++) {
+    threads[i].join();
+  }
+  ASSERT_EQ(1, leader_count);
+  // Close before mock_env destruct.
+  Close();
+}
+
+TEST_P(DBWriteTest, ManualWalFlushInEffect) {
+  Options options = GetOptions();
+  Reopen(options);
+  // try the 1st WAL created during open
+  ASSERT_TRUE(Put("key" + ToString(0), "value").ok());
+  ASSERT_TRUE(options.manual_wal_flush != dbfull()->TEST_WALBufferIsEmpty());
+  ASSERT_TRUE(dbfull()->FlushWAL(false).ok());
+  ASSERT_TRUE(dbfull()->TEST_WALBufferIsEmpty());
+  // try the 2nd wal created during SwitchWAL
+  dbfull()->TEST_SwitchWAL();
+  ASSERT_TRUE(Put("key" + ToString(0), "value").ok());
+  ASSERT_TRUE(options.manual_wal_flush != dbfull()->TEST_WALBufferIsEmpty());
+  ASSERT_TRUE(dbfull()->FlushWAL(false).ok());
+  ASSERT_TRUE(dbfull()->TEST_WALBufferIsEmpty());
+}
+
+TEST_P(DBWriteTest, IOErrorOnWALWriteTriggersReadOnlyMode) {
+  std::unique_ptr<FaultInjectionTestEnv> mock_env(
+      new FaultInjectionTestEnv(Env::Default()));
+  Options options = GetOptions();
+  options.env = mock_env.get();
+  Reopen(options);
+  for (int i = 0; i < 2; i++) {
+    // Forcibly fail WAL write for the first Put only. Subsequent Puts should
+    // fail due to read-only mode
+    mock_env->SetFilesystemActive(i != 0);
+    auto res = Put("key" + ToString(i), "value");
+    // TSAN reports a false alarm for lock-order-inversion but Open and
+    // FlushWAL are not run concurrently. Disabling this until TSAN is
+    // fixed.
+    /*
+    if (options.manual_wal_flush && i == 0) {
+      // even with manual_wal_flush the 2nd Put should return error because of
+      // the read-only mode
+      ASSERT_TRUE(res.ok());
+      // we should see fs error when we do the flush
+      res = dbfull()->FlushWAL(false);
+    }
+    */
+    if (!options.manual_wal_flush) {
+      ASSERT_FALSE(res.ok());
+    }
+  }
+  // Close before mock_env destruct.
+  Close();
+}
+
+TEST_P(DBWriteTest, IOErrorOnSwitchMemtable) {
+  Random rnd(301);
+  std::unique_ptr<FaultInjectionTestEnv> mock_env(
+      new FaultInjectionTestEnv(Env::Default()));
+  Options options = GetOptions();
+  options.env = mock_env.get();
+  options.writable_file_max_buffer_size = 4 * 1024 * 1024;
+  options.write_buffer_size = 3 * 512 * 1024;
+  options.wal_bytes_per_sync = 256 * 1024;
+  options.manual_wal_flush = true;
+  Reopen(options);
+  mock_env->SetFilesystemActive(false, Status::IOError("Not active"));
+  Status s;
+  for (int i = 0; i < 4 * 512; ++i) {
+    s = Put(Key(i), RandomString(&rnd, 1024));
+    if (!s.ok()) {
+      break;
+    }
+  }
+  ASSERT_EQ(s.severity(), Status::Severity::kFatalError);
+
+  mock_env->SetFilesystemActive(true);
+  // Close before mock_env destruct.
+  Close();
+}
+
+// Test that db->LockWAL() flushes the WAL after locking.
+TEST_P(DBWriteTest, LockWalInEffect) {
+  Options options = GetOptions();
+  Reopen(options);
+  // try the 1st WAL created during open
+  ASSERT_OK(Put("key" + ToString(0), "value"));
+  ASSERT_TRUE(options.manual_wal_flush != dbfull()->TEST_WALBufferIsEmpty());
+  ASSERT_OK(dbfull()->LockWAL());
+  ASSERT_TRUE(dbfull()->TEST_WALBufferIsEmpty(false));
+  ASSERT_OK(dbfull()->UnlockWAL());
+  // try the 2nd wal created during SwitchWAL
+  dbfull()->TEST_SwitchWAL();
+  ASSERT_OK(Put("key" + ToString(0), "value"));
+  ASSERT_TRUE(options.manual_wal_flush != dbfull()->TEST_WALBufferIsEmpty());
+  ASSERT_OK(dbfull()->LockWAL());
+  ASSERT_TRUE(dbfull()->TEST_WALBufferIsEmpty(false));
+  ASSERT_OK(dbfull()->UnlockWAL());
+}
+
+TEST_P(DBWriteTest, ConcurrentlyDisabledWAL) {
+    Options options = GetOptions();
+    options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+    options.statistics->set_stats_level(StatsLevel::kAll);
+    Reopen(options);
+    std::string wal_key_prefix = "WAL_KEY_";
+    std::string no_wal_key_prefix = "K_";
+    // 100 KB value each for NO-WAL operation
+    std::string no_wal_value(1024 * 100, 'X');
+    // 1B value each for WAL operation
+    std::string wal_value = "0";
+    std::thread threads[10];
+    for (int t = 0; t < 10; t++) {
+        threads[t] = std::thread([t, wal_key_prefix, wal_value, no_wal_key_prefix, no_wal_value, this] {
+            for(int i = 0; i < 10; i++) {
+              ROCKSDB_NAMESPACE::WriteOptions write_option_disable;
+              write_option_disable.disableWAL = true;
+              ROCKSDB_NAMESPACE::WriteOptions write_option_default;
+              std::string no_wal_key = no_wal_key_prefix + std::to_string(t) +
+                                       "_" + std::to_string(i);
+              this->Put(no_wal_key, no_wal_value, write_option_disable);
+              std::string wal_key =
+                  wal_key_prefix + std::to_string(i) + "_" + std::to_string(i);
+              this->Put(wal_key, wal_value, write_option_default);
+              dbfull()->SyncWAL();
+            }
+            return 0;
+        });
+    }
+    for (auto& t: threads) {
+        t.join();
+    }
+    uint64_t bytes_num = options.statistics->getTickerCount(
+        ROCKSDB_NAMESPACE::Tickers::WAL_FILE_BYTES);
+    // written WAL size should less than 100KB (even included HEADER & FOOTER overhead)
+    ASSERT_LE(bytes_num, 1024 * 100);
+}
+
+INSTANTIATE_TEST_CASE_P(DBWriteTestInstance, DBWriteTest,
+                        testing::Values(DBTestBase::kDefault,
+                                        DBTestBase::kConcurrentWALWrites,
+                                        DBTestBase::kPipelinedWrite));
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/dbformat.cc b/src/rocksdb/db/dbformat.cc
new file mode 100644
index 000000000..e10af2b85
--- /dev/null
+++ b/src/rocksdb/db/dbformat.cc
@@ -0,0 +1,197 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include "db/dbformat.h"
+
+#include <stdio.h>
+#include <cinttypes>
+#include "monitoring/perf_context_imp.h"
+#include "port/port.h"
+#include "util/coding.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// kValueTypeForSeek defines the ValueType that should be passed when
+// constructing a ParsedInternalKey object for seeking to a particular
+// sequence number (since we sort sequence numbers in decreasing order
+// and the value type is embedded as the low 8 bits in the sequence
+// number in internal keys, we need to use the highest-numbered
+// ValueType, not the lowest).
+const ValueType kValueTypeForSeek = kTypeBlobIndex;
+const ValueType kValueTypeForSeekForPrev = kTypeDeletion;
+
+uint64_t PackSequenceAndType(uint64_t seq, ValueType t) {
+  assert(seq <= kMaxSequenceNumber);
+  assert(IsExtendedValueType(t));
+  return (seq << 8) | t;
+}
+
+EntryType GetEntryType(ValueType value_type) {
+  switch (value_type) {
+    case kTypeValue:
+      return kEntryPut;
+    case kTypeDeletion:
+      return kEntryDelete;
+    case kTypeSingleDeletion:
+      return kEntrySingleDelete;
+    case kTypeMerge:
+      return kEntryMerge;
+    case kTypeRangeDeletion:
+      return kEntryRangeDeletion;
+    case kTypeBlobIndex:
+      return kEntryBlobIndex;
+    default:
+      return kEntryOther;
+  }
+}
+
+bool ParseFullKey(const Slice& internal_key, FullKey* fkey) {
+  ParsedInternalKey ikey;
+  if (!ParseInternalKey(internal_key, &ikey)) {
+    return false;
+  }
+  fkey->user_key = ikey.user_key;
+  fkey->sequence = ikey.sequence;
+  fkey->type = GetEntryType(ikey.type);
+  return true;
+}
+
+void UnPackSequenceAndType(uint64_t packed, uint64_t* seq, ValueType* t) {
+  *seq = packed >> 8;
+  *t = static_cast<ValueType>(packed & 0xff);
+
+  assert(*seq <= kMaxSequenceNumber);
+  assert(IsExtendedValueType(*t));
+}
+
+void AppendInternalKey(std::string* result, const ParsedInternalKey& key) {
+  result->append(key.user_key.data(), key.user_key.size());
+  PutFixed64(result, PackSequenceAndType(key.sequence, key.type));
+}
+
+void AppendInternalKeyFooter(std::string* result, SequenceNumber s,
+                             ValueType t) {
+  PutFixed64(result, PackSequenceAndType(s, t));
+}
+
+std::string ParsedInternalKey::DebugString(bool hex) const {
+  char buf[50];
+  snprintf(buf, sizeof(buf), "' seq:%" PRIu64 ", type:%d", sequence,
+           static_cast<int>(type));
+  std::string result = "'";
+  result += user_key.ToString(hex);
+  result += buf;
+  return result;
+}
+
+std::string InternalKey::DebugString(bool hex) const {
+  std::string result;
+  ParsedInternalKey parsed;
+  if (ParseInternalKey(rep_, &parsed)) {
+    result = parsed.DebugString(hex);
+  } else {
+    result = "(bad)";
+    result.append(EscapeString(rep_));
+  }
+  return result;
+}
+
+const char* InternalKeyComparator::Name() const { return name_.c_str(); }
+
+int InternalKeyComparator::Compare(const ParsedInternalKey& a,
+                                   const ParsedInternalKey& b) const {
+  // Order by:
+  //    increasing user key (according to user-supplied comparator)
+  //    decreasing sequence number
+  //    decreasing type (though sequence# should be enough to disambiguate)
+  int r = user_comparator_.Compare(a.user_key, b.user_key);
+  if (r == 0) {
+    if (a.sequence > b.sequence) {
+      r = -1;
+    } else if (a.sequence < b.sequence) {
+      r = +1;
+    } else if (a.type > b.type) {
+      r = -1;
+    } else if (a.type < b.type) {
+      r = +1;
+    }
+  }
+  return r;
+}
+
+void InternalKeyComparator::FindShortestSeparator(std::string* start,
+                                                  const Slice& limit) const {
+  // Attempt to shorten the user portion of the key
+  Slice user_start = ExtractUserKey(*start);
+  Slice user_limit = ExtractUserKey(limit);
+  std::string tmp(user_start.data(), user_start.size());
+  user_comparator_.FindShortestSeparator(&tmp, user_limit);
+  if (tmp.size() <= user_start.size() &&
+      user_comparator_.Compare(user_start, tmp) < 0) {
+    // User key has become shorter physically, but larger logically.
+    // Tack on the earliest possible number to the shortened user key.
+    PutFixed64(&tmp,
+               PackSequenceAndType(kMaxSequenceNumber, kValueTypeForSeek));
+    assert(this->Compare(*start, tmp) < 0);
+    assert(this->Compare(tmp, limit) < 0);
+    start->swap(tmp);
+  }
+}
+
+void InternalKeyComparator::FindShortSuccessor(std::string* key) const {
+  Slice user_key = ExtractUserKey(*key);
+  std::string tmp(user_key.data(), user_key.size());
+  user_comparator_.FindShortSuccessor(&tmp);
+  if (tmp.size() <= user_key.size() &&
+      user_comparator_.Compare(user_key, tmp) < 0) {
+    // User key has become shorter physically, but larger logically.
+    // Tack on the earliest possible number to the shortened user key.
+    PutFixed64(&tmp,
+               PackSequenceAndType(kMaxSequenceNumber, kValueTypeForSeek));
+    assert(this->Compare(*key, tmp) < 0);
+    key->swap(tmp);
+  }
+}
+
+LookupKey::LookupKey(const Slice& _user_key, SequenceNumber s,
+                     const Slice* ts) {
+  size_t usize = _user_key.size();
+  size_t ts_sz = (nullptr == ts) ? 0 : ts->size();
+  size_t needed = usize + ts_sz + 13;  // A conservative estimate
+  char* dst;
+  if (needed <= sizeof(space_)) {
+    dst = space_;
+  } else {
+    dst = new char[needed];
+  }
+  start_ = dst;
+  // NOTE: We don't support users keys of more than 2GB :)
+  dst = EncodeVarint32(dst, static_cast<uint32_t>(usize + ts_sz + 8));
+  kstart_ = dst;
+  memcpy(dst, _user_key.data(), usize);
+  dst += usize;
+  if (nullptr != ts) {
+    memcpy(dst, ts->data(), ts_sz);
+    dst += ts_sz;
+  }
+  EncodeFixed64(dst, PackSequenceAndType(s, kValueTypeForSeek));
+  dst += 8;
+  end_ = dst;
+}
+
+void IterKey::EnlargeBuffer(size_t key_size) {
+  // If size is smaller than buffer size, continue using current buffer,
+  // or the static allocated one, as default
+  assert(key_size > buf_size_);
+  // Need to enlarge the buffer.
+  ResetBuffer();
+  buf_ = new char[key_size];
+  buf_size_ = key_size;
+}
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/dbformat.h b/src/rocksdb/db/dbformat.h
new file mode 100644
index 000000000..de98be8df
--- /dev/null
+++ b/src/rocksdb/db/dbformat.h
@@ -0,0 +1,671 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <stdio.h>
+#include <memory>
+#include <string>
+#include <utility>
+#include "db/lookup_key.h"
+#include "db/merge_context.h"
+#include "logging/logging.h"
+#include "monitoring/perf_context_imp.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/db.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/table.h"
+#include "rocksdb/types.h"
+#include "util/coding.h"
+#include "util/user_comparator_wrapper.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// The file declares data structures and functions that deal with internal
+// keys.
+// Each internal key contains a user key, a sequence number (SequenceNumber)
+// and a type (ValueType), and they are usually encoded together.
+// There are some related helper classes here.
+
+class InternalKey;
+
+// Value types encoded as the last component of internal keys.
+// DO NOT CHANGE THESE ENUM VALUES: they are embedded in the on-disk
+// data structures.
+// The highest bit of the value type needs to be reserved to SST tables
+// for them to do more flexible encoding.
+enum ValueType : unsigned char {
+  kTypeDeletion = 0x0,
+  kTypeValue = 0x1,
+  kTypeMerge = 0x2,
+  kTypeLogData = 0x3,               // WAL only.
+  kTypeColumnFamilyDeletion = 0x4,  // WAL only.
+  kTypeColumnFamilyValue = 0x5,     // WAL only.
+  kTypeColumnFamilyMerge = 0x6,     // WAL only.
+  kTypeSingleDeletion = 0x7,
+  kTypeColumnFamilySingleDeletion = 0x8,  // WAL only.
+  kTypeBeginPrepareXID = 0x9,             // WAL only.
+  kTypeEndPrepareXID = 0xA,               // WAL only.
+  kTypeCommitXID = 0xB,                   // WAL only.
+  kTypeRollbackXID = 0xC,                 // WAL only.
+  kTypeNoop = 0xD,                        // WAL only.
+  kTypeColumnFamilyRangeDeletion = 0xE,   // WAL only.
+  kTypeRangeDeletion = 0xF,               // meta block
+  kTypeColumnFamilyBlobIndex = 0x10,      // Blob DB only
+  kTypeBlobIndex = 0x11,                  // Blob DB only
+  // When the prepared record is also persisted in db, we use a different
+  // record. This is to ensure that the WAL that is generated by a WritePolicy
+  // is not mistakenly read by another, which would result into data
+  // inconsistency.
+  kTypeBeginPersistedPrepareXID = 0x12,  // WAL only.
+  // Similar to kTypeBeginPersistedPrepareXID, this is to ensure that WAL
+  // generated by WriteUnprepared write policy is not mistakenly read by
+  // another.
+  kTypeBeginUnprepareXID = 0x13,  // WAL only.
+  kMaxValue = 0x7F                // Not used for storing records.
+};
+
+// Defined in dbformat.cc
+extern const ValueType kValueTypeForSeek;
+extern const ValueType kValueTypeForSeekForPrev;
+
+// Checks whether a type is an inline value type
+// (i.e. a type used in memtable skiplist and sst file datablock).
+inline bool IsValueType(ValueType t) {
+  return t <= kTypeMerge || t == kTypeSingleDeletion || t == kTypeBlobIndex;
+}
+
+// Checks whether a type is from user operation
+// kTypeRangeDeletion is in meta block so this API is separated from above
+inline bool IsExtendedValueType(ValueType t) {
+  return IsValueType(t) || t == kTypeRangeDeletion;
+}
+
+// We leave eight bits empty at the bottom so a type and sequence#
+// can be packed together into 64-bits.
+static const SequenceNumber kMaxSequenceNumber = ((0x1ull << 56) - 1);
+
+static const SequenceNumber kDisableGlobalSequenceNumber = port::kMaxUint64;
+
+// The data structure that represents an internal key in the way that user_key,
+// sequence number and type are stored in separated forms.
+struct ParsedInternalKey {
+  Slice user_key;
+  SequenceNumber sequence;
+  ValueType type;
+
+  ParsedInternalKey()
+      : sequence(kMaxSequenceNumber)  // Make code analyzer happy
+  {}  // Intentionally left uninitialized (for speed)
+  ParsedInternalKey(const Slice& u, const SequenceNumber& seq, ValueType t)
+      : user_key(u), sequence(seq), type(t) {}
+  std::string DebugString(bool hex = false) const;
+
+  void clear() {
+    user_key.clear();
+    sequence = 0;
+    type = kTypeDeletion;
+  }
+};
+
+// Return the length of the encoding of "key".
+inline size_t InternalKeyEncodingLength(const ParsedInternalKey& key) {
+  return key.user_key.size() + 8;
+}
+
+// Pack a sequence number and a ValueType into a uint64_t
+extern uint64_t PackSequenceAndType(uint64_t seq, ValueType t);
+
+// Given the result of PackSequenceAndType, store the sequence number in *seq
+// and the ValueType in *t.
+extern void UnPackSequenceAndType(uint64_t packed, uint64_t* seq, ValueType* t);
+
+EntryType GetEntryType(ValueType value_type);
+
+// Append the serialization of "key" to *result.
+extern void AppendInternalKey(std::string* result,
+                              const ParsedInternalKey& key);
+// Serialized internal key consists of user key followed by footer.
+// This function appends the footer to *result, assuming that *result already
+// contains the user key at the end.
+extern void AppendInternalKeyFooter(std::string* result, SequenceNumber s,
+                                    ValueType t);
+
+// Attempt to parse an internal key from "internal_key".  On success,
+// stores the parsed data in "*result", and returns true.
+//
+// On error, returns false, leaves "*result" in an undefined state.
+extern bool ParseInternalKey(const Slice& internal_key,
+                             ParsedInternalKey* result);
+
+// Returns the user key portion of an internal key.
+inline Slice ExtractUserKey(const Slice& internal_key) {
+  assert(internal_key.size() >= 8);
+  return Slice(internal_key.data(), internal_key.size() - 8);
+}
+
+inline Slice ExtractUserKeyAndStripTimestamp(const Slice& internal_key,
+                                             size_t ts_sz) {
+  assert(internal_key.size() >= 8 + ts_sz);
+  return Slice(internal_key.data(), internal_key.size() - 8 - ts_sz);
+}
+
+inline Slice StripTimestampFromUserKey(const Slice& user_key, size_t ts_sz) {
+  assert(user_key.size() >= ts_sz);
+  return Slice(user_key.data(), user_key.size() - ts_sz);
+}
+
+inline uint64_t ExtractInternalKeyFooter(const Slice& internal_key) {
+  assert(internal_key.size() >= 8);
+  const size_t n = internal_key.size();
+  return DecodeFixed64(internal_key.data() + n - 8);
+}
+
+inline ValueType ExtractValueType(const Slice& internal_key) {
+  uint64_t num = ExtractInternalKeyFooter(internal_key);
+  unsigned char c = num & 0xff;
+  return static_cast<ValueType>(c);
+}
+
+// A comparator for internal keys that uses a specified comparator for
+// the user key portion and breaks ties by decreasing sequence number.
+class InternalKeyComparator
+#ifdef NDEBUG
+    final
+#endif
+    : public Comparator {
+ private:
+  UserComparatorWrapper user_comparator_;
+  std::string name_;
+
+ public:
+  explicit InternalKeyComparator(const Comparator* c)
+      : user_comparator_(c),
+        name_("rocksdb.InternalKeyComparator:" +
+              std::string(user_comparator_.Name())) {}
+  virtual ~InternalKeyComparator() {}
+
+  virtual const char* Name() const override;
+  virtual int Compare(const Slice& a, const Slice& b) const override;
+  // Same as Compare except that it excludes the value type from comparison
+  virtual int CompareKeySeq(const Slice& a, const Slice& b) const;
+  virtual void FindShortestSeparator(std::string* start,
+                                     const Slice& limit) const override;
+  virtual void FindShortSuccessor(std::string* key) const override;
+
+  const Comparator* user_comparator() const {
+    return user_comparator_.user_comparator();
+  }
+
+  int Compare(const InternalKey& a, const InternalKey& b) const;
+  int Compare(const ParsedInternalKey& a, const ParsedInternalKey& b) const;
+  virtual const Comparator* GetRootComparator() const override {
+    return user_comparator_.GetRootComparator();
+  }
+};
+
+// The class represent the internal key in encoded form.
+class InternalKey {
+ private:
+  std::string rep_;
+
+ public:
+  InternalKey() {}  // Leave rep_ as empty to indicate it is invalid
+  InternalKey(const Slice& _user_key, SequenceNumber s, ValueType t) {
+    AppendInternalKey(&rep_, ParsedInternalKey(_user_key, s, t));
+  }
+
+  // sets the internal key to be bigger or equal to all internal keys with this
+  // user key
+  void SetMaxPossibleForUserKey(const Slice& _user_key) {
+    AppendInternalKey(
+        &rep_, ParsedInternalKey(_user_key, 0, static_cast<ValueType>(0)));
+  }
+
+  // sets the internal key to be smaller or equal to all internal keys with this
+  // user key
+  void SetMinPossibleForUserKey(const Slice& _user_key) {
+    AppendInternalKey(&rep_, ParsedInternalKey(_user_key, kMaxSequenceNumber,
+                                               kValueTypeForSeek));
+  }
+
+  bool Valid() const {
+    ParsedInternalKey parsed;
+    return ParseInternalKey(Slice(rep_), &parsed);
+  }
+
+  void DecodeFrom(const Slice& s) { rep_.assign(s.data(), s.size()); }
+  Slice Encode() const {
+    assert(!rep_.empty());
+    return rep_;
+  }
+
+  Slice user_key() const { return ExtractUserKey(rep_); }
+  size_t size() { return rep_.size(); }
+
+  void Set(const Slice& _user_key, SequenceNumber s, ValueType t) {
+    SetFrom(ParsedInternalKey(_user_key, s, t));
+  }
+
+  void SetFrom(const ParsedInternalKey& p) {
+    rep_.clear();
+    AppendInternalKey(&rep_, p);
+  }
+
+  void Clear() { rep_.clear(); }
+
+  // The underlying representation.
+  // Intended only to be used together with ConvertFromUserKey().
+  std::string* rep() { return &rep_; }
+
+  // Assuming that *rep() contains a user key, this method makes internal key
+  // out of it in-place. This saves a memcpy compared to Set()/SetFrom().
+  void ConvertFromUserKey(SequenceNumber s, ValueType t) {
+    AppendInternalKeyFooter(&rep_, s, t);
+  }
+
+  std::string DebugString(bool hex = false) const;
+};
+
+inline int InternalKeyComparator::Compare(const InternalKey& a,
+                                          const InternalKey& b) const {
+  return Compare(a.Encode(), b.Encode());
+}
+
+inline bool ParseInternalKey(const Slice& internal_key,
+                             ParsedInternalKey* result) {
+  const size_t n = internal_key.size();
+  if (n < 8) return false;
+  uint64_t num = DecodeFixed64(internal_key.data() + n - 8);
+  unsigned char c = num & 0xff;
+  result->sequence = num >> 8;
+  result->type = static_cast<ValueType>(c);
+  assert(result->type <= ValueType::kMaxValue);
+  result->user_key = Slice(internal_key.data(), n - 8);
+  return IsExtendedValueType(result->type);
+}
+
+// Update the sequence number in the internal key.
+// Guarantees not to invalidate ikey.data().
+inline void UpdateInternalKey(std::string* ikey, uint64_t seq, ValueType t) {
+  size_t ikey_sz = ikey->size();
+  assert(ikey_sz >= 8);
+  uint64_t newval = (seq << 8) | t;
+
+  // Note: Since C++11, strings are guaranteed to be stored contiguously and
+  // string::operator[]() is guaranteed not to change ikey.data().
+  EncodeFixed64(&(*ikey)[ikey_sz - 8], newval);
+}
+
+// Get the sequence number from the internal key
+inline uint64_t GetInternalKeySeqno(const Slice& internal_key) {
+  const size_t n = internal_key.size();
+  assert(n >= 8);
+  uint64_t num = DecodeFixed64(internal_key.data() + n - 8);
+  return num >> 8;
+}
+
+// The class to store keys in an efficient way. It allows:
+// 1. Users can either copy the key into it, or have it point to an unowned
+//    address.
+// 2. For copied key, a short inline buffer is kept to reduce memory
+//    allocation for smaller keys.
+// 3. It tracks user key or internal key, and allow conversion between them.
+class IterKey {
+ public:
+  IterKey()
+      : buf_(space_),
+        key_(buf_),
+        key_size_(0),
+        buf_size_(sizeof(space_)),
+        is_user_key_(true) {}
+  // No copying allowed
+  IterKey(const IterKey&) = delete;
+  void operator=(const IterKey&) = delete;
+
+  ~IterKey() { ResetBuffer(); }
+
+  // The bool will be picked up by the next calls to SetKey
+  void SetIsUserKey(bool is_user_key) { is_user_key_ = is_user_key; }
+
+  // Returns the key in whichever format that was provided to KeyIter
+  Slice GetKey() const { return Slice(key_, key_size_); }
+
+  Slice GetInternalKey() const {
+    assert(!IsUserKey());
+    return Slice(key_, key_size_);
+  }
+
+  Slice GetUserKey() const {
+    if (IsUserKey()) {
+      return Slice(key_, key_size_);
+    } else {
+      assert(key_size_ >= 8);
+      return Slice(key_, key_size_ - 8);
+    }
+  }
+
+  size_t Size() const { return key_size_; }
+
+  void Clear() { key_size_ = 0; }
+
+  // Append "non_shared_data" to its back, from "shared_len"
+  // This function is used in Block::Iter::ParseNextKey
+  // shared_len: bytes in [0, shard_len-1] would be remained
+  // non_shared_data: data to be append, its length must be >= non_shared_len
+  void TrimAppend(const size_t shared_len, const char* non_shared_data,
+                  const size_t non_shared_len) {
+    assert(shared_len <= key_size_);
+    size_t total_size = shared_len + non_shared_len;
+
+    if (IsKeyPinned() /* key is not in buf_ */) {
+      // Copy the key from external memory to buf_ (copy shared_len bytes)
+      EnlargeBufferIfNeeded(total_size);
+      memcpy(buf_, key_, shared_len);
+    } else if (total_size > buf_size_) {
+      // Need to allocate space, delete previous space
+      char* p = new char[total_size];
+      memcpy(p, key_, shared_len);
+
+      if (buf_ != space_) {
+        delete[] buf_;
+      }
+
+      buf_ = p;
+      buf_size_ = total_size;
+    }
+
+    memcpy(buf_ + shared_len, non_shared_data, non_shared_len);
+    key_ = buf_;
+    key_size_ = total_size;
+  }
+
+  Slice SetKey(const Slice& key, bool copy = true) {
+    // is_user_key_ expected to be set already via SetIsUserKey
+    return SetKeyImpl(key, copy);
+  }
+
+  Slice SetUserKey(const Slice& key, bool copy = true) {
+    is_user_key_ = true;
+    return SetKeyImpl(key, copy);
+  }
+
+  Slice SetInternalKey(const Slice& key, bool copy = true) {
+    is_user_key_ = false;
+    return SetKeyImpl(key, copy);
+  }
+
+  // Copies the content of key, updates the reference to the user key in ikey
+  // and returns a Slice referencing the new copy.
+  Slice SetInternalKey(const Slice& key, ParsedInternalKey* ikey) {
+    size_t key_n = key.size();
+    assert(key_n >= 8);
+    SetInternalKey(key);
+    ikey->user_key = Slice(key_, key_n - 8);
+    return Slice(key_, key_n);
+  }
+
+  // Copy the key into IterKey own buf_
+  void OwnKey() {
+    assert(IsKeyPinned() == true);
+
+    Reserve(key_size_);
+    memcpy(buf_, key_, key_size_);
+    key_ = buf_;
+  }
+
+  // Update the sequence number in the internal key.  Guarantees not to
+  // invalidate slices to the key (and the user key).
+  void UpdateInternalKey(uint64_t seq, ValueType t) {
+    assert(!IsKeyPinned());
+    assert(key_size_ >= 8);
+    uint64_t newval = (seq << 8) | t;
+    EncodeFixed64(&buf_[key_size_ - 8], newval);
+  }
+
+  bool IsKeyPinned() const { return (key_ != buf_); }
+
+  void SetInternalKey(const Slice& key_prefix, const Slice& user_key,
+                      SequenceNumber s,
+                      ValueType value_type = kValueTypeForSeek) {
+    size_t psize = key_prefix.size();
+    size_t usize = user_key.size();
+    EnlargeBufferIfNeeded(psize + usize + sizeof(uint64_t));
+    if (psize > 0) {
+      memcpy(buf_, key_prefix.data(), psize);
+    }
+    memcpy(buf_ + psize, user_key.data(), usize);
+    EncodeFixed64(buf_ + usize + psize, PackSequenceAndType(s, value_type));
+
+    key_ = buf_;
+    key_size_ = psize + usize + sizeof(uint64_t);
+    is_user_key_ = false;
+  }
+
+  void SetInternalKey(const Slice& user_key, SequenceNumber s,
+                      ValueType value_type = kValueTypeForSeek) {
+    SetInternalKey(Slice(), user_key, s, value_type);
+  }
+
+  void Reserve(size_t size) {
+    EnlargeBufferIfNeeded(size);
+    key_size_ = size;
+  }
+
+  void SetInternalKey(const ParsedInternalKey& parsed_key) {
+    SetInternalKey(Slice(), parsed_key);
+  }
+
+  void SetInternalKey(const Slice& key_prefix,
+                      const ParsedInternalKey& parsed_key_suffix) {
+    SetInternalKey(key_prefix, parsed_key_suffix.user_key,
+                   parsed_key_suffix.sequence, parsed_key_suffix.type);
+  }
+
+  void EncodeLengthPrefixedKey(const Slice& key) {
+    auto size = key.size();
+    EnlargeBufferIfNeeded(size + static_cast<size_t>(VarintLength(size)));
+    char* ptr = EncodeVarint32(buf_, static_cast<uint32_t>(size));
+    memcpy(ptr, key.data(), size);
+    key_ = buf_;
+    is_user_key_ = true;
+  }
+
+  bool IsUserKey() const { return is_user_key_; }
+
+ private:
+  char* buf_;
+  const char* key_;
+  size_t key_size_;
+  size_t buf_size_;
+  char space_[32];  // Avoid allocation for short keys
+  bool is_user_key_;
+
+  Slice SetKeyImpl(const Slice& key, bool copy) {
+    size_t size = key.size();
+    if (copy) {
+      // Copy key to buf_
+      EnlargeBufferIfNeeded(size);
+      memcpy(buf_, key.data(), size);
+      key_ = buf_;
+    } else {
+      // Update key_ to point to external memory
+      key_ = key.data();
+    }
+    key_size_ = size;
+    return Slice(key_, key_size_);
+  }
+
+  void ResetBuffer() {
+    if (buf_ != space_) {
+      delete[] buf_;
+      buf_ = space_;
+    }
+    buf_size_ = sizeof(space_);
+    key_size_ = 0;
+  }
+
+  // Enlarge the buffer size if needed based on key_size.
+  // By default, static allocated buffer is used. Once there is a key
+  // larger than the static allocated buffer, another buffer is dynamically
+  // allocated, until a larger key buffer is requested. In that case, we
+  // reallocate buffer and delete the old one.
+  void EnlargeBufferIfNeeded(size_t key_size) {
+    // If size is smaller than buffer size, continue using current buffer,
+    // or the static allocated one, as default
+    if (key_size > buf_size_) {
+      EnlargeBuffer(key_size);
+    }
+  }
+
+  void EnlargeBuffer(size_t key_size);
+};
+
+// Convert from a SliceTranform of user keys, to a SliceTransform of
+// user keys.
+class InternalKeySliceTransform : public SliceTransform {
+ public:
+  explicit InternalKeySliceTransform(const SliceTransform* transform)
+      : transform_(transform) {}
+
+  virtual const char* Name() const override { return transform_->Name(); }
+
+  virtual Slice Transform(const Slice& src) const override {
+    auto user_key = ExtractUserKey(src);
+    return transform_->Transform(user_key);
+  }
+
+  virtual bool InDomain(const Slice& src) const override {
+    auto user_key = ExtractUserKey(src);
+    return transform_->InDomain(user_key);
+  }
+
+  virtual bool InRange(const Slice& dst) const override {
+    auto user_key = ExtractUserKey(dst);
+    return transform_->InRange(user_key);
+  }
+
+  const SliceTransform* user_prefix_extractor() const { return transform_; }
+
+ private:
+  // Like comparator, InternalKeySliceTransform will not take care of the
+  // deletion of transform_
+  const SliceTransform* const transform_;
+};
+
+// Read the key of a record from a write batch.
+// if this record represent the default column family then cf_record
+// must be passed as false, otherwise it must be passed as true.
+extern bool ReadKeyFromWriteBatchEntry(Slice* input, Slice* key,
+                                       bool cf_record);
+
+// Read record from a write batch piece from input.
+// tag, column_family, key, value and blob are return values. Callers own the
+// Slice they point to.
+// Tag is defined as ValueType.
+// input will be advanced to after the record.
+extern Status ReadRecordFromWriteBatch(Slice* input, char* tag,
+                                       uint32_t* column_family, Slice* key,
+                                       Slice* value, Slice* blob, Slice* xid);
+
+// When user call DeleteRange() to delete a range of keys,
+// we will store a serialized RangeTombstone in MemTable and SST.
+// the struct here is a easy-understood form
+// start/end_key_ is the start/end user key of the range to be deleted
+struct RangeTombstone {
+  Slice start_key_;
+  Slice end_key_;
+  SequenceNumber seq_;
+  RangeTombstone() = default;
+  RangeTombstone(Slice sk, Slice ek, SequenceNumber sn)
+      : start_key_(sk), end_key_(ek), seq_(sn) {}
+
+  RangeTombstone(ParsedInternalKey parsed_key, Slice value) {
+    start_key_ = parsed_key.user_key;
+    seq_ = parsed_key.sequence;
+    end_key_ = value;
+  }
+
+  // be careful to use Serialize(), allocates new memory
+  std::pair<InternalKey, Slice> Serialize() const {
+    auto key = InternalKey(start_key_, seq_, kTypeRangeDeletion);
+    Slice value = end_key_;
+    return std::make_pair(std::move(key), std::move(value));
+  }
+
+  // be careful to use SerializeKey(), allocates new memory
+  InternalKey SerializeKey() const {
+    return InternalKey(start_key_, seq_, kTypeRangeDeletion);
+  }
+
+  // The tombstone end-key is exclusive, so we generate an internal-key here
+  // which has a similar property. Using kMaxSequenceNumber guarantees that
+  // the returned internal-key will compare less than any other internal-key
+  // with the same user-key. This in turn guarantees that the serialized
+  // end-key for a tombstone such as [a-b] will compare less than the key "b".
+  //
+  // be careful to use SerializeEndKey(), allocates new memory
+  InternalKey SerializeEndKey() const {
+    return InternalKey(end_key_, kMaxSequenceNumber, kTypeRangeDeletion);
+  }
+};
+
+inline int InternalKeyComparator::Compare(const Slice& akey,
+                                          const Slice& bkey) const {
+  // Order by:
+  //    increasing user key (according to user-supplied comparator)
+  //    decreasing sequence number
+  //    decreasing type (though sequence# should be enough to disambiguate)
+  int r = user_comparator_.Compare(ExtractUserKey(akey), ExtractUserKey(bkey));
+  if (r == 0) {
+    const uint64_t anum = DecodeFixed64(akey.data() + akey.size() - 8);
+    const uint64_t bnum = DecodeFixed64(bkey.data() + bkey.size() - 8);
+    if (anum > bnum) {
+      r = -1;
+    } else if (anum < bnum) {
+      r = +1;
+    }
+  }
+  return r;
+}
+
+inline int InternalKeyComparator::CompareKeySeq(const Slice& akey,
+                                                const Slice& bkey) const {
+  // Order by:
+  //    increasing user key (according to user-supplied comparator)
+  //    decreasing sequence number
+  int r = user_comparator_.Compare(ExtractUserKey(akey), ExtractUserKey(bkey));
+  if (r == 0) {
+    // Shift the number to exclude the last byte which contains the value type
+    const uint64_t anum = DecodeFixed64(akey.data() + akey.size() - 8) >> 8;
+    const uint64_t bnum = DecodeFixed64(bkey.data() + bkey.size() - 8) >> 8;
+    if (anum > bnum) {
+      r = -1;
+    } else if (anum < bnum) {
+      r = +1;
+    }
+  }
+  return r;
+}
+
+// Wrap InternalKeyComparator as a comparator class for ParsedInternalKey.
+struct ParsedInternalKeyComparator {
+  explicit ParsedInternalKeyComparator(const InternalKeyComparator* c)
+      : cmp(c) {}
+
+  bool operator()(const ParsedInternalKey& a,
+                  const ParsedInternalKey& b) const {
+    return cmp->Compare(a, b) < 0;
+  }
+
+  const InternalKeyComparator* cmp;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/dbformat_test.cc b/src/rocksdb/db/dbformat_test.cc
new file mode 100644
index 000000000..a2c67795a
--- /dev/null
+++ b/src/rocksdb/db/dbformat_test.cc
@@ -0,0 +1,207 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/dbformat.h"
+#include "logging/logging.h"
+#include "test_util/testharness.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+static std::string IKey(const std::string& user_key,
+                        uint64_t seq,
+                        ValueType vt) {
+  std::string encoded;
+  AppendInternalKey(&encoded, ParsedInternalKey(user_key, seq, vt));
+  return encoded;
+}
+
+static std::string Shorten(const std::string& s, const std::string& l) {
+  std::string result = s;
+  InternalKeyComparator(BytewiseComparator()).FindShortestSeparator(&result, l);
+  return result;
+}
+
+static std::string ShortSuccessor(const std::string& s) {
+  std::string result = s;
+  InternalKeyComparator(BytewiseComparator()).FindShortSuccessor(&result);
+  return result;
+}
+
+static void TestKey(const std::string& key,
+                    uint64_t seq,
+                    ValueType vt) {
+  std::string encoded = IKey(key, seq, vt);
+
+  Slice in(encoded);
+  ParsedInternalKey decoded("", 0, kTypeValue);
+
+  ASSERT_TRUE(ParseInternalKey(in, &decoded));
+  ASSERT_EQ(key, decoded.user_key.ToString());
+  ASSERT_EQ(seq, decoded.sequence);
+  ASSERT_EQ(vt, decoded.type);
+
+  ASSERT_TRUE(!ParseInternalKey(Slice("bar"), &decoded));
+}
+
+class FormatTest : public testing::Test {};
+
+TEST_F(FormatTest, InternalKey_EncodeDecode) {
+  const char* keys[] = { "", "k", "hello", "longggggggggggggggggggggg" };
+  const uint64_t seq[] = {
+    1, 2, 3,
+    (1ull << 8) - 1, 1ull << 8, (1ull << 8) + 1,
+    (1ull << 16) - 1, 1ull << 16, (1ull << 16) + 1,
+    (1ull << 32) - 1, 1ull << 32, (1ull << 32) + 1
+  };
+  for (unsigned int k = 0; k < sizeof(keys) / sizeof(keys[0]); k++) {
+    for (unsigned int s = 0; s < sizeof(seq) / sizeof(seq[0]); s++) {
+      TestKey(keys[k], seq[s], kTypeValue);
+      TestKey("hello", 1, kTypeDeletion);
+    }
+  }
+}
+
+TEST_F(FormatTest, InternalKeyShortSeparator) {
+  // When user keys are same
+  ASSERT_EQ(IKey("foo", 100, kTypeValue),
+            Shorten(IKey("foo", 100, kTypeValue),
+                    IKey("foo", 99, kTypeValue)));
+  ASSERT_EQ(IKey("foo", 100, kTypeValue),
+            Shorten(IKey("foo", 100, kTypeValue),
+                    IKey("foo", 101, kTypeValue)));
+  ASSERT_EQ(IKey("foo", 100, kTypeValue),
+            Shorten(IKey("foo", 100, kTypeValue),
+                    IKey("foo", 100, kTypeValue)));
+  ASSERT_EQ(IKey("foo", 100, kTypeValue),
+            Shorten(IKey("foo", 100, kTypeValue),
+                    IKey("foo", 100, kTypeDeletion)));
+
+  // When user keys are misordered
+  ASSERT_EQ(IKey("foo", 100, kTypeValue),
+            Shorten(IKey("foo", 100, kTypeValue),
+                    IKey("bar", 99, kTypeValue)));
+
+  // When user keys are different, but correctly ordered
+  ASSERT_EQ(IKey("g", kMaxSequenceNumber, kValueTypeForSeek),
+            Shorten(IKey("foo", 100, kTypeValue),
+                    IKey("hello", 200, kTypeValue)));
+
+  ASSERT_EQ(IKey("ABC2", kMaxSequenceNumber, kValueTypeForSeek),
+            Shorten(IKey("ABC1AAAAA", 100, kTypeValue),
+                    IKey("ABC2ABB", 200, kTypeValue)));
+
+  ASSERT_EQ(IKey("AAA2", kMaxSequenceNumber, kValueTypeForSeek),
+            Shorten(IKey("AAA1AAA", 100, kTypeValue),
+                    IKey("AAA2AA", 200, kTypeValue)));
+
+  ASSERT_EQ(
+      IKey("AAA2", kMaxSequenceNumber, kValueTypeForSeek),
+      Shorten(IKey("AAA1AAA", 100, kTypeValue), IKey("AAA4", 200, kTypeValue)));
+
+  ASSERT_EQ(
+      IKey("AAA1B", kMaxSequenceNumber, kValueTypeForSeek),
+      Shorten(IKey("AAA1AAA", 100, kTypeValue), IKey("AAA2", 200, kTypeValue)));
+
+  ASSERT_EQ(IKey("AAA2", kMaxSequenceNumber, kValueTypeForSeek),
+            Shorten(IKey("AAA1AAA", 100, kTypeValue),
+                    IKey("AAA2A", 200, kTypeValue)));
+
+  ASSERT_EQ(
+      IKey("AAA1", 100, kTypeValue),
+      Shorten(IKey("AAA1", 100, kTypeValue), IKey("AAA2", 200, kTypeValue)));
+
+  // When start user key is prefix of limit user key
+  ASSERT_EQ(IKey("foo", 100, kTypeValue),
+            Shorten(IKey("foo", 100, kTypeValue),
+                    IKey("foobar", 200, kTypeValue)));
+
+  // When limit user key is prefix of start user key
+  ASSERT_EQ(IKey("foobar", 100, kTypeValue),
+            Shorten(IKey("foobar", 100, kTypeValue),
+                    IKey("foo", 200, kTypeValue)));
+}
+
+TEST_F(FormatTest, InternalKeyShortestSuccessor) {
+  ASSERT_EQ(IKey("g", kMaxSequenceNumber, kValueTypeForSeek),
+            ShortSuccessor(IKey("foo", 100, kTypeValue)));
+  ASSERT_EQ(IKey("\xff\xff", 100, kTypeValue),
+            ShortSuccessor(IKey("\xff\xff", 100, kTypeValue)));
+}
+
+TEST_F(FormatTest, IterKeyOperation) {
+  IterKey k;
+  const char p[] = "abcdefghijklmnopqrstuvwxyz";
+  const char q[] = "0123456789";
+
+  ASSERT_EQ(std::string(k.GetUserKey().data(), k.GetUserKey().size()),
+            std::string(""));
+
+  k.TrimAppend(0, p, 3);
+  ASSERT_EQ(std::string(k.GetUserKey().data(), k.GetUserKey().size()),
+            std::string("abc"));
+
+  k.TrimAppend(1, p, 3);
+  ASSERT_EQ(std::string(k.GetUserKey().data(), k.GetUserKey().size()),
+            std::string("aabc"));
+
+  k.TrimAppend(0, p, 26);
+  ASSERT_EQ(std::string(k.GetUserKey().data(), k.GetUserKey().size()),
+            std::string("abcdefghijklmnopqrstuvwxyz"));
+
+  k.TrimAppend(26, q, 10);
+  ASSERT_EQ(std::string(k.GetUserKey().data(), k.GetUserKey().size()),
+            std::string("abcdefghijklmnopqrstuvwxyz0123456789"));
+
+  k.TrimAppend(36, q, 1);
+  ASSERT_EQ(std::string(k.GetUserKey().data(), k.GetUserKey().size()),
+            std::string("abcdefghijklmnopqrstuvwxyz01234567890"));
+
+  k.TrimAppend(26, q, 1);
+  ASSERT_EQ(std::string(k.GetUserKey().data(), k.GetUserKey().size()),
+            std::string("abcdefghijklmnopqrstuvwxyz0"));
+
+  // Size going up, memory allocation is triggered
+  k.TrimAppend(27, p, 26);
+  ASSERT_EQ(std::string(k.GetUserKey().data(), k.GetUserKey().size()),
+            std::string("abcdefghijklmnopqrstuvwxyz0"
+                        "abcdefghijklmnopqrstuvwxyz"));
+}
+
+TEST_F(FormatTest, UpdateInternalKey) {
+  std::string user_key("abcdefghijklmnopqrstuvwxyz");
+  uint64_t new_seq = 0x123456;
+  ValueType new_val_type = kTypeDeletion;
+
+  std::string ikey;
+  AppendInternalKey(&ikey, ParsedInternalKey(user_key, 100U, kTypeValue));
+  size_t ikey_size = ikey.size();
+  UpdateInternalKey(&ikey, new_seq, new_val_type);
+  ASSERT_EQ(ikey_size, ikey.size());
+
+  Slice in(ikey);
+  ParsedInternalKey decoded;
+  ASSERT_TRUE(ParseInternalKey(in, &decoded));
+  ASSERT_EQ(user_key, decoded.user_key.ToString());
+  ASSERT_EQ(new_seq, decoded.sequence);
+  ASSERT_EQ(new_val_type, decoded.type);
+}
+
+TEST_F(FormatTest, RangeTombstoneSerializeEndKey) {
+  RangeTombstone t("a", "b", 2);
+  InternalKey k("b", 3, kTypeValue);
+  const InternalKeyComparator cmp(BytewiseComparator());
+  ASSERT_LT(cmp.Compare(t.SerializeEndKey(), k), 0);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/deletefile_test.cc b/src/rocksdb/db/deletefile_test.cc
new file mode 100644
index 000000000..f202388c0
--- /dev/null
+++ b/src/rocksdb/db/deletefile_test.cc
@@ -0,0 +1,571 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef ROCKSDB_LITE
+
+#include <stdlib.h>
+#include <map>
+#include <string>
+#include <vector>
+#include "db/db_impl/db_impl.h"
+#include "db/db_test_util.h"
+#include "db/version_set.h"
+#include "db/write_batch_internal.h"
+#include "file/filename.h"
+#include "port/stack_trace.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/transaction_log.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DeleteFileTest : public DBTestBase {
+ public:
+  const int numlevels_;
+  const std::string wal_dir_;
+
+  DeleteFileTest()
+      : DBTestBase("/deletefile_test"),
+        numlevels_(7),
+        wal_dir_(dbname_ + "/wal_files") {}
+
+  void SetOptions(Options* options) {
+    assert(options);
+    options->delete_obsolete_files_period_micros = 0;  // always do full purge
+    options->enable_thread_tracking = true;
+    options->write_buffer_size = 1024 * 1024 * 1000;
+    options->target_file_size_base = 1024 * 1024 * 1000;
+    options->max_bytes_for_level_base = 1024 * 1024 * 1000;
+    options->WAL_ttl_seconds = 300;     // Used to test log files
+    options->WAL_size_limit_MB = 1024;  // Used to test log files
+    options->wal_dir = wal_dir_;
+  }
+
+  void AddKeys(int numkeys, int startkey = 0) {
+    WriteOptions options;
+    options.sync = false;
+    ReadOptions roptions;
+    for (int i = startkey; i < (numkeys + startkey) ; i++) {
+      std::string temp = ToString(i);
+      Slice key(temp);
+      Slice value(temp);
+      ASSERT_OK(db_->Put(options, key, value));
+    }
+  }
+
+  int numKeysInLevels(
+    std::vector<LiveFileMetaData> &metadata,
+    std::vector<int> *keysperlevel = nullptr) {
+
+    if (keysperlevel != nullptr) {
+      keysperlevel->resize(numlevels_);
+    }
+
+    int numKeys = 0;
+    for (size_t i = 0; i < metadata.size(); i++) {
+      int startkey = atoi(metadata[i].smallestkey.c_str());
+      int endkey = atoi(metadata[i].largestkey.c_str());
+      int numkeysinfile = (endkey - startkey + 1);
+      numKeys += numkeysinfile;
+      if (keysperlevel != nullptr) {
+        (*keysperlevel)[(int)metadata[i].level] += numkeysinfile;
+      }
+      fprintf(stderr, "level %d name %s smallest %s largest %s\n",
+              metadata[i].level, metadata[i].name.c_str(),
+              metadata[i].smallestkey.c_str(),
+              metadata[i].largestkey.c_str());
+    }
+    return numKeys;
+  }
+
+  void CreateTwoLevels() {
+    AddKeys(50000, 10000);
+    ASSERT_OK(dbfull()->TEST_FlushMemTable());
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+    for (int i = 0; i < 2; ++i) {
+      ASSERT_OK(dbfull()->TEST_CompactRange(i, nullptr, nullptr));
+    }
+
+    AddKeys(50000, 10000);
+    ASSERT_OK(dbfull()->TEST_FlushMemTable());
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+    ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr));
+  }
+
+  void CheckFileTypeCounts(const std::string& dir, int required_log,
+                           int required_sst, int required_manifest) {
+    std::vector<std::string> filenames;
+    env_->GetChildren(dir, &filenames);
+
+    int log_cnt = 0, sst_cnt = 0, manifest_cnt = 0;
+    for (auto file : filenames) {
+      uint64_t number;
+      FileType type;
+      if (ParseFileName(file, &number, &type)) {
+        log_cnt += (type == kLogFile);
+        sst_cnt += (type == kTableFile);
+        manifest_cnt += (type == kDescriptorFile);
+      }
+    }
+    ASSERT_EQ(required_log, log_cnt);
+    ASSERT_EQ(required_sst, sst_cnt);
+    ASSERT_EQ(required_manifest, manifest_cnt);
+  }
+
+  static void DoSleep(void* arg) {
+    auto test = reinterpret_cast<DeleteFileTest*>(arg);
+    test->env_->SleepForMicroseconds(2 * 1000 * 1000);
+  }
+
+  // An empty job to guard all jobs are processed
+  static void GuardFinish(void* /*arg*/) {
+    TEST_SYNC_POINT("DeleteFileTest::GuardFinish");
+  }
+};
+
+TEST_F(DeleteFileTest, AddKeysAndQueryLevels) {
+  Options options = CurrentOptions();
+  SetOptions(&options);
+  Destroy(options);
+  options.create_if_missing = true;
+  Reopen(options);
+
+  CreateTwoLevels();
+  std::vector<LiveFileMetaData> metadata;
+  db_->GetLiveFilesMetaData(&metadata);
+
+  std::string level1file = "";
+  int level1keycount = 0;
+  std::string level2file = "";
+  int level2keycount = 0;
+  int level1index = 0;
+  int level2index = 1;
+
+  ASSERT_EQ((int)metadata.size(), 2);
+  if (metadata[0].level == 2) {
+    level1index = 1;
+    level2index = 0;
+  }
+
+  level1file = metadata[level1index].name;
+  int startkey = atoi(metadata[level1index].smallestkey.c_str());
+  int endkey = atoi(metadata[level1index].largestkey.c_str());
+  level1keycount = (endkey - startkey + 1);
+  level2file = metadata[level2index].name;
+  startkey = atoi(metadata[level2index].smallestkey.c_str());
+  endkey = atoi(metadata[level2index].largestkey.c_str());
+  level2keycount = (endkey - startkey + 1);
+
+  // COntrolled setup. Levels 1 and 2 should both have 50K files.
+  // This is a little fragile as it depends on the current
+  // compaction heuristics.
+  ASSERT_EQ(level1keycount, 50000);
+  ASSERT_EQ(level2keycount, 50000);
+
+  Status status = db_->DeleteFile("0.sst");
+  ASSERT_TRUE(status.IsInvalidArgument());
+
+  // intermediate level files cannot be deleted.
+  status = db_->DeleteFile(level1file);
+  ASSERT_TRUE(status.IsInvalidArgument());
+
+  // Lowest level file deletion should succeed.
+  ASSERT_OK(db_->DeleteFile(level2file));
+}
+
+TEST_F(DeleteFileTest, PurgeObsoleteFilesTest) {
+  Options options = CurrentOptions();
+  SetOptions(&options);
+  Destroy(options);
+  options.create_if_missing = true;
+  Reopen(options);
+
+  CreateTwoLevels();
+  // there should be only one (empty) log file because CreateTwoLevels()
+  // flushes the memtables to disk
+  CheckFileTypeCounts(wal_dir_, 1, 0, 0);
+  // 2 ssts, 1 manifest
+  CheckFileTypeCounts(dbname_, 0, 2, 1);
+  std::string first("0"), last("999999");
+  CompactRangeOptions compact_options;
+  compact_options.change_level = true;
+  compact_options.target_level = 2;
+  Slice first_slice(first), last_slice(last);
+  db_->CompactRange(compact_options, &first_slice, &last_slice);
+  // 1 sst after compaction
+  CheckFileTypeCounts(dbname_, 0, 1, 1);
+
+  // this time, we keep an iterator alive
+  Reopen(options);
+  Iterator *itr = nullptr;
+  CreateTwoLevels();
+  itr = db_->NewIterator(ReadOptions());
+  db_->CompactRange(compact_options, &first_slice, &last_slice);
+  // 3 sst after compaction with live iterator
+  CheckFileTypeCounts(dbname_, 0, 3, 1);
+  delete itr;
+  // 1 sst after iterator deletion
+  CheckFileTypeCounts(dbname_, 0, 1, 1);
+}
+
+TEST_F(DeleteFileTest, BackgroundPurgeIteratorTest) {
+  Options options = CurrentOptions();
+  SetOptions(&options);
+  Destroy(options);
+  options.create_if_missing = true;
+  Reopen(options);
+
+  std::string first("0"), last("999999");
+  CompactRangeOptions compact_options;
+  compact_options.change_level = true;
+  compact_options.target_level = 2;
+  Slice first_slice(first), last_slice(last);
+
+  // We keep an iterator alive
+  Iterator* itr = nullptr;
+  CreateTwoLevels();
+  ReadOptions read_options;
+  read_options.background_purge_on_iterator_cleanup = true;
+  itr = db_->NewIterator(read_options);
+  db_->CompactRange(compact_options, &first_slice, &last_slice);
+  // 3 sst after compaction with live iterator
+  CheckFileTypeCounts(dbname_, 0, 3, 1);
+  test::SleepingBackgroundTask sleeping_task_before;
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask,
+                 &sleeping_task_before, Env::Priority::HIGH);
+  delete itr;
+  test::SleepingBackgroundTask sleeping_task_after;
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask,
+                 &sleeping_task_after, Env::Priority::HIGH);
+
+  // Make sure no purges are executed foreground
+  CheckFileTypeCounts(dbname_, 0, 3, 1);
+  sleeping_task_before.WakeUp();
+  sleeping_task_before.WaitUntilDone();
+
+  // Make sure all background purges are executed
+  sleeping_task_after.WakeUp();
+  sleeping_task_after.WaitUntilDone();
+  // 1 sst after iterator deletion
+  CheckFileTypeCounts(dbname_, 0, 1, 1);
+}
+
+TEST_F(DeleteFileTest, BackgroundPurgeCFDropTest) {
+  Options options = CurrentOptions();
+  SetOptions(&options);
+  Destroy(options);
+  options.create_if_missing = true;
+  Reopen(options);
+
+  auto do_test = [&](bool bg_purge) {
+    ColumnFamilyOptions co;
+    co.max_write_buffer_size_to_maintain =
+        static_cast<int64_t>(co.write_buffer_size);
+    WriteOptions wo;
+    FlushOptions fo;
+    ColumnFamilyHandle* cfh = nullptr;
+
+    ASSERT_OK(db_->CreateColumnFamily(co, "dropme", &cfh));
+
+    ASSERT_OK(db_->Put(wo, cfh, "pika", "chu"));
+    ASSERT_OK(db_->Flush(fo, cfh));
+    // Expect 1 sst file.
+    CheckFileTypeCounts(dbname_, 0, 1, 1);
+
+    ASSERT_OK(db_->DropColumnFamily(cfh));
+    // Still 1 file, it won't be deleted while ColumnFamilyHandle is alive.
+    CheckFileTypeCounts(dbname_, 0, 1, 1);
+
+    delete cfh;
+    test::SleepingBackgroundTask sleeping_task_after;
+    env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask,
+                   &sleeping_task_after, Env::Priority::HIGH);
+    // If background purge is enabled, the file should still be there.
+    CheckFileTypeCounts(dbname_, 0, bg_purge ? 1 : 0, 1);
+    TEST_SYNC_POINT("DeleteFileTest::BackgroundPurgeCFDropTest:1");
+
+    // Execute background purges.
+    sleeping_task_after.WakeUp();
+    sleeping_task_after.WaitUntilDone();
+    // The file should have been deleted.
+    CheckFileTypeCounts(dbname_, 0, 0, 1);
+  };
+
+  {
+    SCOPED_TRACE("avoid_unnecessary_blocking_io = false");
+    do_test(false);
+  }
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"DeleteFileTest::BackgroundPurgeCFDropTest:1",
+        "DBImpl::BGWorkPurge:start"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  options.avoid_unnecessary_blocking_io = true;
+  options.create_if_missing = false;
+  Reopen(options);
+  {
+    SCOPED_TRACE("avoid_unnecessary_blocking_io = true");
+    do_test(true);
+  }
+}
+
+// This test is to reproduce a bug that read invalid ReadOption in iterator
+// cleanup function
+TEST_F(DeleteFileTest, BackgroundPurgeCopyOptions) {
+  Options options = CurrentOptions();
+  SetOptions(&options);
+  Destroy(options);
+  options.create_if_missing = true;
+  Reopen(options);
+
+  std::string first("0"), last("999999");
+  CompactRangeOptions compact_options;
+  compact_options.change_level = true;
+  compact_options.target_level = 2;
+  Slice first_slice(first), last_slice(last);
+
+  // We keep an iterator alive
+  Iterator* itr = nullptr;
+  CreateTwoLevels();
+  {
+    ReadOptions read_options;
+    read_options.background_purge_on_iterator_cleanup = true;
+    itr = db_->NewIterator(read_options);
+    // ReadOptions is deleted, but iterator cleanup function should not be
+    // affected
+  }
+
+  db_->CompactRange(compact_options, &first_slice, &last_slice);
+  // 3 sst after compaction with live iterator
+  CheckFileTypeCounts(dbname_, 0, 3, 1);
+  delete itr;
+
+  test::SleepingBackgroundTask sleeping_task_after;
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask,
+                 &sleeping_task_after, Env::Priority::HIGH);
+
+  // Make sure all background purges are executed
+  sleeping_task_after.WakeUp();
+  sleeping_task_after.WaitUntilDone();
+  // 1 sst after iterator deletion
+  CheckFileTypeCounts(dbname_, 0, 1, 1);
+}
+
+TEST_F(DeleteFileTest, BackgroundPurgeTestMultipleJobs) {
+  Options options = CurrentOptions();
+  SetOptions(&options);
+  Destroy(options);
+  options.create_if_missing = true;
+  Reopen(options);
+
+  std::string first("0"), last("999999");
+  CompactRangeOptions compact_options;
+  compact_options.change_level = true;
+  compact_options.target_level = 2;
+  Slice first_slice(first), last_slice(last);
+
+  // We keep an iterator alive
+  CreateTwoLevels();
+  ReadOptions read_options;
+  read_options.background_purge_on_iterator_cleanup = true;
+  Iterator* itr1 = db_->NewIterator(read_options);
+  CreateTwoLevels();
+  Iterator* itr2 = db_->NewIterator(read_options);
+  db_->CompactRange(compact_options, &first_slice, &last_slice);
+  // 5 sst files after 2 compactions with 2 live iterators
+  CheckFileTypeCounts(dbname_, 0, 5, 1);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  // ~DBImpl should wait until all BGWorkPurge are finished
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::~DBImpl:WaitJob", "DBImpl::BGWorkPurge"},
+       {"DeleteFileTest::GuardFinish",
+        "DeleteFileTest::BackgroundPurgeTestMultipleJobs:DBClose"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  delete itr1;
+  env_->Schedule(&DeleteFileTest::DoSleep, this, Env::Priority::HIGH);
+  delete itr2;
+  env_->Schedule(&DeleteFileTest::GuardFinish, nullptr, Env::Priority::HIGH);
+  Close();
+
+  TEST_SYNC_POINT("DeleteFileTest::BackgroundPurgeTestMultipleJobs:DBClose");
+  // 1 sst after iterator deletion
+  CheckFileTypeCounts(dbname_, 0, 1, 1);
+}
+
+TEST_F(DeleteFileTest, DeleteFileWithIterator) {
+  Options options = CurrentOptions();
+  SetOptions(&options);
+  Destroy(options);
+  options.create_if_missing = true;
+  Reopen(options);
+
+  CreateTwoLevels();
+  ReadOptions read_options;
+  Iterator* it = db_->NewIterator(read_options);
+  std::vector<LiveFileMetaData> metadata;
+  db_->GetLiveFilesMetaData(&metadata);
+
+  std::string level2file;
+
+  ASSERT_EQ(metadata.size(), static_cast<size_t>(2));
+  if (metadata[0].level == 1) {
+    level2file = metadata[1].name;
+  } else {
+    level2file = metadata[0].name;
+  }
+
+  Status status = db_->DeleteFile(level2file);
+  fprintf(stdout, "Deletion status %s: %s\n",
+          level2file.c_str(), status.ToString().c_str());
+  ASSERT_TRUE(status.ok());
+  it->SeekToFirst();
+  int numKeysIterated = 0;
+  while(it->Valid()) {
+    numKeysIterated++;
+    it->Next();
+  }
+  ASSERT_EQ(numKeysIterated, 50000);
+  delete it;
+}
+
+TEST_F(DeleteFileTest, DeleteLogFiles) {
+  Options options = CurrentOptions();
+  SetOptions(&options);
+  Destroy(options);
+  options.create_if_missing = true;
+  Reopen(options);
+
+  AddKeys(10, 0);
+  VectorLogPtr logfiles;
+  db_->GetSortedWalFiles(logfiles);
+  ASSERT_GT(logfiles.size(), 0UL);
+  // Take the last log file which is expected to be alive and try to delete it
+  // Should not succeed because live logs are not allowed to be deleted
+  std::unique_ptr<LogFile> alive_log = std::move(logfiles.back());
+  ASSERT_EQ(alive_log->Type(), kAliveLogFile);
+  ASSERT_OK(env_->FileExists(wal_dir_ + "/" + alive_log->PathName()));
+  fprintf(stdout, "Deleting alive log file %s\n",
+          alive_log->PathName().c_str());
+  ASSERT_TRUE(!db_->DeleteFile(alive_log->PathName()).ok());
+  ASSERT_OK(env_->FileExists(wal_dir_ + "/" + alive_log->PathName()));
+  logfiles.clear();
+
+  // Call Flush to bring about a new working log file and add more keys
+  // Call Flush again to flush out memtable and move alive log to archived log
+  // and try to delete the archived log file
+  FlushOptions fopts;
+  db_->Flush(fopts);
+  AddKeys(10, 0);
+  db_->Flush(fopts);
+  db_->GetSortedWalFiles(logfiles);
+  ASSERT_GT(logfiles.size(), 0UL);
+  std::unique_ptr<LogFile> archived_log = std::move(logfiles.front());
+  ASSERT_EQ(archived_log->Type(), kArchivedLogFile);
+  ASSERT_OK(env_->FileExists(wal_dir_ + "/" + archived_log->PathName()));
+  fprintf(stdout, "Deleting archived log file %s\n",
+          archived_log->PathName().c_str());
+  ASSERT_OK(db_->DeleteFile(archived_log->PathName()));
+  ASSERT_EQ(Status::NotFound(),
+            env_->FileExists(wal_dir_ + "/" + archived_log->PathName()));
+}
+
+TEST_F(DeleteFileTest, DeleteNonDefaultColumnFamily) {
+  Options options = CurrentOptions();
+  SetOptions(&options);
+  Destroy(options);
+  options.create_if_missing = true;
+  Reopen(options);
+  CreateAndReopenWithCF({"new_cf"}, options);
+
+  Random rnd(5);
+  for (int i = 0; i < 1000; ++i) {
+    ASSERT_OK(db_->Put(WriteOptions(), handles_[1], test::RandomKey(&rnd, 10),
+                       test::RandomKey(&rnd, 10)));
+  }
+  ASSERT_OK(db_->Flush(FlushOptions(), handles_[1]));
+  for (int i = 0; i < 1000; ++i) {
+    ASSERT_OK(db_->Put(WriteOptions(), handles_[1], test::RandomKey(&rnd, 10),
+                       test::RandomKey(&rnd, 10)));
+  }
+  ASSERT_OK(db_->Flush(FlushOptions(), handles_[1]));
+
+  std::vector<LiveFileMetaData> metadata;
+  db_->GetLiveFilesMetaData(&metadata);
+  ASSERT_EQ(2U, metadata.size());
+  ASSERT_EQ("new_cf", metadata[0].column_family_name);
+  ASSERT_EQ("new_cf", metadata[1].column_family_name);
+  auto old_file = metadata[0].smallest_seqno < metadata[1].smallest_seqno
+                      ? metadata[0].name
+                      : metadata[1].name;
+  auto new_file = metadata[0].smallest_seqno > metadata[1].smallest_seqno
+                      ? metadata[0].name
+                      : metadata[1].name;
+  ASSERT_TRUE(db_->DeleteFile(new_file).IsInvalidArgument());
+  ASSERT_OK(db_->DeleteFile(old_file));
+
+  {
+    std::unique_ptr<Iterator> itr(db_->NewIterator(ReadOptions(), handles_[1]));
+    int count = 0;
+    for (itr->SeekToFirst(); itr->Valid(); itr->Next()) {
+      ASSERT_OK(itr->status());
+      ++count;
+    }
+    ASSERT_EQ(count, 1000);
+  }
+
+  Close();
+  ReopenWithColumnFamilies({kDefaultColumnFamilyName, "new_cf"}, options);
+
+  {
+    std::unique_ptr<Iterator> itr(db_->NewIterator(ReadOptions(), handles_[1]));
+    int count = 0;
+    for (itr->SeekToFirst(); itr->Valid(); itr->Next()) {
+      ASSERT_OK(itr->status());
+      ++count;
+    }
+    ASSERT_EQ(count, 1000);
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#ifdef ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS
+extern "C" {
+void RegisterCustomObjects(int argc, char** argv);
+}
+#else
+void RegisterCustomObjects(int /*argc*/, char** /*argv*/) {}
+#endif  // !ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  RegisterCustomObjects(argc, argv);
+  return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+  fprintf(stderr,
+          "SKIPPED as DBImpl::DeleteFile is not supported in ROCKSDB_LITE\n");
+  return 0;
+}
+
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/error_handler.cc b/src/rocksdb/db/error_handler.cc
new file mode 100644
index 000000000..3ba4d9fd9
--- /dev/null
+++ b/src/rocksdb/db/error_handler.cc
@@ -0,0 +1,344 @@
+//  Copyright (c) 2018-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#include "db/error_handler.h"
+#include "db/db_impl/db_impl.h"
+#include "db/event_helpers.h"
+#include "file/sst_file_manager_impl.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Maps to help decide the severity of an error based on the
+// BackgroundErrorReason, Code, SubCode and whether db_options.paranoid_checks
+// is set or not. There are 3 maps, going from most specific to least specific
+// (i.e from all 4 fields in a tuple to only the BackgroundErrorReason and
+// paranoid_checks). The less specific map serves as a catch all in case we miss
+// a specific error code or subcode.
+std::map<std::tuple<BackgroundErrorReason, Status::Code, Status::SubCode, bool>,
+         Status::Severity>
+    ErrorSeverityMap = {
+        // Errors during BG compaction
+        {std::make_tuple(BackgroundErrorReason::kCompaction,
+                         Status::Code::kIOError, Status::SubCode::kNoSpace,
+                         true),
+         Status::Severity::kSoftError},
+        {std::make_tuple(BackgroundErrorReason::kCompaction,
+                         Status::Code::kIOError, Status::SubCode::kNoSpace,
+                         false),
+         Status::Severity::kNoError},
+        {std::make_tuple(BackgroundErrorReason::kCompaction,
+                         Status::Code::kIOError, Status::SubCode::kSpaceLimit,
+                         true),
+         Status::Severity::kHardError},
+        // Errors during BG flush
+        {std::make_tuple(BackgroundErrorReason::kFlush, Status::Code::kIOError,
+                         Status::SubCode::kNoSpace, true),
+         Status::Severity::kHardError},
+        {std::make_tuple(BackgroundErrorReason::kFlush, Status::Code::kIOError,
+                         Status::SubCode::kNoSpace, false),
+         Status::Severity::kNoError},
+        {std::make_tuple(BackgroundErrorReason::kFlush, Status::Code::kIOError,
+                         Status::SubCode::kSpaceLimit, true),
+         Status::Severity::kHardError},
+        // Errors during Write
+        {std::make_tuple(BackgroundErrorReason::kWriteCallback,
+                         Status::Code::kIOError, Status::SubCode::kNoSpace,
+                         true),
+         Status::Severity::kHardError},
+        {std::make_tuple(BackgroundErrorReason::kWriteCallback,
+                         Status::Code::kIOError, Status::SubCode::kNoSpace,
+                         false),
+         Status::Severity::kHardError},
+};
+
+std::map<std::tuple<BackgroundErrorReason, Status::Code, bool>, Status::Severity>
+    DefaultErrorSeverityMap = {
+        // Errors during BG compaction
+        {std::make_tuple(BackgroundErrorReason::kCompaction,
+                         Status::Code::kCorruption, true),
+         Status::Severity::kUnrecoverableError},
+        {std::make_tuple(BackgroundErrorReason::kCompaction,
+                         Status::Code::kCorruption, false),
+         Status::Severity::kNoError},
+        {std::make_tuple(BackgroundErrorReason::kCompaction,
+                         Status::Code::kIOError, true),
+         Status::Severity::kFatalError},
+        {std::make_tuple(BackgroundErrorReason::kCompaction,
+                         Status::Code::kIOError, false),
+         Status::Severity::kNoError},
+        // Errors during BG flush
+        {std::make_tuple(BackgroundErrorReason::kFlush,
+                         Status::Code::kCorruption, true),
+         Status::Severity::kUnrecoverableError},
+        {std::make_tuple(BackgroundErrorReason::kFlush,
+                         Status::Code::kCorruption, false),
+         Status::Severity::kNoError},
+        {std::make_tuple(BackgroundErrorReason::kFlush,
+                         Status::Code::kIOError, true),
+         Status::Severity::kFatalError},
+        {std::make_tuple(BackgroundErrorReason::kFlush,
+                         Status::Code::kIOError, false),
+         Status::Severity::kNoError},
+        // Errors during Write
+        {std::make_tuple(BackgroundErrorReason::kWriteCallback,
+                         Status::Code::kCorruption, true),
+         Status::Severity::kUnrecoverableError},
+        {std::make_tuple(BackgroundErrorReason::kWriteCallback,
+                         Status::Code::kCorruption, false),
+         Status::Severity::kNoError},
+        {std::make_tuple(BackgroundErrorReason::kWriteCallback,
+                         Status::Code::kIOError, true),
+         Status::Severity::kFatalError},
+        {std::make_tuple(BackgroundErrorReason::kWriteCallback,
+                         Status::Code::kIOError, false),
+         Status::Severity::kNoError},
+};
+
+std::map<std::tuple<BackgroundErrorReason, bool>, Status::Severity>
+    DefaultReasonMap = {
+        // Errors during BG compaction
+        {std::make_tuple(BackgroundErrorReason::kCompaction, true),
+          Status::Severity::kFatalError},
+        {std::make_tuple(BackgroundErrorReason::kCompaction, false),
+          Status::Severity::kNoError},
+        // Errors during BG flush
+        {std::make_tuple(BackgroundErrorReason::kFlush, true),
+          Status::Severity::kFatalError},
+        {std::make_tuple(BackgroundErrorReason::kFlush, false),
+          Status::Severity::kNoError},
+        // Errors during Write
+        {std::make_tuple(BackgroundErrorReason::kWriteCallback, true),
+          Status::Severity::kFatalError},
+        {std::make_tuple(BackgroundErrorReason::kWriteCallback, false),
+          Status::Severity::kFatalError},
+        // Errors during Memtable update
+        {std::make_tuple(BackgroundErrorReason::kMemTable, true),
+          Status::Severity::kFatalError},
+        {std::make_tuple(BackgroundErrorReason::kMemTable, false),
+          Status::Severity::kFatalError},
+};
+
+void ErrorHandler::CancelErrorRecovery() {
+#ifndef ROCKSDB_LITE
+  db_mutex_->AssertHeld();
+
+  // We'll release the lock before calling sfm, so make sure no new
+  // recovery gets scheduled at that point
+  auto_recovery_ = false;
+  SstFileManagerImpl* sfm = reinterpret_cast<SstFileManagerImpl*>(
+      db_options_.sst_file_manager.get());
+  if (sfm) {
+    // This may or may not cancel a pending recovery
+    db_mutex_->Unlock();
+    bool cancelled = sfm->CancelErrorRecovery(this);
+    db_mutex_->Lock();
+    if (cancelled) {
+      recovery_in_prog_ = false;
+    }
+  }
+#endif
+}
+
+// This is the main function for looking at an error during a background
+// operation and deciding the severity, and error recovery strategy. The high
+// level algorithm is as follows -
+// 1. Classify the severity of the error based on the ErrorSeverityMap,
+//    DefaultErrorSeverityMap and DefaultReasonMap defined earlier
+// 2. Call a Status code specific override function to adjust the severity
+//    if needed. The reason for this is our ability to recover may depend on
+//    the exact options enabled in DBOptions
+// 3. Determine if auto recovery is possible. A listener notification callback
+//    is called, which can disable the auto recovery even if we decide its
+//    feasible
+// 4. For Status::NoSpace() errors, rely on SstFileManagerImpl to control
+//    the actual recovery. If no sst file manager is specified in DBOptions,
+//    a default one is allocated during DB::Open(), so there will always be
+//    one.
+// This can also get called as part of a recovery operation. In that case, we
+// also track the error separately in recovery_error_ so we can tell in the
+// end whether recovery succeeded or not
+Status ErrorHandler::SetBGError(const Status& bg_err, BackgroundErrorReason reason) {
+  db_mutex_->AssertHeld();
+
+  if (bg_err.ok()) {
+    return Status::OK();
+  }
+
+  bool paranoid = db_options_.paranoid_checks;
+  Status::Severity sev = Status::Severity::kFatalError;
+  Status new_bg_err;
+  bool found = false;
+
+  {
+    auto entry = ErrorSeverityMap.find(std::make_tuple(reason, bg_err.code(),
+          bg_err.subcode(), paranoid));
+    if (entry != ErrorSeverityMap.end()) {
+      sev = entry->second;
+      found = true;
+    }
+  }
+
+  if (!found) {
+    auto entry = DefaultErrorSeverityMap.find(std::make_tuple(reason,
+          bg_err.code(), paranoid));
+    if (entry != DefaultErrorSeverityMap.end()) {
+      sev = entry->second;
+      found = true;
+    }
+  }
+
+  if (!found) {
+    auto entry = DefaultReasonMap.find(std::make_tuple(reason, paranoid));
+    if (entry != DefaultReasonMap.end()) {
+      sev = entry->second;
+    }
+  }
+
+  new_bg_err = Status(bg_err, sev);
+
+  // Check if recovery is currently in progress. If it is, we will save this
+  // error so we can check it at the end to see if recovery succeeded or not
+  if (recovery_in_prog_ && recovery_error_.ok()) {
+    recovery_error_ = new_bg_err;
+  }
+
+  bool auto_recovery = auto_recovery_;
+  if (new_bg_err.severity() >= Status::Severity::kFatalError && auto_recovery) {
+    auto_recovery = false;
+  }
+
+  // Allow some error specific overrides
+  if (new_bg_err == Status::NoSpace()) {
+    new_bg_err = OverrideNoSpaceError(new_bg_err, &auto_recovery);
+  }
+
+  if (!new_bg_err.ok()) {
+    Status s = new_bg_err;
+    EventHelpers::NotifyOnBackgroundError(db_options_.listeners, reason, &s,
+                                          db_mutex_, &auto_recovery);
+    if (!s.ok() && (s.severity() > bg_error_.severity())) {
+      bg_error_ = s;
+    } else {
+      // This error is less severe than previously encountered error. Don't
+      // take any further action
+      return bg_error_;
+    }
+  }
+
+  if (auto_recovery) {
+    recovery_in_prog_ = true;
+
+    // Kick-off error specific recovery
+    if (bg_error_ == Status::NoSpace()) {
+      RecoverFromNoSpace();
+    }
+  }
+  return bg_error_;
+}
+
+Status ErrorHandler::OverrideNoSpaceError(Status bg_error,
+                                          bool* auto_recovery) {
+#ifndef ROCKSDB_LITE
+  if (bg_error.severity() >= Status::Severity::kFatalError) {
+    return bg_error;
+  }
+
+  if (db_options_.sst_file_manager.get() == nullptr) {
+    // We rely on SFM to poll for enough disk space and recover
+    *auto_recovery = false;
+    return bg_error;
+  }
+
+  if (db_options_.allow_2pc &&
+      (bg_error.severity() <= Status::Severity::kSoftError)) {
+    // Don't know how to recover, as the contents of the current WAL file may
+    // be inconsistent, and it may be needed for 2PC. If 2PC is not enabled,
+    // we can just flush the memtable and discard the log
+    *auto_recovery = false;
+    return Status(bg_error, Status::Severity::kFatalError);
+  }
+
+  {
+    uint64_t free_space;
+    if (db_options_.env->GetFreeSpace(db_options_.db_paths[0].path,
+                                      &free_space) == Status::NotSupported()) {
+      *auto_recovery = false;
+    }
+  }
+
+  return bg_error;
+#else
+  (void)auto_recovery;
+  return Status(bg_error, Status::Severity::kFatalError);
+#endif
+}
+
+void ErrorHandler::RecoverFromNoSpace() {
+#ifndef ROCKSDB_LITE
+  SstFileManagerImpl* sfm =
+      reinterpret_cast<SstFileManagerImpl*>(db_options_.sst_file_manager.get());
+
+  // Inform SFM of the error, so it can kick-off the recovery
+  if (sfm) {
+    sfm->StartErrorRecovery(this, bg_error_);
+  }
+#endif
+}
+
+Status ErrorHandler::ClearBGError() {
+#ifndef ROCKSDB_LITE
+  db_mutex_->AssertHeld();
+
+  // Signal that recovery succeeded
+  if (recovery_error_.ok()) {
+    Status old_bg_error = bg_error_;
+    bg_error_ = Status::OK();
+    recovery_in_prog_ = false;
+    EventHelpers::NotifyOnErrorRecoveryCompleted(db_options_.listeners,
+                                                 old_bg_error, db_mutex_);
+  }
+  return recovery_error_;
+#else
+  return bg_error_;
+#endif
+}
+
+Status ErrorHandler::RecoverFromBGError(bool is_manual) {
+#ifndef ROCKSDB_LITE
+  InstrumentedMutexLock l(db_mutex_);
+  if (is_manual) {
+    // If its a manual recovery and there's a background recovery in progress
+    // return busy status
+    if (recovery_in_prog_) {
+      return Status::Busy();
+    }
+    recovery_in_prog_ = true;
+  }
+
+  if (bg_error_.severity() == Status::Severity::kSoftError) {
+    // Simply clear the background error and return
+    recovery_error_ = Status::OK();
+    return ClearBGError();
+  }
+
+  // Reset recovery_error_. We will use this to record any errors that happen
+  // during the recovery process. While recovering, the only operations that
+  // can generate background errors should be the flush operations
+  recovery_error_ = Status::OK();
+  Status s = db_->ResumeImpl();
+  // For manual recover, shutdown, and fatal error  cases, set
+  // recovery_in_prog_ to false. For automatic background recovery, leave it
+  // as is regardless of success or failure as it will be retried
+  if (is_manual || s.IsShutdownInProgress() ||
+      bg_error_.severity() >= Status::Severity::kFatalError) {
+    recovery_in_prog_ = false;
+  }
+  return s;
+#else
+  (void)is_manual;
+  return bg_error_;
+#endif
+}
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/error_handler.h b/src/rocksdb/db/error_handler.h
new file mode 100644
index 000000000..7276f6510
--- /dev/null
+++ b/src/rocksdb/db/error_handler.h
@@ -0,0 +1,75 @@
+//  Copyright (c) 2018-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#include "monitoring/instrumented_mutex.h"
+#include "options/db_options.h"
+#include "rocksdb/listener.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBImpl;
+
+class ErrorHandler {
+  public:
+   ErrorHandler(DBImpl* db, const ImmutableDBOptions& db_options,
+                InstrumentedMutex* db_mutex)
+       : db_(db),
+         db_options_(db_options),
+         bg_error_(Status::OK()),
+         recovery_error_(Status::OK()),
+         db_mutex_(db_mutex),
+         auto_recovery_(false),
+         recovery_in_prog_(false) {}
+   ~ErrorHandler() {}
+
+   void EnableAutoRecovery() { auto_recovery_ = true; }
+
+   Status::Severity GetErrorSeverity(BackgroundErrorReason reason,
+                                     Status::Code code,
+                                     Status::SubCode subcode);
+
+   Status SetBGError(const Status& bg_err, BackgroundErrorReason reason);
+
+   Status GetBGError() { return bg_error_; }
+
+   Status GetRecoveryError() { return recovery_error_; }
+
+   Status ClearBGError();
+
+   bool IsDBStopped() {
+     return !bg_error_.ok() &&
+            bg_error_.severity() >= Status::Severity::kHardError;
+    }
+
+    bool IsBGWorkStopped() {
+      return !bg_error_.ok() &&
+             (bg_error_.severity() >= Status::Severity::kHardError ||
+              !auto_recovery_);
+    }
+
+    bool IsRecoveryInProgress() { return recovery_in_prog_; }
+
+    Status RecoverFromBGError(bool is_manual = false);
+    void CancelErrorRecovery();
+
+   private:
+    DBImpl* db_;
+    const ImmutableDBOptions& db_options_;
+    Status bg_error_;
+    // A separate Status variable used to record any errors during the
+    // recovery process from hard errors
+    Status recovery_error_;
+    InstrumentedMutex* db_mutex_;
+    // A flag indicating whether automatic recovery from errors is enabled
+    bool auto_recovery_;
+    bool recovery_in_prog_;
+
+    Status OverrideNoSpaceError(Status bg_error, bool* auto_recovery);
+    void RecoverFromNoSpace();
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/error_handler_test.cc b/src/rocksdb/db/error_handler_test.cc
new file mode 100644
index 000000000..b9d78490c
--- /dev/null
+++ b/src/rocksdb/db/error_handler_test.cc
@@ -0,0 +1,871 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#ifndef ROCKSDB_LITE
+
+#include "db/db_test_util.h"
+#include "port/stack_trace.h"
+#include "rocksdb/perf_context.h"
+#include "rocksdb/sst_file_manager.h"
+#include "test_util/fault_injection_test_env.h"
+#if !defined(ROCKSDB_LITE)
+#include "test_util/sync_point.h"
+#endif
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBErrorHandlingTest : public DBTestBase {
+ public:
+  DBErrorHandlingTest() : DBTestBase("/db_error_handling_test") {}
+
+  std::string GetManifestNameFromLiveFiles() {
+    std::vector<std::string> live_files;
+    uint64_t manifest_size;
+
+    dbfull()->GetLiveFiles(live_files, &manifest_size, false);
+    for (auto& file : live_files) {
+      uint64_t num = 0;
+      FileType type;
+      if (ParseFileName(file, &num, &type) && type == kDescriptorFile) {
+        return file;
+      }
+    }
+    return "";
+  }
+};
+
+class DBErrorHandlingEnv : public EnvWrapper {
+  public:
+    DBErrorHandlingEnv() : EnvWrapper(Env::Default()),
+      trig_no_space(false), trig_io_error(false) {}
+
+    void SetTrigNoSpace() {trig_no_space = true;}
+    void SetTrigIoError() {trig_io_error = true;}
+  private:
+    bool trig_no_space;
+    bool trig_io_error;
+};
+
+class ErrorHandlerListener : public EventListener {
+ public:
+  ErrorHandlerListener()
+      : mutex_(),
+        cv_(&mutex_),
+        no_auto_recovery_(false),
+        recovery_complete_(false),
+        file_creation_started_(false),
+        override_bg_error_(false),
+        file_count_(0),
+        fault_env_(nullptr) {}
+
+  void OnTableFileCreationStarted(
+      const TableFileCreationBriefInfo& /*ti*/) override {
+    InstrumentedMutexLock l(&mutex_);
+    file_creation_started_ = true;
+    if (file_count_ > 0) {
+      if (--file_count_ == 0) {
+        fault_env_->SetFilesystemActive(false, file_creation_error_);
+        file_creation_error_ = Status::OK();
+      }
+    }
+    cv_.SignalAll();
+  }
+
+  void OnErrorRecoveryBegin(BackgroundErrorReason /*reason*/,
+                            Status /*bg_error*/,
+                            bool* auto_recovery) override {
+    if (*auto_recovery && no_auto_recovery_) {
+      *auto_recovery = false;
+    }
+  }
+
+  void OnErrorRecoveryCompleted(Status /*old_bg_error*/) override {
+    InstrumentedMutexLock l(&mutex_);
+    recovery_complete_ = true;
+    cv_.SignalAll();
+  }
+
+  bool WaitForRecovery(uint64_t /*abs_time_us*/) {
+    InstrumentedMutexLock l(&mutex_);
+    while (!recovery_complete_) {
+      cv_.Wait(/*abs_time_us*/);
+    }
+    if (recovery_complete_) {
+      recovery_complete_ = false;
+      return true;
+    }
+    return false;
+  }
+
+  void WaitForTableFileCreationStarted(uint64_t /*abs_time_us*/) {
+    InstrumentedMutexLock l(&mutex_);
+    while (!file_creation_started_) {
+      cv_.Wait(/*abs_time_us*/);
+    }
+    file_creation_started_ = false;
+  }
+
+  void OnBackgroundError(BackgroundErrorReason /*reason*/,
+                         Status* bg_error) override {
+    if (override_bg_error_) {
+      *bg_error = bg_error_;
+      override_bg_error_ = false;
+    }
+  }
+
+  void EnableAutoRecovery(bool enable = true) { no_auto_recovery_ = !enable; }
+
+  void OverrideBGError(Status bg_err) {
+    bg_error_ = bg_err;
+    override_bg_error_ = true;
+  }
+
+  void InjectFileCreationError(FaultInjectionTestEnv* env, int file_count,
+                               Status s) {
+    fault_env_ = env;
+    file_count_ = file_count;
+    file_creation_error_ = s;
+  }
+
+ private:
+  InstrumentedMutex mutex_;
+  InstrumentedCondVar cv_;
+  bool no_auto_recovery_;
+  bool recovery_complete_;
+  bool file_creation_started_;
+  bool override_bg_error_;
+  int file_count_;
+  Status file_creation_error_;
+  Status bg_error_;
+  FaultInjectionTestEnv* fault_env_;
+};
+
+TEST_F(DBErrorHandlingTest, FLushWriteError) {
+  std::unique_ptr<FaultInjectionTestEnv> fault_env(
+      new FaultInjectionTestEnv(Env::Default()));
+  std::shared_ptr<ErrorHandlerListener> listener(new ErrorHandlerListener());
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  options.env = fault_env.get();
+  options.listeners.emplace_back(listener);
+  Status s;
+
+  listener->EnableAutoRecovery(false);
+  DestroyAndReopen(options);
+
+  Put(Key(0), "val");
+  SyncPoint::GetInstance()->SetCallBack(
+      "FlushJob::Start", [&](void *) {
+    fault_env->SetFilesystemActive(false, Status::NoSpace("Out of space"));
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError);
+  SyncPoint::GetInstance()->DisableProcessing();
+  fault_env->SetFilesystemActive(true);
+  s = dbfull()->Resume();
+  ASSERT_EQ(s, Status::OK());
+
+  Reopen(options);
+  ASSERT_EQ("val", Get(Key(0)));
+  Destroy(options);
+}
+
+TEST_F(DBErrorHandlingTest, ManifestWriteError) {
+  std::unique_ptr<FaultInjectionTestEnv> fault_env(
+      new FaultInjectionTestEnv(Env::Default()));
+  std::shared_ptr<ErrorHandlerListener> listener(new ErrorHandlerListener());
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  options.env = fault_env.get();
+  options.listeners.emplace_back(listener);
+  Status s;
+  std::string old_manifest;
+  std::string new_manifest;
+
+  listener->EnableAutoRecovery(false);
+  DestroyAndReopen(options);
+  old_manifest = GetManifestNameFromLiveFiles();
+
+  Put(Key(0), "val");
+  Flush();
+  Put(Key(1), "val");
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::LogAndApply:WriteManifest", [&](void *) {
+    fault_env->SetFilesystemActive(false, Status::NoSpace("Out of space"));
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError);
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->DisableProcessing();
+  fault_env->SetFilesystemActive(true);
+  s = dbfull()->Resume();
+  ASSERT_EQ(s, Status::OK());
+
+  new_manifest = GetManifestNameFromLiveFiles();
+  ASSERT_NE(new_manifest, old_manifest);
+
+  Reopen(options);
+  ASSERT_EQ("val", Get(Key(0)));
+  ASSERT_EQ("val", Get(Key(1)));
+  Close();
+}
+
+TEST_F(DBErrorHandlingTest, DoubleManifestWriteError) {
+  std::unique_ptr<FaultInjectionTestEnv> fault_env(
+      new FaultInjectionTestEnv(Env::Default()));
+  std::shared_ptr<ErrorHandlerListener> listener(new ErrorHandlerListener());
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  options.env = fault_env.get();
+  options.listeners.emplace_back(listener);
+  Status s;
+  std::string old_manifest;
+  std::string new_manifest;
+
+  listener->EnableAutoRecovery(false);
+  DestroyAndReopen(options);
+  old_manifest = GetManifestNameFromLiveFiles();
+
+  Put(Key(0), "val");
+  Flush();
+  Put(Key(1), "val");
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::LogAndApply:WriteManifest", [&](void *) {
+    fault_env->SetFilesystemActive(false, Status::NoSpace("Out of space"));
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError);
+  fault_env->SetFilesystemActive(true);
+
+  // This Resume() will attempt to create a new manifest file and fail again
+  s = dbfull()->Resume();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError);
+  fault_env->SetFilesystemActive(true);
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->DisableProcessing();
+
+  // A successful Resume() will create a new manifest file
+  s = dbfull()->Resume();
+  ASSERT_EQ(s, Status::OK());
+
+  new_manifest = GetManifestNameFromLiveFiles();
+  ASSERT_NE(new_manifest, old_manifest);
+
+  Reopen(options);
+  ASSERT_EQ("val", Get(Key(0)));
+  ASSERT_EQ("val", Get(Key(1)));
+  Close();
+}
+
+TEST_F(DBErrorHandlingTest, CompactionManifestWriteError) {
+  std::unique_ptr<FaultInjectionTestEnv> fault_env(
+      new FaultInjectionTestEnv(Env::Default()));
+  std::shared_ptr<ErrorHandlerListener> listener(new ErrorHandlerListener());
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  options.level0_file_num_compaction_trigger = 2;
+  options.listeners.emplace_back(listener);
+  options.env = fault_env.get();
+  Status s;
+  std::string old_manifest;
+  std::string new_manifest;
+  std::atomic<bool> fail_manifest(false);
+  DestroyAndReopen(options);
+  old_manifest = GetManifestNameFromLiveFiles();
+
+  Put(Key(0), "val");
+  Put(Key(2), "val");
+  s = Flush();
+  ASSERT_EQ(s, Status::OK());
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      // Wait for flush of 2nd L0 file before starting compaction
+      {{"DBImpl::FlushMemTable:FlushMemTableFinished",
+        "BackgroundCallCompaction:0"},
+       // Wait for compaction to detect manifest write error
+       {"BackgroundCallCompaction:1", "CompactionManifestWriteError:0"},
+       // Make compaction thread wait for error to be cleared
+       {"CompactionManifestWriteError:1",
+        "DBImpl::BackgroundCallCompaction:FoundObsoleteFiles"},
+       // Wait for DB instance to clear bg_error before calling
+       // TEST_WaitForCompact
+       {"SstFileManagerImpl::ErrorCleared", "CompactionManifestWriteError:2"}});
+  // trigger manifest write failure in compaction thread
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "BackgroundCallCompaction:0", [&](void*) { fail_manifest.store(true); });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::LogAndApply:WriteManifest", [&](void*) {
+        if (fail_manifest.load()) {
+          fault_env->SetFilesystemActive(false,
+                                         Status::NoSpace("Out of space"));
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  Put(Key(1), "val");
+  // This Flush will trigger a compaction, which will fail when appending to
+  // the manifest
+  s = Flush();
+  ASSERT_EQ(s, Status::OK());
+
+  TEST_SYNC_POINT("CompactionManifestWriteError:0");
+  // Clear all errors so when the compaction is retried, it will succeed
+  fault_env->SetFilesystemActive(true);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+  TEST_SYNC_POINT("CompactionManifestWriteError:1");
+  TEST_SYNC_POINT("CompactionManifestWriteError:2");
+
+  s = dbfull()->TEST_WaitForCompact();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ASSERT_EQ(s, Status::OK());
+
+  new_manifest = GetManifestNameFromLiveFiles();
+  ASSERT_NE(new_manifest, old_manifest);
+  Reopen(options);
+  ASSERT_EQ("val", Get(Key(0)));
+  ASSERT_EQ("val", Get(Key(1)));
+  ASSERT_EQ("val", Get(Key(2)));
+  Close();
+}
+
+TEST_F(DBErrorHandlingTest, CompactionWriteError) {
+  std::unique_ptr<FaultInjectionTestEnv> fault_env(
+      new FaultInjectionTestEnv(Env::Default()));
+  std::shared_ptr<ErrorHandlerListener> listener(new ErrorHandlerListener());
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  options.level0_file_num_compaction_trigger = 2;
+  options.listeners.emplace_back(listener);
+  options.env = fault_env.get();
+  Status s;
+  DestroyAndReopen(options);
+
+  Put(Key(0), "va;");
+  Put(Key(2), "va;");
+  s = Flush();
+  ASSERT_EQ(s, Status::OK());
+
+  listener->OverrideBGError(
+      Status(Status::NoSpace(), Status::Severity::kHardError)
+      );
+  listener->EnableAutoRecovery(false);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::FlushMemTable:FlushMemTableFinished",
+        "BackgroundCallCompaction:0"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "BackgroundCallCompaction:0", [&](void*) {
+        fault_env->SetFilesystemActive(false, Status::NoSpace("Out of space"));
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  Put(Key(1), "val");
+  s = Flush();
+  ASSERT_EQ(s, Status::OK());
+
+  s = dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError);
+
+  fault_env->SetFilesystemActive(true);
+  s = dbfull()->Resume();
+  ASSERT_EQ(s, Status::OK());
+  Destroy(options);
+}
+
+TEST_F(DBErrorHandlingTest, CorruptionError) {
+  std::unique_ptr<FaultInjectionTestEnv> fault_env(
+      new FaultInjectionTestEnv(Env::Default()));
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  options.level0_file_num_compaction_trigger = 2;
+  options.env = fault_env.get();
+  Status s;
+  DestroyAndReopen(options);
+
+  Put(Key(0), "va;");
+  Put(Key(2), "va;");
+  s = Flush();
+  ASSERT_EQ(s, Status::OK());
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::FlushMemTable:FlushMemTableFinished",
+        "BackgroundCallCompaction:0"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "BackgroundCallCompaction:0", [&](void*) {
+        fault_env->SetFilesystemActive(false, Status::Corruption("Corruption"));
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  Put(Key(1), "val");
+  s = Flush();
+  ASSERT_EQ(s, Status::OK());
+
+  s = dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(s.severity(),
+            ROCKSDB_NAMESPACE::Status::Severity::kUnrecoverableError);
+
+  fault_env->SetFilesystemActive(true);
+  s = dbfull()->Resume();
+  ASSERT_NE(s, Status::OK());
+  Destroy(options);
+}
+
+TEST_F(DBErrorHandlingTest, AutoRecoverFlushError) {
+  std::unique_ptr<FaultInjectionTestEnv> fault_env(
+      new FaultInjectionTestEnv(Env::Default()));
+  std::shared_ptr<ErrorHandlerListener> listener(new ErrorHandlerListener());
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  options.env = fault_env.get();
+  options.listeners.emplace_back(listener);
+  Status s;
+
+  listener->EnableAutoRecovery();
+  DestroyAndReopen(options);
+
+  Put(Key(0), "val");
+  SyncPoint::GetInstance()->SetCallBack("FlushJob::Start", [&](void*) {
+    fault_env->SetFilesystemActive(false, Status::NoSpace("Out of space"));
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError);
+  SyncPoint::GetInstance()->DisableProcessing();
+  fault_env->SetFilesystemActive(true);
+  ASSERT_EQ(listener->WaitForRecovery(5000000), true);
+
+  s = Put(Key(1), "val");
+  ASSERT_EQ(s, Status::OK());
+
+  Reopen(options);
+  ASSERT_EQ("val", Get(Key(0)));
+  ASSERT_EQ("val", Get(Key(1)));
+  Destroy(options);
+}
+
+TEST_F(DBErrorHandlingTest, FailRecoverFlushError) {
+  std::unique_ptr<FaultInjectionTestEnv> fault_env(
+      new FaultInjectionTestEnv(Env::Default()));
+  std::shared_ptr<ErrorHandlerListener> listener(new ErrorHandlerListener());
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  options.env = fault_env.get();
+  options.listeners.emplace_back(listener);
+  Status s;
+
+  listener->EnableAutoRecovery();
+  DestroyAndReopen(options);
+
+  Put(Key(0), "val");
+  SyncPoint::GetInstance()->SetCallBack("FlushJob::Start", [&](void*) {
+    fault_env->SetFilesystemActive(false, Status::NoSpace("Out of space"));
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError);
+  // We should be able to shutdown the database while auto recovery is going
+  // on in the background
+  Close();
+  DestroyDB(dbname_, options);
+}
+
+TEST_F(DBErrorHandlingTest, WALWriteError) {
+  std::unique_ptr<FaultInjectionTestEnv> fault_env(
+      new FaultInjectionTestEnv(Env::Default()));
+  std::shared_ptr<ErrorHandlerListener> listener(new ErrorHandlerListener());
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  options.writable_file_max_buffer_size = 32768;
+  options.env = fault_env.get();
+  options.listeners.emplace_back(listener);
+  Status s;
+  Random rnd(301);
+
+  listener->EnableAutoRecovery();
+  DestroyAndReopen(options);
+
+  {
+    WriteBatch batch;
+
+    for (auto i = 0; i<100; ++i) {
+      batch.Put(Key(i), RandomString(&rnd, 1024));
+    }
+
+    WriteOptions wopts;
+    wopts.sync = true;
+    ASSERT_EQ(dbfull()->Write(wopts, &batch), Status::OK());
+  };
+
+  {
+    WriteBatch batch;
+    int write_error = 0;
+
+    for (auto i = 100; i<199; ++i) {
+      batch.Put(Key(i), RandomString(&rnd, 1024));
+    }
+
+    SyncPoint::GetInstance()->SetCallBack("WritableFileWriter::Append:BeforePrepareWrite", [&](void*) {
+      write_error++;
+      if (write_error > 2) {
+        fault_env->SetFilesystemActive(false, Status::NoSpace("Out of space"));
+      }
+    });
+    SyncPoint::GetInstance()->EnableProcessing();
+    WriteOptions wopts;
+    wopts.sync = true;
+    s = dbfull()->Write(wopts, &batch);
+    ASSERT_EQ(s, s.NoSpace());
+  }
+  SyncPoint::GetInstance()->DisableProcessing();
+  fault_env->SetFilesystemActive(true);
+  ASSERT_EQ(listener->WaitForRecovery(5000000), true);
+  for (auto i=0; i<199; ++i) {
+    if (i < 100) {
+      ASSERT_NE(Get(Key(i)), "NOT_FOUND");
+    } else {
+      ASSERT_EQ(Get(Key(i)), "NOT_FOUND");
+    }
+  }
+  Reopen(options);
+  for (auto i=0; i<199; ++i) {
+    if (i < 100) {
+      ASSERT_NE(Get(Key(i)), "NOT_FOUND");
+    } else {
+      ASSERT_EQ(Get(Key(i)), "NOT_FOUND");
+    }
+  }
+  Close();
+}
+
+TEST_F(DBErrorHandlingTest, MultiCFWALWriteError) {
+  std::unique_ptr<FaultInjectionTestEnv> fault_env(
+      new FaultInjectionTestEnv(Env::Default()));
+  std::shared_ptr<ErrorHandlerListener> listener(new ErrorHandlerListener());
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  options.writable_file_max_buffer_size = 32768;
+  options.env = fault_env.get();
+  options.listeners.emplace_back(listener);
+  Status s;
+  Random rnd(301);
+
+  listener->EnableAutoRecovery();
+  CreateAndReopenWithCF({"one", "two", "three"}, options);
+
+  {
+    WriteBatch batch;
+
+    for (auto i = 1; i < 4; ++i) {
+      for (auto j = 0; j < 100; ++j) {
+        batch.Put(handles_[i], Key(j), RandomString(&rnd, 1024));
+      }
+    }
+
+    WriteOptions wopts;
+    wopts.sync = true;
+    ASSERT_EQ(dbfull()->Write(wopts, &batch), Status::OK());
+  };
+
+  {
+    WriteBatch batch;
+    int write_error = 0;
+
+    // Write to one CF
+    for (auto i = 100; i < 199; ++i) {
+      batch.Put(handles_[2], Key(i), RandomString(&rnd, 1024));
+    }
+
+    SyncPoint::GetInstance()->SetCallBack(
+        "WritableFileWriter::Append:BeforePrepareWrite", [&](void*) {
+          write_error++;
+          if (write_error > 2) {
+            fault_env->SetFilesystemActive(false,
+                                           Status::NoSpace("Out of space"));
+          }
+        });
+    SyncPoint::GetInstance()->EnableProcessing();
+    WriteOptions wopts;
+    wopts.sync = true;
+    s = dbfull()->Write(wopts, &batch);
+    ASSERT_EQ(s, s.NoSpace());
+  }
+  SyncPoint::GetInstance()->DisableProcessing();
+  fault_env->SetFilesystemActive(true);
+  ASSERT_EQ(listener->WaitForRecovery(5000000), true);
+
+  for (auto i = 1; i < 4; ++i) {
+    // Every CF should have been flushed
+    ASSERT_EQ(NumTableFilesAtLevel(0, i), 1);
+  }
+
+  for (auto i = 1; i < 4; ++i) {
+    for (auto j = 0; j < 199; ++j) {
+      if (j < 100) {
+        ASSERT_NE(Get(i, Key(j)), "NOT_FOUND");
+      } else {
+        ASSERT_EQ(Get(i, Key(j)), "NOT_FOUND");
+      }
+    }
+  }
+  ReopenWithColumnFamilies({"default", "one", "two", "three"}, options);
+  for (auto i = 1; i < 4; ++i) {
+    for (auto j = 0; j < 199; ++j) {
+      if (j < 100) {
+        ASSERT_NE(Get(i, Key(j)), "NOT_FOUND");
+      } else {
+        ASSERT_EQ(Get(i, Key(j)), "NOT_FOUND");
+      }
+    }
+  }
+  Close();
+}
+
+TEST_F(DBErrorHandlingTest, MultiDBCompactionError) {
+  FaultInjectionTestEnv* def_env = new FaultInjectionTestEnv(Env::Default());
+  std::vector<std::unique_ptr<FaultInjectionTestEnv>> fault_env;
+  std::vector<Options> options;
+  std::vector<std::shared_ptr<ErrorHandlerListener>> listener;
+  std::vector<DB*> db;
+  std::shared_ptr<SstFileManager> sfm(NewSstFileManager(def_env));
+  int kNumDbInstances = 3;
+  Random rnd(301);
+
+  for (auto i = 0; i < kNumDbInstances; ++i) {
+    listener.emplace_back(new ErrorHandlerListener());
+    options.emplace_back(GetDefaultOptions());
+    fault_env.emplace_back(new FaultInjectionTestEnv(Env::Default()));
+    options[i].create_if_missing = true;
+    options[i].level0_file_num_compaction_trigger = 2;
+    options[i].writable_file_max_buffer_size = 32768;
+    options[i].env = fault_env[i].get();
+    options[i].listeners.emplace_back(listener[i]);
+    options[i].sst_file_manager = sfm;
+    DB* dbptr;
+    char buf[16];
+
+    listener[i]->EnableAutoRecovery();
+    // Setup for returning error for the 3rd SST, which would be level 1
+    listener[i]->InjectFileCreationError(fault_env[i].get(), 3,
+                                         Status::NoSpace("Out of space"));
+    snprintf(buf, sizeof(buf), "_%d", i);
+    DestroyDB(dbname_ + std::string(buf), options[i]);
+    ASSERT_EQ(DB::Open(options[i], dbname_ + std::string(buf), &dbptr),
+              Status::OK());
+    db.emplace_back(dbptr);
+  }
+
+  for (auto i = 0; i < kNumDbInstances; ++i) {
+    WriteBatch batch;
+
+    for (auto j = 0; j <= 100; ++j) {
+      batch.Put(Key(j), RandomString(&rnd, 1024));
+    }
+
+    WriteOptions wopts;
+    wopts.sync = true;
+    ASSERT_EQ(db[i]->Write(wopts, &batch), Status::OK());
+    ASSERT_EQ(db[i]->Flush(FlushOptions()), Status::OK());
+  }
+
+  def_env->SetFilesystemActive(false, Status::NoSpace("Out of space"));
+  for (auto i = 0; i < kNumDbInstances; ++i) {
+    WriteBatch batch;
+
+    // Write to one CF
+    for (auto j = 100; j < 199; ++j) {
+      batch.Put(Key(j), RandomString(&rnd, 1024));
+    }
+
+    WriteOptions wopts;
+    wopts.sync = true;
+    ASSERT_EQ(db[i]->Write(wopts, &batch), Status::OK());
+    ASSERT_EQ(db[i]->Flush(FlushOptions()), Status::OK());
+  }
+
+  for (auto i = 0; i < kNumDbInstances; ++i) {
+    Status s = static_cast<DBImpl*>(db[i])->TEST_WaitForCompact(true);
+    ASSERT_EQ(s.severity(), Status::Severity::kSoftError);
+    fault_env[i]->SetFilesystemActive(true);
+  }
+
+  def_env->SetFilesystemActive(true);
+  for (auto i = 0; i < kNumDbInstances; ++i) {
+    std::string prop;
+    ASSERT_EQ(listener[i]->WaitForRecovery(5000000), true);
+    ASSERT_EQ(static_cast<DBImpl*>(db[i])->TEST_WaitForCompact(true),
+              Status::OK());
+    EXPECT_TRUE(db[i]->GetProperty(
+        "rocksdb.num-files-at-level" + NumberToString(0), &prop));
+    EXPECT_EQ(atoi(prop.c_str()), 0);
+    EXPECT_TRUE(db[i]->GetProperty(
+        "rocksdb.num-files-at-level" + NumberToString(1), &prop));
+    EXPECT_EQ(atoi(prop.c_str()), 1);
+  }
+
+  for (auto i = 0; i < kNumDbInstances; ++i) {
+    char buf[16];
+    snprintf(buf, sizeof(buf), "_%d", i);
+    delete db[i];
+    fault_env[i]->SetFilesystemActive(true);
+    if (getenv("KEEP_DB")) {
+      printf("DB is still at %s%s\n", dbname_.c_str(), buf);
+    } else {
+      Status s = DestroyDB(dbname_ + std::string(buf), options[i]);
+    }
+  }
+  options.clear();
+  sfm.reset();
+  delete def_env;
+}
+
+TEST_F(DBErrorHandlingTest, MultiDBVariousErrors) {
+  FaultInjectionTestEnv* def_env = new FaultInjectionTestEnv(Env::Default());
+  std::vector<std::unique_ptr<FaultInjectionTestEnv>> fault_env;
+  std::vector<Options> options;
+  std::vector<std::shared_ptr<ErrorHandlerListener>> listener;
+  std::vector<DB*> db;
+  std::shared_ptr<SstFileManager> sfm(NewSstFileManager(def_env));
+  int kNumDbInstances = 3;
+  Random rnd(301);
+
+  for (auto i = 0; i < kNumDbInstances; ++i) {
+    listener.emplace_back(new ErrorHandlerListener());
+    options.emplace_back(GetDefaultOptions());
+    fault_env.emplace_back(new FaultInjectionTestEnv(Env::Default()));
+    options[i].create_if_missing = true;
+    options[i].level0_file_num_compaction_trigger = 2;
+    options[i].writable_file_max_buffer_size = 32768;
+    options[i].env = fault_env[i].get();
+    options[i].listeners.emplace_back(listener[i]);
+    options[i].sst_file_manager = sfm;
+    DB* dbptr;
+    char buf[16];
+
+    listener[i]->EnableAutoRecovery();
+    switch (i) {
+      case 0:
+        // Setup for returning error for the 3rd SST, which would be level 1
+        listener[i]->InjectFileCreationError(fault_env[i].get(), 3,
+                                             Status::NoSpace("Out of space"));
+        break;
+      case 1:
+        // Setup for returning error after the 1st SST, which would result
+        // in a hard error
+        listener[i]->InjectFileCreationError(fault_env[i].get(), 2,
+                                             Status::NoSpace("Out of space"));
+        break;
+      default:
+        break;
+    }
+    snprintf(buf, sizeof(buf), "_%d", i);
+    DestroyDB(dbname_ + std::string(buf), options[i]);
+    ASSERT_EQ(DB::Open(options[i], dbname_ + std::string(buf), &dbptr),
+              Status::OK());
+    db.emplace_back(dbptr);
+  }
+
+  for (auto i = 0; i < kNumDbInstances; ++i) {
+    WriteBatch batch;
+
+    for (auto j = 0; j <= 100; ++j) {
+      batch.Put(Key(j), RandomString(&rnd, 1024));
+    }
+
+    WriteOptions wopts;
+    wopts.sync = true;
+    ASSERT_EQ(db[i]->Write(wopts, &batch), Status::OK());
+    ASSERT_EQ(db[i]->Flush(FlushOptions()), Status::OK());
+  }
+
+  def_env->SetFilesystemActive(false, Status::NoSpace("Out of space"));
+  for (auto i = 0; i < kNumDbInstances; ++i) {
+    WriteBatch batch;
+
+    // Write to one CF
+    for (auto j = 100; j < 199; ++j) {
+      batch.Put(Key(j), RandomString(&rnd, 1024));
+    }
+
+    WriteOptions wopts;
+    wopts.sync = true;
+    ASSERT_EQ(db[i]->Write(wopts, &batch), Status::OK());
+    if (i != 1) {
+      ASSERT_EQ(db[i]->Flush(FlushOptions()), Status::OK());
+    } else {
+      ASSERT_EQ(db[i]->Flush(FlushOptions()), Status::NoSpace());
+    }
+  }
+
+  for (auto i = 0; i < kNumDbInstances; ++i) {
+    Status s = static_cast<DBImpl*>(db[i])->TEST_WaitForCompact(true);
+    switch (i) {
+      case 0:
+        ASSERT_EQ(s.severity(), Status::Severity::kSoftError);
+        break;
+      case 1:
+        ASSERT_EQ(s.severity(), Status::Severity::kHardError);
+        break;
+      case 2:
+        ASSERT_EQ(s, Status::OK());
+        break;
+    }
+    fault_env[i]->SetFilesystemActive(true);
+  }
+
+  def_env->SetFilesystemActive(true);
+  for (auto i = 0; i < kNumDbInstances; ++i) {
+    std::string prop;
+    if (i < 2) {
+      ASSERT_EQ(listener[i]->WaitForRecovery(5000000), true);
+    }
+    if (i == 1) {
+      ASSERT_EQ(static_cast<DBImpl*>(db[i])->TEST_WaitForCompact(true),
+                Status::OK());
+    }
+    EXPECT_TRUE(db[i]->GetProperty(
+        "rocksdb.num-files-at-level" + NumberToString(0), &prop));
+    EXPECT_EQ(atoi(prop.c_str()), 0);
+    EXPECT_TRUE(db[i]->GetProperty(
+        "rocksdb.num-files-at-level" + NumberToString(1), &prop));
+    EXPECT_EQ(atoi(prop.c_str()), 1);
+  }
+
+  for (auto i = 0; i < kNumDbInstances; ++i) {
+    char buf[16];
+    snprintf(buf, sizeof(buf), "_%d", i);
+    fault_env[i]->SetFilesystemActive(true);
+    delete db[i];
+    if (getenv("KEEP_DB")) {
+      printf("DB is still at %s%s\n", dbname_.c_str(), buf);
+    } else {
+      DestroyDB(dbname_ + std::string(buf), options[i]);
+    }
+  }
+  options.clear();
+  delete def_env;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+  fprintf(stderr, "SKIPPED as Cuckoo table is not supported in ROCKSDB_LITE\n");
+  return 0;
+}
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/db/event_helpers.cc b/src/rocksdb/db/event_helpers.cc
new file mode 100644
index 000000000..57aa711fc
--- /dev/null
+++ b/src/rocksdb/db/event_helpers.cc
@@ -0,0 +1,223 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/event_helpers.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+template <class T>
+inline T SafeDivide(T a, T b) {
+  return b == 0 ? 0 : a / b;
+}
+}  // namespace
+
+void EventHelpers::AppendCurrentTime(JSONWriter* jwriter) {
+  *jwriter << "time_micros"
+           << std::chrono::duration_cast<std::chrono::microseconds>(
+                  std::chrono::system_clock::now().time_since_epoch())
+                  .count();
+}
+
+#ifndef ROCKSDB_LITE
+void EventHelpers::NotifyTableFileCreationStarted(
+    const std::vector<std::shared_ptr<EventListener>>& listeners,
+    const std::string& db_name, const std::string& cf_name,
+    const std::string& file_path, int job_id, TableFileCreationReason reason) {
+  TableFileCreationBriefInfo info;
+  info.db_name = db_name;
+  info.cf_name = cf_name;
+  info.file_path = file_path;
+  info.job_id = job_id;
+  info.reason = reason;
+  for (auto& listener : listeners) {
+    listener->OnTableFileCreationStarted(info);
+  }
+}
+#endif  // !ROCKSDB_LITE
+
+void EventHelpers::NotifyOnBackgroundError(
+    const std::vector<std::shared_ptr<EventListener>>& listeners,
+    BackgroundErrorReason reason, Status* bg_error, InstrumentedMutex* db_mutex,
+    bool* auto_recovery) {
+#ifndef ROCKSDB_LITE
+  if (listeners.size() == 0U) {
+    return;
+  }
+  db_mutex->AssertHeld();
+  // release lock while notifying events
+  db_mutex->Unlock();
+  for (auto& listener : listeners) {
+    listener->OnBackgroundError(reason, bg_error);
+    if (*auto_recovery) {
+      listener->OnErrorRecoveryBegin(reason, *bg_error, auto_recovery);
+    }
+  }
+  db_mutex->Lock();
+#else
+  (void)listeners;
+  (void)reason;
+  (void)bg_error;
+  (void)db_mutex;
+  (void)auto_recovery;
+#endif  // ROCKSDB_LITE
+}
+
+void EventHelpers::LogAndNotifyTableFileCreationFinished(
+    EventLogger* event_logger,
+    const std::vector<std::shared_ptr<EventListener>>& listeners,
+    const std::string& db_name, const std::string& cf_name,
+    const std::string& file_path, int job_id, const FileDescriptor& fd,
+    uint64_t oldest_blob_file_number, const TableProperties& table_properties,
+    TableFileCreationReason reason, const Status& s) {
+  if (s.ok() && event_logger) {
+    JSONWriter jwriter;
+    AppendCurrentTime(&jwriter);
+    jwriter << "cf_name" << cf_name << "job" << job_id << "event"
+            << "table_file_creation"
+            << "file_number" << fd.GetNumber() << "file_size"
+            << fd.GetFileSize();
+
+    // table_properties
+    {
+      jwriter << "table_properties";
+      jwriter.StartObject();
+
+      // basic properties:
+      jwriter << "data_size" << table_properties.data_size << "index_size"
+              << table_properties.index_size << "index_partitions"
+              << table_properties.index_partitions << "top_level_index_size"
+              << table_properties.top_level_index_size
+              << "index_key_is_user_key"
+              << table_properties.index_key_is_user_key
+              << "index_value_is_delta_encoded"
+              << table_properties.index_value_is_delta_encoded << "filter_size"
+              << table_properties.filter_size << "raw_key_size"
+              << table_properties.raw_key_size << "raw_average_key_size"
+              << SafeDivide(table_properties.raw_key_size,
+                            table_properties.num_entries)
+              << "raw_value_size" << table_properties.raw_value_size
+              << "raw_average_value_size"
+              << SafeDivide(table_properties.raw_value_size,
+                            table_properties.num_entries)
+              << "num_data_blocks" << table_properties.num_data_blocks
+              << "num_entries" << table_properties.num_entries
+              << "num_deletions" << table_properties.num_deletions
+              << "num_merge_operands" << table_properties.num_merge_operands
+              << "num_range_deletions" << table_properties.num_range_deletions
+              << "format_version" << table_properties.format_version
+              << "fixed_key_len" << table_properties.fixed_key_len
+              << "filter_policy" << table_properties.filter_policy_name
+              << "column_family_name" << table_properties.column_family_name
+              << "column_family_id" << table_properties.column_family_id
+              << "comparator" << table_properties.comparator_name
+              << "merge_operator" << table_properties.merge_operator_name
+              << "prefix_extractor_name"
+              << table_properties.prefix_extractor_name << "property_collectors"
+              << table_properties.property_collectors_names << "compression"
+              << table_properties.compression_name << "compression_options"
+              << table_properties.compression_options << "creation_time"
+              << table_properties.creation_time << "oldest_key_time"
+              << table_properties.oldest_key_time << "file_creation_time"
+              << table_properties.file_creation_time;
+
+      // user collected properties
+      for (const auto& prop : table_properties.readable_properties) {
+        jwriter << prop.first << prop.second;
+      }
+      jwriter.EndObject();
+    }
+
+    if (oldest_blob_file_number != kInvalidBlobFileNumber) {
+      jwriter << "oldest_blob_file_number" << oldest_blob_file_number;
+    }
+
+    jwriter.EndObject();
+
+    event_logger->Log(jwriter);
+  }
+
+#ifndef ROCKSDB_LITE
+  if (listeners.size() == 0) {
+    return;
+  }
+  TableFileCreationInfo info;
+  info.db_name = db_name;
+  info.cf_name = cf_name;
+  info.file_path = file_path;
+  info.file_size = fd.file_size;
+  info.job_id = job_id;
+  info.table_properties = table_properties;
+  info.reason = reason;
+  info.status = s;
+  for (auto& listener : listeners) {
+    listener->OnTableFileCreated(info);
+  }
+#else
+  (void)listeners;
+  (void)db_name;
+  (void)cf_name;
+  (void)file_path;
+  (void)reason;
+#endif  // !ROCKSDB_LITE
+}
+
+void EventHelpers::LogAndNotifyTableFileDeletion(
+    EventLogger* event_logger, int job_id, uint64_t file_number,
+    const std::string& file_path, const Status& status,
+    const std::string& dbname,
+    const std::vector<std::shared_ptr<EventListener>>& listeners) {
+  JSONWriter jwriter;
+  AppendCurrentTime(&jwriter);
+
+  jwriter << "job" << job_id << "event"
+          << "table_file_deletion"
+          << "file_number" << file_number;
+  if (!status.ok()) {
+    jwriter << "status" << status.ToString();
+  }
+
+  jwriter.EndObject();
+
+  event_logger->Log(jwriter);
+
+#ifndef ROCKSDB_LITE
+  TableFileDeletionInfo info;
+  info.db_name = dbname;
+  info.job_id = job_id;
+  info.file_path = file_path;
+  info.status = status;
+  for (auto& listener : listeners) {
+    listener->OnTableFileDeleted(info);
+  }
+#else
+  (void)file_path;
+  (void)dbname;
+  (void)listeners;
+#endif  // !ROCKSDB_LITE
+}
+
+void EventHelpers::NotifyOnErrorRecoveryCompleted(
+    const std::vector<std::shared_ptr<EventListener>>& listeners,
+    Status old_bg_error, InstrumentedMutex* db_mutex) {
+#ifndef ROCKSDB_LITE
+  if (listeners.size() == 0U) {
+    return;
+  }
+  db_mutex->AssertHeld();
+  // release lock while notifying events
+  db_mutex->Unlock();
+  for (auto& listener : listeners) {
+    listener->OnErrorRecoveryCompleted(old_bg_error);
+  }
+  db_mutex->Lock();
+#else
+  (void)listeners;
+  (void)old_bg_error;
+  (void)db_mutex;
+#endif  // ROCKSDB_LITE
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/event_helpers.h b/src/rocksdb/db/event_helpers.h
new file mode 100644
index 000000000..87cc1cb8c
--- /dev/null
+++ b/src/rocksdb/db/event_helpers.h
@@ -0,0 +1,55 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "db/column_family.h"
+#include "db/version_edit.h"
+#include "logging/event_logger.h"
+#include "rocksdb/listener.h"
+#include "rocksdb/table_properties.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class EventHelpers {
+ public:
+  static void AppendCurrentTime(JSONWriter* json_writer);
+#ifndef ROCKSDB_LITE
+  static void NotifyTableFileCreationStarted(
+      const std::vector<std::shared_ptr<EventListener>>& listeners,
+      const std::string& db_name, const std::string& cf_name,
+      const std::string& file_path, int job_id, TableFileCreationReason reason);
+#endif  // !ROCKSDB_LITE
+  static void NotifyOnBackgroundError(
+      const std::vector<std::shared_ptr<EventListener>>& listeners,
+      BackgroundErrorReason reason, Status* bg_error,
+      InstrumentedMutex* db_mutex, bool* auto_recovery);
+  static void LogAndNotifyTableFileCreationFinished(
+      EventLogger* event_logger,
+      const std::vector<std::shared_ptr<EventListener>>& listeners,
+      const std::string& db_name, const std::string& cf_name,
+      const std::string& file_path, int job_id, const FileDescriptor& fd,
+      uint64_t oldest_blob_file_number, const TableProperties& table_properties,
+      TableFileCreationReason reason, const Status& s);
+  static void LogAndNotifyTableFileDeletion(
+      EventLogger* event_logger, int job_id,
+      uint64_t file_number, const std::string& file_path,
+      const Status& status, const std::string& db_name,
+      const std::vector<std::shared_ptr<EventListener>>& listeners);
+  static void NotifyOnErrorRecoveryCompleted(
+      const std::vector<std::shared_ptr<EventListener>>& listeners,
+      Status bg_error, InstrumentedMutex* db_mutex);
+
+ private:
+  static void LogAndNotifyTableFileCreation(
+      EventLogger* event_logger,
+      const std::vector<std::shared_ptr<EventListener>>& listeners,
+      const FileDescriptor& fd, const TableFileCreationInfo& info);
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/experimental.cc b/src/rocksdb/db/experimental.cc
new file mode 100644
index 000000000..d12882c8f
--- /dev/null
+++ b/src/rocksdb/db/experimental.cc
@@ -0,0 +1,50 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "rocksdb/experimental.h"
+
+#include "db/db_impl/db_impl.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace experimental {
+
+#ifndef ROCKSDB_LITE
+
+Status SuggestCompactRange(DB* db, ColumnFamilyHandle* column_family,
+                           const Slice* begin, const Slice* end) {
+  if (db == nullptr) {
+    return Status::InvalidArgument("DB is empty");
+  }
+
+  return db->SuggestCompactRange(column_family, begin, end);
+}
+
+Status PromoteL0(DB* db, ColumnFamilyHandle* column_family, int target_level) {
+  if (db == nullptr) {
+    return Status::InvalidArgument("Didn't recognize DB object");
+  }
+  return db->PromoteL0(column_family, target_level);
+}
+
+#else  // ROCKSDB_LITE
+
+Status SuggestCompactRange(DB* /*db*/, ColumnFamilyHandle* /*column_family*/,
+                           const Slice* /*begin*/, const Slice* /*end*/) {
+  return Status::NotSupported("Not supported in RocksDB LITE");
+}
+
+Status PromoteL0(DB* /*db*/, ColumnFamilyHandle* /*column_family*/,
+                 int /*target_level*/) {
+  return Status::NotSupported("Not supported in RocksDB LITE");
+}
+
+#endif  // ROCKSDB_LITE
+
+Status SuggestCompactRange(DB* db, const Slice* begin, const Slice* end) {
+  return SuggestCompactRange(db, db->DefaultColumnFamily(), begin, end);
+}
+
+}  // namespace experimental
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/external_sst_file_basic_test.cc b/src/rocksdb/db/external_sst_file_basic_test.cc
new file mode 100644
index 000000000..b184df20e
--- /dev/null
+++ b/src/rocksdb/db/external_sst_file_basic_test.cc
@@ -0,0 +1,1128 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include <functional>
+
+#include "db/db_test_util.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "rocksdb/sst_file_writer.h"
+#include "test_util/fault_injection_test_env.h"
+#include "test_util/testutil.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+#ifndef ROCKSDB_LITE
+class ExternalSSTFileBasicTest
+    : public DBTestBase,
+      public ::testing::WithParamInterface<std::tuple<bool, bool>> {
+ public:
+  ExternalSSTFileBasicTest() : DBTestBase("/external_sst_file_basic_test") {
+    sst_files_dir_ = dbname_ + "/sst_files/";
+    fault_injection_test_env_.reset(new FaultInjectionTestEnv(Env::Default()));
+    DestroyAndRecreateExternalSSTFilesDir();
+  }
+
+  void DestroyAndRecreateExternalSSTFilesDir() {
+    test::DestroyDir(env_, sst_files_dir_);
+    env_->CreateDir(sst_files_dir_);
+  }
+
+  Status DeprecatedAddFile(const std::vector<std::string>& files,
+                           bool move_files = false,
+                           bool skip_snapshot_check = false) {
+    IngestExternalFileOptions opts;
+    opts.move_files = move_files;
+    opts.snapshot_consistency = !skip_snapshot_check;
+    opts.allow_global_seqno = false;
+    opts.allow_blocking_flush = false;
+    return db_->IngestExternalFile(files, opts);
+  }
+
+  Status GenerateAndAddExternalFile(
+      const Options options, std::vector<int> keys,
+      const std::vector<ValueType>& value_types,
+      std::vector<std::pair<int, int>> range_deletions, int file_id,
+      bool write_global_seqno, bool verify_checksums_before_ingest,
+      std::map<std::string, std::string>* true_data) {
+    assert(value_types.size() == 1 || keys.size() == value_types.size());
+    std::string file_path = sst_files_dir_ + ToString(file_id);
+    SstFileWriter sst_file_writer(EnvOptions(), options);
+
+    Status s = sst_file_writer.Open(file_path);
+    if (!s.ok()) {
+      return s;
+    }
+    for (size_t i = 0; i < range_deletions.size(); i++) {
+      // Account for the effect of range deletions on true_data before
+      // all point operators, even though sst_file_writer.DeleteRange
+      // must be called before other sst_file_writer methods. This is
+      // because point writes take precedence over range deletions
+      // in the same ingested sst.
+      std::string start_key = Key(range_deletions[i].first);
+      std::string end_key = Key(range_deletions[i].second);
+      s = sst_file_writer.DeleteRange(start_key, end_key);
+      if (!s.ok()) {
+        sst_file_writer.Finish();
+        return s;
+      }
+      auto start_key_it = true_data->find(start_key);
+      if (start_key_it == true_data->end()) {
+        start_key_it = true_data->upper_bound(start_key);
+      }
+      auto end_key_it = true_data->find(end_key);
+      if (end_key_it == true_data->end()) {
+        end_key_it = true_data->upper_bound(end_key);
+      }
+      true_data->erase(start_key_it, end_key_it);
+    }
+    for (size_t i = 0; i < keys.size(); i++) {
+      std::string key = Key(keys[i]);
+      std::string value = Key(keys[i]) + ToString(file_id);
+      ValueType value_type =
+          (value_types.size() == 1 ? value_types[0] : value_types[i]);
+      switch (value_type) {
+        case ValueType::kTypeValue:
+          s = sst_file_writer.Put(key, value);
+          (*true_data)[key] = value;
+          break;
+        case ValueType::kTypeMerge:
+          s = sst_file_writer.Merge(key, value);
+          // we only use TestPutOperator in this test
+          (*true_data)[key] = value;
+          break;
+        case ValueType::kTypeDeletion:
+          s = sst_file_writer.Delete(key);
+          true_data->erase(key);
+          break;
+        default:
+          return Status::InvalidArgument("Value type is not supported");
+      }
+      if (!s.ok()) {
+        sst_file_writer.Finish();
+        return s;
+      }
+    }
+    s = sst_file_writer.Finish();
+
+    if (s.ok()) {
+      IngestExternalFileOptions ifo;
+      ifo.allow_global_seqno = true;
+      ifo.write_global_seqno = write_global_seqno;
+      ifo.verify_checksums_before_ingest = verify_checksums_before_ingest;
+      s = db_->IngestExternalFile({file_path}, ifo);
+    }
+    return s;
+  }
+
+  Status GenerateAndAddExternalFile(
+      const Options options, std::vector<int> keys,
+      const std::vector<ValueType>& value_types, int file_id,
+      bool write_global_seqno, bool verify_checksums_before_ingest,
+      std::map<std::string, std::string>* true_data) {
+    return GenerateAndAddExternalFile(
+        options, keys, value_types, {}, file_id, write_global_seqno,
+        verify_checksums_before_ingest, true_data);
+  }
+
+  Status GenerateAndAddExternalFile(
+      const Options options, std::vector<int> keys, const ValueType value_type,
+      int file_id, bool write_global_seqno, bool verify_checksums_before_ingest,
+      std::map<std::string, std::string>* true_data) {
+    return GenerateAndAddExternalFile(
+        options, keys, std::vector<ValueType>(1, value_type), file_id,
+        write_global_seqno, verify_checksums_before_ingest, true_data);
+  }
+
+  ~ExternalSSTFileBasicTest() override {
+    test::DestroyDir(env_, sst_files_dir_);
+  }
+
+ protected:
+  std::string sst_files_dir_;
+  std::unique_ptr<FaultInjectionTestEnv> fault_injection_test_env_;
+};
+
+TEST_F(ExternalSSTFileBasicTest, Basic) {
+  Options options = CurrentOptions();
+
+  SstFileWriter sst_file_writer(EnvOptions(), options);
+
+  // Current file size should be 0 after sst_file_writer init and before open a
+  // file.
+  ASSERT_EQ(sst_file_writer.FileSize(), 0);
+
+  // file1.sst (0 => 99)
+  std::string file1 = sst_files_dir_ + "file1.sst";
+  ASSERT_OK(sst_file_writer.Open(file1));
+  for (int k = 0; k < 100; k++) {
+    ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
+  }
+  ExternalSstFileInfo file1_info;
+  Status s = sst_file_writer.Finish(&file1_info);
+  ASSERT_TRUE(s.ok()) << s.ToString();
+
+  // Current file size should be non-zero after success write.
+  ASSERT_GT(sst_file_writer.FileSize(), 0);
+
+  ASSERT_EQ(file1_info.file_path, file1);
+  ASSERT_EQ(file1_info.num_entries, 100);
+  ASSERT_EQ(file1_info.smallest_key, Key(0));
+  ASSERT_EQ(file1_info.largest_key, Key(99));
+  ASSERT_EQ(file1_info.num_range_del_entries, 0);
+  ASSERT_EQ(file1_info.smallest_range_del_key, "");
+  ASSERT_EQ(file1_info.largest_range_del_key, "");
+  // sst_file_writer already finished, cannot add this value
+  s = sst_file_writer.Put(Key(100), "bad_val");
+  ASSERT_FALSE(s.ok()) << s.ToString();
+  s = sst_file_writer.DeleteRange(Key(100), Key(200));
+  ASSERT_FALSE(s.ok()) << s.ToString();
+
+  DestroyAndReopen(options);
+  // Add file using file path
+  s = DeprecatedAddFile({file1});
+  ASSERT_TRUE(s.ok()) << s.ToString();
+  ASSERT_EQ(db_->GetLatestSequenceNumber(), 0U);
+  for (int k = 0; k < 100; k++) {
+    ASSERT_EQ(Get(Key(k)), Key(k) + "_val");
+  }
+
+  DestroyAndRecreateExternalSSTFilesDir();
+}
+
+TEST_F(ExternalSSTFileBasicTest, NoCopy) {
+  Options options = CurrentOptions();
+  const ImmutableCFOptions ioptions(options);
+
+  SstFileWriter sst_file_writer(EnvOptions(), options);
+
+  // file1.sst (0 => 99)
+  std::string file1 = sst_files_dir_ + "file1.sst";
+  ASSERT_OK(sst_file_writer.Open(file1));
+  for (int k = 0; k < 100; k++) {
+    ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
+  }
+  ExternalSstFileInfo file1_info;
+  Status s = sst_file_writer.Finish(&file1_info);
+  ASSERT_TRUE(s.ok()) << s.ToString();
+  ASSERT_EQ(file1_info.file_path, file1);
+  ASSERT_EQ(file1_info.num_entries, 100);
+  ASSERT_EQ(file1_info.smallest_key, Key(0));
+  ASSERT_EQ(file1_info.largest_key, Key(99));
+
+  // file2.sst (100 => 299)
+  std::string file2 = sst_files_dir_ + "file2.sst";
+  ASSERT_OK(sst_file_writer.Open(file2));
+  for (int k = 100; k < 300; k++) {
+    ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
+  }
+  ExternalSstFileInfo file2_info;
+  s = sst_file_writer.Finish(&file2_info);
+  ASSERT_TRUE(s.ok()) << s.ToString();
+  ASSERT_EQ(file2_info.file_path, file2);
+  ASSERT_EQ(file2_info.num_entries, 200);
+  ASSERT_EQ(file2_info.smallest_key, Key(100));
+  ASSERT_EQ(file2_info.largest_key, Key(299));
+
+  // file3.sst (110 => 124) .. overlap with file2.sst
+  std::string file3 = sst_files_dir_ + "file3.sst";
+  ASSERT_OK(sst_file_writer.Open(file3));
+  for (int k = 110; k < 125; k++) {
+    ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val_overlap"));
+  }
+  ExternalSstFileInfo file3_info;
+  s = sst_file_writer.Finish(&file3_info);
+  ASSERT_TRUE(s.ok()) << s.ToString();
+  ASSERT_EQ(file3_info.file_path, file3);
+  ASSERT_EQ(file3_info.num_entries, 15);
+  ASSERT_EQ(file3_info.smallest_key, Key(110));
+  ASSERT_EQ(file3_info.largest_key, Key(124));
+
+  s = DeprecatedAddFile({file1}, true /* move file */);
+  ASSERT_TRUE(s.ok()) << s.ToString();
+  ASSERT_EQ(Status::NotFound(), env_->FileExists(file1));
+
+  s = DeprecatedAddFile({file2}, false /* copy file */);
+  ASSERT_TRUE(s.ok()) << s.ToString();
+  ASSERT_OK(env_->FileExists(file2));
+
+  // This file has overlapping values with the existing data
+  s = DeprecatedAddFile({file3}, true /* move file */);
+  ASSERT_FALSE(s.ok()) << s.ToString();
+  ASSERT_OK(env_->FileExists(file3));
+
+  for (int k = 0; k < 300; k++) {
+    ASSERT_EQ(Get(Key(k)), Key(k) + "_val");
+  }
+}
+
+TEST_P(ExternalSSTFileBasicTest, IngestFileWithGlobalSeqnoPickedSeqno) {
+  bool write_global_seqno = std::get<0>(GetParam());
+  bool verify_checksums_before_ingest = std::get<1>(GetParam());
+  do {
+    Options options = CurrentOptions();
+    DestroyAndReopen(options);
+    std::map<std::string, std::string> true_data;
+
+    int file_id = 1;
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {1, 2, 3, 4, 5, 6}, ValueType::kTypeValue, file_id++,
+        write_global_seqno, verify_checksums_before_ingest, &true_data));
+    // File doesn't overwrite any keys, no seqno needed
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 0);
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {10, 11, 12, 13}, ValueType::kTypeValue, file_id++,
+        write_global_seqno, verify_checksums_before_ingest, &true_data));
+    // File doesn't overwrite any keys, no seqno needed
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 0);
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {1, 4, 6}, ValueType::kTypeValue, file_id++,
+        write_global_seqno, verify_checksums_before_ingest, &true_data));
+    // File overwrites some keys, a seqno will be assigned
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 1);
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {11, 15, 19}, ValueType::kTypeValue, file_id++,
+        write_global_seqno, verify_checksums_before_ingest, &true_data));
+    // File overwrites some keys, a seqno will be assigned
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 2);
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {120, 130}, ValueType::kTypeValue, file_id++,
+        write_global_seqno, verify_checksums_before_ingest, &true_data));
+    // File doesn't overwrite any keys, no seqno needed
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 2);
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {1, 130}, ValueType::kTypeValue, file_id++, write_global_seqno,
+        verify_checksums_before_ingest, &true_data));
+    // File overwrites some keys, a seqno will be assigned
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 3);
+
+    // Write some keys through normal write path
+    for (int i = 0; i < 50; i++) {
+      ASSERT_OK(Put(Key(i), "memtable"));
+      true_data[Key(i)] = "memtable";
+    }
+    SequenceNumber last_seqno = dbfull()->GetLatestSequenceNumber();
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {60, 61, 62}, ValueType::kTypeValue, file_id++,
+        write_global_seqno, verify_checksums_before_ingest, &true_data));
+    // File doesn't overwrite any keys, no seqno needed
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno);
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {40, 41, 42}, ValueType::kTypeValue, file_id++,
+        write_global_seqno, verify_checksums_before_ingest, &true_data));
+    // File overwrites some keys, a seqno will be assigned
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 1);
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {20, 30, 40}, ValueType::kTypeValue, file_id++,
+        write_global_seqno, verify_checksums_before_ingest, &true_data));
+    // File overwrites some keys, a seqno will be assigned
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 2);
+
+    const Snapshot* snapshot = db_->GetSnapshot();
+
+    // We will need a seqno for the file regardless if the file overwrite
+    // keys in the DB or not because we have a snapshot
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {1000, 1002}, ValueType::kTypeValue, file_id++,
+        write_global_seqno, verify_checksums_before_ingest, &true_data));
+    // A global seqno will be assigned anyway because of the snapshot
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 3);
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {2000, 3002}, ValueType::kTypeValue, file_id++,
+        write_global_seqno, verify_checksums_before_ingest, &true_data));
+    // A global seqno will be assigned anyway because of the snapshot
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 4);
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {1, 20, 40, 100, 150}, ValueType::kTypeValue, file_id++,
+        write_global_seqno, verify_checksums_before_ingest, &true_data));
+    // A global seqno will be assigned anyway because of the snapshot
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 5);
+
+    db_->ReleaseSnapshot(snapshot);
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {5000, 5001}, ValueType::kTypeValue, file_id++,
+        write_global_seqno, verify_checksums_before_ingest, &true_data));
+    // No snapshot anymore, no need to assign a seqno
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 5);
+
+    size_t kcnt = 0;
+    VerifyDBFromMap(true_data, &kcnt, false);
+  } while (ChangeOptionsForFileIngestionTest());
+}
+
+TEST_P(ExternalSSTFileBasicTest, IngestFileWithMultipleValueType) {
+  bool write_global_seqno = std::get<0>(GetParam());
+  bool verify_checksums_before_ingest = std::get<1>(GetParam());
+  do {
+    Options options = CurrentOptions();
+    options.merge_operator.reset(new TestPutOperator());
+    DestroyAndReopen(options);
+    std::map<std::string, std::string> true_data;
+
+    int file_id = 1;
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {1, 2, 3, 4, 5, 6}, ValueType::kTypeValue, file_id++,
+        write_global_seqno, verify_checksums_before_ingest, &true_data));
+    // File doesn't overwrite any keys, no seqno needed
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 0);
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {10, 11, 12, 13}, ValueType::kTypeValue, file_id++,
+        write_global_seqno, verify_checksums_before_ingest, &true_data));
+    // File doesn't overwrite any keys, no seqno needed
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 0);
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {1, 4, 6}, ValueType::kTypeMerge, file_id++,
+        write_global_seqno, verify_checksums_before_ingest, &true_data));
+    // File overwrites some keys, a seqno will be assigned
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 1);
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {11, 15, 19}, ValueType::kTypeDeletion, file_id++,
+        write_global_seqno, verify_checksums_before_ingest, &true_data));
+    // File overwrites some keys, a seqno will be assigned
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 2);
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {120, 130}, ValueType::kTypeMerge, file_id++,
+        write_global_seqno, verify_checksums_before_ingest, &true_data));
+    // File doesn't overwrite any keys, no seqno needed
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 2);
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {1, 130}, ValueType::kTypeDeletion, file_id++,
+        write_global_seqno, verify_checksums_before_ingest, &true_data));
+    // File overwrites some keys, a seqno will be assigned
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 3);
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {120}, {ValueType::kTypeValue}, {{120, 135}}, file_id++,
+        write_global_seqno, verify_checksums_before_ingest, &true_data));
+    // File overwrites some keys, a seqno will be assigned
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 4);
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {}, {}, {{110, 120}}, file_id++, write_global_seqno,
+        verify_checksums_before_ingest, &true_data));
+    // The range deletion ends on a key, but it doesn't actually delete
+    // this key because the largest key in the range is exclusive. Still,
+    // it counts as an overlap so a new seqno will be assigned.
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 5);
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {}, {}, {{100, 109}}, file_id++, write_global_seqno,
+        verify_checksums_before_ingest, &true_data));
+    // File doesn't overwrite any keys, no seqno needed
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 5);
+
+    // Write some keys through normal write path
+    for (int i = 0; i < 50; i++) {
+      ASSERT_OK(Put(Key(i), "memtable"));
+      true_data[Key(i)] = "memtable";
+    }
+    SequenceNumber last_seqno = dbfull()->GetLatestSequenceNumber();
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {60, 61, 62}, ValueType::kTypeValue, file_id++,
+        write_global_seqno, verify_checksums_before_ingest, &true_data));
+    // File doesn't overwrite any keys, no seqno needed
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno);
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {40, 41, 42}, ValueType::kTypeMerge, file_id++,
+        write_global_seqno, verify_checksums_before_ingest, &true_data));
+    // File overwrites some keys, a seqno will be assigned
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 1);
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {20, 30, 40}, ValueType::kTypeDeletion, file_id++,
+        write_global_seqno, verify_checksums_before_ingest, &true_data));
+    // File overwrites some keys, a seqno will be assigned
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 2);
+
+    const Snapshot* snapshot = db_->GetSnapshot();
+
+    // We will need a seqno for the file regardless if the file overwrite
+    // keys in the DB or not because we have a snapshot
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {1000, 1002}, ValueType::kTypeMerge, file_id++,
+        write_global_seqno, verify_checksums_before_ingest, &true_data));
+    // A global seqno will be assigned anyway because of the snapshot
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 3);
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {2000, 3002}, ValueType::kTypeMerge, file_id++,
+        write_global_seqno, verify_checksums_before_ingest, &true_data));
+    // A global seqno will be assigned anyway because of the snapshot
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 4);
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {1, 20, 40, 100, 150}, ValueType::kTypeMerge, file_id++,
+        write_global_seqno, verify_checksums_before_ingest, &true_data));
+    // A global seqno will be assigned anyway because of the snapshot
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 5);
+
+    db_->ReleaseSnapshot(snapshot);
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {5000, 5001}, ValueType::kTypeValue, file_id++,
+        write_global_seqno, verify_checksums_before_ingest, &true_data));
+    // No snapshot anymore, no need to assign a seqno
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 5);
+
+    size_t kcnt = 0;
+    VerifyDBFromMap(true_data, &kcnt, false);
+  } while (ChangeOptionsForFileIngestionTest());
+}
+
+TEST_P(ExternalSSTFileBasicTest, IngestFileWithMixedValueType) {
+  bool write_global_seqno = std::get<0>(GetParam());
+  bool verify_checksums_before_ingest = std::get<1>(GetParam());
+  do {
+    Options options = CurrentOptions();
+    options.merge_operator.reset(new TestPutOperator());
+    DestroyAndReopen(options);
+    std::map<std::string, std::string> true_data;
+
+    int file_id = 1;
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {1, 2, 3, 4, 5, 6},
+        {ValueType::kTypeValue, ValueType::kTypeMerge, ValueType::kTypeValue,
+         ValueType::kTypeMerge, ValueType::kTypeValue, ValueType::kTypeMerge},
+        file_id++, write_global_seqno, verify_checksums_before_ingest,
+        &true_data));
+    // File doesn't overwrite any keys, no seqno needed
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 0);
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {10, 11, 12, 13},
+        {ValueType::kTypeValue, ValueType::kTypeMerge, ValueType::kTypeValue,
+         ValueType::kTypeMerge},
+        file_id++, write_global_seqno, verify_checksums_before_ingest,
+        &true_data));
+    // File doesn't overwrite any keys, no seqno needed
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 0);
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {1, 4, 6},
+        {ValueType::kTypeDeletion, ValueType::kTypeValue,
+         ValueType::kTypeMerge},
+        file_id++, write_global_seqno, verify_checksums_before_ingest,
+        &true_data));
+    // File overwrites some keys, a seqno will be assigned
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 1);
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {11, 15, 19},
+        {ValueType::kTypeDeletion, ValueType::kTypeMerge,
+         ValueType::kTypeValue},
+        file_id++, write_global_seqno, verify_checksums_before_ingest,
+        &true_data));
+    // File overwrites some keys, a seqno will be assigned
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 2);
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {120, 130}, {ValueType::kTypeValue, ValueType::kTypeMerge},
+        file_id++, write_global_seqno, verify_checksums_before_ingest,
+        &true_data));
+    // File doesn't overwrite any keys, no seqno needed
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 2);
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {1, 130}, {ValueType::kTypeMerge, ValueType::kTypeDeletion},
+        file_id++, write_global_seqno, verify_checksums_before_ingest,
+        &true_data));
+    // File overwrites some keys, a seqno will be assigned
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 3);
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {150, 151, 152},
+        {ValueType::kTypeValue, ValueType::kTypeMerge,
+         ValueType::kTypeDeletion},
+        {{150, 160}, {180, 190}}, file_id++, write_global_seqno,
+        verify_checksums_before_ingest, &true_data));
+    // File doesn't overwrite any keys, no seqno needed
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 3);
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {150, 151, 152},
+        {ValueType::kTypeValue, ValueType::kTypeMerge, ValueType::kTypeValue},
+        {{200, 250}}, file_id++, write_global_seqno,
+        verify_checksums_before_ingest, &true_data));
+    // File overwrites some keys, a seqno will be assigned
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 4);
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {300, 301, 302},
+        {ValueType::kTypeValue, ValueType::kTypeMerge,
+         ValueType::kTypeDeletion},
+        {{1, 2}, {152, 154}}, file_id++, write_global_seqno,
+        verify_checksums_before_ingest, &true_data));
+    // File overwrites some keys, a seqno will be assigned
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 5);
+
+    // Write some keys through normal write path
+    for (int i = 0; i < 50; i++) {
+      ASSERT_OK(Put(Key(i), "memtable"));
+      true_data[Key(i)] = "memtable";
+    }
+    SequenceNumber last_seqno = dbfull()->GetLatestSequenceNumber();
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {60, 61, 62},
+        {ValueType::kTypeValue, ValueType::kTypeMerge, ValueType::kTypeValue},
+        file_id++, write_global_seqno, verify_checksums_before_ingest,
+        &true_data));
+    // File doesn't overwrite any keys, no seqno needed
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno);
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {40, 41, 42},
+        {ValueType::kTypeValue, ValueType::kTypeDeletion,
+         ValueType::kTypeDeletion},
+        file_id++, write_global_seqno, verify_checksums_before_ingest,
+        &true_data));
+    // File overwrites some keys, a seqno will be assigned
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 1);
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {20, 30, 40},
+        {ValueType::kTypeDeletion, ValueType::kTypeDeletion,
+         ValueType::kTypeDeletion},
+        file_id++, write_global_seqno, verify_checksums_before_ingest,
+        &true_data));
+    // File overwrites some keys, a seqno will be assigned
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 2);
+
+    const Snapshot* snapshot = db_->GetSnapshot();
+
+    // We will need a seqno for the file regardless if the file overwrite
+    // keys in the DB or not because we have a snapshot
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {1000, 1002}, {ValueType::kTypeValue, ValueType::kTypeMerge},
+        file_id++, write_global_seqno, verify_checksums_before_ingest,
+        &true_data));
+    // A global seqno will be assigned anyway because of the snapshot
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 3);
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {2000, 3002}, {ValueType::kTypeValue, ValueType::kTypeMerge},
+        file_id++, write_global_seqno, verify_checksums_before_ingest,
+        &true_data));
+    // A global seqno will be assigned anyway because of the snapshot
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 4);
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {1, 20, 40, 100, 150},
+        {ValueType::kTypeDeletion, ValueType::kTypeDeletion,
+         ValueType::kTypeValue, ValueType::kTypeMerge, ValueType::kTypeMerge},
+        file_id++, write_global_seqno, verify_checksums_before_ingest,
+        &true_data));
+    // A global seqno will be assigned anyway because of the snapshot
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 5);
+
+    db_->ReleaseSnapshot(snapshot);
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {5000, 5001}, {ValueType::kTypeValue, ValueType::kTypeMerge},
+        file_id++, write_global_seqno, verify_checksums_before_ingest,
+        &true_data));
+    // No snapshot anymore, no need to assign a seqno
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 5);
+
+    size_t kcnt = 0;
+    VerifyDBFromMap(true_data, &kcnt, false);
+  } while (ChangeOptionsForFileIngestionTest());
+}
+
+TEST_F(ExternalSSTFileBasicTest, FadviseTrigger) {
+  Options options = CurrentOptions();
+  const int kNumKeys = 10000;
+
+  size_t total_fadvised_bytes = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "SstFileWriter::Rep::InvalidatePageCache", [&](void* arg) {
+        size_t fadvise_size = *(reinterpret_cast<size_t*>(arg));
+        total_fadvised_bytes += fadvise_size;
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  std::unique_ptr<SstFileWriter> sst_file_writer;
+
+  std::string sst_file_path = sst_files_dir_ + "file_fadvise_disable.sst";
+  sst_file_writer.reset(
+      new SstFileWriter(EnvOptions(), options, nullptr, false));
+  ASSERT_OK(sst_file_writer->Open(sst_file_path));
+  for (int i = 0; i < kNumKeys; i++) {
+    ASSERT_OK(sst_file_writer->Put(Key(i), Key(i)));
+  }
+  ASSERT_OK(sst_file_writer->Finish());
+  // fadvise disabled
+  ASSERT_EQ(total_fadvised_bytes, 0);
+
+  sst_file_path = sst_files_dir_ + "file_fadvise_enable.sst";
+  sst_file_writer.reset(
+      new SstFileWriter(EnvOptions(), options, nullptr, true));
+  ASSERT_OK(sst_file_writer->Open(sst_file_path));
+  for (int i = 0; i < kNumKeys; i++) {
+    ASSERT_OK(sst_file_writer->Put(Key(i), Key(i)));
+  }
+  ASSERT_OK(sst_file_writer->Finish());
+  // fadvise enabled
+  ASSERT_EQ(total_fadvised_bytes, sst_file_writer->FileSize());
+  ASSERT_GT(total_fadvised_bytes, 0);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(ExternalSSTFileBasicTest, SyncFailure) {
+  Options options;
+  options.create_if_missing = true;
+  options.env = fault_injection_test_env_.get();
+
+  std::vector<std::pair<std::string, std::string>> test_cases = {
+      {"ExternalSstFileIngestionJob::BeforeSyncIngestedFile",
+       "ExternalSstFileIngestionJob::AfterSyncIngestedFile"},
+      {"ExternalSstFileIngestionJob::BeforeSyncDir",
+       "ExternalSstFileIngestionJob::AfterSyncDir"},
+      {"ExternalSstFileIngestionJob::BeforeSyncGlobalSeqno",
+       "ExternalSstFileIngestionJob::AfterSyncGlobalSeqno"}};
+
+  for (size_t i = 0; i < test_cases.size(); i++) {
+    SyncPoint::GetInstance()->SetCallBack(test_cases[i].first, [&](void*) {
+      fault_injection_test_env_->SetFilesystemActive(false);
+    });
+    SyncPoint::GetInstance()->SetCallBack(test_cases[i].second, [&](void*) {
+      fault_injection_test_env_->SetFilesystemActive(true);
+    });
+    SyncPoint::GetInstance()->EnableProcessing();
+
+    DestroyAndReopen(options);
+    if (i == 2) {
+      ASSERT_OK(Put("foo", "v1"));
+    }
+
+    Options sst_file_writer_options;
+    std::unique_ptr<SstFileWriter> sst_file_writer(
+        new SstFileWriter(EnvOptions(), sst_file_writer_options));
+    std::string file_name =
+        sst_files_dir_ + "sync_failure_test_" + ToString(i) + ".sst";
+    ASSERT_OK(sst_file_writer->Open(file_name));
+    ASSERT_OK(sst_file_writer->Put("bar", "v2"));
+    ASSERT_OK(sst_file_writer->Finish());
+
+    IngestExternalFileOptions ingest_opt;
+    if (i == 0) {
+      ingest_opt.move_files = true;
+    }
+    const Snapshot* snapshot = db_->GetSnapshot();
+    if (i == 2) {
+      ingest_opt.write_global_seqno = true;
+    }
+    ASSERT_FALSE(db_->IngestExternalFile({file_name}, ingest_opt).ok());
+    db_->ReleaseSnapshot(snapshot);
+
+    SyncPoint::GetInstance()->DisableProcessing();
+    SyncPoint::GetInstance()->ClearAllCallBacks();
+    Destroy(options);
+  }
+}
+
+TEST_F(ExternalSSTFileBasicTest, VerifyChecksumReadahead) {
+  Options options;
+  options.create_if_missing = true;
+  SpecialEnv senv(Env::Default());
+  options.env = &senv;
+  DestroyAndReopen(options);
+
+  Options sst_file_writer_options;
+  std::unique_ptr<SstFileWriter> sst_file_writer(
+      new SstFileWriter(EnvOptions(), sst_file_writer_options));
+  std::string file_name = sst_files_dir_ + "verify_checksum_readahead_test.sst";
+  ASSERT_OK(sst_file_writer->Open(file_name));
+  Random rnd(301);
+  std::string value = DBTestBase::RandomString(&rnd, 4000);
+  for (int i = 0; i < 5000; i++) {
+    ASSERT_OK(sst_file_writer->Put(DBTestBase::Key(i), value));
+  }
+  ASSERT_OK(sst_file_writer->Finish());
+
+  // Ingest it once without verifying checksums to see the baseline
+  // preads.
+  IngestExternalFileOptions ingest_opt;
+  ingest_opt.move_files = false;
+  senv.count_random_reads_ = true;
+  senv.random_read_bytes_counter_ = 0;
+  ASSERT_OK(db_->IngestExternalFile({file_name}, ingest_opt));
+
+  auto base_num_reads = senv.random_read_counter_.Read();
+  // Make sure the counter is enabled.
+  ASSERT_GT(base_num_reads, 0);
+
+  // Ingest again and observe the reads made for for readahead.
+  ingest_opt.move_files = false;
+  ingest_opt.verify_checksums_before_ingest = true;
+  ingest_opt.verify_checksums_readahead_size = size_t{2 * 1024 * 1024};
+
+  senv.count_random_reads_ = true;
+  senv.random_read_bytes_counter_ = 0;
+  ASSERT_OK(db_->IngestExternalFile({file_name}, ingest_opt));
+
+  // Make sure the counter is enabled.
+  ASSERT_GT(senv.random_read_counter_.Read() - base_num_reads, 0);
+
+  // The SST file is about 20MB. Readahead size is 2MB.
+  // Give a conservative 15 reads for metadata blocks, the number
+  // of random reads should be within 20 MB / 2MB + 15 = 25.
+  ASSERT_LE(senv.random_read_counter_.Read() - base_num_reads, 40);
+
+  Destroy(options);
+}
+
+TEST_P(ExternalSSTFileBasicTest, IngestionWithRangeDeletions) {
+  int kNumLevels = 7;
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.num_levels = kNumLevels;
+  Reopen(options);
+
+  std::map<std::string, std::string> true_data;
+  int file_id = 1;
+  // prevent range deletions from being dropped due to becoming obsolete.
+  const Snapshot* snapshot = db_->GetSnapshot();
+
+  // range del [0, 50) in L6 file, [50, 100) in L0 file, [100, 150) in memtable
+  for (int i = 0; i < 3; i++) {
+    if (i != 0) {
+      db_->Flush(FlushOptions());
+      if (i == 1) {
+        MoveFilesToLevel(kNumLevels - 1);
+      }
+    }
+    ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                               Key(50 * i), Key(50 * (i + 1))));
+  }
+  ASSERT_EQ(1, NumTableFilesAtLevel(0));
+  ASSERT_EQ(0, NumTableFilesAtLevel(kNumLevels - 2));
+  ASSERT_EQ(1, NumTableFilesAtLevel(kNumLevels - 1));
+
+  bool write_global_seqno = std::get<0>(GetParam());
+  bool verify_checksums_before_ingest = std::get<1>(GetParam());
+  // overlaps with L0 file but not memtable, so flush is skipped and file is
+  // ingested into L0
+  SequenceNumber last_seqno = dbfull()->GetLatestSequenceNumber();
+  ASSERT_OK(GenerateAndAddExternalFile(
+      options, {60, 90}, {ValueType::kTypeValue, ValueType::kTypeValue},
+      {{65, 70}, {70, 85}}, file_id++, write_global_seqno,
+      verify_checksums_before_ingest, &true_data));
+  ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), ++last_seqno);
+  ASSERT_EQ(2, NumTableFilesAtLevel(0));
+  ASSERT_EQ(0, NumTableFilesAtLevel(kNumLevels - 2));
+  ASSERT_EQ(1, NumTableFilesAtLevel(options.num_levels - 1));
+
+  // overlaps with L6 file but not memtable or L0 file, so flush is skipped and
+  // file is ingested into L5
+  ASSERT_OK(GenerateAndAddExternalFile(
+      options, {10, 40}, {ValueType::kTypeValue, ValueType::kTypeValue},
+      file_id++, write_global_seqno, verify_checksums_before_ingest,
+      &true_data));
+  ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), ++last_seqno);
+  ASSERT_EQ(2, NumTableFilesAtLevel(0));
+  ASSERT_EQ(1, NumTableFilesAtLevel(kNumLevels - 2));
+  ASSERT_EQ(1, NumTableFilesAtLevel(options.num_levels - 1));
+
+  // overlaps with L5 file but not memtable or L0 file, so flush is skipped and
+  // file is ingested into L4
+  ASSERT_OK(GenerateAndAddExternalFile(
+      options, {}, {}, {{5, 15}}, file_id++, write_global_seqno,
+      verify_checksums_before_ingest, &true_data));
+  ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), ++last_seqno);
+  ASSERT_EQ(2, NumTableFilesAtLevel(0));
+  ASSERT_EQ(1, NumTableFilesAtLevel(kNumLevels - 2));
+  ASSERT_EQ(1, NumTableFilesAtLevel(options.num_levels - 2));
+  ASSERT_EQ(1, NumTableFilesAtLevel(options.num_levels - 1));
+
+  // ingested file overlaps with memtable, so flush is triggered before the file
+  // is ingested such that the ingested data is considered newest. So L0 file
+  // count increases by two.
+  ASSERT_OK(GenerateAndAddExternalFile(
+      options, {100, 140}, {ValueType::kTypeValue, ValueType::kTypeValue},
+      file_id++, write_global_seqno, verify_checksums_before_ingest,
+      &true_data));
+  ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), ++last_seqno);
+  ASSERT_EQ(4, NumTableFilesAtLevel(0));
+  ASSERT_EQ(1, NumTableFilesAtLevel(kNumLevels - 2));
+  ASSERT_EQ(1, NumTableFilesAtLevel(options.num_levels - 1));
+
+  // snapshot unneeded now that all range deletions are persisted
+  db_->ReleaseSnapshot(snapshot);
+
+  // overlaps with nothing, so places at bottom level and skips incrementing
+  // seqnum.
+  ASSERT_OK(GenerateAndAddExternalFile(
+      options, {151, 175}, {ValueType::kTypeValue, ValueType::kTypeValue},
+      {{160, 200}}, file_id++, write_global_seqno,
+      verify_checksums_before_ingest, &true_data));
+  ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno);
+  ASSERT_EQ(4, NumTableFilesAtLevel(0));
+  ASSERT_EQ(1, NumTableFilesAtLevel(kNumLevels - 2));
+  ASSERT_EQ(2, NumTableFilesAtLevel(options.num_levels - 1));
+}
+
+TEST_F(ExternalSSTFileBasicTest, AdjacentRangeDeletionTombstones) {
+  Options options = CurrentOptions();
+  SstFileWriter sst_file_writer(EnvOptions(), options);
+
+  // file8.sst (delete 300 => 400)
+  std::string file8 = sst_files_dir_ + "file8.sst";
+  ASSERT_OK(sst_file_writer.Open(file8));
+  ASSERT_OK(sst_file_writer.DeleteRange(Key(300), Key(400)));
+  ExternalSstFileInfo file8_info;
+  Status s = sst_file_writer.Finish(&file8_info);
+  ASSERT_TRUE(s.ok()) << s.ToString();
+  ASSERT_EQ(file8_info.file_path, file8);
+  ASSERT_EQ(file8_info.num_entries, 0);
+  ASSERT_EQ(file8_info.smallest_key, "");
+  ASSERT_EQ(file8_info.largest_key, "");
+  ASSERT_EQ(file8_info.num_range_del_entries, 1);
+  ASSERT_EQ(file8_info.smallest_range_del_key, Key(300));
+  ASSERT_EQ(file8_info.largest_range_del_key, Key(400));
+
+  // file9.sst (delete 400 => 500)
+  std::string file9 = sst_files_dir_ + "file9.sst";
+  ASSERT_OK(sst_file_writer.Open(file9));
+  ASSERT_OK(sst_file_writer.DeleteRange(Key(400), Key(500)));
+  ExternalSstFileInfo file9_info;
+  s = sst_file_writer.Finish(&file9_info);
+  ASSERT_TRUE(s.ok()) << s.ToString();
+  ASSERT_EQ(file9_info.file_path, file9);
+  ASSERT_EQ(file9_info.num_entries, 0);
+  ASSERT_EQ(file9_info.smallest_key, "");
+  ASSERT_EQ(file9_info.largest_key, "");
+  ASSERT_EQ(file9_info.num_range_del_entries, 1);
+  ASSERT_EQ(file9_info.smallest_range_del_key, Key(400));
+  ASSERT_EQ(file9_info.largest_range_del_key, Key(500));
+
+  // Range deletion tombstones are exclusive on their end key, so these SSTs
+  // should not be considered as overlapping.
+  s = DeprecatedAddFile({file8, file9});
+  ASSERT_TRUE(s.ok()) << s.ToString();
+  ASSERT_EQ(db_->GetLatestSequenceNumber(), 0U);
+  DestroyAndRecreateExternalSSTFilesDir();
+}
+
+TEST_P(ExternalSSTFileBasicTest, IngestFileWithBadBlockChecksum) {
+  bool change_checksum_called = false;
+  const auto& change_checksum = [&](void* arg) {
+    if (!change_checksum_called) {
+      char* buf = reinterpret_cast<char*>(arg);
+      assert(nullptr != buf);
+      buf[0] ^= 0x1;
+      change_checksum_called = true;
+    }
+  };
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->SetCallBack(
+      "BlockBasedTableBuilder::WriteRawBlock:TamperWithChecksum",
+      change_checksum);
+  SyncPoint::GetInstance()->EnableProcessing();
+  int file_id = 0;
+  bool write_global_seqno = std::get<0>(GetParam());
+  bool verify_checksums_before_ingest = std::get<1>(GetParam());
+  do {
+    Options options = CurrentOptions();
+    DestroyAndReopen(options);
+    std::map<std::string, std::string> true_data;
+    Status s = GenerateAndAddExternalFile(
+        options, {1, 2, 3, 4, 5, 6}, ValueType::kTypeValue, file_id++,
+        write_global_seqno, verify_checksums_before_ingest, &true_data);
+    if (verify_checksums_before_ingest) {
+      ASSERT_NOK(s);
+    } else {
+      ASSERT_OK(s);
+    }
+    change_checksum_called = false;
+  } while (ChangeOptionsForFileIngestionTest());
+}
+
+TEST_P(ExternalSSTFileBasicTest, IngestFileWithFirstByteTampered) {
+  SyncPoint::GetInstance()->DisableProcessing();
+  int file_id = 0;
+  EnvOptions env_options;
+  do {
+    Options options = CurrentOptions();
+    std::string file_path = sst_files_dir_ + ToString(file_id++);
+    SstFileWriter sst_file_writer(env_options, options);
+    Status s = sst_file_writer.Open(file_path);
+    ASSERT_OK(s);
+    for (int i = 0; i != 100; ++i) {
+      std::string key = Key(i);
+      std::string value = Key(i) + ToString(0);
+      ASSERT_OK(sst_file_writer.Put(key, value));
+    }
+    ASSERT_OK(sst_file_writer.Finish());
+    {
+      // Get file size
+      uint64_t file_size = 0;
+      ASSERT_OK(env_->GetFileSize(file_path, &file_size));
+      ASSERT_GT(file_size, 8);
+      std::unique_ptr<RandomRWFile> rwfile;
+      ASSERT_OK(env_->NewRandomRWFile(file_path, &rwfile, EnvOptions()));
+      // Manually corrupt the file
+      // We deterministically corrupt the first byte because we currently
+      // cannot choose a random offset. The reason for this limitation is that
+      // we do not checksum property block at present.
+      const uint64_t offset = 0;
+      char scratch[8] = {0};
+      Slice buf;
+      ASSERT_OK(rwfile->Read(offset, sizeof(scratch), &buf, scratch));
+      scratch[0] ^= 0xff;  // flip one bit
+      ASSERT_OK(rwfile->Write(offset, buf));
+    }
+    // Ingest file.
+    IngestExternalFileOptions ifo;
+    ifo.write_global_seqno = std::get<0>(GetParam());
+    ifo.verify_checksums_before_ingest = std::get<1>(GetParam());
+    s = db_->IngestExternalFile({file_path}, ifo);
+    if (ifo.verify_checksums_before_ingest) {
+      ASSERT_NOK(s);
+    } else {
+      ASSERT_OK(s);
+    }
+  } while (ChangeOptionsForFileIngestionTest());
+}
+
+TEST_P(ExternalSSTFileBasicTest, IngestExternalFileWithCorruptedPropsBlock) {
+  bool verify_checksums_before_ingest = std::get<1>(GetParam());
+  if (!verify_checksums_before_ingest) {
+    return;
+  }
+  uint64_t props_block_offset = 0;
+  size_t props_block_size = 0;
+  const auto& get_props_block_offset = [&](void* arg) {
+    props_block_offset = *reinterpret_cast<uint64_t*>(arg);
+  };
+  const auto& get_props_block_size = [&](void* arg) {
+    props_block_size = *reinterpret_cast<uint64_t*>(arg);
+  };
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->SetCallBack(
+      "BlockBasedTableBuilder::WritePropertiesBlock:GetPropsBlockOffset",
+      get_props_block_offset);
+  SyncPoint::GetInstance()->SetCallBack(
+      "BlockBasedTableBuilder::WritePropertiesBlock:GetPropsBlockSize",
+      get_props_block_size);
+  SyncPoint::GetInstance()->EnableProcessing();
+  int file_id = 0;
+  Random64 rand(time(nullptr));
+  do {
+    std::string file_path = sst_files_dir_ + ToString(file_id++);
+    Options options = CurrentOptions();
+    SstFileWriter sst_file_writer(EnvOptions(), options);
+    Status s = sst_file_writer.Open(file_path);
+    ASSERT_OK(s);
+    for (int i = 0; i != 100; ++i) {
+      std::string key = Key(i);
+      std::string value = Key(i) + ToString(0);
+      ASSERT_OK(sst_file_writer.Put(key, value));
+    }
+    ASSERT_OK(sst_file_writer.Finish());
+
+    {
+      std::unique_ptr<RandomRWFile> rwfile;
+      ASSERT_OK(env_->NewRandomRWFile(file_path, &rwfile, EnvOptions()));
+      // Manually corrupt the file
+      ASSERT_GT(props_block_size, 8);
+      uint64_t offset =
+          props_block_offset + rand.Next() % (props_block_size - 8);
+      char scratch[8] = {0};
+      Slice buf;
+      ASSERT_OK(rwfile->Read(offset, sizeof(scratch), &buf, scratch));
+      scratch[0] ^= 0xff;  // flip one bit
+      ASSERT_OK(rwfile->Write(offset, buf));
+    }
+
+    // Ingest file.
+    IngestExternalFileOptions ifo;
+    ifo.write_global_seqno = std::get<0>(GetParam());
+    ifo.verify_checksums_before_ingest = true;
+    s = db_->IngestExternalFile({file_path}, ifo);
+    ASSERT_NOK(s);
+  } while (ChangeOptionsForFileIngestionTest());
+}
+
+TEST_F(ExternalSSTFileBasicTest, OverlappingFiles) {
+  Options options = CurrentOptions();
+
+  std::vector<std::string> files;
+  {
+    SstFileWriter sst_file_writer(EnvOptions(), options);
+    std::string file1 = sst_files_dir_ + "file1.sst";
+    ASSERT_OK(sst_file_writer.Open(file1));
+    ASSERT_OK(sst_file_writer.Put("a", "z"));
+    ASSERT_OK(sst_file_writer.Put("i", "m"));
+    ExternalSstFileInfo file1_info;
+    ASSERT_OK(sst_file_writer.Finish(&file1_info));
+    files.push_back(std::move(file1));
+  }
+  {
+    SstFileWriter sst_file_writer(EnvOptions(), options);
+    std::string file2 = sst_files_dir_ + "file2.sst";
+    ASSERT_OK(sst_file_writer.Open(file2));
+    ASSERT_OK(sst_file_writer.Put("i", "k"));
+    ExternalSstFileInfo file2_info;
+    ASSERT_OK(sst_file_writer.Finish(&file2_info));
+    files.push_back(std::move(file2));
+  }
+
+  IngestExternalFileOptions ifo;
+  ASSERT_OK(db_->IngestExternalFile(files, ifo));
+  ASSERT_EQ(Get("a"), "z");
+  ASSERT_EQ(Get("i"), "k");
+
+  int total_keys = 0;
+  Iterator* iter = db_->NewIterator(ReadOptions());
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    ASSERT_OK(iter->status());
+    total_keys++;
+  }
+  delete iter;
+  ASSERT_EQ(total_keys, 2);
+
+  ASSERT_EQ(2, NumTableFilesAtLevel(0));
+}
+
+INSTANTIATE_TEST_CASE_P(ExternalSSTFileBasicTest, ExternalSSTFileBasicTest,
+                        testing::Values(std::make_tuple(true, true),
+                                        std::make_tuple(true, false),
+                                        std::make_tuple(false, true),
+                                        std::make_tuple(false, false)));
+
+#endif  // ROCKSDB_LITE
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/external_sst_file_ingestion_job.cc b/src/rocksdb/db/external_sst_file_ingestion_job.cc
new file mode 100644
index 000000000..4cec5d376
--- /dev/null
+++ b/src/rocksdb/db/external_sst_file_ingestion_job.cc
@@ -0,0 +1,731 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "db/external_sst_file_ingestion_job.h"
+
+#include <algorithm>
+#include <cinttypes>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "db/db_impl/db_impl.h"
+#include "db/version_edit.h"
+#include "file/file_util.h"
+#include "file/random_access_file_reader.h"
+#include "table/merging_iterator.h"
+#include "table/scoped_arena_iterator.h"
+#include "table/sst_file_writer_collectors.h"
+#include "table/table_builder.h"
+#include "test_util/sync_point.h"
+#include "util/stop_watch.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+Status ExternalSstFileIngestionJob::Prepare(
+    const std::vector<std::string>& external_files_paths,
+    uint64_t next_file_number, SuperVersion* sv) {
+  Status status;
+
+  // Read the information of files we are ingesting
+  for (const std::string& file_path : external_files_paths) {
+    IngestedFileInfo file_to_ingest;
+    status = GetIngestedFileInfo(file_path, &file_to_ingest, sv);
+    if (!status.ok()) {
+      return status;
+    }
+    files_to_ingest_.push_back(file_to_ingest);
+  }
+
+  for (const IngestedFileInfo& f : files_to_ingest_) {
+    if (f.cf_id !=
+            TablePropertiesCollectorFactory::Context::kUnknownColumnFamily &&
+        f.cf_id != cfd_->GetID()) {
+      return Status::InvalidArgument(
+          "External file column family id dont match");
+    }
+  }
+
+  const Comparator* ucmp = cfd_->internal_comparator().user_comparator();
+  auto num_files = files_to_ingest_.size();
+  if (num_files == 0) {
+    return Status::InvalidArgument("The list of files is empty");
+  } else if (num_files > 1) {
+    // Verify that passed files dont have overlapping ranges
+    autovector<const IngestedFileInfo*> sorted_files;
+    for (size_t i = 0; i < num_files; i++) {
+      sorted_files.push_back(&files_to_ingest_[i]);
+    }
+
+    std::sort(
+        sorted_files.begin(), sorted_files.end(),
+        [&ucmp](const IngestedFileInfo* info1, const IngestedFileInfo* info2) {
+          return sstableKeyCompare(ucmp, info1->smallest_internal_key,
+                                   info2->smallest_internal_key) < 0;
+        });
+
+    for (size_t i = 0; i < num_files - 1; i++) {
+      if (sstableKeyCompare(ucmp, sorted_files[i]->largest_internal_key,
+                            sorted_files[i + 1]->smallest_internal_key) >= 0) {
+        files_overlap_ = true;
+        break;
+      }
+    }
+  }
+
+  if (ingestion_options_.ingest_behind && files_overlap_) {
+    return Status::NotSupported("Files have overlapping ranges");
+  }
+
+  for (IngestedFileInfo& f : files_to_ingest_) {
+    if (f.num_entries == 0 && f.num_range_deletions == 0) {
+      return Status::InvalidArgument("File contain no entries");
+    }
+
+    if (!f.smallest_internal_key.Valid() || !f.largest_internal_key.Valid()) {
+      return Status::Corruption("Generated table have corrupted keys");
+    }
+  }
+
+  // Copy/Move external files into DB
+  std::unordered_set<size_t> ingestion_path_ids;
+  for (IngestedFileInfo& f : files_to_ingest_) {
+    f.fd = FileDescriptor(next_file_number++, 0, f.file_size);
+    f.copy_file = false;
+    const std::string path_outside_db = f.external_file_path;
+    const std::string path_inside_db =
+        TableFileName(cfd_->ioptions()->cf_paths, f.fd.GetNumber(),
+                      f.fd.GetPathId());
+    if (ingestion_options_.move_files) {
+      status =
+          fs_->LinkFile(path_outside_db, path_inside_db, IOOptions(), nullptr);
+      if (status.ok()) {
+        // It is unsafe to assume application had sync the file and file
+        // directory before ingest the file. For integrity of RocksDB we need
+        // to sync the file.
+        std::unique_ptr<FSWritableFile> file_to_sync;
+        status = fs_->ReopenWritableFile(path_inside_db, env_options_,
+                                         &file_to_sync, nullptr);
+        if (status.ok()) {
+          TEST_SYNC_POINT(
+              "ExternalSstFileIngestionJob::BeforeSyncIngestedFile");
+          status = SyncIngestedFile(file_to_sync.get());
+          TEST_SYNC_POINT("ExternalSstFileIngestionJob::AfterSyncIngestedFile");
+          if (!status.ok()) {
+            ROCKS_LOG_WARN(db_options_.info_log,
+                           "Failed to sync ingested file %s: %s",
+                           path_inside_db.c_str(), status.ToString().c_str());
+          }
+        }
+      } else if (status.IsNotSupported() &&
+                 ingestion_options_.failed_move_fall_back_to_copy) {
+        // Original file is on a different FS, use copy instead of hard linking.
+        f.copy_file = true;
+      }
+    } else {
+      f.copy_file = true;
+    }
+
+    if (f.copy_file) {
+      TEST_SYNC_POINT_CALLBACK("ExternalSstFileIngestionJob::Prepare:CopyFile",
+                               nullptr);
+      // CopyFile also sync the new file.
+      status = CopyFile(fs_, path_outside_db, path_inside_db, 0,
+                        db_options_.use_fsync);
+    }
+    TEST_SYNC_POINT("ExternalSstFileIngestionJob::Prepare:FileAdded");
+    if (!status.ok()) {
+      break;
+    }
+    f.internal_file_path = path_inside_db;
+    ingestion_path_ids.insert(f.fd.GetPathId());
+  }
+
+  TEST_SYNC_POINT("ExternalSstFileIngestionJob::BeforeSyncDir");
+  if (status.ok()) {
+    for (auto path_id : ingestion_path_ids) {
+      status = directories_->GetDataDir(path_id)->Fsync();
+      if (!status.ok()) {
+        ROCKS_LOG_WARN(db_options_.info_log,
+                       "Failed to sync directory %" ROCKSDB_PRIszt
+                       " while ingest file: %s",
+                       path_id, status.ToString().c_str());
+        break;
+      }
+    }
+  }
+  TEST_SYNC_POINT("ExternalSstFileIngestionJob::AfterSyncDir");
+
+  // TODO: The following is duplicated with Cleanup().
+  if (!status.ok()) {
+    // We failed, remove all files that we copied into the db
+    for (IngestedFileInfo& f : files_to_ingest_) {
+      if (f.internal_file_path.empty()) {
+        continue;
+      }
+      Status s = env_->DeleteFile(f.internal_file_path);
+      if (!s.ok()) {
+        ROCKS_LOG_WARN(db_options_.info_log,
+                       "AddFile() clean up for file %s failed : %s",
+                       f.internal_file_path.c_str(), s.ToString().c_str());
+      }
+    }
+  }
+
+  return status;
+}
+
+Status ExternalSstFileIngestionJob::NeedsFlush(bool* flush_needed,
+                                               SuperVersion* super_version) {
+  autovector<Range> ranges;
+  for (const IngestedFileInfo& file_to_ingest : files_to_ingest_) {
+    ranges.emplace_back(file_to_ingest.smallest_internal_key.user_key(),
+                        file_to_ingest.largest_internal_key.user_key());
+  }
+  Status status =
+      cfd_->RangesOverlapWithMemtables(ranges, super_version, flush_needed);
+  if (status.ok() && *flush_needed &&
+      !ingestion_options_.allow_blocking_flush) {
+    status = Status::InvalidArgument("External file requires flush");
+  }
+  return status;
+}
+
+// REQUIRES: we have become the only writer by entering both write_thread_ and
+// nonmem_write_thread_
+Status ExternalSstFileIngestionJob::Run() {
+  Status status;
+  SuperVersion* super_version = cfd_->GetSuperVersion();
+#ifndef NDEBUG
+  // We should never run the job with a memtable that is overlapping
+  // with the files we are ingesting
+  bool need_flush = false;
+  status = NeedsFlush(&need_flush, super_version);
+  assert(status.ok() && need_flush == false);
+#endif
+
+  bool force_global_seqno = false;
+
+  if (ingestion_options_.snapshot_consistency && !db_snapshots_->empty()) {
+    // We need to assign a global sequence number to all the files even
+    // if the dont overlap with any ranges since we have snapshots
+    force_global_seqno = true;
+  }
+  // It is safe to use this instead of LastAllocatedSequence since we are
+  // the only active writer, and hence they are equal
+  SequenceNumber last_seqno = versions_->LastSequence();
+  edit_.SetColumnFamily(cfd_->GetID());
+  // The levels that the files will be ingested into
+
+  for (IngestedFileInfo& f : files_to_ingest_) {
+    SequenceNumber assigned_seqno = 0;
+    if (ingestion_options_.ingest_behind) {
+      status = CheckLevelForIngestedBehindFile(&f);
+    } else {
+      status = AssignLevelAndSeqnoForIngestedFile(
+          super_version, force_global_seqno, cfd_->ioptions()->compaction_style,
+          last_seqno, &f, &assigned_seqno);
+    }
+    if (!status.ok()) {
+      return status;
+    }
+    status = AssignGlobalSeqnoForIngestedFile(&f, assigned_seqno);
+    TEST_SYNC_POINT_CALLBACK("ExternalSstFileIngestionJob::Run",
+                             &assigned_seqno);
+    if (assigned_seqno > last_seqno) {
+      assert(assigned_seqno == last_seqno + 1);
+      last_seqno = assigned_seqno;
+      ++consumed_seqno_count_;
+    }
+    if (!status.ok()) {
+      return status;
+    }
+
+    // We use the import time as the ancester time. This is the time the data
+    // is written to the database.
+    int64_t temp_current_time = 0;
+    uint64_t current_time = kUnknownFileCreationTime;
+    uint64_t oldest_ancester_time = kUnknownOldestAncesterTime;
+    if (env_->GetCurrentTime(&temp_current_time).ok()) {
+      current_time = oldest_ancester_time =
+          static_cast<uint64_t>(temp_current_time);
+    }
+
+    edit_.AddFile(
+        f.picked_level, f.fd.GetNumber(), f.fd.GetPathId(), f.fd.GetFileSize(),
+        f.smallest_internal_key, f.largest_internal_key, f.assigned_seqno,
+        f.assigned_seqno, false, kInvalidBlobFileNumber, oldest_ancester_time,
+        current_time, kUnknownFileChecksum, kUnknownFileChecksumFuncName);
+  }
+  return status;
+}
+
+void ExternalSstFileIngestionJob::UpdateStats() {
+  // Update internal stats for new ingested files
+  uint64_t total_keys = 0;
+  uint64_t total_l0_files = 0;
+  uint64_t total_time = env_->NowMicros() - job_start_time_;
+
+  EventLoggerStream stream = event_logger_->Log();
+  stream << "event"
+         << "ingest_finished";
+  stream << "files_ingested";
+  stream.StartArray();
+
+  for (IngestedFileInfo& f : files_to_ingest_) {
+    InternalStats::CompactionStats stats(CompactionReason::kExternalSstIngestion, 1);
+    stats.micros = total_time;
+    // If actual copy occurred for this file, then we need to count the file
+    // size as the actual bytes written. If the file was linked, then we ignore
+    // the bytes written for file metadata.
+    // TODO (yanqin) maybe account for file metadata bytes for exact accuracy?
+    if (f.copy_file) {
+      stats.bytes_written = f.fd.GetFileSize();
+    } else {
+      stats.bytes_moved = f.fd.GetFileSize();
+    }
+    stats.num_output_files = 1;
+    cfd_->internal_stats()->AddCompactionStats(f.picked_level,
+                                               Env::Priority::USER, stats);
+    cfd_->internal_stats()->AddCFStats(InternalStats::BYTES_INGESTED_ADD_FILE,
+                                       f.fd.GetFileSize());
+    total_keys += f.num_entries;
+    if (f.picked_level == 0) {
+      total_l0_files += 1;
+    }
+    ROCKS_LOG_INFO(
+        db_options_.info_log,
+        "[AddFile] External SST file %s was ingested in L%d with path %s "
+        "(global_seqno=%" PRIu64 ")\n",
+        f.external_file_path.c_str(), f.picked_level,
+        f.internal_file_path.c_str(), f.assigned_seqno);
+    stream << "file" << f.internal_file_path << "level" << f.picked_level;
+  }
+  stream.EndArray();
+
+  stream << "lsm_state";
+  stream.StartArray();
+  auto vstorage = cfd_->current()->storage_info();
+  for (int level = 0; level < vstorage->num_levels(); ++level) {
+    stream << vstorage->NumLevelFiles(level);
+  }
+  stream.EndArray();
+
+  cfd_->internal_stats()->AddCFStats(InternalStats::INGESTED_NUM_KEYS_TOTAL,
+                                     total_keys);
+  cfd_->internal_stats()->AddCFStats(InternalStats::INGESTED_NUM_FILES_TOTAL,
+                                     files_to_ingest_.size());
+  cfd_->internal_stats()->AddCFStats(
+      InternalStats::INGESTED_LEVEL0_NUM_FILES_TOTAL, total_l0_files);
+}
+
+void ExternalSstFileIngestionJob::Cleanup(const Status& status) {
+  if (!status.ok()) {
+    // We failed to add the files to the database
+    // remove all the files we copied
+    for (IngestedFileInfo& f : files_to_ingest_) {
+      if (f.internal_file_path.empty()) {
+        continue;
+      }
+      Status s = env_->DeleteFile(f.internal_file_path);
+      if (!s.ok()) {
+        ROCKS_LOG_WARN(db_options_.info_log,
+                       "AddFile() clean up for file %s failed : %s",
+                       f.internal_file_path.c_str(), s.ToString().c_str());
+      }
+    }
+    consumed_seqno_count_ = 0;
+    files_overlap_ = false;
+  } else if (status.ok() && ingestion_options_.move_files) {
+    // The files were moved and added successfully, remove original file links
+    for (IngestedFileInfo& f : files_to_ingest_) {
+      Status s = env_->DeleteFile(f.external_file_path);
+      if (!s.ok()) {
+        ROCKS_LOG_WARN(
+            db_options_.info_log,
+            "%s was added to DB successfully but failed to remove original "
+            "file link : %s",
+            f.external_file_path.c_str(), s.ToString().c_str());
+      }
+    }
+  }
+}
+
+Status ExternalSstFileIngestionJob::GetIngestedFileInfo(
+    const std::string& external_file, IngestedFileInfo* file_to_ingest,
+    SuperVersion* sv) {
+  file_to_ingest->external_file_path = external_file;
+
+  // Get external file size
+  Status status = fs_->GetFileSize(external_file, IOOptions(),
+                                   &file_to_ingest->file_size, nullptr);
+  if (!status.ok()) {
+    return status;
+  }
+
+  // Create TableReader for external file
+  std::unique_ptr<TableReader> table_reader;
+  std::unique_ptr<FSRandomAccessFile> sst_file;
+  std::unique_ptr<RandomAccessFileReader> sst_file_reader;
+
+  status = fs_->NewRandomAccessFile(external_file, env_options_,
+                                    &sst_file, nullptr);
+  if (!status.ok()) {
+    return status;
+  }
+  sst_file_reader.reset(new RandomAccessFileReader(std::move(sst_file),
+                                                   external_file));
+
+  status = cfd_->ioptions()->table_factory->NewTableReader(
+      TableReaderOptions(*cfd_->ioptions(),
+                         sv->mutable_cf_options.prefix_extractor.get(),
+                         env_options_, cfd_->internal_comparator()),
+      std::move(sst_file_reader), file_to_ingest->file_size, &table_reader);
+  if (!status.ok()) {
+    return status;
+  }
+
+  if (ingestion_options_.verify_checksums_before_ingest) {
+    // If customized readahead size is needed, we can pass a user option
+    // all the way to here. Right now we just rely on the default readahead
+    // to keep things simple.
+    ReadOptions ro;
+    ro.readahead_size = ingestion_options_.verify_checksums_readahead_size;
+    status = table_reader->VerifyChecksum(
+        ro, TableReaderCaller::kExternalSSTIngestion);
+  }
+  if (!status.ok()) {
+    return status;
+  }
+
+  // Get the external file properties
+  auto props = table_reader->GetTableProperties();
+  const auto& uprops = props->user_collected_properties;
+
+  // Get table version
+  auto version_iter = uprops.find(ExternalSstFilePropertyNames::kVersion);
+  if (version_iter == uprops.end()) {
+    return Status::Corruption("External file version not found");
+  }
+  file_to_ingest->version = DecodeFixed32(version_iter->second.c_str());
+
+  auto seqno_iter = uprops.find(ExternalSstFilePropertyNames::kGlobalSeqno);
+  if (file_to_ingest->version == 2) {
+    // version 2 imply that we have global sequence number
+    if (seqno_iter == uprops.end()) {
+      return Status::Corruption(
+          "External file global sequence number not found");
+    }
+
+    // Set the global sequence number
+    file_to_ingest->original_seqno = DecodeFixed64(seqno_iter->second.c_str());
+    auto offsets_iter = props->properties_offsets.find(
+        ExternalSstFilePropertyNames::kGlobalSeqno);
+    if (offsets_iter == props->properties_offsets.end() ||
+        offsets_iter->second == 0) {
+      file_to_ingest->global_seqno_offset = 0;
+      return Status::Corruption("Was not able to find file global seqno field");
+    }
+    file_to_ingest->global_seqno_offset = static_cast<size_t>(offsets_iter->second);
+  } else if (file_to_ingest->version == 1) {
+    // SST file V1 should not have global seqno field
+    assert(seqno_iter == uprops.end());
+    file_to_ingest->original_seqno = 0;
+    if (ingestion_options_.allow_blocking_flush ||
+            ingestion_options_.allow_global_seqno) {
+      return Status::InvalidArgument(
+            "External SST file V1 does not support global seqno");
+    }
+  } else {
+    return Status::InvalidArgument("External file version is not supported");
+  }
+  // Get number of entries in table
+  file_to_ingest->num_entries = props->num_entries;
+  file_to_ingest->num_range_deletions = props->num_range_deletions;
+
+  ParsedInternalKey key;
+  ReadOptions ro;
+  // During reading the external file we can cache blocks that we read into
+  // the block cache, if we later change the global seqno of this file, we will
+  // have block in cache that will include keys with wrong seqno.
+  // We need to disable fill_cache so that we read from the file without
+  // updating the block cache.
+  ro.fill_cache = false;
+  std::unique_ptr<InternalIterator> iter(table_reader->NewIterator(
+      ro, sv->mutable_cf_options.prefix_extractor.get(), /*arena=*/nullptr,
+      /*skip_filters=*/false, TableReaderCaller::kExternalSSTIngestion));
+  std::unique_ptr<InternalIterator> range_del_iter(
+      table_reader->NewRangeTombstoneIterator(ro));
+
+  // Get first (smallest) and last (largest) key from file.
+  file_to_ingest->smallest_internal_key =
+      InternalKey("", 0, ValueType::kTypeValue);
+  file_to_ingest->largest_internal_key =
+      InternalKey("", 0, ValueType::kTypeValue);
+  bool bounds_set = false;
+  iter->SeekToFirst();
+  if (iter->Valid()) {
+    if (!ParseInternalKey(iter->key(), &key)) {
+      return Status::Corruption("external file have corrupted keys");
+    }
+    if (key.sequence != 0) {
+      return Status::Corruption("external file have non zero sequence number");
+    }
+    file_to_ingest->smallest_internal_key.SetFrom(key);
+
+    iter->SeekToLast();
+    if (!ParseInternalKey(iter->key(), &key)) {
+      return Status::Corruption("external file have corrupted keys");
+    }
+    if (key.sequence != 0) {
+      return Status::Corruption("external file have non zero sequence number");
+    }
+    file_to_ingest->largest_internal_key.SetFrom(key);
+
+    bounds_set = true;
+  }
+
+  // We may need to adjust these key bounds, depending on whether any range
+  // deletion tombstones extend past them.
+  const Comparator* ucmp = cfd_->internal_comparator().user_comparator();
+  if (range_del_iter != nullptr) {
+    for (range_del_iter->SeekToFirst(); range_del_iter->Valid();
+         range_del_iter->Next()) {
+      if (!ParseInternalKey(range_del_iter->key(), &key)) {
+        return Status::Corruption("external file have corrupted keys");
+      }
+      RangeTombstone tombstone(key, range_del_iter->value());
+
+      InternalKey start_key = tombstone.SerializeKey();
+      if (!bounds_set ||
+          sstableKeyCompare(ucmp, start_key,
+                            file_to_ingest->smallest_internal_key) < 0) {
+        file_to_ingest->smallest_internal_key = start_key;
+      }
+      InternalKey end_key = tombstone.SerializeEndKey();
+      if (!bounds_set ||
+          sstableKeyCompare(ucmp, end_key,
+                            file_to_ingest->largest_internal_key) > 0) {
+        file_to_ingest->largest_internal_key = end_key;
+      }
+      bounds_set = true;
+    }
+  }
+
+  file_to_ingest->cf_id = static_cast<uint32_t>(props->column_family_id);
+
+  file_to_ingest->table_properties = *props;
+
+  return status;
+}
+
+Status ExternalSstFileIngestionJob::AssignLevelAndSeqnoForIngestedFile(
+    SuperVersion* sv, bool force_global_seqno, CompactionStyle compaction_style,
+    SequenceNumber last_seqno, IngestedFileInfo* file_to_ingest,
+    SequenceNumber* assigned_seqno) {
+  Status status;
+  *assigned_seqno = 0;
+  if (force_global_seqno) {
+    *assigned_seqno = last_seqno + 1;
+    if (compaction_style == kCompactionStyleUniversal || files_overlap_) {
+      file_to_ingest->picked_level = 0;
+      return status;
+    }
+  }
+
+  bool overlap_with_db = false;
+  Arena arena;
+  ReadOptions ro;
+  ro.total_order_seek = true;
+  int target_level = 0;
+  auto* vstorage = cfd_->current()->storage_info();
+
+  for (int lvl = 0; lvl < cfd_->NumberLevels(); lvl++) {
+    if (lvl > 0 && lvl < vstorage->base_level()) {
+      continue;
+    }
+
+    if (vstorage->NumLevelFiles(lvl) > 0) {
+      bool overlap_with_level = false;
+      status = sv->current->OverlapWithLevelIterator(
+          ro, env_options_, file_to_ingest->smallest_internal_key.user_key(),
+          file_to_ingest->largest_internal_key.user_key(), lvl,
+          &overlap_with_level);
+      if (!status.ok()) {
+        return status;
+      }
+      if (overlap_with_level) {
+        // We must use L0 or any level higher than `lvl` to be able to overwrite
+        // the keys that we overlap with in this level, We also need to assign
+        // this file a seqno to overwrite the existing keys in level `lvl`
+        overlap_with_db = true;
+        break;
+      }
+
+      if (compaction_style == kCompactionStyleUniversal && lvl != 0) {
+        const std::vector<FileMetaData*>& level_files =
+            vstorage->LevelFiles(lvl);
+        const SequenceNumber level_largest_seqno =
+            (*max_element(level_files.begin(), level_files.end(),
+                          [](FileMetaData* f1, FileMetaData* f2) {
+                            return f1->fd.largest_seqno < f2->fd.largest_seqno;
+                          }))
+                ->fd.largest_seqno;
+        // should only assign seqno to current level's largest seqno when
+        // the file fits
+        if (level_largest_seqno != 0 &&
+            IngestedFileFitInLevel(file_to_ingest, lvl)) {
+          *assigned_seqno = level_largest_seqno;
+        } else {
+          continue;
+        }
+      }
+    } else if (compaction_style == kCompactionStyleUniversal) {
+      continue;
+    }
+
+    // We dont overlap with any keys in this level, but we still need to check
+    // if our file can fit in it
+    if (IngestedFileFitInLevel(file_to_ingest, lvl)) {
+      target_level = lvl;
+    }
+  }
+  // If files overlap, we have to ingest them at level 0 and assign the newest
+  // sequence number
+  if (files_overlap_) {
+    target_level = 0;
+    *assigned_seqno = last_seqno + 1;
+  }
+ TEST_SYNC_POINT_CALLBACK(
+      "ExternalSstFileIngestionJob::AssignLevelAndSeqnoForIngestedFile",
+      &overlap_with_db);
+  file_to_ingest->picked_level = target_level;
+  if (overlap_with_db && *assigned_seqno == 0) {
+    *assigned_seqno = last_seqno + 1;
+  }
+  return status;
+}
+
+Status ExternalSstFileIngestionJob::CheckLevelForIngestedBehindFile(
+    IngestedFileInfo* file_to_ingest) {
+  auto* vstorage = cfd_->current()->storage_info();
+  // first check if new files fit in the bottommost level
+  int bottom_lvl = cfd_->NumberLevels() - 1;
+  if(!IngestedFileFitInLevel(file_to_ingest, bottom_lvl)) {
+    return Status::InvalidArgument(
+      "Can't ingest_behind file as it doesn't fit "
+      "at the bottommost level!");
+  }
+
+  // second check if despite allow_ingest_behind=true we still have 0 seqnums
+  // at some upper level
+  for (int lvl = 0; lvl < cfd_->NumberLevels() - 1; lvl++) {
+    for (auto file : vstorage->LevelFiles(lvl)) {
+      if (file->fd.smallest_seqno == 0) {
+        return Status::InvalidArgument(
+          "Can't ingest_behind file as despite allow_ingest_behind=true "
+          "there are files with 0 seqno in database at upper levels!");
+      }
+    }
+  }
+
+  file_to_ingest->picked_level = bottom_lvl;
+  return Status::OK();
+}
+
+Status ExternalSstFileIngestionJob::AssignGlobalSeqnoForIngestedFile(
+    IngestedFileInfo* file_to_ingest, SequenceNumber seqno) {
+  if (file_to_ingest->original_seqno == seqno) {
+    // This file already have the correct global seqno
+    return Status::OK();
+  } else if (!ingestion_options_.allow_global_seqno) {
+    return Status::InvalidArgument("Global seqno is required, but disabled");
+  } else if (file_to_ingest->global_seqno_offset == 0) {
+    return Status::InvalidArgument(
+        "Trying to set global seqno for a file that dont have a global seqno "
+        "field");
+  }
+
+  if (ingestion_options_.write_global_seqno) {
+    // Determine if we can write global_seqno to a given offset of file.
+    // If the file system does not support random write, then we should not.
+    // Otherwise we should.
+    std::unique_ptr<FSRandomRWFile> rwfile;
+    Status status =
+        fs_->NewRandomRWFile(file_to_ingest->internal_file_path, env_options_,
+                             &rwfile, nullptr);
+    if (status.ok()) {
+      std::string seqno_val;
+      PutFixed64(&seqno_val, seqno);
+      status = rwfile->Write(file_to_ingest->global_seqno_offset, seqno_val,
+                             IOOptions(), nullptr);
+      if (status.ok()) {
+        TEST_SYNC_POINT("ExternalSstFileIngestionJob::BeforeSyncGlobalSeqno");
+        status = SyncIngestedFile(rwfile.get());
+        TEST_SYNC_POINT("ExternalSstFileIngestionJob::AfterSyncGlobalSeqno");
+        if (!status.ok()) {
+          ROCKS_LOG_WARN(db_options_.info_log,
+                         "Failed to sync ingested file %s after writing global "
+                         "sequence number: %s",
+                         file_to_ingest->internal_file_path.c_str(),
+                         status.ToString().c_str());
+        }
+      }
+      if (!status.ok()) {
+        return status;
+      }
+    } else if (!status.IsNotSupported()) {
+      return status;
+    }
+  }
+
+  file_to_ingest->assigned_seqno = seqno;
+  return Status::OK();
+}
+
+bool ExternalSstFileIngestionJob::IngestedFileFitInLevel(
+    const IngestedFileInfo* file_to_ingest, int level) {
+  if (level == 0) {
+    // Files can always fit in L0
+    return true;
+  }
+
+  auto* vstorage = cfd_->current()->storage_info();
+  Slice file_smallest_user_key(
+      file_to_ingest->smallest_internal_key.user_key());
+  Slice file_largest_user_key(file_to_ingest->largest_internal_key.user_key());
+
+  if (vstorage->OverlapInLevel(level, &file_smallest_user_key,
+                               &file_largest_user_key)) {
+    // File overlap with another files in this level, we cannot
+    // add it to this level
+    return false;
+  }
+  if (cfd_->RangeOverlapWithCompaction(file_smallest_user_key,
+                                       file_largest_user_key, level)) {
+    // File overlap with a running compaction output that will be stored
+    // in this level, we cannot add this file to this level
+    return false;
+  }
+
+  // File did not overlap with level files, our compaction output
+  return true;
+}
+
+template <typename TWritableFile>
+Status ExternalSstFileIngestionJob::SyncIngestedFile(TWritableFile* file) {
+  assert(file != nullptr);
+  if (db_options_.use_fsync) {
+    return file->Fsync(IOOptions(), nullptr);
+  } else {
+    return file->Sync(IOOptions(), nullptr);
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/external_sst_file_ingestion_job.h b/src/rocksdb/db/external_sst_file_ingestion_job.h
new file mode 100644
index 000000000..7ddb6f3e8
--- /dev/null
+++ b/src/rocksdb/db/external_sst_file_ingestion_job.h
@@ -0,0 +1,180 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "db/column_family.h"
+#include "db/dbformat.h"
+#include "db/internal_stats.h"
+#include "db/snapshot_impl.h"
+#include "logging/event_logger.h"
+#include "options/db_options.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/sst_file_writer.h"
+#include "util/autovector.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Directories;
+
+struct IngestedFileInfo {
+  // External file path
+  std::string external_file_path;
+  // Smallest internal key in external file
+  InternalKey smallest_internal_key;
+  // Largest internal key in external file
+  InternalKey largest_internal_key;
+  // Sequence number for keys in external file
+  SequenceNumber original_seqno;
+  // Offset of the global sequence number field in the file, will
+  // be zero if version is 1 (global seqno is not supported)
+  size_t global_seqno_offset;
+  // External file size
+  uint64_t file_size;
+  // total number of keys in external file
+  uint64_t num_entries;
+  // total number of range deletions in external file
+  uint64_t num_range_deletions;
+  // Id of column family this file shoule be ingested into
+  uint32_t cf_id;
+  // TableProperties read from external file
+  TableProperties table_properties;
+  // Version of external file
+  int version;
+
+  // FileDescriptor for the file inside the DB
+  FileDescriptor fd;
+  // file path that we picked for file inside the DB
+  std::string internal_file_path;
+  // Global sequence number that we picked for the file inside the DB
+  SequenceNumber assigned_seqno = 0;
+  // Level inside the DB we picked for the external file.
+  int picked_level = 0;
+  // Whether to copy or link the external sst file. copy_file will be set to
+  // false if ingestion_options.move_files is true and underlying FS
+  // supports link operation. Need to provide a default value to make the
+  // undefined-behavior sanity check of llvm happy. Since
+  // ingestion_options.move_files is false by default, thus copy_file is true
+  // by default.
+  bool copy_file = true;
+};
+
+class ExternalSstFileIngestionJob {
+ public:
+  ExternalSstFileIngestionJob(
+      Env* env, VersionSet* versions, ColumnFamilyData* cfd,
+      const ImmutableDBOptions& db_options, const EnvOptions& env_options,
+      SnapshotList* db_snapshots,
+      const IngestExternalFileOptions& ingestion_options,
+      Directories* directories, EventLogger* event_logger)
+      : env_(env),
+        fs_(db_options.fs.get()),
+        versions_(versions),
+        cfd_(cfd),
+        db_options_(db_options),
+        env_options_(env_options),
+        db_snapshots_(db_snapshots),
+        ingestion_options_(ingestion_options),
+        directories_(directories),
+        event_logger_(event_logger),
+        job_start_time_(env_->NowMicros()),
+        consumed_seqno_count_(0) {
+    assert(directories != nullptr);
+  }
+
+  // Prepare the job by copying external files into the DB.
+  Status Prepare(const std::vector<std::string>& external_files_paths,
+                 uint64_t next_file_number, SuperVersion* sv);
+
+  // Check if we need to flush the memtable before running the ingestion job
+  // This will be true if the files we are ingesting are overlapping with any
+  // key range in the memtable.
+  //
+  // @param super_version A referenced SuperVersion that will be held for the
+  //    duration of this function.
+  //
+  // Thread-safe
+  Status NeedsFlush(bool* flush_needed, SuperVersion* super_version);
+
+  // Will execute the ingestion job and prepare edit() to be applied.
+  // REQUIRES: Mutex held
+  Status Run();
+
+  // Update column family stats.
+  // REQUIRES: Mutex held
+  void UpdateStats();
+
+  // Cleanup after successful/failed job
+  void Cleanup(const Status& status);
+
+  VersionEdit* edit() { return &edit_; }
+
+  const autovector<IngestedFileInfo>& files_to_ingest() const {
+    return files_to_ingest_;
+  }
+
+  // How many sequence numbers did we consume as part of the ingest job?
+  int ConsumedSequenceNumbersCount() const { return consumed_seqno_count_; }
+
+ private:
+  // Open the external file and populate `file_to_ingest` with all the
+  // external information we need to ingest this file.
+  Status GetIngestedFileInfo(const std::string& external_file,
+                             IngestedFileInfo* file_to_ingest,
+                             SuperVersion* sv);
+
+  // Assign `file_to_ingest` the appropriate sequence number and  the lowest
+  // possible level that it can be ingested to according to compaction_style.
+  // REQUIRES: Mutex held
+  Status AssignLevelAndSeqnoForIngestedFile(SuperVersion* sv,
+                                            bool force_global_seqno,
+                                            CompactionStyle compaction_style,
+                                            SequenceNumber last_seqno,
+                                            IngestedFileInfo* file_to_ingest,
+                                            SequenceNumber* assigned_seqno);
+
+  // File that we want to ingest behind always goes to the lowest level;
+  // we just check that it fits in the level, that DB allows ingest_behind,
+  // and that we don't have 0 seqnums at the upper levels.
+  // REQUIRES: Mutex held
+  Status CheckLevelForIngestedBehindFile(IngestedFileInfo* file_to_ingest);
+
+  // Set the file global sequence number to `seqno`
+  Status AssignGlobalSeqnoForIngestedFile(IngestedFileInfo* file_to_ingest,
+                                          SequenceNumber seqno);
+
+  // Check if `file_to_ingest` can fit in level `level`
+  // REQUIRES: Mutex held
+  bool IngestedFileFitInLevel(const IngestedFileInfo* file_to_ingest,
+                              int level);
+
+  // Helper method to sync given file.
+  template <typename TWritableFile>
+  Status SyncIngestedFile(TWritableFile* file);
+
+  Env* env_;
+  FileSystem* fs_;
+  VersionSet* versions_;
+  ColumnFamilyData* cfd_;
+  const ImmutableDBOptions& db_options_;
+  const EnvOptions& env_options_;
+  SnapshotList* db_snapshots_;
+  autovector<IngestedFileInfo> files_to_ingest_;
+  const IngestExternalFileOptions& ingestion_options_;
+  Directories* directories_;
+  EventLogger* event_logger_;
+  VersionEdit edit_;
+  uint64_t job_start_time_;
+  int consumed_seqno_count_;
+  // Set in ExternalSstFileIngestionJob::Prepare(), if true all files are
+  // ingested in L0
+  bool files_overlap_{false};
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/external_sst_file_test.cc b/src/rocksdb/db/external_sst_file_test.cc
new file mode 100644
index 000000000..0b91910a1
--- /dev/null
+++ b/src/rocksdb/db/external_sst_file_test.cc
@@ -0,0 +1,2832 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include <functional>
+#include "db/db_test_util.h"
+#include "file/filename.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "rocksdb/sst_file_writer.h"
+#include "test_util/fault_injection_test_env.h"
+#include "test_util/testutil.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// A test environment that can be configured to fail the Link operation.
+class ExternalSSTTestEnv : public EnvWrapper {
+ public:
+  ExternalSSTTestEnv(Env* t, bool fail_link)
+      : EnvWrapper(t), fail_link_(fail_link) {}
+
+  Status LinkFile(const std::string& s, const std::string& t) override {
+    if (fail_link_) {
+      return Status::NotSupported("Link failed");
+    }
+    return target()->LinkFile(s, t);
+  }
+
+  void set_fail_link(bool fail_link) { fail_link_ = fail_link; }
+
+ private:
+  bool fail_link_;
+};
+
+class ExternSSTFileLinkFailFallbackTest
+    : public DBTestBase,
+      public ::testing::WithParamInterface<std::tuple<bool, bool>> {
+ public:
+  ExternSSTFileLinkFailFallbackTest()
+      : DBTestBase("/external_sst_file_test"),
+        test_env_(new ExternalSSTTestEnv(env_, true)) {
+    sst_files_dir_ = dbname_ + "/sst_files/";
+    test::DestroyDir(env_, sst_files_dir_);
+    env_->CreateDir(sst_files_dir_);
+    options_ = CurrentOptions();
+    options_.disable_auto_compactions = true;
+    options_.env = test_env_;
+  }
+
+  void TearDown() override {
+    delete db_;
+    db_ = nullptr;
+    ASSERT_OK(DestroyDB(dbname_, options_));
+    delete test_env_;
+    test_env_ = nullptr;
+  }
+
+ protected:
+  std::string sst_files_dir_;
+  Options options_;
+  ExternalSSTTestEnv* test_env_;
+};
+
+class ExternalSSTFileTest
+    : public DBTestBase,
+      public ::testing::WithParamInterface<std::tuple<bool, bool>> {
+ public:
+  ExternalSSTFileTest() : DBTestBase("/external_sst_file_test") {
+    sst_files_dir_ = dbname_ + "/sst_files/";
+    DestroyAndRecreateExternalSSTFilesDir();
+  }
+
+  void DestroyAndRecreateExternalSSTFilesDir() {
+    test::DestroyDir(env_, sst_files_dir_);
+    env_->CreateDir(sst_files_dir_);
+  }
+
+  Status GenerateOneExternalFile(
+      const Options& options, ColumnFamilyHandle* cfh,
+      std::vector<std::pair<std::string, std::string>>& data, int file_id,
+      bool sort_data, std::string* external_file_path,
+      std::map<std::string, std::string>* true_data) {
+    // Generate a file id if not provided
+    if (-1 == file_id) {
+      file_id = (++last_file_id_);
+    }
+    // Sort data if asked to do so
+    if (sort_data) {
+      std::sort(data.begin(), data.end(),
+                [&](const std::pair<std::string, std::string>& e1,
+                    const std::pair<std::string, std::string>& e2) {
+                  return options.comparator->Compare(e1.first, e2.first) < 0;
+                });
+      auto uniq_iter = std::unique(
+          data.begin(), data.end(),
+          [&](const std::pair<std::string, std::string>& e1,
+              const std::pair<std::string, std::string>& e2) {
+            return options.comparator->Compare(e1.first, e2.first) == 0;
+          });
+      data.resize(uniq_iter - data.begin());
+    }
+    std::string file_path = sst_files_dir_ + ToString(file_id);
+    SstFileWriter sst_file_writer(EnvOptions(), options, cfh);
+    Status s = sst_file_writer.Open(file_path);
+    if (!s.ok()) {
+      return s;
+    }
+    for (const auto& entry : data) {
+      s = sst_file_writer.Put(entry.first, entry.second);
+      if (!s.ok()) {
+        sst_file_writer.Finish();
+        return s;
+      }
+    }
+    s = sst_file_writer.Finish();
+    if (s.ok() && external_file_path != nullptr) {
+      *external_file_path = file_path;
+    }
+    if (s.ok() && nullptr != true_data) {
+      for (const auto& entry : data) {
+        true_data->insert({entry.first, entry.second});
+      }
+    }
+    return s;
+  }
+
+  Status GenerateAndAddExternalFile(
+      const Options options,
+      std::vector<std::pair<std::string, std::string>> data, int file_id = -1,
+      bool allow_global_seqno = false, bool write_global_seqno = false,
+      bool verify_checksums_before_ingest = true, bool ingest_behind = false,
+      bool sort_data = false,
+      std::map<std::string, std::string>* true_data = nullptr,
+      ColumnFamilyHandle* cfh = nullptr) {
+    // Generate a file id if not provided
+    if (file_id == -1) {
+      file_id = last_file_id_ + 1;
+      last_file_id_++;
+    }
+
+    // Sort data if asked to do so
+    if (sort_data) {
+      std::sort(data.begin(), data.end(),
+                [&](const std::pair<std::string, std::string>& e1,
+                    const std::pair<std::string, std::string>& e2) {
+                  return options.comparator->Compare(e1.first, e2.first) < 0;
+                });
+      auto uniq_iter = std::unique(
+          data.begin(), data.end(),
+          [&](const std::pair<std::string, std::string>& e1,
+              const std::pair<std::string, std::string>& e2) {
+            return options.comparator->Compare(e1.first, e2.first) == 0;
+          });
+      data.resize(uniq_iter - data.begin());
+    }
+    std::string file_path = sst_files_dir_ + ToString(file_id);
+    SstFileWriter sst_file_writer(EnvOptions(), options, cfh);
+
+    Status s = sst_file_writer.Open(file_path);
+    if (!s.ok()) {
+      return s;
+    }
+    for (auto& entry : data) {
+      s = sst_file_writer.Put(entry.first, entry.second);
+      if (!s.ok()) {
+        sst_file_writer.Finish();
+        return s;
+      }
+    }
+    s = sst_file_writer.Finish();
+
+    if (s.ok()) {
+      IngestExternalFileOptions ifo;
+      ifo.allow_global_seqno = allow_global_seqno;
+      ifo.write_global_seqno = allow_global_seqno ? write_global_seqno : false;
+      ifo.verify_checksums_before_ingest = verify_checksums_before_ingest;
+      ifo.ingest_behind = ingest_behind;
+      if (cfh) {
+        s = db_->IngestExternalFile(cfh, {file_path}, ifo);
+      } else {
+        s = db_->IngestExternalFile({file_path}, ifo);
+      }
+    }
+
+    if (s.ok() && true_data) {
+      for (auto& entry : data) {
+        (*true_data)[entry.first] = entry.second;
+      }
+    }
+
+    return s;
+  }
+
+  Status GenerateAndAddExternalFiles(
+      const Options& options,
+      const std::vector<ColumnFamilyHandle*>& column_families,
+      const std::vector<IngestExternalFileOptions>& ifos,
+      std::vector<std::vector<std::pair<std::string, std::string>>>& data,
+      int file_id, bool sort_data,
+      std::vector<std::map<std::string, std::string>>& true_data) {
+    if (-1 == file_id) {
+      file_id = (++last_file_id_);
+    }
+    // Generate external SST files, one for each column family
+    size_t num_cfs = column_families.size();
+    assert(ifos.size() == num_cfs);
+    assert(data.size() == num_cfs);
+    Status s;
+    std::vector<IngestExternalFileArg> args(num_cfs);
+    for (size_t i = 0; i != num_cfs; ++i) {
+      std::string external_file_path;
+      s = GenerateOneExternalFile(
+          options, column_families[i], data[i], file_id, sort_data,
+          &external_file_path,
+          true_data.size() == num_cfs ? &true_data[i] : nullptr);
+      if (!s.ok()) {
+        return s;
+      }
+      ++file_id;
+
+      args[i].column_family = column_families[i];
+      args[i].external_files.push_back(external_file_path);
+      args[i].options = ifos[i];
+    }
+    s = db_->IngestExternalFiles(args);
+    return s;
+  }
+
+  Status GenerateAndAddExternalFile(
+      const Options options, std::vector<std::pair<int, std::string>> data,
+      int file_id = -1, bool allow_global_seqno = false,
+      bool write_global_seqno = false,
+      bool verify_checksums_before_ingest = true, bool ingest_behind = false,
+      bool sort_data = false,
+      std::map<std::string, std::string>* true_data = nullptr,
+      ColumnFamilyHandle* cfh = nullptr) {
+    std::vector<std::pair<std::string, std::string>> file_data;
+    for (auto& entry : data) {
+      file_data.emplace_back(Key(entry.first), entry.second);
+    }
+    return GenerateAndAddExternalFile(options, file_data, file_id,
+                                      allow_global_seqno, write_global_seqno,
+                                      verify_checksums_before_ingest,
+                                      ingest_behind, sort_data, true_data, cfh);
+  }
+
+  Status GenerateAndAddExternalFile(
+      const Options options, std::vector<int> keys, int file_id = -1,
+      bool allow_global_seqno = false, bool write_global_seqno = false,
+      bool verify_checksums_before_ingest = true, bool ingest_behind = false,
+      bool sort_data = false,
+      std::map<std::string, std::string>* true_data = nullptr,
+      ColumnFamilyHandle* cfh = nullptr) {
+    std::vector<std::pair<std::string, std::string>> file_data;
+    for (auto& k : keys) {
+      file_data.emplace_back(Key(k), Key(k) + ToString(file_id));
+    }
+    return GenerateAndAddExternalFile(options, file_data, file_id,
+                                      allow_global_seqno, write_global_seqno,
+                                      verify_checksums_before_ingest,
+                                      ingest_behind, sort_data, true_data, cfh);
+  }
+
+  Status DeprecatedAddFile(const std::vector<std::string>& files,
+                           bool move_files = false,
+                           bool skip_snapshot_check = false,
+                           bool skip_write_global_seqno = false) {
+    IngestExternalFileOptions opts;
+    opts.move_files = move_files;
+    opts.snapshot_consistency = !skip_snapshot_check;
+    opts.allow_global_seqno = false;
+    opts.allow_blocking_flush = false;
+    opts.write_global_seqno = !skip_write_global_seqno;
+    return db_->IngestExternalFile(files, opts);
+  }
+
+  ~ExternalSSTFileTest() override { test::DestroyDir(env_, sst_files_dir_); }
+
+ protected:
+  int last_file_id_ = 0;
+  std::string sst_files_dir_;
+};
+
+TEST_F(ExternalSSTFileTest, Basic) {
+  do {
+    Options options = CurrentOptions();
+
+    SstFileWriter sst_file_writer(EnvOptions(), options);
+
+    // Current file size should be 0 after sst_file_writer init and before open a file.
+    ASSERT_EQ(sst_file_writer.FileSize(), 0);
+
+    // file1.sst (0 => 99)
+    std::string file1 = sst_files_dir_ + "file1.sst";
+    ASSERT_OK(sst_file_writer.Open(file1));
+    for (int k = 0; k < 100; k++) {
+      ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
+    }
+    ExternalSstFileInfo file1_info;
+    Status s = sst_file_writer.Finish(&file1_info);
+    ASSERT_TRUE(s.ok()) << s.ToString();
+
+    // Current file size should be non-zero after success write.
+    ASSERT_GT(sst_file_writer.FileSize(), 0);
+
+    ASSERT_EQ(file1_info.file_path, file1);
+    ASSERT_EQ(file1_info.num_entries, 100);
+    ASSERT_EQ(file1_info.smallest_key, Key(0));
+    ASSERT_EQ(file1_info.largest_key, Key(99));
+    ASSERT_EQ(file1_info.num_range_del_entries, 0);
+    ASSERT_EQ(file1_info.smallest_range_del_key, "");
+    ASSERT_EQ(file1_info.largest_range_del_key, "");
+    // sst_file_writer already finished, cannot add this value
+    s = sst_file_writer.Put(Key(100), "bad_val");
+    ASSERT_FALSE(s.ok()) << s.ToString();
+
+    // file2.sst (100 => 199)
+    std::string file2 = sst_files_dir_ + "file2.sst";
+    ASSERT_OK(sst_file_writer.Open(file2));
+    for (int k = 100; k < 200; k++) {
+      ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
+    }
+    // Cannot add this key because it's not after last added key
+    s = sst_file_writer.Put(Key(99), "bad_val");
+    ASSERT_FALSE(s.ok()) << s.ToString();
+    ExternalSstFileInfo file2_info;
+    s = sst_file_writer.Finish(&file2_info);
+    ASSERT_TRUE(s.ok()) << s.ToString();
+    ASSERT_EQ(file2_info.file_path, file2);
+    ASSERT_EQ(file2_info.num_entries, 100);
+    ASSERT_EQ(file2_info.smallest_key, Key(100));
+    ASSERT_EQ(file2_info.largest_key, Key(199));
+
+    // file3.sst (195 => 299)
+    // This file values overlap with file2 values
+    std::string file3 = sst_files_dir_ + "file3.sst";
+    ASSERT_OK(sst_file_writer.Open(file3));
+    for (int k = 195; k < 300; k++) {
+      ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val_overlap"));
+    }
+    ExternalSstFileInfo file3_info;
+    s = sst_file_writer.Finish(&file3_info);
+
+    ASSERT_TRUE(s.ok()) << s.ToString();
+    // Current file size should be non-zero after success finish.
+    ASSERT_GT(sst_file_writer.FileSize(), 0);
+    ASSERT_EQ(file3_info.file_path, file3);
+    ASSERT_EQ(file3_info.num_entries, 105);
+    ASSERT_EQ(file3_info.smallest_key, Key(195));
+    ASSERT_EQ(file3_info.largest_key, Key(299));
+
+    // file4.sst (30 => 39)
+    // This file values overlap with file1 values
+    std::string file4 = sst_files_dir_ + "file4.sst";
+    ASSERT_OK(sst_file_writer.Open(file4));
+    for (int k = 30; k < 40; k++) {
+      ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val_overlap"));
+    }
+    ExternalSstFileInfo file4_info;
+    s = sst_file_writer.Finish(&file4_info);
+    ASSERT_TRUE(s.ok()) << s.ToString();
+    ASSERT_EQ(file4_info.file_path, file4);
+    ASSERT_EQ(file4_info.num_entries, 10);
+    ASSERT_EQ(file4_info.smallest_key, Key(30));
+    ASSERT_EQ(file4_info.largest_key, Key(39));
+
+    // file5.sst (400 => 499)
+    std::string file5 = sst_files_dir_ + "file5.sst";
+    ASSERT_OK(sst_file_writer.Open(file5));
+    for (int k = 400; k < 500; k++) {
+      ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
+    }
+    ExternalSstFileInfo file5_info;
+    s = sst_file_writer.Finish(&file5_info);
+    ASSERT_TRUE(s.ok()) << s.ToString();
+    ASSERT_EQ(file5_info.file_path, file5);
+    ASSERT_EQ(file5_info.num_entries, 100);
+    ASSERT_EQ(file5_info.smallest_key, Key(400));
+    ASSERT_EQ(file5_info.largest_key, Key(499));
+
+    // file6.sst (delete 400 => 500)
+    std::string file6 = sst_files_dir_ + "file6.sst";
+    ASSERT_OK(sst_file_writer.Open(file6));
+    sst_file_writer.DeleteRange(Key(400), Key(500));
+    ExternalSstFileInfo file6_info;
+    s = sst_file_writer.Finish(&file6_info);
+    ASSERT_TRUE(s.ok()) << s.ToString();
+    ASSERT_EQ(file6_info.file_path, file6);
+    ASSERT_EQ(file6_info.num_entries, 0);
+    ASSERT_EQ(file6_info.smallest_key, "");
+    ASSERT_EQ(file6_info.largest_key, "");
+    ASSERT_EQ(file6_info.num_range_del_entries, 1);
+    ASSERT_EQ(file6_info.smallest_range_del_key, Key(400));
+    ASSERT_EQ(file6_info.largest_range_del_key, Key(500));
+
+    // file7.sst (delete 500 => 570, put 520 => 599 divisible by 2)
+    std::string file7 = sst_files_dir_ + "file7.sst";
+    ASSERT_OK(sst_file_writer.Open(file7));
+    sst_file_writer.DeleteRange(Key(500), Key(550));
+    for (int k = 520; k < 560; k += 2) {
+      ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
+    }
+    sst_file_writer.DeleteRange(Key(525), Key(575));
+    for (int k = 560; k < 600; k += 2) {
+      ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
+    }
+    ExternalSstFileInfo file7_info;
+    s = sst_file_writer.Finish(&file7_info);
+    ASSERT_TRUE(s.ok()) << s.ToString();
+    ASSERT_EQ(file7_info.file_path, file7);
+    ASSERT_EQ(file7_info.num_entries, 40);
+    ASSERT_EQ(file7_info.smallest_key, Key(520));
+    ASSERT_EQ(file7_info.largest_key, Key(598));
+    ASSERT_EQ(file7_info.num_range_del_entries, 2);
+    ASSERT_EQ(file7_info.smallest_range_del_key, Key(500));
+    ASSERT_EQ(file7_info.largest_range_del_key, Key(575));
+
+    // file8.sst (delete 600 => 700)
+    std::string file8 = sst_files_dir_ + "file8.sst";
+    ASSERT_OK(sst_file_writer.Open(file8));
+    sst_file_writer.DeleteRange(Key(600), Key(700));
+    ExternalSstFileInfo file8_info;
+    s = sst_file_writer.Finish(&file8_info);
+    ASSERT_TRUE(s.ok()) << s.ToString();
+    ASSERT_EQ(file8_info.file_path, file8);
+    ASSERT_EQ(file8_info.num_entries, 0);
+    ASSERT_EQ(file8_info.smallest_key, "");
+    ASSERT_EQ(file8_info.largest_key, "");
+    ASSERT_EQ(file8_info.num_range_del_entries, 1);
+    ASSERT_EQ(file8_info.smallest_range_del_key, Key(600));
+    ASSERT_EQ(file8_info.largest_range_del_key, Key(700));
+
+    // Cannot create an empty sst file
+    std::string file_empty = sst_files_dir_ + "file_empty.sst";
+    ExternalSstFileInfo file_empty_info;
+    s = sst_file_writer.Finish(&file_empty_info);
+    ASSERT_NOK(s);
+
+    DestroyAndReopen(options);
+    // Add file using file path
+    s = DeprecatedAddFile({file1});
+    ASSERT_TRUE(s.ok()) << s.ToString();
+    ASSERT_EQ(db_->GetLatestSequenceNumber(), 0U);
+    for (int k = 0; k < 100; k++) {
+      ASSERT_EQ(Get(Key(k)), Key(k) + "_val");
+    }
+
+    // Add file while holding a snapshot will fail
+    const Snapshot* s1 = db_->GetSnapshot();
+    if (s1 != nullptr) {
+      ASSERT_NOK(DeprecatedAddFile({file2}));
+      db_->ReleaseSnapshot(s1);
+    }
+    // We can add the file after releaseing the snapshot
+    ASSERT_OK(DeprecatedAddFile({file2}));
+
+    ASSERT_EQ(db_->GetLatestSequenceNumber(), 0U);
+    for (int k = 0; k < 200; k++) {
+      ASSERT_EQ(Get(Key(k)), Key(k) + "_val");
+    }
+
+    // This file has overlapping values with the existing data
+    s = DeprecatedAddFile({file3});
+    ASSERT_FALSE(s.ok()) << s.ToString();
+
+    // This file has overlapping values with the existing data
+    s = DeprecatedAddFile({file4});
+    ASSERT_FALSE(s.ok()) << s.ToString();
+
+    // Overwrite values of keys divisible by 5
+    for (int k = 0; k < 200; k += 5) {
+      ASSERT_OK(Put(Key(k), Key(k) + "_val_new"));
+    }
+    ASSERT_NE(db_->GetLatestSequenceNumber(), 0U);
+
+    // Key range of file5 (400 => 499) dont overlap with any keys in DB
+    ASSERT_OK(DeprecatedAddFile({file5}));
+
+    // This file has overlapping values with the existing data
+    s = DeprecatedAddFile({file6});
+    ASSERT_FALSE(s.ok()) << s.ToString();
+
+    // Key range of file7 (500 => 598) dont overlap with any keys in DB
+    ASSERT_OK(DeprecatedAddFile({file7}));
+
+    // Key range of file7 (600 => 700) dont overlap with any keys in DB
+    ASSERT_OK(DeprecatedAddFile({file8}));
+
+    // Make sure values are correct before and after flush/compaction
+    for (int i = 0; i < 2; i++) {
+      for (int k = 0; k < 200; k++) {
+        std::string value = Key(k) + "_val";
+        if (k % 5 == 0) {
+          value += "_new";
+        }
+        ASSERT_EQ(Get(Key(k)), value);
+      }
+      for (int k = 400; k < 500; k++) {
+        std::string value = Key(k) + "_val";
+        ASSERT_EQ(Get(Key(k)), value);
+      }
+      for (int k = 500; k < 600; k++) {
+        std::string value = Key(k) + "_val";
+        if (k < 520 || k % 2 == 1) {
+          value = "NOT_FOUND";
+        }
+        ASSERT_EQ(Get(Key(k)), value);
+      }
+      ASSERT_OK(Flush());
+      ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+    }
+
+    Close();
+    options.disable_auto_compactions = true;
+    Reopen(options);
+
+    // Delete keys in range (400 => 499)
+    for (int k = 400; k < 500; k++) {
+      ASSERT_OK(Delete(Key(k)));
+    }
+    // We deleted range (400 => 499) but cannot add file5 because
+    // of the range tombstones
+    ASSERT_NOK(DeprecatedAddFile({file5}));
+
+    // Compacting the DB will remove the tombstones
+    ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+    // Now we can add the file
+    ASSERT_OK(DeprecatedAddFile({file5}));
+
+    // Verify values of file5 in DB
+    for (int k = 400; k < 500; k++) {
+      std::string value = Key(k) + "_val";
+      ASSERT_EQ(Get(Key(k)), value);
+    }
+    DestroyAndRecreateExternalSSTFilesDir();
+  } while (ChangeOptions(kSkipPlainTable | kSkipFIFOCompaction |
+                         kRangeDelSkipConfigs));
+}
+
+class SstFileWriterCollector : public TablePropertiesCollector {
+ public:
+  explicit SstFileWriterCollector(const std::string prefix) : prefix_(prefix) {
+    name_ = prefix_ + "_SstFileWriterCollector";
+  }
+
+  const char* Name() const override { return name_.c_str(); }
+
+  Status Finish(UserCollectedProperties* properties) override {
+    std::string count = std::to_string(count_);
+    *properties = UserCollectedProperties{
+        {prefix_ + "_SstFileWriterCollector", "YES"},
+        {prefix_ + "_Count", count},
+    };
+    return Status::OK();
+  }
+
+  Status AddUserKey(const Slice& /*user_key*/, const Slice& /*value*/,
+                    EntryType /*type*/, SequenceNumber /*seq*/,
+                    uint64_t /*file_size*/) override {
+    ++count_;
+    return Status::OK();
+  }
+
+  UserCollectedProperties GetReadableProperties() const override {
+    return UserCollectedProperties{};
+  }
+
+ private:
+  uint32_t count_ = 0;
+  std::string prefix_;
+  std::string name_;
+};
+
+class SstFileWriterCollectorFactory : public TablePropertiesCollectorFactory {
+ public:
+  explicit SstFileWriterCollectorFactory(std::string prefix)
+      : prefix_(prefix), num_created_(0) {}
+  TablePropertiesCollector* CreateTablePropertiesCollector(
+      TablePropertiesCollectorFactory::Context /*context*/) override {
+    num_created_++;
+    return new SstFileWriterCollector(prefix_);
+  }
+  const char* Name() const override { return "SstFileWriterCollectorFactory"; }
+
+  std::string prefix_;
+  uint32_t num_created_;
+};
+
+TEST_F(ExternalSSTFileTest, AddList) {
+  do {
+    Options options = CurrentOptions();
+
+    auto abc_collector = std::make_shared<SstFileWriterCollectorFactory>("abc");
+    auto xyz_collector = std::make_shared<SstFileWriterCollectorFactory>("xyz");
+
+    options.table_properties_collector_factories.emplace_back(abc_collector);
+    options.table_properties_collector_factories.emplace_back(xyz_collector);
+
+    SstFileWriter sst_file_writer(EnvOptions(), options);
+
+    // file1.sst (0 => 99)
+    std::string file1 = sst_files_dir_ + "file1.sst";
+    ASSERT_OK(sst_file_writer.Open(file1));
+    for (int k = 0; k < 100; k++) {
+      ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
+    }
+    ExternalSstFileInfo file1_info;
+    Status s = sst_file_writer.Finish(&file1_info);
+    ASSERT_TRUE(s.ok()) << s.ToString();
+    ASSERT_EQ(file1_info.file_path, file1);
+    ASSERT_EQ(file1_info.num_entries, 100);
+    ASSERT_EQ(file1_info.smallest_key, Key(0));
+    ASSERT_EQ(file1_info.largest_key, Key(99));
+    // sst_file_writer already finished, cannot add this value
+    s = sst_file_writer.Put(Key(100), "bad_val");
+    ASSERT_FALSE(s.ok()) << s.ToString();
+
+    // file2.sst (100 => 199)
+    std::string file2 = sst_files_dir_ + "file2.sst";
+    ASSERT_OK(sst_file_writer.Open(file2));
+    for (int k = 100; k < 200; k++) {
+      ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
+    }
+    // Cannot add this key because it's not after last added key
+    s = sst_file_writer.Put(Key(99), "bad_val");
+    ASSERT_FALSE(s.ok()) << s.ToString();
+    ExternalSstFileInfo file2_info;
+    s = sst_file_writer.Finish(&file2_info);
+    ASSERT_TRUE(s.ok()) << s.ToString();
+    ASSERT_EQ(file2_info.file_path, file2);
+    ASSERT_EQ(file2_info.num_entries, 100);
+    ASSERT_EQ(file2_info.smallest_key, Key(100));
+    ASSERT_EQ(file2_info.largest_key, Key(199));
+
+    // file3.sst (195 => 199)
+    // This file values overlap with file2 values
+    std::string file3 = sst_files_dir_ + "file3.sst";
+    ASSERT_OK(sst_file_writer.Open(file3));
+    for (int k = 195; k < 200; k++) {
+      ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val_overlap"));
+    }
+    ExternalSstFileInfo file3_info;
+    s = sst_file_writer.Finish(&file3_info);
+    ASSERT_TRUE(s.ok()) << s.ToString();
+    ASSERT_EQ(file3_info.file_path, file3);
+    ASSERT_EQ(file3_info.num_entries, 5);
+    ASSERT_EQ(file3_info.smallest_key, Key(195));
+    ASSERT_EQ(file3_info.largest_key, Key(199));
+
+    // file4.sst (30 => 39)
+    // This file values overlap with file1 values
+    std::string file4 = sst_files_dir_ + "file4.sst";
+    ASSERT_OK(sst_file_writer.Open(file4));
+    for (int k = 30; k < 40; k++) {
+      ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val_overlap"));
+    }
+    ExternalSstFileInfo file4_info;
+    s = sst_file_writer.Finish(&file4_info);
+    ASSERT_TRUE(s.ok()) << s.ToString();
+    ASSERT_EQ(file4_info.file_path, file4);
+    ASSERT_EQ(file4_info.num_entries, 10);
+    ASSERT_EQ(file4_info.smallest_key, Key(30));
+    ASSERT_EQ(file4_info.largest_key, Key(39));
+
+    // file5.sst (200 => 299)
+    std::string file5 = sst_files_dir_ + "file5.sst";
+    ASSERT_OK(sst_file_writer.Open(file5));
+    for (int k = 200; k < 300; k++) {
+      ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
+    }
+    ExternalSstFileInfo file5_info;
+    s = sst_file_writer.Finish(&file5_info);
+    ASSERT_TRUE(s.ok()) << s.ToString();
+    ASSERT_EQ(file5_info.file_path, file5);
+    ASSERT_EQ(file5_info.num_entries, 100);
+    ASSERT_EQ(file5_info.smallest_key, Key(200));
+    ASSERT_EQ(file5_info.largest_key, Key(299));
+
+    // file6.sst (delete 0 => 100)
+    std::string file6 = sst_files_dir_ + "file6.sst";
+    ASSERT_OK(sst_file_writer.Open(file6));
+    ASSERT_OK(sst_file_writer.DeleteRange(Key(0), Key(75)));
+    ASSERT_OK(sst_file_writer.DeleteRange(Key(25), Key(100)));
+    ExternalSstFileInfo file6_info;
+    s = sst_file_writer.Finish(&file6_info);
+    ASSERT_TRUE(s.ok()) << s.ToString();
+    ASSERT_EQ(file6_info.file_path, file6);
+    ASSERT_EQ(file6_info.num_entries, 0);
+    ASSERT_EQ(file6_info.smallest_key, "");
+    ASSERT_EQ(file6_info.largest_key, "");
+    ASSERT_EQ(file6_info.num_range_del_entries, 2);
+    ASSERT_EQ(file6_info.smallest_range_del_key, Key(0));
+    ASSERT_EQ(file6_info.largest_range_del_key, Key(100));
+
+    // file7.sst (delete 99 => 201)
+    std::string file7 = sst_files_dir_ + "file7.sst";
+    ASSERT_OK(sst_file_writer.Open(file7));
+    ASSERT_OK(sst_file_writer.DeleteRange(Key(99), Key(201)));
+    ExternalSstFileInfo file7_info;
+    s = sst_file_writer.Finish(&file7_info);
+    ASSERT_TRUE(s.ok()) << s.ToString();
+    ASSERT_EQ(file7_info.file_path, file7);
+    ASSERT_EQ(file7_info.num_entries, 0);
+    ASSERT_EQ(file7_info.smallest_key, "");
+    ASSERT_EQ(file7_info.largest_key, "");
+    ASSERT_EQ(file7_info.num_range_del_entries, 1);
+    ASSERT_EQ(file7_info.smallest_range_del_key, Key(99));
+    ASSERT_EQ(file7_info.largest_range_del_key, Key(201));
+
+    // list 1 has internal key range conflict
+    std::vector<std::string> file_list0({file1, file2});
+    std::vector<std::string> file_list1({file3, file2, file1});
+    std::vector<std::string> file_list2({file5});
+    std::vector<std::string> file_list3({file3, file4});
+    std::vector<std::string> file_list4({file5, file7});
+    std::vector<std::string> file_list5({file6, file7});
+
+    DestroyAndReopen(options);
+
+    // These lists of files have key ranges that overlap with each other
+    s = DeprecatedAddFile(file_list1);
+    ASSERT_FALSE(s.ok()) << s.ToString();
+    // Both of the following overlap on the range deletion tombstone.
+    s = DeprecatedAddFile(file_list4);
+    ASSERT_FALSE(s.ok()) << s.ToString();
+    s = DeprecatedAddFile(file_list5);
+    ASSERT_FALSE(s.ok()) << s.ToString();
+
+    // Add files using file path list
+    s = DeprecatedAddFile(file_list0);
+    ASSERT_TRUE(s.ok()) << s.ToString();
+    ASSERT_EQ(db_->GetLatestSequenceNumber(), 0U);
+    for (int k = 0; k < 200; k++) {
+      ASSERT_EQ(Get(Key(k)), Key(k) + "_val");
+    }
+
+    TablePropertiesCollection props;
+    ASSERT_OK(db_->GetPropertiesOfAllTables(&props));
+    ASSERT_EQ(props.size(), 2);
+    for (auto file_props : props) {
+      auto user_props = file_props.second->user_collected_properties;
+      ASSERT_EQ(user_props["abc_SstFileWriterCollector"], "YES");
+      ASSERT_EQ(user_props["xyz_SstFileWriterCollector"], "YES");
+      ASSERT_EQ(user_props["abc_Count"], "100");
+      ASSERT_EQ(user_props["xyz_Count"], "100");
+    }
+
+    // Add file while holding a snapshot will fail
+    const Snapshot* s1 = db_->GetSnapshot();
+    if (s1 != nullptr) {
+      ASSERT_NOK(DeprecatedAddFile(file_list2));
+      db_->ReleaseSnapshot(s1);
+    }
+    // We can add the file after releaseing the snapshot
+    ASSERT_OK(DeprecatedAddFile(file_list2));
+    ASSERT_EQ(db_->GetLatestSequenceNumber(), 0U);
+    for (int k = 0; k < 300; k++) {
+      ASSERT_EQ(Get(Key(k)), Key(k) + "_val");
+    }
+
+    ASSERT_OK(db_->GetPropertiesOfAllTables(&props));
+    ASSERT_EQ(props.size(), 3);
+    for (auto file_props : props) {
+      auto user_props = file_props.second->user_collected_properties;
+      ASSERT_EQ(user_props["abc_SstFileWriterCollector"], "YES");
+      ASSERT_EQ(user_props["xyz_SstFileWriterCollector"], "YES");
+      ASSERT_EQ(user_props["abc_Count"], "100");
+      ASSERT_EQ(user_props["xyz_Count"], "100");
+    }
+
+    // This file list has overlapping values with the existing data
+    s = DeprecatedAddFile(file_list3);
+    ASSERT_FALSE(s.ok()) << s.ToString();
+
+    // Overwrite values of keys divisible by 5
+    for (int k = 0; k < 200; k += 5) {
+      ASSERT_OK(Put(Key(k), Key(k) + "_val_new"));
+    }
+    ASSERT_NE(db_->GetLatestSequenceNumber(), 0U);
+
+    // Make sure values are correct before and after flush/compaction
+    for (int i = 0; i < 2; i++) {
+      for (int k = 0; k < 200; k++) {
+        std::string value = Key(k) + "_val";
+        if (k % 5 == 0) {
+          value += "_new";
+        }
+        ASSERT_EQ(Get(Key(k)), value);
+      }
+      for (int k = 200; k < 300; k++) {
+        std::string value = Key(k) + "_val";
+        ASSERT_EQ(Get(Key(k)), value);
+      }
+      ASSERT_OK(Flush());
+      ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+    }
+
+    // Delete keys in range (200 => 299)
+    for (int k = 200; k < 300; k++) {
+      ASSERT_OK(Delete(Key(k)));
+    }
+    // We deleted range (200 => 299) but cannot add file5 because
+    // of the range tombstones
+    ASSERT_NOK(DeprecatedAddFile(file_list2));
+
+    // Compacting the DB will remove the tombstones
+    ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+    // Now we can add the file
+    ASSERT_OK(DeprecatedAddFile(file_list2));
+
+    // Verify values of file5 in DB
+    for (int k = 200; k < 300; k++) {
+      std::string value = Key(k) + "_val";
+      ASSERT_EQ(Get(Key(k)), value);
+    }
+    DestroyAndRecreateExternalSSTFilesDir();
+  } while (ChangeOptions(kSkipPlainTable | kSkipFIFOCompaction |
+                         kRangeDelSkipConfigs));
+}
+
+TEST_F(ExternalSSTFileTest, AddListAtomicity) {
+  do {
+    Options options = CurrentOptions();
+
+    SstFileWriter sst_file_writer(EnvOptions(), options);
+
+    // files[0].sst (0 => 99)
+    // files[1].sst (100 => 199)
+    // ...
+    // file[8].sst (800 => 899)
+    int n = 9;
+    std::vector<std::string> files(n);
+    std::vector<ExternalSstFileInfo> files_info(n);
+    for (int i = 0; i < n; i++) {
+      files[i] = sst_files_dir_ + "file" + std::to_string(i) + ".sst";
+      ASSERT_OK(sst_file_writer.Open(files[i]));
+      for (int k = i * 100; k < (i + 1) * 100; k++) {
+        ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
+      }
+      Status s = sst_file_writer.Finish(&files_info[i]);
+      ASSERT_TRUE(s.ok()) << s.ToString();
+      ASSERT_EQ(files_info[i].file_path, files[i]);
+      ASSERT_EQ(files_info[i].num_entries, 100);
+      ASSERT_EQ(files_info[i].smallest_key, Key(i * 100));
+      ASSERT_EQ(files_info[i].largest_key, Key((i + 1) * 100 - 1));
+    }
+    files.push_back(sst_files_dir_ + "file" + std::to_string(n) + ".sst");
+    auto s = DeprecatedAddFile(files);
+    ASSERT_NOK(s) << s.ToString();
+    for (int k = 0; k < n * 100; k++) {
+      ASSERT_EQ("NOT_FOUND", Get(Key(k)));
+    }
+    files.pop_back();
+    ASSERT_OK(DeprecatedAddFile(files));
+    for (int k = 0; k < n * 100; k++) {
+      std::string value = Key(k) + "_val";
+      ASSERT_EQ(Get(Key(k)), value);
+    }
+    DestroyAndRecreateExternalSSTFilesDir();
+  } while (ChangeOptions(kSkipPlainTable | kSkipFIFOCompaction));
+}
+// This test reporduce a bug that can happen in some cases if the DB started
+// purging obsolete files when we are adding an external sst file.
+// This situation may result in deleting the file while it's being added.
+TEST_F(ExternalSSTFileTest, PurgeObsoleteFilesBug) {
+  Options options = CurrentOptions();
+  SstFileWriter sst_file_writer(EnvOptions(), options);
+
+  // file1.sst (0 => 500)
+  std::string sst_file_path = sst_files_dir_ + "file1.sst";
+  Status s = sst_file_writer.Open(sst_file_path);
+  ASSERT_OK(s);
+  for (int i = 0; i < 500; i++) {
+    std::string k = Key(i);
+    s = sst_file_writer.Put(k, k + "_val");
+    ASSERT_OK(s);
+  }
+
+  ExternalSstFileInfo sst_file_info;
+  s = sst_file_writer.Finish(&sst_file_info);
+  ASSERT_OK(s);
+
+  options.delete_obsolete_files_period_micros = 0;
+  options.disable_auto_compactions = true;
+  DestroyAndReopen(options);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "ExternalSstFileIngestionJob::Prepare:FileAdded", [&](void* /* arg */) {
+        ASSERT_OK(Put("aaa", "bbb"));
+        ASSERT_OK(Flush());
+        ASSERT_OK(Put("aaa", "xxx"));
+        ASSERT_OK(Flush());
+        db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  s = DeprecatedAddFile({sst_file_path});
+  ASSERT_OK(s);
+
+  for (int i = 0; i < 500; i++) {
+    std::string k = Key(i);
+    std::string v = k + "_val";
+    ASSERT_EQ(Get(k), v);
+  }
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(ExternalSSTFileTest, SkipSnapshot) {
+  Options options = CurrentOptions();
+
+  SstFileWriter sst_file_writer(EnvOptions(), options);
+
+  // file1.sst (0 => 99)
+  std::string file1 = sst_files_dir_ + "file1.sst";
+  ASSERT_OK(sst_file_writer.Open(file1));
+  for (int k = 0; k < 100; k++) {
+    ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
+  }
+  ExternalSstFileInfo file1_info;
+  Status s = sst_file_writer.Finish(&file1_info);
+  ASSERT_TRUE(s.ok()) << s.ToString();
+  ASSERT_EQ(file1_info.file_path, file1);
+  ASSERT_EQ(file1_info.num_entries, 100);
+  ASSERT_EQ(file1_info.smallest_key, Key(0));
+  ASSERT_EQ(file1_info.largest_key, Key(99));
+
+  // file2.sst (100 => 299)
+  std::string file2 = sst_files_dir_ + "file2.sst";
+  ASSERT_OK(sst_file_writer.Open(file2));
+  for (int k = 100; k < 300; k++) {
+    ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
+  }
+  ExternalSstFileInfo file2_info;
+  s = sst_file_writer.Finish(&file2_info);
+  ASSERT_TRUE(s.ok()) << s.ToString();
+  ASSERT_EQ(file2_info.file_path, file2);
+  ASSERT_EQ(file2_info.num_entries, 200);
+  ASSERT_EQ(file2_info.smallest_key, Key(100));
+  ASSERT_EQ(file2_info.largest_key, Key(299));
+
+  ASSERT_OK(DeprecatedAddFile({file1}));
+
+  // Add file will fail when holding snapshot and use the default
+  // skip_snapshot_check to false
+  const Snapshot* s1 = db_->GetSnapshot();
+  if (s1 != nullptr) {
+    ASSERT_NOK(DeprecatedAddFile({file2}));
+  }
+
+  // Add file will success when set skip_snapshot_check to true even db holding
+  // snapshot
+  if (s1 != nullptr) {
+    ASSERT_OK(DeprecatedAddFile({file2}, false, true));
+    db_->ReleaseSnapshot(s1);
+  }
+
+  // file3.sst (300 => 399)
+  std::string file3 = sst_files_dir_ + "file3.sst";
+  ASSERT_OK(sst_file_writer.Open(file3));
+  for (int k = 300; k < 400; k++) {
+    ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
+  }
+  ExternalSstFileInfo file3_info;
+  s = sst_file_writer.Finish(&file3_info);
+  ASSERT_TRUE(s.ok()) << s.ToString();
+  ASSERT_EQ(file3_info.file_path, file3);
+  ASSERT_EQ(file3_info.num_entries, 100);
+  ASSERT_EQ(file3_info.smallest_key, Key(300));
+  ASSERT_EQ(file3_info.largest_key, Key(399));
+
+  // check that we have change the old key
+  ASSERT_EQ(Get(Key(300)), "NOT_FOUND");
+  const Snapshot* s2 = db_->GetSnapshot();
+  ASSERT_OK(DeprecatedAddFile({file3}, false, true));
+  ASSERT_EQ(Get(Key(300)), Key(300) + ("_val"));
+  ASSERT_EQ(Get(Key(300), s2), Key(300) + ("_val"));
+
+  db_->ReleaseSnapshot(s2);
+}
+
+TEST_F(ExternalSSTFileTest, MultiThreaded) {
+  // Bulk load 10 files every file contain 1000 keys
+  int num_files = 10;
+  int keys_per_file = 1000;
+
+  // Generate file names
+  std::vector<std::string> file_names;
+  for (int i = 0; i < num_files; i++) {
+    std::string file_name = "file_" + ToString(i) + ".sst";
+    file_names.push_back(sst_files_dir_ + file_name);
+  }
+
+  do {
+    Options options = CurrentOptions();
+
+    std::atomic<int> thread_num(0);
+    std::function<void()> write_file_func = [&]() {
+      int file_idx = thread_num.fetch_add(1);
+      int range_start = file_idx * keys_per_file;
+      int range_end = range_start + keys_per_file;
+
+      SstFileWriter sst_file_writer(EnvOptions(), options);
+
+      ASSERT_OK(sst_file_writer.Open(file_names[file_idx]));
+
+      for (int k = range_start; k < range_end; k++) {
+        ASSERT_OK(sst_file_writer.Put(Key(k), Key(k)));
+      }
+
+      Status s = sst_file_writer.Finish();
+      ASSERT_TRUE(s.ok()) << s.ToString();
+    };
+    // Write num_files files in parallel
+    std::vector<port::Thread> sst_writer_threads;
+    for (int i = 0; i < num_files; ++i) {
+      sst_writer_threads.emplace_back(write_file_func);
+    }
+
+    for (auto& t : sst_writer_threads) {
+      t.join();
+    }
+
+    fprintf(stderr, "Wrote %d files (%d keys)\n", num_files,
+            num_files * keys_per_file);
+
+    thread_num.store(0);
+    std::atomic<int> files_added(0);
+    // Thread 0 -> Load {f0,f1}
+    // Thread 1 -> Load {f0,f1}
+    // Thread 2 -> Load {f2,f3}
+    // Thread 3 -> Load {f2,f3}
+    // Thread 4 -> Load {f4,f5}
+    // Thread 5 -> Load {f4,f5}
+    // ...
+    std::function<void()> load_file_func = [&]() {
+      // We intentionally add every file twice, and assert that it was added
+      // only once and the other add failed
+      int thread_id = thread_num.fetch_add(1);
+      int file_idx = (thread_id / 2) * 2;
+      // sometimes we use copy, sometimes link .. the result should be the same
+      bool move_file = (thread_id % 3 == 0);
+
+      std::vector<std::string> files_to_add;
+
+      files_to_add = {file_names[file_idx]};
+      if (static_cast<size_t>(file_idx + 1) < file_names.size()) {
+        files_to_add.push_back(file_names[file_idx + 1]);
+      }
+
+      Status s = DeprecatedAddFile(files_to_add, move_file);
+      if (s.ok()) {
+        files_added += static_cast<int>(files_to_add.size());
+      }
+    };
+
+    // Bulk load num_files files in parallel
+    std::vector<port::Thread> add_file_threads;
+    DestroyAndReopen(options);
+    for (int i = 0; i < num_files; ++i) {
+      add_file_threads.emplace_back(load_file_func);
+    }
+
+    for (auto& t : add_file_threads) {
+      t.join();
+    }
+    ASSERT_EQ(files_added.load(), num_files);
+    fprintf(stderr, "Loaded %d files (%d keys)\n", num_files,
+            num_files * keys_per_file);
+
+    // Overwrite values of keys divisible by 100
+    for (int k = 0; k < num_files * keys_per_file; k += 100) {
+      std::string key = Key(k);
+      Status s = Put(key, key + "_new");
+      ASSERT_TRUE(s.ok());
+    }
+
+    for (int i = 0; i < 2; i++) {
+      // Make sure the values are correct before and after flush/compaction
+      for (int k = 0; k < num_files * keys_per_file; ++k) {
+        std::string key = Key(k);
+        std::string value = (k % 100 == 0) ? (key + "_new") : key;
+        ASSERT_EQ(Get(key), value);
+      }
+      ASSERT_OK(Flush());
+      ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+    }
+
+    fprintf(stderr, "Verified %d values\n", num_files * keys_per_file);
+    DestroyAndRecreateExternalSSTFilesDir();
+  } while (ChangeOptions(kSkipPlainTable | kSkipFIFOCompaction));
+}
+
+TEST_F(ExternalSSTFileTest, OverlappingRanges) {
+  Random rnd(301);
+  SequenceNumber assigned_seqno = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "ExternalSstFileIngestionJob::Run", [&assigned_seqno](void* arg) {
+        ASSERT_TRUE(arg != nullptr);
+        assigned_seqno = *(static_cast<SequenceNumber*>(arg));
+      });
+  bool need_flush = false;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::IngestExternalFile:NeedFlush", [&need_flush](void* arg) {
+        ASSERT_TRUE(arg != nullptr);
+        need_flush = *(static_cast<bool*>(arg));
+      });
+  bool overlap_with_db = false;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "ExternalSstFileIngestionJob::AssignLevelAndSeqnoForIngestedFile",
+      [&overlap_with_db](void* arg) {
+        ASSERT_TRUE(arg != nullptr);
+        overlap_with_db = *(static_cast<bool*>(arg));
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  do {
+    Options options = CurrentOptions();
+    DestroyAndReopen(options);
+
+    SstFileWriter sst_file_writer(EnvOptions(), options);
+
+    printf("Option config = %d\n", option_config_);
+    std::vector<std::pair<int, int>> key_ranges;
+    for (int i = 0; i < 100; i++) {
+      int range_start = rnd.Uniform(20000);
+      int keys_per_range = 10 + rnd.Uniform(41);
+
+      key_ranges.emplace_back(range_start, range_start + keys_per_range);
+    }
+
+    int memtable_add = 0;
+    int success_add_file = 0;
+    int failed_add_file = 0;
+    std::map<std::string, std::string> true_data;
+    for (size_t i = 0; i < key_ranges.size(); i++) {
+      int range_start = key_ranges[i].first;
+      int range_end = key_ranges[i].second;
+
+      Status s;
+      std::string range_val = "range_" + ToString(i);
+
+      // For 20% of ranges we use DB::Put, for 80% we use DB::AddFile
+      if (i && i % 5 == 0) {
+        // Use DB::Put to insert range (insert into memtable)
+        range_val += "_put";
+        for (int k = range_start; k <= range_end; k++) {
+          s = Put(Key(k), range_val);
+          ASSERT_OK(s);
+        }
+        memtable_add++;
+      } else {
+        // Use DB::AddFile to insert range
+        range_val += "_add_file";
+
+        // Generate the file containing the range
+        std::string file_name = sst_files_dir_ + env_->GenerateUniqueId();
+        ASSERT_OK(sst_file_writer.Open(file_name));
+        for (int k = range_start; k <= range_end; k++) {
+          s = sst_file_writer.Put(Key(k), range_val);
+          ASSERT_OK(s);
+        }
+        ExternalSstFileInfo file_info;
+        s = sst_file_writer.Finish(&file_info);
+        ASSERT_OK(s);
+
+        // Insert the generated file
+        s = DeprecatedAddFile({file_name});
+        auto it = true_data.lower_bound(Key(range_start));
+        if (option_config_ != kUniversalCompaction &&
+            option_config_ != kUniversalCompactionMultiLevel &&
+            option_config_ != kUniversalSubcompactions) {
+          if (it != true_data.end() && it->first <= Key(range_end)) {
+            // This range overlap with data already exist in DB
+            ASSERT_NOK(s);
+            failed_add_file++;
+          } else {
+            ASSERT_OK(s);
+            success_add_file++;
+          }
+        } else {
+          if ((it != true_data.end() && it->first <= Key(range_end)) ||
+              need_flush || assigned_seqno > 0 || overlap_with_db) {
+            // This range overlap with data already exist in DB
+            ASSERT_NOK(s);
+            failed_add_file++;
+          } else {
+            ASSERT_OK(s);
+            success_add_file++;
+          }
+        }
+      }
+
+      if (s.ok()) {
+        // Update true_data map to include the new inserted data
+        for (int k = range_start; k <= range_end; k++) {
+          true_data[Key(k)] = range_val;
+        }
+      }
+
+      // Flush / Compact the DB
+      if (i && i % 50 == 0) {
+        Flush();
+      }
+      if (i && i % 75 == 0) {
+        db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+      }
+    }
+
+    printf("Total: %" ROCKSDB_PRIszt
+           " ranges\n"
+           "AddFile()|Success: %d ranges\n"
+           "AddFile()|RangeConflict: %d ranges\n"
+           "Put(): %d ranges\n",
+           key_ranges.size(), success_add_file, failed_add_file, memtable_add);
+
+    // Verify the correctness of the data
+    for (const auto& kv : true_data) {
+      ASSERT_EQ(Get(kv.first), kv.second);
+    }
+    printf("keys/values verified\n");
+    DestroyAndRecreateExternalSSTFilesDir();
+  } while (ChangeOptions(kSkipPlainTable | kSkipFIFOCompaction));
+}
+
+TEST_P(ExternalSSTFileTest, PickedLevel) {
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = false;
+  options.level0_file_num_compaction_trigger = 4;
+  options.num_levels = 4;
+  DestroyAndReopen(options);
+
+  std::map<std::string, std::string> true_data;
+
+  // File 0 will go to last level (L3)
+  ASSERT_OK(GenerateAndAddExternalFile(options, {1, 10}, -1, false, false, true,
+                                       false, false, &true_data));
+  EXPECT_EQ(FilesPerLevel(), "0,0,0,1");
+
+  // File 1 will go to level L2 (since it overlap with file 0 in L3)
+  ASSERT_OK(GenerateAndAddExternalFile(options, {2, 9}, -1, false, false, true,
+                                       false, false, &true_data));
+  EXPECT_EQ(FilesPerLevel(), "0,0,1,1");
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+      {"ExternalSSTFileTest::PickedLevel:0", "BackgroundCallCompaction:0"},
+      {"DBImpl::BackgroundCompaction:Start",
+       "ExternalSSTFileTest::PickedLevel:1"},
+      {"ExternalSSTFileTest::PickedLevel:2",
+       "DBImpl::BackgroundCompaction:NonTrivial:AfterRun"},
+  });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // Flush 4 files containing the same keys
+  for (int i = 0; i < 4; i++) {
+    ASSERT_OK(Put(Key(3), Key(3) + "put"));
+    ASSERT_OK(Put(Key(8), Key(8) + "put"));
+    true_data[Key(3)] = Key(3) + "put";
+    true_data[Key(8)] = Key(8) + "put";
+    ASSERT_OK(Flush());
+  }
+
+  // Wait for BackgroundCompaction() to be called
+  TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevel:0");
+  TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevel:1");
+
+  EXPECT_EQ(FilesPerLevel(), "4,0,1,1");
+
+  // This file overlaps with file 0 (L3), file 1 (L2) and the
+  // output of compaction going to L1
+  ASSERT_OK(GenerateAndAddExternalFile(options, {4, 7}, -1, false, false, true,
+                                       false, false, &true_data));
+  EXPECT_EQ(FilesPerLevel(), "5,0,1,1");
+
+  // This file does not overlap with any file or with the running compaction
+  ASSERT_OK(GenerateAndAddExternalFile(options, {9000, 9001}, -1, false, false,
+                                       false, false, false, &true_data));
+  EXPECT_EQ(FilesPerLevel(), "5,0,1,2");
+
+  // Hold compaction from finishing
+  TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevel:2");
+
+  dbfull()->TEST_WaitForCompact();
+  EXPECT_EQ(FilesPerLevel(), "1,1,1,2");
+
+  size_t kcnt = 0;
+  VerifyDBFromMap(true_data, &kcnt, false);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(ExternalSSTFileTest, PickedLevelBug) {
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = false;
+  options.level0_file_num_compaction_trigger = 3;
+  options.num_levels = 2;
+  DestroyAndReopen(options);
+
+  std::vector<int> file_keys;
+
+  // file #1 in L0
+  file_keys = {0, 5, 7};
+  for (int k : file_keys) {
+    ASSERT_OK(Put(Key(k), Key(k)));
+  }
+  ASSERT_OK(Flush());
+
+  // file #2 in L0
+  file_keys = {4, 6, 8, 9};
+  for (int k : file_keys) {
+    ASSERT_OK(Put(Key(k), Key(k)));
+  }
+  ASSERT_OK(Flush());
+
+  // We have 2 overlapping files in L0
+  EXPECT_EQ(FilesPerLevel(), "2");
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::AddFile:MutexLock", "ExternalSSTFileTest::PickedLevelBug:0"},
+       {"ExternalSSTFileTest::PickedLevelBug:1", "DBImpl::AddFile:MutexUnlock"},
+       {"ExternalSSTFileTest::PickedLevelBug:2",
+        "DBImpl::RunManualCompaction:0"},
+       {"ExternalSSTFileTest::PickedLevelBug:3",
+        "DBImpl::RunManualCompaction:1"}});
+
+  std::atomic<bool> bg_compact_started(false);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:Start",
+      [&](void* /*arg*/) { bg_compact_started.store(true); });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // While writing the MANIFEST start a thread that will ask for compaction
+  ROCKSDB_NAMESPACE::port::Thread bg_compact([&]() {
+    ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  });
+  TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevelBug:2");
+
+  // Start a thread that will ingest a new file
+  ROCKSDB_NAMESPACE::port::Thread bg_addfile([&]() {
+    file_keys = {1, 2, 3};
+    ASSERT_OK(GenerateAndAddExternalFile(options, file_keys, 1));
+  });
+
+  // Wait for AddFile to start picking levels and writing MANIFEST
+  TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevelBug:0");
+
+  TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevelBug:3");
+
+  // We need to verify that no compactions can run while AddFile is
+  // ingesting the files into the levels it find suitable. So we will
+  // wait for 2 seconds to give a chance for compactions to run during
+  // this period, and then make sure that no compactions where able to run
+  env_->SleepForMicroseconds(1000000 * 2);
+  ASSERT_FALSE(bg_compact_started.load());
+
+  // Hold AddFile from finishing writing the MANIFEST
+  TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevelBug:1");
+
+  bg_addfile.join();
+  bg_compact.join();
+
+  dbfull()->TEST_WaitForCompact();
+
+  int total_keys = 0;
+  Iterator* iter = db_->NewIterator(ReadOptions());
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    ASSERT_OK(iter->status());
+    total_keys++;
+  }
+  ASSERT_EQ(total_keys, 10);
+
+  delete iter;
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(ExternalSSTFileTest, IngestNonExistingFile) {
+  Options options = CurrentOptions();
+  DestroyAndReopen(options);
+
+  Status s = db_->IngestExternalFile({"non_existing_file"},
+                                     IngestExternalFileOptions());
+  ASSERT_NOK(s);
+
+  // Verify file deletion is not impacted (verify a bug fix)
+  ASSERT_OK(Put(Key(1), Key(1)));
+  ASSERT_OK(Put(Key(9), Key(9)));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put(Key(1), Key(1)));
+  ASSERT_OK(Put(Key(9), Key(9)));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
+
+  // After full compaction, there should be only 1 file.
+  std::vector<std::string> files;
+  env_->GetChildren(dbname_, &files);
+  int num_sst_files = 0;
+  for (auto& f : files) {
+    uint64_t number;
+    FileType type;
+    if (ParseFileName(f, &number, &type) && type == kTableFile) {
+      num_sst_files++;
+    }
+  }
+  ASSERT_EQ(1, num_sst_files);
+}
+
+TEST_F(ExternalSSTFileTest, CompactDuringAddFileRandom) {
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = false;
+  options.level0_file_num_compaction_trigger = 2;
+  options.num_levels = 2;
+  DestroyAndReopen(options);
+
+  std::function<void()> bg_compact = [&]() {
+    ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  };
+
+  int range_id = 0;
+  std::vector<int> file_keys;
+  std::function<void()> bg_addfile = [&]() {
+    ASSERT_OK(GenerateAndAddExternalFile(options, file_keys, range_id));
+  };
+
+  const int num_of_ranges = 1000;
+  std::vector<port::Thread> threads;
+  while (range_id < num_of_ranges) {
+    int range_start = range_id * 10;
+    int range_end = range_start + 10;
+
+    file_keys.clear();
+    for (int k = range_start + 1; k < range_end; k++) {
+      file_keys.push_back(k);
+    }
+    ASSERT_OK(Put(Key(range_start), Key(range_start)));
+    ASSERT_OK(Put(Key(range_end), Key(range_end)));
+    ASSERT_OK(Flush());
+
+    if (range_id % 10 == 0) {
+      threads.emplace_back(bg_compact);
+    }
+    threads.emplace_back(bg_addfile);
+
+    for (auto& t : threads) {
+      t.join();
+    }
+    threads.clear();
+
+    range_id++;
+  }
+
+  for (int rid = 0; rid < num_of_ranges; rid++) {
+    int range_start = rid * 10;
+    int range_end = range_start + 10;
+
+    ASSERT_EQ(Get(Key(range_start)), Key(range_start)) << rid;
+    ASSERT_EQ(Get(Key(range_end)), Key(range_end)) << rid;
+    for (int k = range_start + 1; k < range_end; k++) {
+      std::string v = Key(k) + ToString(rid);
+      ASSERT_EQ(Get(Key(k)), v) << rid;
+    }
+  }
+}
+
+TEST_F(ExternalSSTFileTest, PickedLevelDynamic) {
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = false;
+  options.level0_file_num_compaction_trigger = 4;
+  options.level_compaction_dynamic_level_bytes = true;
+  options.num_levels = 4;
+  DestroyAndReopen(options);
+  std::map<std::string, std::string> true_data;
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+      {"ExternalSSTFileTest::PickedLevelDynamic:0",
+       "BackgroundCallCompaction:0"},
+      {"DBImpl::BackgroundCompaction:Start",
+       "ExternalSSTFileTest::PickedLevelDynamic:1"},
+      {"ExternalSSTFileTest::PickedLevelDynamic:2",
+       "DBImpl::BackgroundCompaction:NonTrivial:AfterRun"},
+  });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // Flush 4 files containing the same keys
+  for (int i = 0; i < 4; i++) {
+    for (int k = 20; k <= 30; k++) {
+      ASSERT_OK(Put(Key(k), Key(k) + "put"));
+      true_data[Key(k)] = Key(k) + "put";
+    }
+    for (int k = 50; k <= 60; k++) {
+      ASSERT_OK(Put(Key(k), Key(k) + "put"));
+      true_data[Key(k)] = Key(k) + "put";
+    }
+    ASSERT_OK(Flush());
+  }
+
+  // Wait for BackgroundCompaction() to be called
+  TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevelDynamic:0");
+  TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevelDynamic:1");
+
+  // This file overlaps with the output of the compaction (going to L3)
+  // so the file will be added to L0 since L3 is the base level
+  ASSERT_OK(GenerateAndAddExternalFile(options, {31, 32, 33, 34}, -1, false,
+                                       false, true, false, false, &true_data));
+  EXPECT_EQ(FilesPerLevel(), "5");
+
+  // This file does not overlap with the current running compactiong
+  ASSERT_OK(GenerateAndAddExternalFile(options, {9000, 9001}, -1, false, false,
+                                       true, false, false, &true_data));
+  EXPECT_EQ(FilesPerLevel(), "5,0,0,1");
+
+  // Hold compaction from finishing
+  TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevelDynamic:2");
+
+  // Output of the compaction will go to L3
+  dbfull()->TEST_WaitForCompact();
+  EXPECT_EQ(FilesPerLevel(), "1,0,0,2");
+
+  Close();
+  options.disable_auto_compactions = true;
+  Reopen(options);
+
+  ASSERT_OK(GenerateAndAddExternalFile(options, {1, 15, 19}, -1, false, false,
+                                       true, false, false, &true_data));
+  ASSERT_EQ(FilesPerLevel(), "1,0,0,3");
+
+  ASSERT_OK(GenerateAndAddExternalFile(options, {1000, 1001, 1002}, -1, false,
+                                       false, true, false, false, &true_data));
+  ASSERT_EQ(FilesPerLevel(), "1,0,0,4");
+
+  ASSERT_OK(GenerateAndAddExternalFile(options, {500, 600, 700}, -1, false,
+                                       false, true, false, false, &true_data));
+  ASSERT_EQ(FilesPerLevel(), "1,0,0,5");
+
+  // File 5 overlaps with file 2 (L3 / base level)
+  ASSERT_OK(GenerateAndAddExternalFile(options, {2, 10}, -1, false, false, true,
+                                       false, false, &true_data));
+  ASSERT_EQ(FilesPerLevel(), "2,0,0,5");
+
+  // File 6 overlaps with file 2 (L3 / base level) and file 5 (L0)
+  ASSERT_OK(GenerateAndAddExternalFile(options, {3, 9}, -1, false, false, true,
+                                       false, false, &true_data));
+  ASSERT_EQ(FilesPerLevel(), "3,0,0,5");
+
+  // Verify data in files
+  size_t kcnt = 0;
+  VerifyDBFromMap(true_data, &kcnt, false);
+
+  // Write range [5 => 10] to L0
+  for (int i = 5; i <= 10; i++) {
+    std::string k = Key(i);
+    std::string v = k + "put";
+    ASSERT_OK(Put(k, v));
+    true_data[k] = v;
+  }
+  ASSERT_OK(Flush());
+  ASSERT_EQ(FilesPerLevel(), "4,0,0,5");
+
+  // File 7 overlaps with file 4 (L3)
+  ASSERT_OK(GenerateAndAddExternalFile(options, {650, 651, 652}, -1, false,
+                                       false, true, false, false, &true_data));
+  ASSERT_EQ(FilesPerLevel(), "5,0,0,5");
+
+  VerifyDBFromMap(true_data, &kcnt, false);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(ExternalSSTFileTest, AddExternalSstFileWithCustomCompartor) {
+  Options options = CurrentOptions();
+  options.comparator = ReverseBytewiseComparator();
+  DestroyAndReopen(options);
+
+  SstFileWriter sst_file_writer(EnvOptions(), options);
+
+  // Generate files with these key ranges
+  // {14  -> 0}
+  // {24 -> 10}
+  // {34 -> 20}
+  // {44 -> 30}
+  // ..
+  std::vector<std::string> generated_files;
+  for (int i = 0; i < 10; i++) {
+    std::string file_name = sst_files_dir_ + env_->GenerateUniqueId();
+    ASSERT_OK(sst_file_writer.Open(file_name));
+
+    int range_end = i * 10;
+    int range_start = range_end + 15;
+    for (int k = (range_start - 1); k >= range_end; k--) {
+      ASSERT_OK(sst_file_writer.Put(Key(k), Key(k)));
+    }
+    ExternalSstFileInfo file_info;
+    ASSERT_OK(sst_file_writer.Finish(&file_info));
+    generated_files.push_back(file_name);
+  }
+
+  std::vector<std::string> in_files;
+
+  // These 2nd and 3rd files overlap with each other
+  in_files = {generated_files[0], generated_files[4], generated_files[5],
+              generated_files[7]};
+  ASSERT_NOK(DeprecatedAddFile(in_files));
+
+  // These 2 files dont overlap with each other
+  in_files = {generated_files[0], generated_files[2]};
+  ASSERT_OK(DeprecatedAddFile(in_files));
+
+  // These 2 files dont overlap with each other but overlap with keys in DB
+  in_files = {generated_files[3], generated_files[7]};
+  ASSERT_NOK(DeprecatedAddFile(in_files));
+
+  // Files dont overlap and dont overlap with DB key range
+  in_files = {generated_files[4], generated_files[6], generated_files[8]};
+  ASSERT_OK(DeprecatedAddFile(in_files));
+
+  for (int i = 0; i < 100; i++) {
+    if (i % 20 <= 14) {
+      ASSERT_EQ(Get(Key(i)), Key(i));
+    } else {
+      ASSERT_EQ(Get(Key(i)), "NOT_FOUND");
+    }
+  }
+}
+
+TEST_F(ExternalSSTFileTest, AddFileTrivialMoveBug) {
+  Options options = CurrentOptions();
+  options.num_levels = 3;
+  options.IncreaseParallelism(20);
+  DestroyAndReopen(options);
+
+  ASSERT_OK(GenerateAndAddExternalFile(options, {1, 4}, 1));  // L3
+  ASSERT_OK(GenerateAndAddExternalFile(options, {2, 3}, 2));  // L2
+
+  ASSERT_OK(GenerateAndAddExternalFile(options, {10, 14}, 3));  // L3
+  ASSERT_OK(GenerateAndAddExternalFile(options, {12, 13}, 4));  // L2
+
+  ASSERT_OK(GenerateAndAddExternalFile(options, {20, 24}, 5));  // L3
+  ASSERT_OK(GenerateAndAddExternalFile(options, {22, 23}, 6));  // L2
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "CompactionJob::Run():Start", [&](void* /*arg*/) {
+        // fit in L3 but will overlap with compaction so will be added
+        // to L2 but a compaction will trivially move it to L3
+        // and break LSM consistency
+        static std::atomic<bool> called = {false};
+        if (!called) {
+          called = true;
+          ASSERT_OK(dbfull()->SetOptions({{"max_bytes_for_level_base", "1"}}));
+          ASSERT_OK(GenerateAndAddExternalFile(options, {15, 16}, 7));
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  CompactRangeOptions cro;
+  cro.exclusive_manual_compaction = false;
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+  dbfull()->TEST_WaitForCompact();
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(ExternalSSTFileTest, CompactAddedFiles) {
+  Options options = CurrentOptions();
+  options.num_levels = 3;
+  DestroyAndReopen(options);
+
+  ASSERT_OK(GenerateAndAddExternalFile(options, {1, 10}, 1));  // L3
+  ASSERT_OK(GenerateAndAddExternalFile(options, {2, 9}, 2));   // L2
+  ASSERT_OK(GenerateAndAddExternalFile(options, {3, 8}, 3));   // L1
+  ASSERT_OK(GenerateAndAddExternalFile(options, {4, 7}, 4));   // L0
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+}
+
+TEST_F(ExternalSSTFileTest, SstFileWriterNonSharedKeys) {
+  Options options = CurrentOptions();
+  DestroyAndReopen(options);
+  std::string file_path = sst_files_dir_ + "/not_shared";
+  SstFileWriter sst_file_writer(EnvOptions(), options);
+
+  std::string suffix(100, 'X');
+  ASSERT_OK(sst_file_writer.Open(file_path));
+  ASSERT_OK(sst_file_writer.Put("A" + suffix, "VAL"));
+  ASSERT_OK(sst_file_writer.Put("BB" + suffix, "VAL"));
+  ASSERT_OK(sst_file_writer.Put("CC" + suffix, "VAL"));
+  ASSERT_OK(sst_file_writer.Put("CXD" + suffix, "VAL"));
+  ASSERT_OK(sst_file_writer.Put("CZZZ" + suffix, "VAL"));
+  ASSERT_OK(sst_file_writer.Put("ZAAAX" + suffix, "VAL"));
+
+  ASSERT_OK(sst_file_writer.Finish());
+  ASSERT_OK(DeprecatedAddFile({file_path}));
+}
+
+TEST_F(ExternalSSTFileTest, WithUnorderedWrite) {
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::WriteImpl:UnorderedWriteAfterWriteWAL",
+        "ExternalSSTFileTest::WithUnorderedWrite:WaitWriteWAL"},
+       {"DBImpl::WaitForPendingWrites:BeforeBlock",
+        "DBImpl::WriteImpl:BeforeUnorderedWriteMemtable"}});
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::IngestExternalFile:NeedFlush", [&](void* need_flush) {
+        ASSERT_TRUE(*reinterpret_cast<bool*>(need_flush));
+      });
+
+  Options options = CurrentOptions();
+  options.unordered_write = true;
+  DestroyAndReopen(options);
+  Put("foo", "v1");
+  SyncPoint::GetInstance()->EnableProcessing();
+  port::Thread writer([&]() { Put("bar", "v2"); });
+
+  TEST_SYNC_POINT("ExternalSSTFileTest::WithUnorderedWrite:WaitWriteWAL");
+  ASSERT_OK(GenerateAndAddExternalFile(options, {{"bar", "v3"}}, -1,
+                                       true /* allow_global_seqno */));
+  ASSERT_EQ(Get("bar"), "v3");
+
+  writer.join();
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_P(ExternalSSTFileTest, IngestFileWithGlobalSeqnoRandomized) {
+  Options options = CurrentOptions();
+  options.IncreaseParallelism(20);
+  options.level0_slowdown_writes_trigger = 256;
+  options.level0_stop_writes_trigger = 256;
+
+  bool write_global_seqno = std::get<0>(GetParam());
+  bool verify_checksums_before_ingest = std::get<1>(GetParam());
+  for (int iter = 0; iter < 2; iter++) {
+    bool write_to_memtable = (iter == 0);
+    DestroyAndReopen(options);
+
+    Random rnd(301);
+    std::map<std::string, std::string> true_data;
+    for (int i = 0; i < 500; i++) {
+      std::vector<std::pair<std::string, std::string>> random_data;
+      for (int j = 0; j < 100; j++) {
+        std::string k;
+        std::string v;
+        test::RandomString(&rnd, rnd.Next() % 20, &k);
+        test::RandomString(&rnd, rnd.Next() % 50, &v);
+        random_data.emplace_back(k, v);
+      }
+
+      if (write_to_memtable && rnd.OneIn(4)) {
+        // 25% of writes go through memtable
+        for (auto& entry : random_data) {
+          ASSERT_OK(Put(entry.first, entry.second));
+          true_data[entry.first] = entry.second;
+        }
+      } else {
+        ASSERT_OK(GenerateAndAddExternalFile(
+            options, random_data, -1, true, write_global_seqno,
+            verify_checksums_before_ingest, false, true, &true_data));
+      }
+    }
+    size_t kcnt = 0;
+    VerifyDBFromMap(true_data, &kcnt, false);
+    db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+    VerifyDBFromMap(true_data, &kcnt, false);
+  }
+}
+
+TEST_P(ExternalSSTFileTest, IngestFileWithGlobalSeqnoAssignedLevel) {
+  Options options = CurrentOptions();
+  options.num_levels = 5;
+  options.disable_auto_compactions = true;
+  DestroyAndReopen(options);
+  std::vector<std::pair<std::string, std::string>> file_data;
+  std::map<std::string, std::string> true_data;
+
+  // Insert 100 -> 200 into the memtable
+  for (int i = 100; i <= 200; i++) {
+    ASSERT_OK(Put(Key(i), "memtable"));
+    true_data[Key(i)] = "memtable";
+  }
+
+  // Insert 0 -> 20 using AddFile
+  file_data.clear();
+  for (int i = 0; i <= 20; i++) {
+    file_data.emplace_back(Key(i), "L4");
+  }
+  bool write_global_seqno = std::get<0>(GetParam());
+  bool verify_checksums_before_ingest = std::get<1>(GetParam());
+  ASSERT_OK(GenerateAndAddExternalFile(
+      options, file_data, -1, true, write_global_seqno,
+      verify_checksums_before_ingest, false, false, &true_data));
+
+  // This file dont overlap with anything in the DB, will go to L4
+  ASSERT_EQ("0,0,0,0,1", FilesPerLevel());
+
+  // Insert 80 -> 130 using AddFile
+  file_data.clear();
+  for (int i = 80; i <= 130; i++) {
+    file_data.emplace_back(Key(i), "L0");
+  }
+  ASSERT_OK(GenerateAndAddExternalFile(
+      options, file_data, -1, true, write_global_seqno,
+      verify_checksums_before_ingest, false, false, &true_data));
+
+  // This file overlap with the memtable, so it will flush it and add
+  // it self to L0
+  ASSERT_EQ("2,0,0,0,1", FilesPerLevel());
+
+  // Insert 30 -> 50 using AddFile
+  file_data.clear();
+  for (int i = 30; i <= 50; i++) {
+    file_data.emplace_back(Key(i), "L4");
+  }
+  ASSERT_OK(GenerateAndAddExternalFile(
+      options, file_data, -1, true, write_global_seqno,
+      verify_checksums_before_ingest, false, false, &true_data));
+
+  // This file dont overlap with anything in the DB and fit in L4 as well
+  ASSERT_EQ("2,0,0,0,2", FilesPerLevel());
+
+  // Insert 10 -> 40 using AddFile
+  file_data.clear();
+  for (int i = 10; i <= 40; i++) {
+    file_data.emplace_back(Key(i), "L3");
+  }
+  ASSERT_OK(GenerateAndAddExternalFile(
+      options, file_data, -1, true, write_global_seqno,
+      verify_checksums_before_ingest, false, false, &true_data));
+
+  // This file overlap with files in L4, we will ingest it in L3
+  ASSERT_EQ("2,0,0,1,2", FilesPerLevel());
+
+  size_t kcnt = 0;
+  VerifyDBFromMap(true_data, &kcnt, false);
+}
+
+TEST_P(ExternalSSTFileTest, IngestFileWithGlobalSeqnoMemtableFlush) {
+  Options options = CurrentOptions();
+  DestroyAndReopen(options);
+  uint64_t entries_in_memtable;
+  std::map<std::string, std::string> true_data;
+
+  for (int k : {10, 20, 40, 80}) {
+    ASSERT_OK(Put(Key(k), "memtable"));
+    true_data[Key(k)] = "memtable";
+  }
+  db_->GetIntProperty(DB::Properties::kNumEntriesActiveMemTable,
+                      &entries_in_memtable);
+  ASSERT_GE(entries_in_memtable, 1);
+
+  bool write_global_seqno = std::get<0>(GetParam());
+  bool verify_checksums_before_ingest = std::get<1>(GetParam());
+  // No need for flush
+  ASSERT_OK(GenerateAndAddExternalFile(
+      options, {90, 100, 110}, -1, true, write_global_seqno,
+      verify_checksums_before_ingest, false, false, &true_data));
+  db_->GetIntProperty(DB::Properties::kNumEntriesActiveMemTable,
+                      &entries_in_memtable);
+  ASSERT_GE(entries_in_memtable, 1);
+
+  // This file will flush the memtable
+  ASSERT_OK(GenerateAndAddExternalFile(
+      options, {19, 20, 21}, -1, true, write_global_seqno,
+      verify_checksums_before_ingest, false, false, &true_data));
+  db_->GetIntProperty(DB::Properties::kNumEntriesActiveMemTable,
+                      &entries_in_memtable);
+  ASSERT_EQ(entries_in_memtable, 0);
+
+  for (int k : {200, 201, 205, 206}) {
+    ASSERT_OK(Put(Key(k), "memtable"));
+    true_data[Key(k)] = "memtable";
+  }
+  db_->GetIntProperty(DB::Properties::kNumEntriesActiveMemTable,
+                      &entries_in_memtable);
+  ASSERT_GE(entries_in_memtable, 1);
+
+  // No need for flush, this file keys fit between the memtable keys
+  ASSERT_OK(GenerateAndAddExternalFile(
+      options, {202, 203, 204}, -1, true, write_global_seqno,
+      verify_checksums_before_ingest, false, false, &true_data));
+  db_->GetIntProperty(DB::Properties::kNumEntriesActiveMemTable,
+                      &entries_in_memtable);
+  ASSERT_GE(entries_in_memtable, 1);
+
+  // This file will flush the memtable
+  ASSERT_OK(GenerateAndAddExternalFile(
+      options, {206, 207}, -1, true, write_global_seqno,
+      verify_checksums_before_ingest, false, false, &true_data));
+  db_->GetIntProperty(DB::Properties::kNumEntriesActiveMemTable,
+                      &entries_in_memtable);
+  ASSERT_EQ(entries_in_memtable, 0);
+
+  size_t kcnt = 0;
+  VerifyDBFromMap(true_data, &kcnt, false);
+}
+
+TEST_P(ExternalSSTFileTest, L0SortingIssue) {
+  Options options = CurrentOptions();
+  options.num_levels = 2;
+  DestroyAndReopen(options);
+  std::map<std::string, std::string> true_data;
+
+  ASSERT_OK(Put(Key(1), "memtable"));
+  ASSERT_OK(Put(Key(10), "memtable"));
+
+  bool write_global_seqno = std::get<0>(GetParam());
+  bool verify_checksums_before_ingest = std::get<1>(GetParam());
+  // No Flush needed, No global seqno needed, Ingest in L1
+  ASSERT_OK(
+      GenerateAndAddExternalFile(options, {7, 8}, -1, true, write_global_seqno,
+                                 verify_checksums_before_ingest, false, false));
+  // No Flush needed, but need a global seqno, Ingest in L0
+  ASSERT_OK(
+      GenerateAndAddExternalFile(options, {7, 8}, -1, true, write_global_seqno,
+                                 verify_checksums_before_ingest, false, false));
+  printf("%s\n", FilesPerLevel().c_str());
+
+  // Overwrite what we added using external files
+  ASSERT_OK(Put(Key(7), "memtable"));
+  ASSERT_OK(Put(Key(8), "memtable"));
+
+  // Read values from memtable
+  ASSERT_EQ(Get(Key(7)), "memtable");
+  ASSERT_EQ(Get(Key(8)), "memtable");
+
+  // Flush and read from L0
+  ASSERT_OK(Flush());
+  printf("%s\n", FilesPerLevel().c_str());
+  ASSERT_EQ(Get(Key(7)), "memtable");
+  ASSERT_EQ(Get(Key(8)), "memtable");
+}
+
+TEST_F(ExternalSSTFileTest, CompactionDeadlock) {
+  Options options = CurrentOptions();
+  options.num_levels = 2;
+  options.level0_file_num_compaction_trigger = 4;
+  options.level0_slowdown_writes_trigger = 4;
+  options.level0_stop_writes_trigger = 4;
+  DestroyAndReopen(options);
+
+  // atomic conter of currently running bg threads
+  std::atomic<int> running_threads(0);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+      {"DBImpl::DelayWrite:Wait", "ExternalSSTFileTest::DeadLock:0"},
+      {"ExternalSSTFileTest::DeadLock:1", "DBImpl::AddFile:Start"},
+      {"DBImpl::AddFile:MutexLock", "ExternalSSTFileTest::DeadLock:2"},
+      {"ExternalSSTFileTest::DeadLock:3", "BackgroundCallCompaction:0"},
+  });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // Start ingesting and extrnal file in the background
+  ROCKSDB_NAMESPACE::port::Thread bg_ingest_file([&]() {
+    running_threads += 1;
+    ASSERT_OK(GenerateAndAddExternalFile(options, {5, 6}));
+    running_threads -= 1;
+  });
+
+  ASSERT_OK(Put(Key(1), "memtable"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put(Key(2), "memtable"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put(Key(3), "memtable"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put(Key(4), "memtable"));
+  ASSERT_OK(Flush());
+
+  // This thread will try to insert into the memtable but since we have 4 L0
+  // files this thread will be blocked and hold the writer thread
+  ROCKSDB_NAMESPACE::port::Thread bg_block_put([&]() {
+    running_threads += 1;
+    ASSERT_OK(Put(Key(10), "memtable"));
+    running_threads -= 1;
+  });
+
+  // Make sure DelayWrite is called first
+  TEST_SYNC_POINT("ExternalSSTFileTest::DeadLock:0");
+
+  // `DBImpl::AddFile:Start` will wait until we be here
+  TEST_SYNC_POINT("ExternalSSTFileTest::DeadLock:1");
+
+  // Wait for IngestExternalFile() to start and aquire mutex
+  TEST_SYNC_POINT("ExternalSSTFileTest::DeadLock:2");
+
+  // Now let compaction start
+  TEST_SYNC_POINT("ExternalSSTFileTest::DeadLock:3");
+
+  // Wait for max 5 seconds, if we did not finish all bg threads
+  // then we hit the deadlock bug
+  for (int i = 0; i < 10; i++) {
+    if (running_threads.load() == 0) {
+      break;
+    }
+    env_->SleepForMicroseconds(500000);
+  }
+
+  ASSERT_EQ(running_threads.load(), 0);
+
+  bg_ingest_file.join();
+  bg_block_put.join();
+}
+
+TEST_F(ExternalSSTFileTest, DirtyExit) {
+  Options options = CurrentOptions();
+  DestroyAndReopen(options);
+  std::string file_path = sst_files_dir_ + "/dirty_exit";
+  std::unique_ptr<SstFileWriter> sst_file_writer;
+
+  // Destruct SstFileWriter without calling Finish()
+  sst_file_writer.reset(new SstFileWriter(EnvOptions(), options));
+  ASSERT_OK(sst_file_writer->Open(file_path));
+  sst_file_writer.reset();
+
+  // Destruct SstFileWriter with a failing Finish
+  sst_file_writer.reset(new SstFileWriter(EnvOptions(), options));
+  ASSERT_OK(sst_file_writer->Open(file_path));
+  ASSERT_NOK(sst_file_writer->Finish());
+}
+
+TEST_F(ExternalSSTFileTest, FileWithCFInfo) {
+  Options options = CurrentOptions();
+  CreateAndReopenWithCF({"koko", "toto"}, options);
+
+  SstFileWriter sfw_default(EnvOptions(), options, handles_[0]);
+  SstFileWriter sfw_cf1(EnvOptions(), options, handles_[1]);
+  SstFileWriter sfw_cf2(EnvOptions(), options, handles_[2]);
+  SstFileWriter sfw_unknown(EnvOptions(), options);
+
+  // default_cf.sst
+  const std::string cf_default_sst = sst_files_dir_ + "/default_cf.sst";
+  ASSERT_OK(sfw_default.Open(cf_default_sst));
+  ASSERT_OK(sfw_default.Put("K1", "V1"));
+  ASSERT_OK(sfw_default.Put("K2", "V2"));
+  ASSERT_OK(sfw_default.Finish());
+
+  // cf1.sst
+  const std::string cf1_sst = sst_files_dir_ + "/cf1.sst";
+  ASSERT_OK(sfw_cf1.Open(cf1_sst));
+  ASSERT_OK(sfw_cf1.Put("K3", "V1"));
+  ASSERT_OK(sfw_cf1.Put("K4", "V2"));
+  ASSERT_OK(sfw_cf1.Finish());
+
+  // cf_unknown.sst
+  const std::string unknown_sst = sst_files_dir_ + "/cf_unknown.sst";
+  ASSERT_OK(sfw_unknown.Open(unknown_sst));
+  ASSERT_OK(sfw_unknown.Put("K5", "V1"));
+  ASSERT_OK(sfw_unknown.Put("K6", "V2"));
+  ASSERT_OK(sfw_unknown.Finish());
+
+  IngestExternalFileOptions ifo;
+
+  // SST CF dont match
+  ASSERT_NOK(db_->IngestExternalFile(handles_[0], {cf1_sst}, ifo));
+  // SST CF dont match
+  ASSERT_NOK(db_->IngestExternalFile(handles_[2], {cf1_sst}, ifo));
+  // SST CF match
+  ASSERT_OK(db_->IngestExternalFile(handles_[1], {cf1_sst}, ifo));
+
+  // SST CF dont match
+  ASSERT_NOK(db_->IngestExternalFile(handles_[1], {cf_default_sst}, ifo));
+  // SST CF dont match
+  ASSERT_NOK(db_->IngestExternalFile(handles_[2], {cf_default_sst}, ifo));
+  // SST CF match
+  ASSERT_OK(db_->IngestExternalFile(handles_[0], {cf_default_sst}, ifo));
+
+  // SST CF unknown
+  ASSERT_OK(db_->IngestExternalFile(handles_[1], {unknown_sst}, ifo));
+  // SST CF unknown
+  ASSERT_OK(db_->IngestExternalFile(handles_[2], {unknown_sst}, ifo));
+  // SST CF unknown
+  ASSERT_OK(db_->IngestExternalFile(handles_[0], {unknown_sst}, ifo));
+
+  // Cannot ingest a file into a dropped CF
+  ASSERT_OK(db_->DropColumnFamily(handles_[1]));
+  ASSERT_NOK(db_->IngestExternalFile(handles_[1], {unknown_sst}, ifo));
+
+  // CF was not dropped, ok to Ingest
+  ASSERT_OK(db_->IngestExternalFile(handles_[2], {unknown_sst}, ifo));
+}
+
+/*
+ * Test and verify the functionality of ingestion_options.move_files and
+ * ingestion_options.failed_move_fall_back_to_copy
+ */
+TEST_P(ExternSSTFileLinkFailFallbackTest, LinkFailFallBackExternalSst) {
+  const bool fail_link = std::get<0>(GetParam());
+  const bool failed_move_fall_back_to_copy = std::get<1>(GetParam());
+  test_env_->set_fail_link(fail_link);
+  const EnvOptions env_options;
+  DestroyAndReopen(options_);
+  const int kNumKeys = 10000;
+  IngestExternalFileOptions ifo;
+  ifo.move_files = true;
+  ifo.failed_move_fall_back_to_copy = failed_move_fall_back_to_copy;
+
+  std::string file_path = sst_files_dir_ + "file1.sst";
+  // Create SstFileWriter for default column family
+  SstFileWriter sst_file_writer(env_options, options_);
+  ASSERT_OK(sst_file_writer.Open(file_path));
+  for (int i = 0; i < kNumKeys; i++) {
+    ASSERT_OK(sst_file_writer.Put(Key(i), Key(i) + "_value"));
+  }
+  ASSERT_OK(sst_file_writer.Finish());
+  uint64_t file_size = 0;
+  ASSERT_OK(env_->GetFileSize(file_path, &file_size));
+
+  bool copyfile = false;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "ExternalSstFileIngestionJob::Prepare:CopyFile",
+      [&](void* /* arg */) { copyfile = true; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  const Status s = db_->IngestExternalFile({file_path}, ifo);
+
+  ColumnFamilyHandleImpl* cfh =
+      static_cast<ColumnFamilyHandleImpl*>(dbfull()->DefaultColumnFamily());
+  ColumnFamilyData* cfd = cfh->cfd();
+  const InternalStats* internal_stats_ptr = cfd->internal_stats();
+  const std::vector<InternalStats::CompactionStats>& comp_stats =
+      internal_stats_ptr->TEST_GetCompactionStats();
+  uint64_t bytes_copied = 0;
+  uint64_t bytes_moved = 0;
+  for (const auto& stats : comp_stats) {
+    bytes_copied += stats.bytes_written;
+    bytes_moved += stats.bytes_moved;
+  }
+
+  if (!fail_link) {
+    // Link operation succeeds. External SST should be moved.
+    ASSERT_OK(s);
+    ASSERT_EQ(0, bytes_copied);
+    ASSERT_EQ(file_size, bytes_moved);
+    ASSERT_FALSE(copyfile);
+  } else {
+    // Link operation fails.
+    ASSERT_EQ(0, bytes_moved);
+    if (failed_move_fall_back_to_copy) {
+      ASSERT_OK(s);
+      // Copy file is true since a failed link falls back to copy file.
+      ASSERT_TRUE(copyfile);
+      ASSERT_EQ(file_size, bytes_copied);
+    } else {
+      ASSERT_TRUE(s.IsNotSupported());
+      // Copy file is false since a failed link does not fall back to copy file.
+      ASSERT_FALSE(copyfile);
+      ASSERT_EQ(0, bytes_copied);
+    }
+  }
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+class TestIngestExternalFileListener : public EventListener {
+ public:
+  void OnExternalFileIngested(DB* /*db*/,
+                              const ExternalFileIngestionInfo& info) override {
+    ingested_files.push_back(info);
+  }
+
+  std::vector<ExternalFileIngestionInfo> ingested_files;
+};
+
+TEST_P(ExternalSSTFileTest, IngestionListener) {
+  Options options = CurrentOptions();
+  TestIngestExternalFileListener* listener =
+      new TestIngestExternalFileListener();
+  options.listeners.emplace_back(listener);
+  CreateAndReopenWithCF({"koko", "toto"}, options);
+
+  bool write_global_seqno = std::get<0>(GetParam());
+  bool verify_checksums_before_ingest = std::get<1>(GetParam());
+  // Ingest into default cf
+  ASSERT_OK(GenerateAndAddExternalFile(
+      options, {1, 2}, -1, true, write_global_seqno,
+      verify_checksums_before_ingest, false, true, nullptr, handles_[0]));
+  ASSERT_EQ(listener->ingested_files.size(), 1);
+  ASSERT_EQ(listener->ingested_files.back().cf_name, "default");
+  ASSERT_EQ(listener->ingested_files.back().global_seqno, 0);
+  ASSERT_EQ(listener->ingested_files.back().table_properties.column_family_id,
+            0);
+  ASSERT_EQ(listener->ingested_files.back().table_properties.column_family_name,
+            "default");
+
+  // Ingest into cf1
+  ASSERT_OK(GenerateAndAddExternalFile(
+      options, {1, 2}, -1, true, write_global_seqno,
+      verify_checksums_before_ingest, false, true, nullptr, handles_[1]));
+  ASSERT_EQ(listener->ingested_files.size(), 2);
+  ASSERT_EQ(listener->ingested_files.back().cf_name, "koko");
+  ASSERT_EQ(listener->ingested_files.back().global_seqno, 0);
+  ASSERT_EQ(listener->ingested_files.back().table_properties.column_family_id,
+            1);
+  ASSERT_EQ(listener->ingested_files.back().table_properties.column_family_name,
+            "koko");
+
+  // Ingest into cf2
+  ASSERT_OK(GenerateAndAddExternalFile(
+      options, {1, 2}, -1, true, write_global_seqno,
+      verify_checksums_before_ingest, false, true, nullptr, handles_[2]));
+  ASSERT_EQ(listener->ingested_files.size(), 3);
+  ASSERT_EQ(listener->ingested_files.back().cf_name, "toto");
+  ASSERT_EQ(listener->ingested_files.back().global_seqno, 0);
+  ASSERT_EQ(listener->ingested_files.back().table_properties.column_family_id,
+            2);
+  ASSERT_EQ(listener->ingested_files.back().table_properties.column_family_name,
+            "toto");
+}
+
+TEST_F(ExternalSSTFileTest, SnapshotInconsistencyBug) {
+  Options options = CurrentOptions();
+  DestroyAndReopen(options);
+  const int kNumKeys = 10000;
+
+  // Insert keys using normal path and take a snapshot
+  for (int i = 0; i < kNumKeys; i++) {
+    ASSERT_OK(Put(Key(i), Key(i) + "_V1"));
+  }
+  const Snapshot* snap = db_->GetSnapshot();
+
+  // Overwrite all keys using IngestExternalFile
+  std::string sst_file_path = sst_files_dir_ + "file1.sst";
+  SstFileWriter sst_file_writer(EnvOptions(), options);
+  ASSERT_OK(sst_file_writer.Open(sst_file_path));
+  for (int i = 0; i < kNumKeys; i++) {
+    ASSERT_OK(sst_file_writer.Put(Key(i), Key(i) + "_V2"));
+  }
+  ASSERT_OK(sst_file_writer.Finish());
+
+  IngestExternalFileOptions ifo;
+  ifo.move_files = true;
+  ASSERT_OK(db_->IngestExternalFile({sst_file_path}, ifo));
+
+  for (int i = 0; i < kNumKeys; i++) {
+    ASSERT_EQ(Get(Key(i), snap), Key(i) + "_V1");
+    ASSERT_EQ(Get(Key(i)), Key(i) + "_V2");
+  }
+
+  db_->ReleaseSnapshot(snap);
+}
+
+TEST_P(ExternalSSTFileTest, IngestBehind) {
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  options.num_levels = 3;
+  options.disable_auto_compactions = false;
+  DestroyAndReopen(options);
+  std::vector<std::pair<std::string, std::string>> file_data;
+  std::map<std::string, std::string> true_data;
+
+  // Insert 100 -> 200 into the memtable
+  for (int i = 100; i <= 200; i++) {
+    ASSERT_OK(Put(Key(i), "memtable"));
+    true_data[Key(i)] = "memtable";
+  }
+
+  // Insert 100 -> 200 using IngestExternalFile
+  file_data.clear();
+  for (int i = 0; i <= 20; i++) {
+    file_data.emplace_back(Key(i), "ingest_behind");
+  }
+
+  bool allow_global_seqno = true;
+  bool ingest_behind = true;
+  bool write_global_seqno = std::get<0>(GetParam());
+  bool verify_checksums_before_ingest = std::get<1>(GetParam());
+
+  // Can't ingest behind since allow_ingest_behind isn't set to true
+  ASSERT_NOK(GenerateAndAddExternalFile(
+      options, file_data, -1, allow_global_seqno, write_global_seqno,
+      verify_checksums_before_ingest, ingest_behind, false /*sort_data*/,
+      &true_data));
+
+  options.allow_ingest_behind = true;
+  // check that we still can open the DB, as num_levels should be
+  // sanitized to 3
+  options.num_levels = 2;
+  DestroyAndReopen(options);
+
+  options.num_levels = 3;
+  DestroyAndReopen(options);
+  // Insert 100 -> 200 into the memtable
+  for (int i = 100; i <= 200; i++) {
+    ASSERT_OK(Put(Key(i), "memtable"));
+    true_data[Key(i)] = "memtable";
+  }
+  db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  // Universal picker should go at second from the bottom level
+  ASSERT_EQ("0,1", FilesPerLevel());
+  ASSERT_OK(GenerateAndAddExternalFile(
+      options, file_data, -1, allow_global_seqno, write_global_seqno,
+      verify_checksums_before_ingest, true /*ingest_behind*/,
+      false /*sort_data*/, &true_data));
+  ASSERT_EQ("0,1,1", FilesPerLevel());
+  // this time ingest should fail as the file doesn't fit to the bottom level
+  ASSERT_NOK(GenerateAndAddExternalFile(
+      options, file_data, -1, allow_global_seqno, write_global_seqno,
+      verify_checksums_before_ingest, true /*ingest_behind*/,
+      false /*sort_data*/, &true_data));
+  ASSERT_EQ("0,1,1", FilesPerLevel());
+  db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  // bottom level should be empty
+  ASSERT_EQ("0,1", FilesPerLevel());
+
+  size_t kcnt = 0;
+  VerifyDBFromMap(true_data, &kcnt, false);
+}
+
+TEST_F(ExternalSSTFileTest, SkipBloomFilter) {
+  Options options = CurrentOptions();
+
+  BlockBasedTableOptions table_options;
+  table_options.filter_policy.reset(NewBloomFilterPolicy(10));
+  table_options.cache_index_and_filter_blocks = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+
+  // Create external SST file and include bloom filters
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  DestroyAndReopen(options);
+  {
+    std::string file_path = sst_files_dir_ + "sst_with_bloom.sst";
+    SstFileWriter sst_file_writer(EnvOptions(), options);
+    ASSERT_OK(sst_file_writer.Open(file_path));
+    ASSERT_OK(sst_file_writer.Put("Key1", "Value1"));
+    ASSERT_OK(sst_file_writer.Finish());
+
+    ASSERT_OK(
+        db_->IngestExternalFile({file_path}, IngestExternalFileOptions()));
+
+    ASSERT_EQ(Get("Key1"), "Value1");
+    ASSERT_GE(
+        options.statistics->getTickerCount(Tickers::BLOCK_CACHE_FILTER_ADD), 1);
+  }
+
+  // Create external SST file but skip bloom filters
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  DestroyAndReopen(options);
+  {
+    std::string file_path = sst_files_dir_ + "sst_with_no_bloom.sst";
+    SstFileWriter sst_file_writer(EnvOptions(), options, nullptr, true,
+                                  Env::IOPriority::IO_TOTAL,
+                                  true /* skip_filters */);
+    ASSERT_OK(sst_file_writer.Open(file_path));
+    ASSERT_OK(sst_file_writer.Put("Key1", "Value1"));
+    ASSERT_OK(sst_file_writer.Finish());
+
+    ASSERT_OK(
+        db_->IngestExternalFile({file_path}, IngestExternalFileOptions()));
+
+    ASSERT_EQ(Get("Key1"), "Value1");
+    ASSERT_EQ(
+        options.statistics->getTickerCount(Tickers::BLOCK_CACHE_FILTER_ADD), 0);
+  }
+}
+
+TEST_F(ExternalSSTFileTest, IngestFileWrittenWithCompressionDictionary) {
+  if (!ZSTD_Supported()) {
+    return;
+  }
+  const int kNumEntries = 1 << 10;
+  const int kNumBytesPerEntry = 1 << 10;
+  Options options = CurrentOptions();
+  options.compression = kZSTD;
+  options.compression_opts.max_dict_bytes = 1 << 14;        // 16KB
+  options.compression_opts.zstd_max_train_bytes = 1 << 18;  // 256KB
+  DestroyAndReopen(options);
+
+  std::atomic<int> num_compression_dicts(0);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "BlockBasedTableBuilder::WriteCompressionDictBlock:RawDict",
+      [&](void* /* arg */) { ++num_compression_dicts; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  Random rnd(301);
+  std::vector<std::pair<std::string, std::string>> random_data;
+  for (int i = 0; i < kNumEntries; i++) {
+    std::string val;
+    test::RandomString(&rnd, kNumBytesPerEntry, &val);
+    random_data.emplace_back(Key(i), std::move(val));
+  }
+  ASSERT_OK(GenerateAndAddExternalFile(options, std::move(random_data)));
+  ASSERT_EQ(1, num_compression_dicts);
+}
+
+TEST_P(ExternalSSTFileTest, IngestFilesIntoMultipleColumnFamilies_Success) {
+  std::unique_ptr<FaultInjectionTestEnv> fault_injection_env(
+      new FaultInjectionTestEnv(env_));
+  Options options = CurrentOptions();
+  options.env = fault_injection_env.get();
+  CreateAndReopenWithCF({"pikachu", "eevee"}, options);
+  std::vector<ColumnFamilyHandle*> column_families;
+  column_families.push_back(handles_[0]);
+  column_families.push_back(handles_[1]);
+  column_families.push_back(handles_[2]);
+  std::vector<IngestExternalFileOptions> ifos(column_families.size());
+  for (auto& ifo : ifos) {
+    ifo.allow_global_seqno = true;  // Always allow global_seqno
+    // May or may not write global_seqno
+    ifo.write_global_seqno = std::get<0>(GetParam());
+    // Whether to verify checksums before ingestion
+    ifo.verify_checksums_before_ingest = std::get<1>(GetParam());
+  }
+  std::vector<std::vector<std::pair<std::string, std::string>>> data;
+  data.push_back(
+      {std::make_pair("foo1", "fv1"), std::make_pair("foo2", "fv2")});
+  data.push_back(
+      {std::make_pair("bar1", "bv1"), std::make_pair("bar2", "bv2")});
+  data.push_back(
+      {std::make_pair("bar3", "bv3"), std::make_pair("bar4", "bv4")});
+
+  // Resize the true_data vector upon construction to avoid re-alloc
+  std::vector<std::map<std::string, std::string>> true_data(
+      column_families.size());
+  Status s = GenerateAndAddExternalFiles(options, column_families, ifos, data,
+                                         -1, true, true_data);
+  ASSERT_OK(s);
+  Close();
+  ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu", "eevee"},
+                           options);
+  ASSERT_EQ(3, handles_.size());
+  int cf = 0;
+  for (const auto& verify_map : true_data) {
+    for (const auto& elem : verify_map) {
+      const std::string& key = elem.first;
+      const std::string& value = elem.second;
+      ASSERT_EQ(value, Get(cf, key));
+    }
+    ++cf;
+  }
+  Close();
+  Destroy(options, true /* delete_cf_paths */);
+}
+
+TEST_P(ExternalSSTFileTest,
+       IngestFilesIntoMultipleColumnFamilies_NoMixedStateWithSnapshot) {
+  std::unique_ptr<FaultInjectionTestEnv> fault_injection_env(
+      new FaultInjectionTestEnv(env_));
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->LoadDependency({
+      {"DBImpl::IngestExternalFiles:InstallSVForFirstCF:0",
+       "ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_MixedState:"
+       "BeforeRead"},
+      {"ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_MixedState:"
+       "AfterRead",
+       "DBImpl::IngestExternalFiles:InstallSVForFirstCF:1"},
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  Options options = CurrentOptions();
+  options.env = fault_injection_env.get();
+  CreateAndReopenWithCF({"pikachu", "eevee"}, options);
+  const std::vector<std::map<std::string, std::string>> data_before_ingestion =
+      {{{"foo1", "fv1_0"}, {"foo2", "fv2_0"}, {"foo3", "fv3_0"}},
+       {{"bar1", "bv1_0"}, {"bar2", "bv2_0"}, {"bar3", "bv3_0"}},
+       {{"bar4", "bv4_0"}, {"bar5", "bv5_0"}, {"bar6", "bv6_0"}}};
+  for (size_t i = 0; i != handles_.size(); ++i) {
+    int cf = static_cast<int>(i);
+    const auto& orig_data = data_before_ingestion[i];
+    for (const auto& kv : orig_data) {
+      ASSERT_OK(Put(cf, kv.first, kv.second));
+    }
+    ASSERT_OK(Flush(cf));
+  }
+
+  std::vector<ColumnFamilyHandle*> column_families;
+  column_families.push_back(handles_[0]);
+  column_families.push_back(handles_[1]);
+  column_families.push_back(handles_[2]);
+  std::vector<IngestExternalFileOptions> ifos(column_families.size());
+  for (auto& ifo : ifos) {
+    ifo.allow_global_seqno = true;  // Always allow global_seqno
+    // May or may not write global_seqno
+    ifo.write_global_seqno = std::get<0>(GetParam());
+    // Whether to verify checksums before ingestion
+    ifo.verify_checksums_before_ingest = std::get<1>(GetParam());
+  }
+  std::vector<std::vector<std::pair<std::string, std::string>>> data;
+  data.push_back(
+      {std::make_pair("foo1", "fv1"), std::make_pair("foo2", "fv2")});
+  data.push_back(
+      {std::make_pair("bar1", "bv1"), std::make_pair("bar2", "bv2")});
+  data.push_back(
+      {std::make_pair("bar3", "bv3"), std::make_pair("bar4", "bv4")});
+  // Resize the true_data vector upon construction to avoid re-alloc
+  std::vector<std::map<std::string, std::string>> true_data(
+      column_families.size());
+  // Take snapshot before ingestion starts
+  ReadOptions read_opts;
+  read_opts.total_order_seek = true;
+  read_opts.snapshot = dbfull()->GetSnapshot();
+  std::vector<Iterator*> iters(handles_.size());
+
+  // Range scan checks first kv of each CF before ingestion starts.
+  for (size_t i = 0; i != handles_.size(); ++i) {
+    iters[i] = dbfull()->NewIterator(read_opts, handles_[i]);
+    iters[i]->SeekToFirst();
+    ASSERT_TRUE(iters[i]->Valid());
+    const std::string& key = iters[i]->key().ToString();
+    const std::string& value = iters[i]->value().ToString();
+    const std::map<std::string, std::string>& orig_data =
+        data_before_ingestion[i];
+    std::map<std::string, std::string>::const_iterator it = orig_data.find(key);
+    ASSERT_NE(orig_data.end(), it);
+    ASSERT_EQ(it->second, value);
+    iters[i]->Next();
+  }
+  port::Thread ingest_thread([&]() {
+    ASSERT_OK(GenerateAndAddExternalFiles(options, column_families, ifos, data,
+                                          -1, true, true_data));
+  });
+  TEST_SYNC_POINT(
+      "ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_MixedState:"
+      "BeforeRead");
+  // Should see only data before ingestion
+  for (size_t i = 0; i != handles_.size(); ++i) {
+    const auto& orig_data = data_before_ingestion[i];
+    for (; iters[i]->Valid(); iters[i]->Next()) {
+      const std::string& key = iters[i]->key().ToString();
+      const std::string& value = iters[i]->value().ToString();
+      std::map<std::string, std::string>::const_iterator it =
+          orig_data.find(key);
+      ASSERT_NE(orig_data.end(), it);
+      ASSERT_EQ(it->second, value);
+    }
+  }
+  TEST_SYNC_POINT(
+      "ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_MixedState:"
+      "AfterRead");
+  ingest_thread.join();
+  for (auto* iter : iters) {
+    delete iter;
+  }
+  iters.clear();
+  dbfull()->ReleaseSnapshot(read_opts.snapshot);
+
+  Close();
+  ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu", "eevee"},
+                           options);
+  // Should see consistent state after ingestion for all column families even
+  // without snapshot.
+  ASSERT_EQ(3, handles_.size());
+  int cf = 0;
+  for (const auto& verify_map : true_data) {
+    for (const auto& elem : verify_map) {
+      const std::string& key = elem.first;
+      const std::string& value = elem.second;
+      ASSERT_EQ(value, Get(cf, key));
+    }
+    ++cf;
+  }
+  Close();
+  Destroy(options, true /* delete_cf_paths */);
+}
+
+TEST_P(ExternalSSTFileTest, IngestFilesIntoMultipleColumnFamilies_PrepareFail) {
+  std::unique_ptr<FaultInjectionTestEnv> fault_injection_env(
+      new FaultInjectionTestEnv(env_));
+  Options options = CurrentOptions();
+  options.env = fault_injection_env.get();
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->LoadDependency({
+      {"DBImpl::IngestExternalFiles:BeforeLastJobPrepare:0",
+       "ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_PrepareFail:"
+       "0"},
+      {"ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies:PrepareFail:"
+       "1",
+       "DBImpl::IngestExternalFiles:BeforeLastJobPrepare:1"},
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+  CreateAndReopenWithCF({"pikachu", "eevee"}, options);
+  std::vector<ColumnFamilyHandle*> column_families;
+  column_families.push_back(handles_[0]);
+  column_families.push_back(handles_[1]);
+  column_families.push_back(handles_[2]);
+  std::vector<IngestExternalFileOptions> ifos(column_families.size());
+  for (auto& ifo : ifos) {
+    ifo.allow_global_seqno = true;  // Always allow global_seqno
+    // May or may not write global_seqno
+    ifo.write_global_seqno = std::get<0>(GetParam());
+    // Whether to verify block checksums before ingest
+    ifo.verify_checksums_before_ingest = std::get<1>(GetParam());
+  }
+  std::vector<std::vector<std::pair<std::string, std::string>>> data;
+  data.push_back(
+      {std::make_pair("foo1", "fv1"), std::make_pair("foo2", "fv2")});
+  data.push_back(
+      {std::make_pair("bar1", "bv1"), std::make_pair("bar2", "bv2")});
+  data.push_back(
+      {std::make_pair("bar3", "bv3"), std::make_pair("bar4", "bv4")});
+
+  // Resize the true_data vector upon construction to avoid re-alloc
+  std::vector<std::map<std::string, std::string>> true_data(
+      column_families.size());
+  port::Thread ingest_thread([&]() {
+    Status s = GenerateAndAddExternalFiles(options, column_families, ifos, data,
+                                           -1, true, true_data);
+    ASSERT_NOK(s);
+  });
+  TEST_SYNC_POINT(
+      "ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_PrepareFail:"
+      "0");
+  fault_injection_env->SetFilesystemActive(false);
+  TEST_SYNC_POINT(
+      "ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies:PrepareFail:"
+      "1");
+  ingest_thread.join();
+
+  fault_injection_env->SetFilesystemActive(true);
+  Close();
+  ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu", "eevee"},
+                           options);
+  ASSERT_EQ(3, handles_.size());
+  int cf = 0;
+  for (const auto& verify_map : true_data) {
+    for (const auto& elem : verify_map) {
+      const std::string& key = elem.first;
+      ASSERT_EQ("NOT_FOUND", Get(cf, key));
+    }
+    ++cf;
+  }
+  Close();
+  Destroy(options, true /* delete_cf_paths */);
+}
+
+TEST_P(ExternalSSTFileTest, IngestFilesIntoMultipleColumnFamilies_CommitFail) {
+  std::unique_ptr<FaultInjectionTestEnv> fault_injection_env(
+      new FaultInjectionTestEnv(env_));
+  Options options = CurrentOptions();
+  options.env = fault_injection_env.get();
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->LoadDependency({
+      {"DBImpl::IngestExternalFiles:BeforeJobsRun:0",
+       "ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_CommitFail:"
+       "0"},
+      {"ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_CommitFail:"
+       "1",
+       "DBImpl::IngestExternalFiles:BeforeJobsRun:1"},
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+  CreateAndReopenWithCF({"pikachu", "eevee"}, options);
+  std::vector<ColumnFamilyHandle*> column_families;
+  column_families.push_back(handles_[0]);
+  column_families.push_back(handles_[1]);
+  column_families.push_back(handles_[2]);
+  std::vector<IngestExternalFileOptions> ifos(column_families.size());
+  for (auto& ifo : ifos) {
+    ifo.allow_global_seqno = true;  // Always allow global_seqno
+    // May or may not write global_seqno
+    ifo.write_global_seqno = std::get<0>(GetParam());
+    // Whether to verify block checksums before ingestion
+    ifo.verify_checksums_before_ingest = std::get<1>(GetParam());
+  }
+  std::vector<std::vector<std::pair<std::string, std::string>>> data;
+  data.push_back(
+      {std::make_pair("foo1", "fv1"), std::make_pair("foo2", "fv2")});
+  data.push_back(
+      {std::make_pair("bar1", "bv1"), std::make_pair("bar2", "bv2")});
+  data.push_back(
+      {std::make_pair("bar3", "bv3"), std::make_pair("bar4", "bv4")});
+  // Resize the true_data vector upon construction to avoid re-alloc
+  std::vector<std::map<std::string, std::string>> true_data(
+      column_families.size());
+  port::Thread ingest_thread([&]() {
+    Status s = GenerateAndAddExternalFiles(options, column_families, ifos, data,
+                                           -1, true, true_data);
+    ASSERT_NOK(s);
+  });
+  TEST_SYNC_POINT(
+      "ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_CommitFail:"
+      "0");
+  fault_injection_env->SetFilesystemActive(false);
+  TEST_SYNC_POINT(
+      "ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_CommitFail:"
+      "1");
+  ingest_thread.join();
+
+  fault_injection_env->SetFilesystemActive(true);
+  Close();
+  ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu", "eevee"},
+                           options);
+  ASSERT_EQ(3, handles_.size());
+  int cf = 0;
+  for (const auto& verify_map : true_data) {
+    for (const auto& elem : verify_map) {
+      const std::string& key = elem.first;
+      ASSERT_EQ("NOT_FOUND", Get(cf, key));
+    }
+    ++cf;
+  }
+  Close();
+  Destroy(options, true /* delete_cf_paths */);
+}
+
+TEST_P(ExternalSSTFileTest,
+       IngestFilesIntoMultipleColumnFamilies_PartialManifestWriteFail) {
+  std::unique_ptr<FaultInjectionTestEnv> fault_injection_env(
+      new FaultInjectionTestEnv(env_));
+  Options options = CurrentOptions();
+  options.env = fault_injection_env.get();
+
+  CreateAndReopenWithCF({"pikachu", "eevee"}, options);
+
+  SyncPoint::GetInstance()->ClearTrace();
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->LoadDependency({
+      {"VersionSet::ProcessManifestWrites:BeforeWriteLastVersionEdit:0",
+       "ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_"
+       "PartialManifestWriteFail:0"},
+      {"ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_"
+       "PartialManifestWriteFail:1",
+       "VersionSet::ProcessManifestWrites:BeforeWriteLastVersionEdit:1"},
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  std::vector<ColumnFamilyHandle*> column_families;
+  column_families.push_back(handles_[0]);
+  column_families.push_back(handles_[1]);
+  column_families.push_back(handles_[2]);
+  std::vector<IngestExternalFileOptions> ifos(column_families.size());
+  for (auto& ifo : ifos) {
+    ifo.allow_global_seqno = true;  // Always allow global_seqno
+    // May or may not write global_seqno
+    ifo.write_global_seqno = std::get<0>(GetParam());
+    // Whether to verify block checksums before ingestion
+    ifo.verify_checksums_before_ingest = std::get<1>(GetParam());
+  }
+  std::vector<std::vector<std::pair<std::string, std::string>>> data;
+  data.push_back(
+      {std::make_pair("foo1", "fv1"), std::make_pair("foo2", "fv2")});
+  data.push_back(
+      {std::make_pair("bar1", "bv1"), std::make_pair("bar2", "bv2")});
+  data.push_back(
+      {std::make_pair("bar3", "bv3"), std::make_pair("bar4", "bv4")});
+  // Resize the true_data vector upon construction to avoid re-alloc
+  std::vector<std::map<std::string, std::string>> true_data(
+      column_families.size());
+  port::Thread ingest_thread([&]() {
+    Status s = GenerateAndAddExternalFiles(options, column_families, ifos, data,
+                                           -1, true, true_data);
+    ASSERT_NOK(s);
+  });
+  TEST_SYNC_POINT(
+      "ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_"
+      "PartialManifestWriteFail:0");
+  fault_injection_env->SetFilesystemActive(false);
+  TEST_SYNC_POINT(
+      "ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_"
+      "PartialManifestWriteFail:1");
+  ingest_thread.join();
+
+  fault_injection_env->DropUnsyncedFileData();
+  fault_injection_env->SetFilesystemActive(true);
+  Close();
+  ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu", "eevee"},
+                           options);
+  ASSERT_EQ(3, handles_.size());
+  int cf = 0;
+  for (const auto& verify_map : true_data) {
+    for (const auto& elem : verify_map) {
+      const std::string& key = elem.first;
+      ASSERT_EQ("NOT_FOUND", Get(cf, key));
+    }
+    ++cf;
+  }
+  Close();
+  Destroy(options, true /* delete_cf_paths */);
+}
+
+TEST_P(ExternalSSTFileTest, IngestFilesTriggerFlushingWithTwoWriteQueue) {
+  Options options = CurrentOptions();
+  // Use large buffer to avoid memtable flush
+  options.write_buffer_size = 1024 * 1024;
+  options.two_write_queues = true;
+  DestroyAndReopen(options);
+
+  ASSERT_OK(dbfull()->Put(WriteOptions(), "1000", "v1"));
+  ASSERT_OK(dbfull()->Put(WriteOptions(), "1001", "v1"));
+  ASSERT_OK(dbfull()->Put(WriteOptions(), "9999", "v1"));
+
+  // Put one key which is overlap with keys in memtable.
+  // It will trigger flushing memtable and require this thread is
+  // currently at the front of the 2nd writer queue. We must make
+  // sure that it won't enter the 2nd writer queue for the second time.
+  std::vector<std::pair<std::string, std::string>> data;
+  data.push_back(std::make_pair("1001", "v2"));
+  GenerateAndAddExternalFile(options, data);
+}
+
+INSTANTIATE_TEST_CASE_P(ExternalSSTFileTest, ExternalSSTFileTest,
+                        testing::Values(std::make_tuple(false, false),
+                                        std::make_tuple(false, true),
+                                        std::make_tuple(true, false),
+                                        std::make_tuple(true, true)));
+
+INSTANTIATE_TEST_CASE_P(ExternSSTFileLinkFailFallbackTest,
+                        ExternSSTFileLinkFailFallbackTest,
+                        testing::Values(std::make_tuple(true, false),
+                                        std::make_tuple(true, true),
+                                        std::make_tuple(false, false)));
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+  fprintf(stderr,
+          "SKIPPED as External SST File Writer and Ingestion are not supported "
+          "in ROCKSDB_LITE\n");
+  return 0;
+}
+
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/fault_injection_test.cc b/src/rocksdb/db/fault_injection_test.cc
new file mode 100644
index 000000000..f4ca3458a
--- /dev/null
+++ b/src/rocksdb/db/fault_injection_test.cc
@@ -0,0 +1,555 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright 2014 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+// This test uses a custom Env to keep track of the state of a filesystem as of
+// the last "sync". It then checks for data loss errors by purposely dropping
+// file data (or entire files) not protected by a "sync".
+
+#include "db/db_impl/db_impl.h"
+#include "db/log_format.h"
+#include "db/version_set.h"
+#include "env/mock_env.h"
+#include "file/filename.h"
+#include "logging/logging.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/table.h"
+#include "rocksdb/write_batch.h"
+#include "test_util/fault_injection_test_env.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/mutexlock.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+static const int kValueSize = 1000;
+static const int kMaxNumValues = 2000;
+static const size_t kNumIterations = 3;
+
+enum FaultInjectionOptionConfig {
+  kDefault,
+  kDifferentDataDir,
+  kWalDir,
+  kSyncWal,
+  kWalDirSyncWal,
+  kMultiLevels,
+  kEnd,
+};
+class FaultInjectionTest
+    : public testing::Test,
+      public testing::WithParamInterface<std::tuple<
+          bool, FaultInjectionOptionConfig, FaultInjectionOptionConfig>> {
+ protected:
+  int option_config_;
+  int non_inclusive_end_range_;  // kEnd or equivalent to that
+  // When need to make sure data is persistent, sync WAL
+  bool sync_use_wal_;
+  // When need to make sure data is persistent, call DB::CompactRange()
+  bool sync_use_compact_;
+
+  bool sequential_order_;
+
+ protected:
+ public:
+  enum ExpectedVerifResult { kValExpectFound, kValExpectNoError };
+  enum ResetMethod {
+    kResetDropUnsyncedData,
+    kResetDropRandomUnsyncedData,
+    kResetDeleteUnsyncedFiles,
+    kResetDropAndDeleteUnsynced
+  };
+
+  std::unique_ptr<Env> base_env_;
+  FaultInjectionTestEnv* env_;
+  std::string dbname_;
+  std::shared_ptr<Cache> tiny_cache_;
+  Options options_;
+  DB* db_;
+
+  FaultInjectionTest()
+      : option_config_(std::get<1>(GetParam())),
+        non_inclusive_end_range_(std::get<2>(GetParam())),
+        sync_use_wal_(false),
+        sync_use_compact_(true),
+        base_env_(nullptr),
+        env_(nullptr),
+        db_(nullptr) {}
+
+  ~FaultInjectionTest() override {
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+  }
+
+  bool ChangeOptions() {
+    option_config_++;
+    if (option_config_ >= non_inclusive_end_range_) {
+      return false;
+    } else {
+      if (option_config_ == kMultiLevels) {
+        base_env_.reset(new MockEnv(Env::Default()));
+      }
+      return true;
+    }
+  }
+
+  // Return the current option configuration.
+  Options CurrentOptions() {
+    sync_use_wal_ = false;
+    sync_use_compact_ = true;
+    Options options;
+    switch (option_config_) {
+      case kWalDir:
+        options.wal_dir = test::PerThreadDBPath(env_, "fault_test_wal");
+        break;
+      case kDifferentDataDir:
+        options.db_paths.emplace_back(
+            test::PerThreadDBPath(env_, "fault_test_data"), 1000000U);
+        break;
+      case kSyncWal:
+        sync_use_wal_ = true;
+        sync_use_compact_ = false;
+        break;
+      case kWalDirSyncWal:
+        options.wal_dir = test::PerThreadDBPath(env_, "/fault_test_wal");
+        sync_use_wal_ = true;
+        sync_use_compact_ = false;
+        break;
+      case kMultiLevels:
+        options.write_buffer_size = 64 * 1024;
+        options.target_file_size_base = 64 * 1024;
+        options.level0_file_num_compaction_trigger = 2;
+        options.level0_slowdown_writes_trigger = 2;
+        options.level0_stop_writes_trigger = 4;
+        options.max_bytes_for_level_base = 128 * 1024;
+        options.max_write_buffer_number = 2;
+        options.max_background_compactions = 8;
+        options.max_background_flushes = 8;
+        sync_use_wal_ = true;
+        sync_use_compact_ = false;
+        break;
+      default:
+        break;
+    }
+    return options;
+  }
+
+  Status NewDB() {
+    assert(db_ == nullptr);
+    assert(tiny_cache_ == nullptr);
+    assert(env_ == nullptr);
+
+    env_ =
+        new FaultInjectionTestEnv(base_env_ ? base_env_.get() : Env::Default());
+
+    options_ = CurrentOptions();
+    options_.env = env_;
+    options_.paranoid_checks = true;
+
+    BlockBasedTableOptions table_options;
+    tiny_cache_ = NewLRUCache(100);
+    table_options.block_cache = tiny_cache_;
+    options_.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+    dbname_ = test::PerThreadDBPath("fault_test");
+
+    EXPECT_OK(DestroyDB(dbname_, options_));
+
+    options_.create_if_missing = true;
+    Status s = OpenDB();
+    options_.create_if_missing = false;
+    return s;
+  }
+
+  void SetUp() override {
+    sequential_order_ = std::get<0>(GetParam());
+    ASSERT_OK(NewDB());
+  }
+
+  void TearDown() override {
+    CloseDB();
+
+    Status s = DestroyDB(dbname_, options_);
+
+    delete env_;
+    env_ = nullptr;
+
+    tiny_cache_.reset();
+
+    ASSERT_OK(s);
+  }
+
+  void Build(const WriteOptions& write_options, int start_idx, int num_vals) {
+    std::string key_space, value_space;
+    WriteBatch batch;
+    for (int i = start_idx; i < start_idx + num_vals; i++) {
+      Slice key = Key(i, &key_space);
+      batch.Clear();
+      batch.Put(key, Value(i, &value_space));
+      ASSERT_OK(db_->Write(write_options, &batch));
+    }
+  }
+
+  Status ReadValue(int i, std::string* val) const {
+    std::string key_space, value_space;
+    Slice key = Key(i, &key_space);
+    Value(i, &value_space);
+    ReadOptions options;
+    return db_->Get(options, key, val);
+  }
+
+  Status Verify(int start_idx, int num_vals,
+                ExpectedVerifResult expected) const {
+    std::string val;
+    std::string value_space;
+    Status s;
+    for (int i = start_idx; i < start_idx + num_vals && s.ok(); i++) {
+      Value(i, &value_space);
+      s = ReadValue(i, &val);
+      if (s.ok()) {
+        EXPECT_EQ(value_space, val);
+      }
+      if (expected == kValExpectFound) {
+        if (!s.ok()) {
+          fprintf(stderr, "Error when read %dth record (expect found): %s\n", i,
+                  s.ToString().c_str());
+          return s;
+        }
+      } else if (!s.ok() && !s.IsNotFound()) {
+        fprintf(stderr, "Error when read %dth record: %s\n", i,
+                s.ToString().c_str());
+        return s;
+      }
+    }
+    return Status::OK();
+  }
+
+  // Return the ith key
+  Slice Key(int i, std::string* storage) const {
+    unsigned long long num = i;
+    if (!sequential_order_) {
+      // random transfer
+      const int m = 0x5bd1e995;
+      num *= m;
+      num ^= num << 24;
+    }
+    char buf[100];
+    snprintf(buf, sizeof(buf), "%016d", static_cast<int>(num));
+    storage->assign(buf, strlen(buf));
+    return Slice(*storage);
+  }
+
+  // Return the value to associate with the specified key
+  Slice Value(int k, std::string* storage) const {
+    Random r(k);
+    return test::RandomString(&r, kValueSize, storage);
+  }
+
+  void CloseDB() {
+    delete db_;
+    db_ = nullptr;
+  }
+
+  Status OpenDB() {
+    CloseDB();
+    env_->ResetState();
+    Status s = DB::Open(options_, dbname_, &db_);
+    assert(db_ != nullptr);
+    return s;
+  }
+
+  void DeleteAllData() {
+    Iterator* iter = db_->NewIterator(ReadOptions());
+    WriteOptions options;
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+      ASSERT_OK(db_->Delete(WriteOptions(), iter->key()));
+    }
+
+    delete iter;
+
+    FlushOptions flush_options;
+    flush_options.wait = true;
+    db_->Flush(flush_options);
+  }
+
+  // rnd cannot be null for kResetDropRandomUnsyncedData
+  void ResetDBState(ResetMethod reset_method, Random* rnd = nullptr) {
+    env_->AssertNoOpenFile();
+    switch (reset_method) {
+      case kResetDropUnsyncedData:
+        ASSERT_OK(env_->DropUnsyncedFileData());
+        break;
+      case kResetDropRandomUnsyncedData:
+        ASSERT_OK(env_->DropRandomUnsyncedFileData(rnd));
+        break;
+      case kResetDeleteUnsyncedFiles:
+        ASSERT_OK(env_->DeleteFilesCreatedAfterLastDirSync());
+        break;
+      case kResetDropAndDeleteUnsynced:
+        ASSERT_OK(env_->DropUnsyncedFileData());
+        ASSERT_OK(env_->DeleteFilesCreatedAfterLastDirSync());
+        break;
+      default:
+        assert(false);
+    }
+  }
+
+  void PartialCompactTestPreFault(int num_pre_sync, int num_post_sync) {
+    DeleteAllData();
+
+    WriteOptions write_options;
+    write_options.sync = sync_use_wal_;
+
+    Build(write_options, 0, num_pre_sync);
+    if (sync_use_compact_) {
+      db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+    }
+    write_options.sync = false;
+    Build(write_options, num_pre_sync, num_post_sync);
+  }
+
+  void PartialCompactTestReopenWithFault(ResetMethod reset_method,
+                                         int num_pre_sync, int num_post_sync,
+                                         Random* rnd = nullptr) {
+    env_->SetFilesystemActive(false);
+    CloseDB();
+    ResetDBState(reset_method, rnd);
+    ASSERT_OK(OpenDB());
+    ASSERT_OK(Verify(0, num_pre_sync, FaultInjectionTest::kValExpectFound));
+    ASSERT_OK(Verify(num_pre_sync, num_post_sync,
+                     FaultInjectionTest::kValExpectNoError));
+    WaitCompactionFinish();
+    ASSERT_OK(Verify(0, num_pre_sync, FaultInjectionTest::kValExpectFound));
+    ASSERT_OK(Verify(num_pre_sync, num_post_sync,
+                     FaultInjectionTest::kValExpectNoError));
+  }
+
+  void NoWriteTestPreFault() {
+  }
+
+  void NoWriteTestReopenWithFault(ResetMethod reset_method) {
+    CloseDB();
+    ResetDBState(reset_method);
+    ASSERT_OK(OpenDB());
+  }
+
+  void WaitCompactionFinish() {
+    static_cast<DBImpl*>(db_->GetRootDB())->TEST_WaitForCompact();
+    ASSERT_OK(db_->Put(WriteOptions(), "", ""));
+  }
+};
+
+class FaultInjectionTestSplitted : public FaultInjectionTest {};
+
+TEST_P(FaultInjectionTestSplitted, FaultTest) {
+  do {
+    Random rnd(301);
+
+    for (size_t idx = 0; idx < kNumIterations; idx++) {
+      int num_pre_sync = rnd.Uniform(kMaxNumValues);
+      int num_post_sync = rnd.Uniform(kMaxNumValues);
+
+      PartialCompactTestPreFault(num_pre_sync, num_post_sync);
+      PartialCompactTestReopenWithFault(kResetDropUnsyncedData, num_pre_sync,
+                                        num_post_sync);
+      NoWriteTestPreFault();
+      NoWriteTestReopenWithFault(kResetDropUnsyncedData);
+
+      PartialCompactTestPreFault(num_pre_sync, num_post_sync);
+      PartialCompactTestReopenWithFault(kResetDropRandomUnsyncedData,
+                                        num_pre_sync, num_post_sync, &rnd);
+      NoWriteTestPreFault();
+      NoWriteTestReopenWithFault(kResetDropUnsyncedData);
+
+      // Setting a separate data path won't pass the test as we don't sync
+      // it after creating new files,
+      PartialCompactTestPreFault(num_pre_sync, num_post_sync);
+      PartialCompactTestReopenWithFault(kResetDropAndDeleteUnsynced,
+                                        num_pre_sync, num_post_sync);
+      NoWriteTestPreFault();
+      NoWriteTestReopenWithFault(kResetDropAndDeleteUnsynced);
+
+      PartialCompactTestPreFault(num_pre_sync, num_post_sync);
+      // No new files created so we expect all values since no files will be
+      // dropped.
+      PartialCompactTestReopenWithFault(kResetDeleteUnsyncedFiles, num_pre_sync,
+                                        num_post_sync);
+      NoWriteTestPreFault();
+      NoWriteTestReopenWithFault(kResetDeleteUnsyncedFiles);
+    }
+  } while (ChangeOptions());
+}
+
+// Previous log file is not fsynced if sync is forced after log rolling.
+TEST_P(FaultInjectionTest, WriteOptionSyncTest) {
+  test::SleepingBackgroundTask sleeping_task_low;
+  env_->SetBackgroundThreads(1, Env::HIGH);
+  // Block the job queue to prevent flush job from running.
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+                 Env::Priority::HIGH);
+  sleeping_task_low.WaitUntilSleeping();
+
+  WriteOptions write_options;
+  write_options.sync = false;
+
+  std::string key_space, value_space;
+  ASSERT_OK(
+      db_->Put(write_options, Key(1, &key_space), Value(1, &value_space)));
+  FlushOptions flush_options;
+  flush_options.wait = false;
+  ASSERT_OK(db_->Flush(flush_options));
+  write_options.sync = true;
+  ASSERT_OK(
+      db_->Put(write_options, Key(2, &key_space), Value(2, &value_space)));
+  db_->FlushWAL(false);
+
+  env_->SetFilesystemActive(false);
+  NoWriteTestReopenWithFault(kResetDropAndDeleteUnsynced);
+  sleeping_task_low.WakeUp();
+  sleeping_task_low.WaitUntilDone();
+
+  ASSERT_OK(OpenDB());
+  std::string val;
+  Value(2, &value_space);
+  ASSERT_OK(ReadValue(2, &val));
+  ASSERT_EQ(value_space, val);
+
+  Value(1, &value_space);
+  ASSERT_OK(ReadValue(1, &val));
+  ASSERT_EQ(value_space, val);
+}
+
+TEST_P(FaultInjectionTest, UninstalledCompaction) {
+  options_.target_file_size_base = 32 * 1024;
+  options_.write_buffer_size = 100 << 10;  // 100KB
+  options_.level0_file_num_compaction_trigger = 6;
+  options_.level0_stop_writes_trigger = 1 << 10;
+  options_.level0_slowdown_writes_trigger = 1 << 10;
+  options_.max_background_compactions = 1;
+  OpenDB();
+
+  if (!sequential_order_) {
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+        {"FaultInjectionTest::FaultTest:0", "DBImpl::BGWorkCompaction"},
+        {"CompactionJob::Run():End", "FaultInjectionTest::FaultTest:1"},
+        {"FaultInjectionTest::FaultTest:2",
+         "DBImpl::BackgroundCompaction:NonTrivial:AfterRun"},
+    });
+  }
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  int kNumKeys = 1000;
+  Build(WriteOptions(), 0, kNumKeys);
+  FlushOptions flush_options;
+  flush_options.wait = true;
+  db_->Flush(flush_options);
+  ASSERT_OK(db_->Put(WriteOptions(), "", ""));
+  TEST_SYNC_POINT("FaultInjectionTest::FaultTest:0");
+  TEST_SYNC_POINT("FaultInjectionTest::FaultTest:1");
+  env_->SetFilesystemActive(false);
+  TEST_SYNC_POINT("FaultInjectionTest::FaultTest:2");
+  CloseDB();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ResetDBState(kResetDropUnsyncedData);
+
+  std::atomic<bool> opened(false);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::Open:Opened", [&](void* /*arg*/) { opened.store(true); });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BGWorkCompaction",
+      [&](void* /*arg*/) { ASSERT_TRUE(opened.load()); });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_OK(OpenDB());
+  ASSERT_OK(Verify(0, kNumKeys, FaultInjectionTest::kValExpectFound));
+  WaitCompactionFinish();
+  ASSERT_OK(Verify(0, kNumKeys, FaultInjectionTest::kValExpectFound));
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_P(FaultInjectionTest, ManualLogSyncTest) {
+  test::SleepingBackgroundTask sleeping_task_low;
+  env_->SetBackgroundThreads(1, Env::HIGH);
+  // Block the job queue to prevent flush job from running.
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+                 Env::Priority::HIGH);
+  sleeping_task_low.WaitUntilSleeping();
+
+  WriteOptions write_options;
+  write_options.sync = false;
+
+  std::string key_space, value_space;
+  ASSERT_OK(
+      db_->Put(write_options, Key(1, &key_space), Value(1, &value_space)));
+  FlushOptions flush_options;
+  flush_options.wait = false;
+  ASSERT_OK(db_->Flush(flush_options));
+  ASSERT_OK(
+      db_->Put(write_options, Key(2, &key_space), Value(2, &value_space)));
+  ASSERT_OK(db_->FlushWAL(true));
+
+  env_->SetFilesystemActive(false);
+  NoWriteTestReopenWithFault(kResetDropAndDeleteUnsynced);
+  sleeping_task_low.WakeUp();
+  sleeping_task_low.WaitUntilDone();
+
+  ASSERT_OK(OpenDB());
+  std::string val;
+  Value(2, &value_space);
+  ASSERT_OK(ReadValue(2, &val));
+  ASSERT_EQ(value_space, val);
+
+  Value(1, &value_space);
+  ASSERT_OK(ReadValue(1, &val));
+  ASSERT_EQ(value_space, val);
+}
+
+TEST_P(FaultInjectionTest, WriteBatchWalTerminationTest) {
+  ReadOptions ro;
+  Options options = CurrentOptions();
+  options.env = env_;
+
+  WriteOptions wo;
+  wo.sync = true;
+  wo.disableWAL = false;
+  WriteBatch batch;
+  batch.Put("cats", "dogs");
+  batch.MarkWalTerminationPoint();
+  batch.Put("boys", "girls");
+  ASSERT_OK(db_->Write(wo, &batch));
+
+  env_->SetFilesystemActive(false);
+  NoWriteTestReopenWithFault(kResetDropAndDeleteUnsynced);
+  ASSERT_OK(OpenDB());
+
+  std::string val;
+  ASSERT_OK(db_->Get(ro, "cats", &val));
+  ASSERT_EQ("dogs", val);
+  ASSERT_EQ(db_->Get(ro, "boys", &val), Status::NotFound());
+}
+
+INSTANTIATE_TEST_CASE_P(
+    FaultTest, FaultInjectionTest,
+    ::testing::Values(std::make_tuple(false, kDefault, kEnd),
+                      std::make_tuple(true, kDefault, kEnd)));
+
+INSTANTIATE_TEST_CASE_P(
+    FaultTest, FaultInjectionTestSplitted,
+    ::testing::Values(std::make_tuple(false, kDefault, kSyncWal),
+                      std::make_tuple(true, kDefault, kSyncWal),
+                      std::make_tuple(false, kSyncWal, kEnd),
+                      std::make_tuple(true, kSyncWal, kEnd)));
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/file_indexer.cc b/src/rocksdb/db/file_indexer.cc
new file mode 100644
index 000000000..523cb3c16
--- /dev/null
+++ b/src/rocksdb/db/file_indexer.cc
@@ -0,0 +1,216 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/file_indexer.h"
+#include <algorithm>
+#include <functional>
+#include "db/version_edit.h"
+#include "rocksdb/comparator.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+FileIndexer::FileIndexer(const Comparator* ucmp)
+    : num_levels_(0), ucmp_(ucmp), level_rb_(nullptr) {}
+
+size_t FileIndexer::NumLevelIndex() const { return next_level_index_.size(); }
+
+size_t FileIndexer::LevelIndexSize(size_t level) const {
+  if (level >= next_level_index_.size()) {
+    return 0;
+  }
+  return next_level_index_[level].num_index;
+}
+
+void FileIndexer::GetNextLevelIndex(const size_t level, const size_t file_index,
+                                    const int cmp_smallest,
+                                    const int cmp_largest, int32_t* left_bound,
+                                    int32_t* right_bound) const {
+  assert(level > 0);
+
+  // Last level, no hint
+  if (level == num_levels_ - 1) {
+    *left_bound = 0;
+    *right_bound = -1;
+    return;
+  }
+
+  assert(level < num_levels_ - 1);
+  assert(static_cast<int32_t>(file_index) <= level_rb_[level]);
+
+  const IndexUnit* index_units = next_level_index_[level].index_units;
+  const auto& index = index_units[file_index];
+
+  if (cmp_smallest < 0) {
+    *left_bound = (level > 0 && file_index > 0)
+                      ? index_units[file_index - 1].largest_lb
+                      : 0;
+    *right_bound = index.smallest_rb;
+  } else if (cmp_smallest == 0) {
+    *left_bound = index.smallest_lb;
+    *right_bound = index.smallest_rb;
+  } else if (cmp_smallest > 0 && cmp_largest < 0) {
+    *left_bound = index.smallest_lb;
+    *right_bound = index.largest_rb;
+  } else if (cmp_largest == 0) {
+    *left_bound = index.largest_lb;
+    *right_bound = index.largest_rb;
+  } else if (cmp_largest > 0) {
+    *left_bound = index.largest_lb;
+    *right_bound = level_rb_[level + 1];
+  } else {
+    assert(false);
+  }
+
+  assert(*left_bound >= 0);
+  assert(*left_bound <= *right_bound + 1);
+  assert(*right_bound <= level_rb_[level + 1]);
+}
+
+void FileIndexer::UpdateIndex(Arena* arena, const size_t num_levels,
+                              std::vector<FileMetaData*>* const files) {
+  if (files == nullptr) {
+    return;
+  }
+  if (num_levels == 0) {  // uint_32 0-1 would cause bad behavior
+    num_levels_ = num_levels;
+    return;
+  }
+  assert(level_rb_ == nullptr);  // level_rb_ should be init here
+
+  num_levels_ = num_levels;
+  next_level_index_.resize(num_levels);
+
+  char* mem = arena->AllocateAligned(num_levels_ * sizeof(int32_t));
+  level_rb_ = new (mem) int32_t[num_levels_];
+  for (size_t i = 0; i < num_levels_; i++) {
+    level_rb_[i] = -1;
+  }
+
+  // L1 - Ln-1
+  for (size_t level = 1; level < num_levels_ - 1; ++level) {
+    const auto& upper_files = files[level];
+    const int32_t upper_size = static_cast<int32_t>(upper_files.size());
+    const auto& lower_files = files[level + 1];
+    level_rb_[level] = static_cast<int32_t>(upper_files.size()) - 1;
+    if (upper_size == 0) {
+      continue;
+    }
+    IndexLevel& index_level = next_level_index_[level];
+    index_level.num_index = upper_size;
+    mem = arena->AllocateAligned(upper_size * sizeof(IndexUnit));
+    index_level.index_units = new (mem) IndexUnit[upper_size];
+
+    CalculateLB(
+        upper_files, lower_files, &index_level,
+        [this](const FileMetaData* a, const FileMetaData* b) -> int {
+          return ucmp_->CompareWithoutTimestamp(a->smallest.user_key(),
+                                                b->largest.user_key());
+        },
+        [](IndexUnit* index, int32_t f_idx) { index->smallest_lb = f_idx; });
+    CalculateLB(
+        upper_files, lower_files, &index_level,
+        [this](const FileMetaData* a, const FileMetaData* b) -> int {
+          return ucmp_->CompareWithoutTimestamp(a->largest.user_key(),
+                                                b->largest.user_key());
+        },
+        [](IndexUnit* index, int32_t f_idx) { index->largest_lb = f_idx; });
+    CalculateRB(
+        upper_files, lower_files, &index_level,
+        [this](const FileMetaData* a, const FileMetaData* b) -> int {
+          return ucmp_->CompareWithoutTimestamp(a->smallest.user_key(),
+                                                b->smallest.user_key());
+        },
+        [](IndexUnit* index, int32_t f_idx) { index->smallest_rb = f_idx; });
+    CalculateRB(
+        upper_files, lower_files, &index_level,
+        [this](const FileMetaData* a, const FileMetaData* b) -> int {
+          return ucmp_->CompareWithoutTimestamp(a->largest.user_key(),
+                                                b->smallest.user_key());
+        },
+        [](IndexUnit* index, int32_t f_idx) { index->largest_rb = f_idx; });
+  }
+
+  level_rb_[num_levels_ - 1] =
+      static_cast<int32_t>(files[num_levels_ - 1].size()) - 1;
+}
+
+void FileIndexer::CalculateLB(
+    const std::vector<FileMetaData*>& upper_files,
+    const std::vector<FileMetaData*>& lower_files, IndexLevel* index_level,
+    std::function<int(const FileMetaData*, const FileMetaData*)> cmp_op,
+    std::function<void(IndexUnit*, int32_t)> set_index) {
+  const int32_t upper_size = static_cast<int32_t>(upper_files.size());
+  const int32_t lower_size = static_cast<int32_t>(lower_files.size());
+  int32_t upper_idx = 0;
+  int32_t lower_idx = 0;
+
+  IndexUnit* index = index_level->index_units;
+  while (upper_idx < upper_size && lower_idx < lower_size) {
+    int cmp = cmp_op(upper_files[upper_idx], lower_files[lower_idx]);
+
+    if (cmp == 0) {
+      set_index(&index[upper_idx], lower_idx);
+      ++upper_idx;
+    } else if (cmp > 0) {
+      // Lower level's file (largest) is smaller, a key won't hit in that
+      // file. Move to next lower file
+      ++lower_idx;
+    } else {
+      // Lower level's file becomes larger, update the index, and
+      // move to the next upper file
+      set_index(&index[upper_idx], lower_idx);
+      ++upper_idx;
+    }
+  }
+
+  while (upper_idx < upper_size) {
+    // Lower files are exhausted, that means the remaining upper files are
+    // greater than any lower files. Set the index to be the lower level size.
+    set_index(&index[upper_idx], lower_size);
+    ++upper_idx;
+  }
+}
+
+void FileIndexer::CalculateRB(
+    const std::vector<FileMetaData*>& upper_files,
+    const std::vector<FileMetaData*>& lower_files, IndexLevel* index_level,
+    std::function<int(const FileMetaData*, const FileMetaData*)> cmp_op,
+    std::function<void(IndexUnit*, int32_t)> set_index) {
+  const int32_t upper_size = static_cast<int32_t>(upper_files.size());
+  const int32_t lower_size = static_cast<int32_t>(lower_files.size());
+  int32_t upper_idx = upper_size - 1;
+  int32_t lower_idx = lower_size - 1;
+
+  IndexUnit* index = index_level->index_units;
+  while (upper_idx >= 0 && lower_idx >= 0) {
+    int cmp = cmp_op(upper_files[upper_idx], lower_files[lower_idx]);
+
+    if (cmp == 0) {
+      set_index(&index[upper_idx], lower_idx);
+      --upper_idx;
+    } else if (cmp < 0) {
+      // Lower level's file (smallest) is larger, a key won't hit in that
+      // file. Move to next lower file.
+      --lower_idx;
+    } else {
+      // Lower level's file becomes smaller, update the index, and move to
+      // the next the upper file
+      set_index(&index[upper_idx], lower_idx);
+      --upper_idx;
+    }
+  }
+  while (upper_idx >= 0) {
+    // Lower files are exhausted, that means the remaining upper files are
+    // smaller than any lower files. Set it to -1.
+    set_index(&index[upper_idx], -1);
+    --upper_idx;
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/file_indexer.h b/src/rocksdb/db/file_indexer.h
new file mode 100644
index 000000000..ad7553f2c
--- /dev/null
+++ b/src/rocksdb/db/file_indexer.h
@@ -0,0 +1,142 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <cstdint>
+#include <functional>
+#include <limits>
+#include <vector>
+#include "memory/arena.h"
+#include "port/port.h"
+#include "util/autovector.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Comparator;
+struct FileMetaData;
+struct FdWithKeyRange;
+struct FileLevel;
+
+// The file tree structure in Version is prebuilt and the range of each file
+// is known. On Version::Get(), it uses binary search to find a potential file
+// and then check if a target key can be found in the file by comparing the key
+// to each file's smallest and largest key. The results of these comparisons
+// can be reused beyond checking if a key falls into a file's range.
+// With some pre-calculated knowledge, each key comparison that has been done
+// can serve as a hint to narrow down further searches: if a key compared to
+// be smaller than a file's smallest or largest, that comparison can be used
+// to find out the right bound of next binary search. Similarly, if a key
+// compared to be larger than a file's smallest or largest, it can be utilized
+// to find out the left bound of next binary search.
+// With these hints: it can greatly reduce the range of binary search,
+// especially for bottom levels, given that one file most likely overlaps with
+// only N files from level below (where N is max_bytes_for_level_multiplier).
+// So on level L, we will only look at ~N files instead of N^L files on the
+// naive approach.
+class FileIndexer {
+ public:
+  explicit FileIndexer(const Comparator* ucmp);
+
+  size_t NumLevelIndex() const;
+
+  size_t LevelIndexSize(size_t level) const;
+
+  // Return a file index range in the next level to search for a key based on
+  // smallest and largest key comparison for the current file specified by
+  // level and file_index. When *left_index < *right_index, both index should
+  // be valid and fit in the vector size.
+  void GetNextLevelIndex(const size_t level, const size_t file_index,
+                         const int cmp_smallest, const int cmp_largest,
+                         int32_t* left_bound, int32_t* right_bound) const;
+
+  void UpdateIndex(Arena* arena, const size_t num_levels,
+                   std::vector<FileMetaData*>* const files);
+
+  enum {
+    // MSVC version 1800 still does not have constexpr for ::max()
+    kLevelMaxIndex = ROCKSDB_NAMESPACE::port::kMaxInt32
+  };
+
+ private:
+  size_t num_levels_;
+  const Comparator* ucmp_;
+
+  struct IndexUnit {
+    IndexUnit()
+      : smallest_lb(0), largest_lb(0), smallest_rb(-1), largest_rb(-1) {}
+    // During file search, a key is compared against smallest and largest
+    // from a FileMetaData. It can have 3 possible outcomes:
+    // (1) key is smaller than smallest, implying it is also smaller than
+    //     larger. Precalculated index based on "smallest < smallest" can
+    //     be used to provide right bound.
+    // (2) key is in between smallest and largest.
+    //     Precalculated index based on "smallest > greatest" can be used to
+    //     provide left bound.
+    //     Precalculated index based on "largest < smallest" can be used to
+    //     provide right bound.
+    // (3) key is larger than largest, implying it is also larger than smallest.
+    //     Precalculated index based on "largest > largest" can be used to
+    //     provide left bound.
+    //
+    // As a result, we will need to do:
+    // Compare smallest (<=) and largest keys from upper level file with
+    // smallest key from lower level to get a right bound.
+    // Compare smallest (>=) and largest keys from upper level file with
+    // largest key from lower level to get a left bound.
+    //
+    // Example:
+    //    level 1:              [50 - 60]
+    //    level 2:        [1 - 40], [45 - 55], [58 - 80]
+    // A key 35, compared to be less than 50, 3rd file on level 2 can be
+    // skipped according to rule (1). LB = 0, RB = 1.
+    // A key 53, sits in the middle 50 and 60. 1st file on level 2 can be
+    // skipped according to rule (2)-a, but the 3rd file cannot be skipped
+    // because 60 is greater than 58. LB = 1, RB = 2.
+    // A key 70, compared to be larger than 60. 1st and 2nd file can be skipped
+    // according to rule (3). LB = 2, RB = 2.
+    //
+    // Point to a left most file in a lower level that may contain a key,
+    // which compares greater than smallest of a FileMetaData (upper level)
+    int32_t smallest_lb;
+    // Point to a left most file in a lower level that may contain a key,
+    // which compares greater than largest of a FileMetaData (upper level)
+    int32_t largest_lb;
+    // Point to a right most file in a lower level that may contain a key,
+    // which compares smaller than smallest of a FileMetaData (upper level)
+    int32_t smallest_rb;
+    // Point to a right most file in a lower level that may contain a key,
+    // which compares smaller than largest of a FileMetaData (upper level)
+    int32_t largest_rb;
+  };
+
+  // Data structure to store IndexUnits in a whole level
+  struct IndexLevel {
+    size_t num_index;
+    IndexUnit* index_units;
+
+    IndexLevel() : num_index(0), index_units(nullptr) {}
+  };
+
+  void CalculateLB(
+      const std::vector<FileMetaData*>& upper_files,
+      const std::vector<FileMetaData*>& lower_files, IndexLevel* index_level,
+      std::function<int(const FileMetaData*, const FileMetaData*)> cmp_op,
+      std::function<void(IndexUnit*, int32_t)> set_index);
+
+  void CalculateRB(
+      const std::vector<FileMetaData*>& upper_files,
+      const std::vector<FileMetaData*>& lower_files, IndexLevel* index_level,
+      std::function<int(const FileMetaData*, const FileMetaData*)> cmp_op,
+      std::function<void(IndexUnit*, int32_t)> set_index);
+
+  autovector<IndexLevel> next_level_index_;
+  int32_t* level_rb_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/file_indexer_test.cc b/src/rocksdb/db/file_indexer_test.cc
new file mode 100644
index 000000000..99ce93993
--- /dev/null
+++ b/src/rocksdb/db/file_indexer_test.cc
@@ -0,0 +1,350 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/file_indexer.h"
+#include <string>
+#include "db/dbformat.h"
+#include "db/version_edit.h"
+#include "port/stack_trace.h"
+#include "rocksdb/comparator.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class IntComparator : public Comparator {
+ public:
+  int Compare(const Slice& a, const Slice& b) const override {
+    assert(a.size() == 8);
+    assert(b.size() == 8);
+    int64_t diff = *reinterpret_cast<const int64_t*>(a.data()) -
+                   *reinterpret_cast<const int64_t*>(b.data());
+    if (diff < 0) {
+      return -1;
+    } else if (diff == 0) {
+      return 0;
+    } else {
+      return 1;
+    }
+  }
+
+  const char* Name() const override { return "IntComparator"; }
+
+  void FindShortestSeparator(std::string* /*start*/,
+                             const Slice& /*limit*/) const override {}
+
+  void FindShortSuccessor(std::string* /*key*/) const override {}
+};
+
+class FileIndexerTest : public testing::Test {
+ public:
+  FileIndexerTest()
+      : kNumLevels(4), files(new std::vector<FileMetaData*>[kNumLevels]) {}
+
+  ~FileIndexerTest() override {
+    ClearFiles();
+    delete[] files;
+  }
+
+  void AddFile(int level, int64_t smallest, int64_t largest) {
+    auto* f = new FileMetaData();
+    f->smallest = IntKey(smallest);
+    f->largest = IntKey(largest);
+    files[level].push_back(f);
+  }
+
+  InternalKey IntKey(int64_t v) {
+    return InternalKey(Slice(reinterpret_cast<char*>(&v), 8), 0, kTypeValue);
+  }
+
+  void ClearFiles() {
+    for (uint32_t i = 0; i < kNumLevels; ++i) {
+      for (auto* f : files[i]) {
+        delete f;
+      }
+      files[i].clear();
+    }
+  }
+
+  void GetNextLevelIndex(const uint32_t level, const uint32_t file_index,
+      const int cmp_smallest, const int cmp_largest, int32_t* left_index,
+      int32_t* right_index) {
+    *left_index = 100;
+    *right_index = 100;
+    indexer->GetNextLevelIndex(level, file_index, cmp_smallest, cmp_largest,
+                               left_index, right_index);
+  }
+
+  int32_t left = 100;
+  int32_t right = 100;
+  const uint32_t kNumLevels;
+  IntComparator ucmp;
+  FileIndexer* indexer;
+
+  std::vector<FileMetaData*>* files;
+};
+
+// Case 0: Empty
+TEST_F(FileIndexerTest, Empty) {
+  Arena arena;
+  indexer = new FileIndexer(&ucmp);
+  indexer->UpdateIndex(&arena, 0, files);
+  delete indexer;
+}
+
+// Case 1: no overlap, files are on the left of next level files
+TEST_F(FileIndexerTest, no_overlap_left) {
+  Arena arena;
+  indexer = new FileIndexer(&ucmp);
+  // level 1
+  AddFile(1, 100, 200);
+  AddFile(1, 300, 400);
+  AddFile(1, 500, 600);
+  // level 2
+  AddFile(2, 1500, 1600);
+  AddFile(2, 1601, 1699);
+  AddFile(2, 1700, 1800);
+  // level 3
+  AddFile(3, 2500, 2600);
+  AddFile(3, 2601, 2699);
+  AddFile(3, 2700, 2800);
+  indexer->UpdateIndex(&arena, kNumLevels, files);
+  for (uint32_t level = 1; level < 3; ++level) {
+    for (uint32_t f = 0; f < 3; ++f) {
+      GetNextLevelIndex(level, f, -1, -1, &left, &right);
+      ASSERT_EQ(0, left);
+      ASSERT_EQ(-1, right);
+      GetNextLevelIndex(level, f, 0, -1, &left, &right);
+      ASSERT_EQ(0, left);
+      ASSERT_EQ(-1, right);
+      GetNextLevelIndex(level, f, 1, -1, &left, &right);
+      ASSERT_EQ(0, left);
+      ASSERT_EQ(-1, right);
+      GetNextLevelIndex(level, f, 1, 0, &left, &right);
+      ASSERT_EQ(0, left);
+      ASSERT_EQ(-1, right);
+      GetNextLevelIndex(level, f, 1, 1, &left, &right);
+      ASSERT_EQ(0, left);
+      ASSERT_EQ(2, right);
+    }
+  }
+  delete indexer;
+  ClearFiles();
+}
+
+// Case 2: no overlap, files are on the right of next level files
+TEST_F(FileIndexerTest, no_overlap_right) {
+  Arena arena;
+  indexer = new FileIndexer(&ucmp);
+  // level 1
+  AddFile(1, 2100, 2200);
+  AddFile(1, 2300, 2400);
+  AddFile(1, 2500, 2600);
+  // level 2
+  AddFile(2, 1500, 1600);
+  AddFile(2, 1501, 1699);
+  AddFile(2, 1700, 1800);
+  // level 3
+  AddFile(3, 500, 600);
+  AddFile(3, 501, 699);
+  AddFile(3, 700, 800);
+  indexer->UpdateIndex(&arena, kNumLevels, files);
+  for (uint32_t level = 1; level < 3; ++level) {
+    for (uint32_t f = 0; f < 3; ++f) {
+      GetNextLevelIndex(level, f, -1, -1, &left, &right);
+      ASSERT_EQ(f == 0 ? 0 : 3, left);
+      ASSERT_EQ(2, right);
+      GetNextLevelIndex(level, f, 0, -1, &left, &right);
+      ASSERT_EQ(3, left);
+      ASSERT_EQ(2, right);
+      GetNextLevelIndex(level, f, 1, -1, &left, &right);
+      ASSERT_EQ(3, left);
+      ASSERT_EQ(2, right);
+      GetNextLevelIndex(level, f, 1, -1, &left, &right);
+      ASSERT_EQ(3, left);
+      ASSERT_EQ(2, right);
+      GetNextLevelIndex(level, f, 1, 0, &left, &right);
+      ASSERT_EQ(3, left);
+      ASSERT_EQ(2, right);
+      GetNextLevelIndex(level, f, 1, 1, &left, &right);
+      ASSERT_EQ(3, left);
+      ASSERT_EQ(2, right);
+    }
+  }
+  delete indexer;
+}
+
+// Case 3: empty L2
+TEST_F(FileIndexerTest, empty_L2) {
+  Arena arena;
+  indexer = new FileIndexer(&ucmp);
+  for (uint32_t i = 1; i < kNumLevels; ++i) {
+    ASSERT_EQ(0U, indexer->LevelIndexSize(i));
+  }
+  // level 1
+  AddFile(1, 2100, 2200);
+  AddFile(1, 2300, 2400);
+  AddFile(1, 2500, 2600);
+  // level 3
+  AddFile(3, 500, 600);
+  AddFile(3, 501, 699);
+  AddFile(3, 700, 800);
+  indexer->UpdateIndex(&arena, kNumLevels, files);
+  for (uint32_t f = 0; f < 3; ++f) {
+    GetNextLevelIndex(1, f, -1, -1, &left, &right);
+    ASSERT_EQ(0, left);
+    ASSERT_EQ(-1, right);
+    GetNextLevelIndex(1, f, 0, -1, &left, &right);
+    ASSERT_EQ(0, left);
+    ASSERT_EQ(-1, right);
+    GetNextLevelIndex(1, f, 1, -1, &left, &right);
+    ASSERT_EQ(0, left);
+    ASSERT_EQ(-1, right);
+    GetNextLevelIndex(1, f, 1, -1, &left, &right);
+    ASSERT_EQ(0, left);
+    ASSERT_EQ(-1, right);
+    GetNextLevelIndex(1, f, 1, 0, &left, &right);
+    ASSERT_EQ(0, left);
+    ASSERT_EQ(-1, right);
+    GetNextLevelIndex(1, f, 1, 1, &left, &right);
+    ASSERT_EQ(0, left);
+    ASSERT_EQ(-1, right);
+  }
+  delete indexer;
+  ClearFiles();
+}
+
+// Case 4: mixed
+TEST_F(FileIndexerTest, mixed) {
+  Arena arena;
+  indexer = new FileIndexer(&ucmp);
+  // level 1
+  AddFile(1, 100, 200);
+  AddFile(1, 250, 400);
+  AddFile(1, 450, 500);
+  // level 2
+  AddFile(2, 100, 150);  // 0
+  AddFile(2, 200, 250);  // 1
+  AddFile(2, 251, 300);  // 2
+  AddFile(2, 301, 350);  // 3
+  AddFile(2, 500, 600);  // 4
+  // level 3
+  AddFile(3, 0, 50);
+  AddFile(3, 100, 200);
+  AddFile(3, 201, 250);
+  indexer->UpdateIndex(&arena, kNumLevels, files);
+  // level 1, 0
+  GetNextLevelIndex(1, 0, -1, -1, &left, &right);
+  ASSERT_EQ(0, left);
+  ASSERT_EQ(0, right);
+  GetNextLevelIndex(1, 0, 0, -1, &left, &right);
+  ASSERT_EQ(0, left);
+  ASSERT_EQ(0, right);
+  GetNextLevelIndex(1, 0, 1, -1, &left, &right);
+  ASSERT_EQ(0, left);
+  ASSERT_EQ(1, right);
+  GetNextLevelIndex(1, 0, 1, 0, &left, &right);
+  ASSERT_EQ(1, left);
+  ASSERT_EQ(1, right);
+  GetNextLevelIndex(1, 0, 1, 1, &left, &right);
+  ASSERT_EQ(1, left);
+  ASSERT_EQ(4, right);
+  // level 1, 1
+  GetNextLevelIndex(1, 1, -1, -1, &left, &right);
+  ASSERT_EQ(1, left);
+  ASSERT_EQ(1, right);
+  GetNextLevelIndex(1, 1, 0, -1, &left, &right);
+  ASSERT_EQ(1, left);
+  ASSERT_EQ(1, right);
+  GetNextLevelIndex(1, 1, 1, -1, &left, &right);
+  ASSERT_EQ(1, left);
+  ASSERT_EQ(3, right);
+  GetNextLevelIndex(1, 1, 1, 0, &left, &right);
+  ASSERT_EQ(4, left);
+  ASSERT_EQ(3, right);
+  GetNextLevelIndex(1, 1, 1, 1, &left, &right);
+  ASSERT_EQ(4, left);
+  ASSERT_EQ(4, right);
+  // level 1, 2
+  GetNextLevelIndex(1, 2, -1, -1, &left, &right);
+  ASSERT_EQ(4, left);
+  ASSERT_EQ(3, right);
+  GetNextLevelIndex(1, 2, 0, -1, &left, &right);
+  ASSERT_EQ(4, left);
+  ASSERT_EQ(3, right);
+  GetNextLevelIndex(1, 2, 1, -1, &left, &right);
+  ASSERT_EQ(4, left);
+  ASSERT_EQ(4, right);
+  GetNextLevelIndex(1, 2, 1, 0, &left, &right);
+  ASSERT_EQ(4, left);
+  ASSERT_EQ(4, right);
+  GetNextLevelIndex(1, 2, 1, 1, &left, &right);
+  ASSERT_EQ(4, left);
+  ASSERT_EQ(4, right);
+  // level 2, 0
+  GetNextLevelIndex(2, 0, -1, -1, &left, &right);
+  ASSERT_EQ(0, left);
+  ASSERT_EQ(1, right);
+  GetNextLevelIndex(2, 0, 0, -1, &left, &right);
+  ASSERT_EQ(1, left);
+  ASSERT_EQ(1, right);
+  GetNextLevelIndex(2, 0, 1, -1, &left, &right);
+  ASSERT_EQ(1, left);
+  ASSERT_EQ(1, right);
+  GetNextLevelIndex(2, 0, 1, 0, &left, &right);
+  ASSERT_EQ(1, left);
+  ASSERT_EQ(1, right);
+  GetNextLevelIndex(2, 0, 1, 1, &left, &right);
+  ASSERT_EQ(1, left);
+  ASSERT_EQ(2, right);
+  // level 2, 1
+  GetNextLevelIndex(2, 1, -1, -1, &left, &right);
+  ASSERT_EQ(1, left);
+  ASSERT_EQ(1, right);
+  GetNextLevelIndex(2, 1, 0, -1, &left, &right);
+  ASSERT_EQ(1, left);
+  ASSERT_EQ(1, right);
+  GetNextLevelIndex(2, 1, 1, -1, &left, &right);
+  ASSERT_EQ(1, left);
+  ASSERT_EQ(2, right);
+  GetNextLevelIndex(2, 1, 1, 0, &left, &right);
+  ASSERT_EQ(2, left);
+  ASSERT_EQ(2, right);
+  GetNextLevelIndex(2, 1, 1, 1, &left, &right);
+  ASSERT_EQ(2, left);
+  ASSERT_EQ(2, right);
+  // level 2, [2 - 4], no overlap
+  for (uint32_t f = 2; f <= 4; ++f) {
+    GetNextLevelIndex(2, f, -1, -1, &left, &right);
+    ASSERT_EQ(f == 2 ? 2 : 3, left);
+    ASSERT_EQ(2, right);
+    GetNextLevelIndex(2, f, 0, -1, &left, &right);
+    ASSERT_EQ(3, left);
+    ASSERT_EQ(2, right);
+    GetNextLevelIndex(2, f, 1, -1, &left, &right);
+    ASSERT_EQ(3, left);
+    ASSERT_EQ(2, right);
+    GetNextLevelIndex(2, f, 1, 0, &left, &right);
+    ASSERT_EQ(3, left);
+    ASSERT_EQ(2, right);
+    GetNextLevelIndex(2, f, 1, 1, &left, &right);
+    ASSERT_EQ(3, left);
+    ASSERT_EQ(2, right);
+  }
+  delete indexer;
+  ClearFiles();
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/filename_test.cc b/src/rocksdb/db/filename_test.cc
new file mode 100644
index 000000000..9a04542f6
--- /dev/null
+++ b/src/rocksdb/db/filename_test.cc
@@ -0,0 +1,180 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "file/filename.h"
+
+#include "db/dbformat.h"
+#include "logging/logging.h"
+#include "port/port.h"
+#include "test_util/testharness.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class FileNameTest : public testing::Test {};
+
+TEST_F(FileNameTest, Parse) {
+  Slice db;
+  FileType type;
+  uint64_t number;
+
+  char kDefautInfoLogDir = 1;
+  char kDifferentInfoLogDir = 2;
+  char kNoCheckLogDir = 4;
+  char kAllMode = kDefautInfoLogDir | kDifferentInfoLogDir | kNoCheckLogDir;
+
+  // Successful parses
+  static struct {
+    const char* fname;
+    uint64_t number;
+    FileType type;
+    char mode;
+  } cases[] = {
+        {"100.log", 100, kLogFile, kAllMode},
+        {"0.log", 0, kLogFile, kAllMode},
+        {"0.sst", 0, kTableFile, kAllMode},
+        {"CURRENT", 0, kCurrentFile, kAllMode},
+        {"LOCK", 0, kDBLockFile, kAllMode},
+        {"MANIFEST-2", 2, kDescriptorFile, kAllMode},
+        {"MANIFEST-7", 7, kDescriptorFile, kAllMode},
+        {"METADB-2", 2, kMetaDatabase, kAllMode},
+        {"METADB-7", 7, kMetaDatabase, kAllMode},
+        {"LOG", 0, kInfoLogFile, kDefautInfoLogDir},
+        {"LOG.old", 0, kInfoLogFile, kDefautInfoLogDir},
+        {"LOG.old.6688", 6688, kInfoLogFile, kDefautInfoLogDir},
+        {"rocksdb_dir_LOG", 0, kInfoLogFile, kDifferentInfoLogDir},
+        {"rocksdb_dir_LOG.old", 0, kInfoLogFile, kDifferentInfoLogDir},
+        {"rocksdb_dir_LOG.old.6688", 6688, kInfoLogFile, kDifferentInfoLogDir},
+        {"18446744073709551615.log", 18446744073709551615ull, kLogFile,
+         kAllMode}, };
+  for (char mode : {kDifferentInfoLogDir, kDefautInfoLogDir, kNoCheckLogDir}) {
+    for (unsigned int i = 0; i < sizeof(cases) / sizeof(cases[0]); i++) {
+      InfoLogPrefix info_log_prefix(mode != kDefautInfoLogDir, "/rocksdb/dir");
+      if (cases[i].mode & mode) {
+        std::string f = cases[i].fname;
+        if (mode == kNoCheckLogDir) {
+          ASSERT_TRUE(ParseFileName(f, &number, &type)) << f;
+        } else {
+          ASSERT_TRUE(ParseFileName(f, &number, info_log_prefix.prefix, &type))
+              << f;
+        }
+        ASSERT_EQ(cases[i].type, type) << f;
+        ASSERT_EQ(cases[i].number, number) << f;
+      }
+    }
+  }
+
+  // Errors
+  static const char* errors[] = {
+    "",
+    "foo",
+    "foo-dx-100.log",
+    ".log",
+    "",
+    "manifest",
+    "CURREN",
+    "CURRENTX",
+    "MANIFES",
+    "MANIFEST",
+    "MANIFEST-",
+    "XMANIFEST-3",
+    "MANIFEST-3x",
+    "META",
+    "METADB",
+    "METADB-",
+    "XMETADB-3",
+    "METADB-3x",
+    "LOC",
+    "LOCKx",
+    "LO",
+    "LOGx",
+    "18446744073709551616.log",
+    "184467440737095516150.log",
+    "100",
+    "100.",
+    "100.lop"
+  };
+  for (unsigned int i = 0; i < sizeof(errors) / sizeof(errors[0]); i++) {
+    std::string f = errors[i];
+    ASSERT_TRUE(!ParseFileName(f, &number, &type)) << f;
+  };
+}
+
+TEST_F(FileNameTest, InfoLogFileName) {
+  std::string dbname = ("/data/rocksdb");
+  std::string db_absolute_path;
+  Env::Default()->GetAbsolutePath(dbname, &db_absolute_path);
+
+  ASSERT_EQ("/data/rocksdb/LOG", InfoLogFileName(dbname, db_absolute_path, ""));
+  ASSERT_EQ("/data/rocksdb/LOG.old.666",
+            OldInfoLogFileName(dbname, 666u, db_absolute_path, ""));
+
+  ASSERT_EQ("/data/rocksdb_log/data_rocksdb_LOG",
+            InfoLogFileName(dbname, db_absolute_path, "/data/rocksdb_log"));
+  ASSERT_EQ(
+      "/data/rocksdb_log/data_rocksdb_LOG.old.666",
+      OldInfoLogFileName(dbname, 666u, db_absolute_path, "/data/rocksdb_log"));
+}
+
+TEST_F(FileNameTest, Construction) {
+  uint64_t number;
+  FileType type;
+  std::string fname;
+
+  fname = CurrentFileName("foo");
+  ASSERT_EQ("foo/", std::string(fname.data(), 4));
+  ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
+  ASSERT_EQ(0U, number);
+  ASSERT_EQ(kCurrentFile, type);
+
+  fname = LockFileName("foo");
+  ASSERT_EQ("foo/", std::string(fname.data(), 4));
+  ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
+  ASSERT_EQ(0U, number);
+  ASSERT_EQ(kDBLockFile, type);
+
+  fname = LogFileName("foo", 192);
+  ASSERT_EQ("foo/", std::string(fname.data(), 4));
+  ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
+  ASSERT_EQ(192U, number);
+  ASSERT_EQ(kLogFile, type);
+
+  fname = TableFileName({DbPath("bar", 0)}, 200, 0);
+  std::string fname1 =
+      TableFileName({DbPath("foo", 0), DbPath("bar", 0)}, 200, 1);
+  ASSERT_EQ(fname, fname1);
+  ASSERT_EQ("bar/", std::string(fname.data(), 4));
+  ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
+  ASSERT_EQ(200U, number);
+  ASSERT_EQ(kTableFile, type);
+
+  fname = DescriptorFileName("bar", 100);
+  ASSERT_EQ("bar/", std::string(fname.data(), 4));
+  ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
+  ASSERT_EQ(100U, number);
+  ASSERT_EQ(kDescriptorFile, type);
+
+  fname = TempFileName("tmp", 999);
+  ASSERT_EQ("tmp/", std::string(fname.data(), 4));
+  ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
+  ASSERT_EQ(999U, number);
+  ASSERT_EQ(kTempFile, type);
+
+  fname = MetaDatabaseName("met", 100);
+  ASSERT_EQ("met/", std::string(fname.data(), 4));
+  ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
+  ASSERT_EQ(100U, number);
+  ASSERT_EQ(kMetaDatabase, type);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/flush_job.cc b/src/rocksdb/db/flush_job.cc
new file mode 100644
index 000000000..997bd8080
--- /dev/null
+++ b/src/rocksdb/db/flush_job.cc
@@ -0,0 +1,466 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/flush_job.h"
+
+#include <cinttypes>
+
+#include <algorithm>
+#include <vector>
+
+#include "db/builder.h"
+#include "db/db_iter.h"
+#include "db/dbformat.h"
+#include "db/event_helpers.h"
+#include "db/log_reader.h"
+#include "db/log_writer.h"
+#include "db/memtable.h"
+#include "db/memtable_list.h"
+#include "db/merge_context.h"
+#include "db/range_tombstone_fragmenter.h"
+#include "db/version_set.h"
+#include "file/file_util.h"
+#include "file/filename.h"
+#include "logging/event_logger.h"
+#include "logging/log_buffer.h"
+#include "logging/logging.h"
+#include "monitoring/iostats_context_imp.h"
+#include "monitoring/perf_context_imp.h"
+#include "monitoring/thread_status_util.h"
+#include "port/port.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/status.h"
+#include "rocksdb/table.h"
+#include "table/block_based/block.h"
+#include "table/block_based/block_based_table_factory.h"
+#include "table/merging_iterator.h"
+#include "table/table_builder.h"
+#include "table/two_level_iterator.h"
+#include "test_util/sync_point.h"
+#include "util/coding.h"
+#include "util/mutexlock.h"
+#include "util/stop_watch.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+const char* GetFlushReasonString (FlushReason flush_reason) {
+  switch (flush_reason) {
+    case FlushReason::kOthers:
+      return "Other Reasons";
+    case FlushReason::kGetLiveFiles:
+      return "Get Live Files";
+    case FlushReason::kShutDown:
+      return "Shut down";
+    case FlushReason::kExternalFileIngestion:
+      return "External File Ingestion";
+    case FlushReason::kManualCompaction:
+      return "Manual Compaction";
+    case FlushReason::kWriteBufferManager:
+      return "Write Buffer Manager";
+    case FlushReason::kWriteBufferFull:
+      return "Write Buffer Full";
+    case FlushReason::kTest:
+      return "Test";
+    case FlushReason::kDeleteFiles:
+      return "Delete Files";
+    case FlushReason::kAutoCompaction:
+      return "Auto Compaction";
+    case FlushReason::kManualFlush:
+      return "Manual Flush";
+    case FlushReason::kErrorRecovery:
+      return "Error Recovery";
+    default:
+      return "Invalid";
+  }
+}
+
+FlushJob::FlushJob(const std::string& dbname, ColumnFamilyData* cfd,
+                   const ImmutableDBOptions& db_options,
+                   const MutableCFOptions& mutable_cf_options,
+                   const uint64_t* max_memtable_id,
+                   const FileOptions& file_options, VersionSet* versions,
+                   InstrumentedMutex* db_mutex,
+                   std::atomic<bool>* shutting_down,
+                   std::vector<SequenceNumber> existing_snapshots,
+                   SequenceNumber earliest_write_conflict_snapshot,
+                   SnapshotChecker* snapshot_checker, JobContext* job_context,
+                   LogBuffer* log_buffer, Directory* db_directory,
+                   Directory* output_file_directory,
+                   CompressionType output_compression, Statistics* stats,
+                   EventLogger* event_logger, bool measure_io_stats,
+                   const bool sync_output_directory, const bool write_manifest,
+                   Env::Priority thread_pri)
+    : dbname_(dbname),
+      cfd_(cfd),
+      db_options_(db_options),
+      mutable_cf_options_(mutable_cf_options),
+      max_memtable_id_(max_memtable_id),
+      file_options_(file_options),
+      versions_(versions),
+      db_mutex_(db_mutex),
+      shutting_down_(shutting_down),
+      existing_snapshots_(std::move(existing_snapshots)),
+      earliest_write_conflict_snapshot_(earliest_write_conflict_snapshot),
+      snapshot_checker_(snapshot_checker),
+      job_context_(job_context),
+      log_buffer_(log_buffer),
+      db_directory_(db_directory),
+      output_file_directory_(output_file_directory),
+      output_compression_(output_compression),
+      stats_(stats),
+      event_logger_(event_logger),
+      measure_io_stats_(measure_io_stats),
+      sync_output_directory_(sync_output_directory),
+      write_manifest_(write_manifest),
+      edit_(nullptr),
+      base_(nullptr),
+      pick_memtable_called(false),
+      thread_pri_(thread_pri) {
+  // Update the thread status to indicate flush.
+  ReportStartedFlush();
+  TEST_SYNC_POINT("FlushJob::FlushJob()");
+}
+
+FlushJob::~FlushJob() {
+  ThreadStatusUtil::ResetThreadStatus();
+}
+
+void FlushJob::ReportStartedFlush() {
+  ThreadStatusUtil::SetColumnFamily(cfd_, cfd_->ioptions()->env,
+                                    db_options_.enable_thread_tracking);
+  ThreadStatusUtil::SetThreadOperation(ThreadStatus::OP_FLUSH);
+  ThreadStatusUtil::SetThreadOperationProperty(
+      ThreadStatus::COMPACTION_JOB_ID,
+      job_context_->job_id);
+  IOSTATS_RESET(bytes_written);
+}
+
+void FlushJob::ReportFlushInputSize(const autovector<MemTable*>& mems) {
+  uint64_t input_size = 0;
+  for (auto* mem : mems) {
+    input_size += mem->ApproximateMemoryUsage();
+  }
+  ThreadStatusUtil::IncreaseThreadOperationProperty(
+      ThreadStatus::FLUSH_BYTES_MEMTABLES,
+      input_size);
+}
+
+void FlushJob::RecordFlushIOStats() {
+  RecordTick(stats_, FLUSH_WRITE_BYTES, IOSTATS(bytes_written));
+  ThreadStatusUtil::IncreaseThreadOperationProperty(
+      ThreadStatus::FLUSH_BYTES_WRITTEN, IOSTATS(bytes_written));
+  IOSTATS_RESET(bytes_written);
+}
+
+void FlushJob::PickMemTable() {
+  db_mutex_->AssertHeld();
+  assert(!pick_memtable_called);
+  pick_memtable_called = true;
+  // Save the contents of the earliest memtable as a new Table
+  cfd_->imm()->PickMemtablesToFlush(max_memtable_id_, &mems_);
+  if (mems_.empty()) {
+    return;
+  }
+
+  ReportFlushInputSize(mems_);
+
+  // entries mems are (implicitly) sorted in ascending order by their created
+  // time. We will use the first memtable's `edit` to keep the meta info for
+  // this flush.
+  MemTable* m = mems_[0];
+  edit_ = m->GetEdits();
+  edit_->SetPrevLogNumber(0);
+  // SetLogNumber(log_num) indicates logs with number smaller than log_num
+  // will no longer be picked up for recovery.
+  edit_->SetLogNumber(mems_.back()->GetNextLogNumber());
+  edit_->SetColumnFamily(cfd_->GetID());
+
+  // path 0 for level 0 file.
+  meta_.fd = FileDescriptor(versions_->NewFileNumber(), 0, 0);
+
+  base_ = cfd_->current();
+  base_->Ref();  // it is likely that we do not need this reference
+}
+
+Status FlushJob::Run(LogsWithPrepTracker* prep_tracker,
+                     FileMetaData* file_meta) {
+  TEST_SYNC_POINT("FlushJob::Start");
+  db_mutex_->AssertHeld();
+  assert(pick_memtable_called);
+  AutoThreadOperationStageUpdater stage_run(
+      ThreadStatus::STAGE_FLUSH_RUN);
+  if (mems_.empty()) {
+    ROCKS_LOG_BUFFER(log_buffer_, "[%s] Nothing in memtable to flush",
+                     cfd_->GetName().c_str());
+    return Status::OK();
+  }
+
+  // I/O measurement variables
+  PerfLevel prev_perf_level = PerfLevel::kEnableTime;
+  uint64_t prev_write_nanos = 0;
+  uint64_t prev_fsync_nanos = 0;
+  uint64_t prev_range_sync_nanos = 0;
+  uint64_t prev_prepare_write_nanos = 0;
+  uint64_t prev_cpu_write_nanos = 0;
+  uint64_t prev_cpu_read_nanos = 0;
+  if (measure_io_stats_) {
+    prev_perf_level = GetPerfLevel();
+    SetPerfLevel(PerfLevel::kEnableTime);
+    prev_write_nanos = IOSTATS(write_nanos);
+    prev_fsync_nanos = IOSTATS(fsync_nanos);
+    prev_range_sync_nanos = IOSTATS(range_sync_nanos);
+    prev_prepare_write_nanos = IOSTATS(prepare_write_nanos);
+    prev_cpu_write_nanos = IOSTATS(cpu_write_nanos);
+    prev_cpu_read_nanos = IOSTATS(cpu_read_nanos);
+  }
+
+  // This will release and re-acquire the mutex.
+  Status s = WriteLevel0Table();
+
+  if (s.ok() && cfd_->IsDropped()) {
+    s = Status::ColumnFamilyDropped("Column family dropped during compaction");
+  }
+  if ((s.ok() || s.IsColumnFamilyDropped()) &&
+      shutting_down_->load(std::memory_order_acquire)) {
+    s = Status::ShutdownInProgress("Database shutdown");
+  }
+
+  if (!s.ok()) {
+    cfd_->imm()->RollbackMemtableFlush(mems_, meta_.fd.GetNumber());
+  } else if (write_manifest_) {
+    TEST_SYNC_POINT("FlushJob::InstallResults");
+    // Replace immutable memtable with the generated Table
+    s = cfd_->imm()->TryInstallMemtableFlushResults(
+        cfd_, mutable_cf_options_, mems_, prep_tracker, versions_, db_mutex_,
+        meta_.fd.GetNumber(), &job_context_->memtables_to_free, db_directory_,
+        log_buffer_, &committed_flush_jobs_info_);
+  }
+
+  if (s.ok() && file_meta != nullptr) {
+    *file_meta = meta_;
+  }
+  RecordFlushIOStats();
+
+  // When measure_io_stats_ is true, the default 512 bytes is not enough.
+  auto stream = event_logger_->LogToBuffer(log_buffer_, 1024);
+  stream << "job" << job_context_->job_id << "event"
+         << "flush_finished";
+  stream << "output_compression"
+         << CompressionTypeToString(output_compression_);
+  stream << "lsm_state";
+  stream.StartArray();
+  auto vstorage = cfd_->current()->storage_info();
+  for (int level = 0; level < vstorage->num_levels(); ++level) {
+    stream << vstorage->NumLevelFiles(level);
+  }
+  stream.EndArray();
+  stream << "immutable_memtables" << cfd_->imm()->NumNotFlushed();
+
+  if (measure_io_stats_) {
+    if (prev_perf_level != PerfLevel::kEnableTime) {
+      SetPerfLevel(prev_perf_level);
+    }
+    stream << "file_write_nanos" << (IOSTATS(write_nanos) - prev_write_nanos);
+    stream << "file_range_sync_nanos"
+           << (IOSTATS(range_sync_nanos) - prev_range_sync_nanos);
+    stream << "file_fsync_nanos" << (IOSTATS(fsync_nanos) - prev_fsync_nanos);
+    stream << "file_prepare_write_nanos"
+           << (IOSTATS(prepare_write_nanos) - prev_prepare_write_nanos);
+    stream << "file_cpu_write_nanos"
+           << (IOSTATS(cpu_write_nanos) - prev_cpu_write_nanos);
+    stream << "file_cpu_read_nanos"
+           << (IOSTATS(cpu_read_nanos) - prev_cpu_read_nanos);
+  }
+
+  return s;
+}
+
+void FlushJob::Cancel() {
+  db_mutex_->AssertHeld();
+  assert(base_ != nullptr);
+  base_->Unref();
+}
+
+Status FlushJob::WriteLevel0Table() {
+  AutoThreadOperationStageUpdater stage_updater(
+      ThreadStatus::STAGE_FLUSH_WRITE_L0);
+  db_mutex_->AssertHeld();
+  const uint64_t start_micros = db_options_.env->NowMicros();
+  const uint64_t start_cpu_micros = db_options_.env->NowCPUNanos() / 1000;
+  Status s;
+  {
+    auto write_hint = cfd_->CalculateSSTWriteHint(0);
+    db_mutex_->Unlock();
+    if (log_buffer_) {
+      log_buffer_->FlushBufferToLog();
+    }
+    // memtables and range_del_iters store internal iterators over each data
+    // memtable and its associated range deletion memtable, respectively, at
+    // corresponding indexes.
+    std::vector<InternalIterator*> memtables;
+    std::vector<std::unique_ptr<FragmentedRangeTombstoneIterator>>
+        range_del_iters;
+    ReadOptions ro;
+    ro.total_order_seek = true;
+    Arena arena;
+    uint64_t total_num_entries = 0, total_num_deletes = 0;
+    uint64_t total_data_size = 0;
+    size_t total_memory_usage = 0;
+    for (MemTable* m : mems_) {
+      ROCKS_LOG_INFO(
+          db_options_.info_log,
+          "[%s] [JOB %d] Flushing memtable with next log file: %" PRIu64 "\n",
+          cfd_->GetName().c_str(), job_context_->job_id, m->GetNextLogNumber());
+      memtables.push_back(m->NewIterator(ro, &arena));
+      auto* range_del_iter =
+          m->NewRangeTombstoneIterator(ro, kMaxSequenceNumber);
+      if (range_del_iter != nullptr) {
+        range_del_iters.emplace_back(range_del_iter);
+      }
+      total_num_entries += m->num_entries();
+      total_num_deletes += m->num_deletes();
+      total_data_size += m->get_data_size();
+      total_memory_usage += m->ApproximateMemoryUsage();
+    }
+
+    event_logger_->Log() << "job" << job_context_->job_id << "event"
+                         << "flush_started"
+                         << "num_memtables" << mems_.size() << "num_entries"
+                         << total_num_entries << "num_deletes"
+                         << total_num_deletes << "total_data_size"
+                         << total_data_size << "memory_usage"
+                         << total_memory_usage << "flush_reason"
+                         << GetFlushReasonString(cfd_->GetFlushReason());
+
+    {
+      ScopedArenaIterator iter(
+          NewMergingIterator(&cfd_->internal_comparator(), &memtables[0],
+                             static_cast<int>(memtables.size()), &arena));
+      ROCKS_LOG_INFO(db_options_.info_log,
+                     "[%s] [JOB %d] Level-0 flush table #%" PRIu64 ": started",
+                     cfd_->GetName().c_str(), job_context_->job_id,
+                     meta_.fd.GetNumber());
+
+      TEST_SYNC_POINT_CALLBACK("FlushJob::WriteLevel0Table:output_compression",
+                               &output_compression_);
+      int64_t _current_time = 0;
+      auto status = db_options_.env->GetCurrentTime(&_current_time);
+      // Safe to proceed even if GetCurrentTime fails. So, log and proceed.
+      if (!status.ok()) {
+        ROCKS_LOG_WARN(
+            db_options_.info_log,
+            "Failed to get current time to populate creation_time property. "
+            "Status: %s",
+            status.ToString().c_str());
+      }
+      const uint64_t current_time = static_cast<uint64_t>(_current_time);
+
+      uint64_t oldest_key_time =
+          mems_.front()->ApproximateOldestKeyTime();
+
+      // It's not clear whether oldest_key_time is always available. In case
+      // it is not available, use current_time.
+      meta_.oldest_ancester_time = std::min(current_time, oldest_key_time);
+      meta_.file_creation_time = current_time;
+
+      uint64_t creation_time = (cfd_->ioptions()->compaction_style ==
+                                CompactionStyle::kCompactionStyleFIFO)
+                                   ? current_time
+                                   : meta_.oldest_ancester_time;
+
+      s = BuildTable(
+          dbname_, db_options_.env, db_options_.fs.get(), *cfd_->ioptions(),
+          mutable_cf_options_, file_options_, cfd_->table_cache(), iter.get(),
+          std::move(range_del_iters), &meta_, cfd_->internal_comparator(),
+          cfd_->int_tbl_prop_collector_factories(), cfd_->GetID(),
+          cfd_->GetName(), existing_snapshots_,
+          earliest_write_conflict_snapshot_, snapshot_checker_,
+          output_compression_, mutable_cf_options_.sample_for_compression,
+          cfd_->ioptions()->compression_opts,
+          mutable_cf_options_.paranoid_file_checks, cfd_->internal_stats(),
+          TableFileCreationReason::kFlush, event_logger_, job_context_->job_id,
+          Env::IO_HIGH, &table_properties_, 0 /* level */,
+          creation_time, oldest_key_time, write_hint, current_time);
+      LogFlush(db_options_.info_log);
+    }
+    ROCKS_LOG_INFO(db_options_.info_log,
+                   "[%s] [JOB %d] Level-0 flush table #%" PRIu64 ": %" PRIu64
+                   " bytes %s"
+                   "%s",
+                   cfd_->GetName().c_str(), job_context_->job_id,
+                   meta_.fd.GetNumber(), meta_.fd.GetFileSize(),
+                   s.ToString().c_str(),
+                   meta_.marked_for_compaction ? " (needs compaction)" : "");
+
+    if (s.ok() && output_file_directory_ != nullptr && sync_output_directory_) {
+      s = output_file_directory_->Fsync();
+    }
+    TEST_SYNC_POINT_CALLBACK("FlushJob::WriteLevel0Table", &mems_);
+    db_mutex_->Lock();
+  }
+  base_->Unref();
+
+  // Note that if file_size is zero, the file has been deleted and
+  // should not be added to the manifest.
+  if (s.ok() && meta_.fd.GetFileSize() > 0) {
+    // if we have more than 1 background thread, then we cannot
+    // insert files directly into higher levels because some other
+    // threads could be concurrently producing compacted files for
+    // that key range.
+    // Add file to L0
+    edit_->AddFile(0 /* level */, meta_.fd.GetNumber(), meta_.fd.GetPathId(),
+                   meta_.fd.GetFileSize(), meta_.smallest, meta_.largest,
+                   meta_.fd.smallest_seqno, meta_.fd.largest_seqno,
+                   meta_.marked_for_compaction, meta_.oldest_blob_file_number,
+                   meta_.oldest_ancester_time, meta_.file_creation_time,
+                   meta_.file_checksum, meta_.file_checksum_func_name);
+  }
+#ifndef ROCKSDB_LITE
+  // Piggyback FlushJobInfo on the first first flushed memtable.
+  mems_[0]->SetFlushJobInfo(GetFlushJobInfo());
+#endif  // !ROCKSDB_LITE
+
+  // Note that here we treat flush as level 0 compaction in internal stats
+  InternalStats::CompactionStats stats(CompactionReason::kFlush, 1);
+  stats.micros = db_options_.env->NowMicros() - start_micros;
+  stats.cpu_micros = db_options_.env->NowCPUNanos() / 1000 - start_cpu_micros;
+  stats.bytes_written = meta_.fd.GetFileSize();
+  RecordTimeToHistogram(stats_, FLUSH_TIME, stats.micros);
+  cfd_->internal_stats()->AddCompactionStats(0 /* level */, thread_pri_, stats);
+  cfd_->internal_stats()->AddCFStats(InternalStats::BYTES_FLUSHED,
+                                     meta_.fd.GetFileSize());
+  RecordFlushIOStats();
+  return s;
+}
+
+#ifndef ROCKSDB_LITE
+std::unique_ptr<FlushJobInfo> FlushJob::GetFlushJobInfo() const {
+  db_mutex_->AssertHeld();
+  std::unique_ptr<FlushJobInfo> info(new FlushJobInfo{});
+  info->cf_id = cfd_->GetID();
+  info->cf_name = cfd_->GetName();
+
+  const uint64_t file_number = meta_.fd.GetNumber();
+  info->file_path =
+      MakeTableFileName(cfd_->ioptions()->cf_paths[0].path, file_number);
+  info->file_number = file_number;
+  info->oldest_blob_file_number = meta_.oldest_blob_file_number;
+  info->thread_id = db_options_.env->GetThreadID();
+  info->job_id = job_context_->job_id;
+  info->smallest_seqno = meta_.fd.smallest_seqno;
+  info->largest_seqno = meta_.fd.largest_seqno;
+  info->table_properties = table_properties_;
+  info->flush_reason = cfd_->GetFlushReason();
+  return info;
+}
+#endif  // !ROCKSDB_LITE
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/flush_job.h b/src/rocksdb/db/flush_job.h
new file mode 100644
index 000000000..1f4435f4c
--- /dev/null
+++ b/src/rocksdb/db/flush_job.h
@@ -0,0 +1,158 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+
+#include <atomic>
+#include <deque>
+#include <limits>
+#include <list>
+#include <set>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "db/column_family.h"
+#include "db/dbformat.h"
+#include "db/flush_scheduler.h"
+#include "db/internal_stats.h"
+#include "db/job_context.h"
+#include "db/log_writer.h"
+#include "db/logs_with_prep_tracker.h"
+#include "db/memtable_list.h"
+#include "db/snapshot_impl.h"
+#include "db/version_edit.h"
+#include "db/write_controller.h"
+#include "db/write_thread.h"
+#include "logging/event_logger.h"
+#include "monitoring/instrumented_mutex.h"
+#include "options/db_options.h"
+#include "port/port.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/listener.h"
+#include "rocksdb/memtablerep.h"
+#include "rocksdb/transaction_log.h"
+#include "table/scoped_arena_iterator.h"
+#include "util/autovector.h"
+#include "util/stop_watch.h"
+#include "util/thread_local.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBImpl;
+class MemTable;
+class SnapshotChecker;
+class TableCache;
+class Version;
+class VersionEdit;
+class VersionSet;
+class Arena;
+
+class FlushJob {
+ public:
+  // TODO(icanadi) make effort to reduce number of parameters here
+  // IMPORTANT: mutable_cf_options needs to be alive while FlushJob is alive
+  FlushJob(const std::string& dbname, ColumnFamilyData* cfd,
+           const ImmutableDBOptions& db_options,
+           const MutableCFOptions& mutable_cf_options,
+           const uint64_t* max_memtable_id, const FileOptions& file_options,
+           VersionSet* versions, InstrumentedMutex* db_mutex,
+           std::atomic<bool>* shutting_down,
+           std::vector<SequenceNumber> existing_snapshots,
+           SequenceNumber earliest_write_conflict_snapshot,
+           SnapshotChecker* snapshot_checker, JobContext* job_context,
+           LogBuffer* log_buffer, Directory* db_directory,
+           Directory* output_file_directory, CompressionType output_compression,
+           Statistics* stats, EventLogger* event_logger, bool measure_io_stats,
+           const bool sync_output_directory, const bool write_manifest,
+           Env::Priority thread_pri);
+
+  ~FlushJob();
+
+  // Require db_mutex held.
+  // Once PickMemTable() is called, either Run() or Cancel() has to be called.
+  void PickMemTable();
+  Status Run(LogsWithPrepTracker* prep_tracker = nullptr,
+             FileMetaData* file_meta = nullptr);
+  void Cancel();
+  const autovector<MemTable*>& GetMemTables() const { return mems_; }
+
+#ifndef ROCKSDB_LITE
+  std::list<std::unique_ptr<FlushJobInfo>>* GetCommittedFlushJobsInfo() {
+    return &committed_flush_jobs_info_;
+  }
+#endif  // !ROCKSDB_LITE
+
+ private:
+  void ReportStartedFlush();
+  void ReportFlushInputSize(const autovector<MemTable*>& mems);
+  void RecordFlushIOStats();
+  Status WriteLevel0Table();
+#ifndef ROCKSDB_LITE
+  std::unique_ptr<FlushJobInfo> GetFlushJobInfo() const;
+#endif  // !ROCKSDB_LITE
+
+  const std::string& dbname_;
+  ColumnFamilyData* cfd_;
+  const ImmutableDBOptions& db_options_;
+  const MutableCFOptions& mutable_cf_options_;
+  // Pointer to a variable storing the largest memtable id to flush in this
+  // flush job. RocksDB uses this variable to select the memtables to flush in
+  // this job. All memtables in this column family with an ID smaller than or
+  // equal to *max_memtable_id_ will be selected for flush. If null, then all
+  // memtables in the column family will be selected.
+  const uint64_t* max_memtable_id_;
+  const FileOptions file_options_;
+  VersionSet* versions_;
+  InstrumentedMutex* db_mutex_;
+  std::atomic<bool>* shutting_down_;
+  std::vector<SequenceNumber> existing_snapshots_;
+  SequenceNumber earliest_write_conflict_snapshot_;
+  SnapshotChecker* snapshot_checker_;
+  JobContext* job_context_;
+  LogBuffer* log_buffer_;
+  Directory* db_directory_;
+  Directory* output_file_directory_;
+  CompressionType output_compression_;
+  Statistics* stats_;
+  EventLogger* event_logger_;
+  TableProperties table_properties_;
+  bool measure_io_stats_;
+  // True if this flush job should call fsync on the output directory. False
+  // otherwise.
+  // Usually sync_output_directory_ is true. A flush job needs to call sync on
+  // the output directory before committing to the MANIFEST.
+  // However, an individual flush job does not have to call sync on the output
+  // directory if it is part of an atomic flush. After all flush jobs in the
+  // atomic flush succeed, call sync once on each distinct output directory.
+  const bool sync_output_directory_;
+  // True if this flush job should write to MANIFEST after successfully
+  // flushing memtables. False otherwise.
+  // Usually write_manifest_ is true. A flush job commits to the MANIFEST after
+  // flushing the memtables.
+  // However, an individual flush job cannot rashly write to the MANIFEST
+  // immediately after it finishes the flush if it is part of an atomic flush.
+  // In this case, only after all flush jobs succeed in flush can RocksDB
+  // commit to the MANIFEST.
+  const bool write_manifest_;
+  // The current flush job can commit flush result of a concurrent flush job.
+  // We collect FlushJobInfo of all jobs committed by current job and fire
+  // OnFlushCompleted for them.
+  std::list<std::unique_ptr<FlushJobInfo>> committed_flush_jobs_info_;
+
+  // Variables below are set by PickMemTable():
+  FileMetaData meta_;
+  autovector<MemTable*> mems_;
+  VersionEdit* edit_;
+  Version* base_;
+  bool pick_memtable_called;
+  Env::Priority thread_pri_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/flush_job_test.cc b/src/rocksdb/db/flush_job_test.cc
new file mode 100644
index 000000000..b77a4a2a9
--- /dev/null
+++ b/src/rocksdb/db/flush_job_test.cc
@@ -0,0 +1,498 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include <algorithm>
+#include <array>
+#include <map>
+#include <string>
+
+#include "db/blob_index.h"
+#include "db/column_family.h"
+#include "db/db_impl/db_impl.h"
+#include "db/flush_job.h"
+#include "db/version_set.h"
+#include "file/writable_file_writer.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/write_buffer_manager.h"
+#include "table/mock_table.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// TODO(icanadi) Mock out everything else:
+// 1. VersionSet
+// 2. Memtable
+class FlushJobTest : public testing::Test {
+ public:
+  FlushJobTest()
+      : env_(Env::Default()),
+        fs_(std::make_shared<LegacyFileSystemWrapper>(env_)),
+        dbname_(test::PerThreadDBPath("flush_job_test")),
+        options_(),
+        db_options_(options_),
+        column_family_names_({kDefaultColumnFamilyName, "foo", "bar"}),
+        table_cache_(NewLRUCache(50000, 16)),
+        write_buffer_manager_(db_options_.db_write_buffer_size),
+        shutting_down_(false),
+        mock_table_factory_(new mock::MockTableFactory()) {
+    EXPECT_OK(env_->CreateDirIfMissing(dbname_));
+    db_options_.db_paths.emplace_back(dbname_,
+                                      std::numeric_limits<uint64_t>::max());
+    db_options_.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+    // TODO(icanadi) Remove this once we mock out VersionSet
+    NewDB();
+    std::vector<ColumnFamilyDescriptor> column_families;
+    cf_options_.table_factory = mock_table_factory_;
+    for (const auto& cf_name : column_family_names_) {
+      column_families.emplace_back(cf_name, cf_options_);
+    }
+
+    db_options_.env = env_;
+    db_options_.fs = fs_;
+    versions_.reset(new VersionSet(dbname_, &db_options_, env_options_,
+                                   table_cache_.get(), &write_buffer_manager_,
+                                   &write_controller_,
+                                   /*block_cache_tracer=*/nullptr));
+    EXPECT_OK(versions_->Recover(column_families, false));
+  }
+
+  void NewDB() {
+    SetIdentityFile(env_, dbname_);
+    VersionEdit new_db;
+    if (db_options_.write_dbid_to_manifest) {
+      DBImpl* impl = new DBImpl(DBOptions(), dbname_);
+      std::string db_id;
+      impl->GetDbIdentityFromIdentityFile(&db_id);
+      new_db.SetDBId(db_id);
+    }
+    new_db.SetLogNumber(0);
+    new_db.SetNextFile(2);
+    new_db.SetLastSequence(0);
+
+    autovector<VersionEdit> new_cfs;
+    SequenceNumber last_seq = 1;
+    uint32_t cf_id = 1;
+    for (size_t i = 1; i != column_family_names_.size(); ++i) {
+      VersionEdit new_cf;
+      new_cf.AddColumnFamily(column_family_names_[i]);
+      new_cf.SetColumnFamily(cf_id++);
+      new_cf.SetLogNumber(0);
+      new_cf.SetNextFile(2);
+      new_cf.SetLastSequence(last_seq++);
+      new_cfs.emplace_back(new_cf);
+    }
+
+    const std::string manifest = DescriptorFileName(dbname_, 1);
+    std::unique_ptr<WritableFile> file;
+    Status s = env_->NewWritableFile(
+        manifest, &file, env_->OptimizeForManifestWrite(env_options_));
+    ASSERT_OK(s);
+    std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
+        NewLegacyWritableFileWrapper(std::move(file)), manifest, EnvOptions()));
+    {
+      log::Writer log(std::move(file_writer), 0, false);
+      std::string record;
+      new_db.EncodeTo(&record);
+      s = log.AddRecord(record);
+
+      for (const auto& e : new_cfs) {
+        record.clear();
+        e.EncodeTo(&record);
+        s = log.AddRecord(record);
+        ASSERT_OK(s);
+      }
+    }
+    ASSERT_OK(s);
+    // Make "CURRENT" file that points to the new manifest file.
+    s = SetCurrentFile(env_, dbname_, 1, nullptr);
+  }
+
+  Env* env_;
+  std::shared_ptr<FileSystem> fs_;
+  std::string dbname_;
+  EnvOptions env_options_;
+  Options options_;
+  ImmutableDBOptions db_options_;
+  const std::vector<std::string> column_family_names_;
+  std::shared_ptr<Cache> table_cache_;
+  WriteController write_controller_;
+  WriteBufferManager write_buffer_manager_;
+  ColumnFamilyOptions cf_options_;
+  std::unique_ptr<VersionSet> versions_;
+  InstrumentedMutex mutex_;
+  std::atomic<bool> shutting_down_;
+  std::shared_ptr<mock::MockTableFactory> mock_table_factory_;
+};
+
+TEST_F(FlushJobTest, Empty) {
+  JobContext job_context(0);
+  auto cfd = versions_->GetColumnFamilySet()->GetDefault();
+  EventLogger event_logger(db_options_.info_log.get());
+  SnapshotChecker* snapshot_checker = nullptr;  // not relavant
+  FlushJob flush_job(dbname_, versions_->GetColumnFamilySet()->GetDefault(),
+                     db_options_, *cfd->GetLatestMutableCFOptions(),
+                     nullptr /* memtable_id */, env_options_, versions_.get(),
+                     &mutex_, &shutting_down_, {}, kMaxSequenceNumber,
+                     snapshot_checker, &job_context, nullptr, nullptr, nullptr,
+                     kNoCompression, nullptr, &event_logger, false,
+                     true /* sync_output_directory */,
+                     true /* write_manifest */, Env::Priority::USER);
+  {
+    InstrumentedMutexLock l(&mutex_);
+    flush_job.PickMemTable();
+    ASSERT_OK(flush_job.Run());
+  }
+  job_context.Clean();
+}
+
+TEST_F(FlushJobTest, NonEmpty) {
+  JobContext job_context(0);
+  auto cfd = versions_->GetColumnFamilySet()->GetDefault();
+  auto new_mem = cfd->ConstructNewMemtable(*cfd->GetLatestMutableCFOptions(),
+                                           kMaxSequenceNumber);
+  new_mem->Ref();
+  auto inserted_keys = mock::MakeMockFile();
+  // Test data:
+  //   seqno [    1,    2 ... 8998, 8999, 9000, 9001, 9002 ... 9999 ]
+  //   key   [ 1001, 1002 ... 9998, 9999,    0,    1,    2 ...  999 ]
+  //   range-delete "9995" -> "9999" at seqno 10000
+  //   blob references with seqnos 10001..10006
+  for (int i = 1; i < 10000; ++i) {
+    std::string key(ToString((i + 1000) % 10000));
+    std::string value("value" + key);
+    new_mem->Add(SequenceNumber(i), kTypeValue, key, value);
+    if ((i + 1000) % 10000 < 9995) {
+      InternalKey internal_key(key, SequenceNumber(i), kTypeValue);
+      inserted_keys.insert({internal_key.Encode().ToString(), value});
+    }
+  }
+
+  {
+    new_mem->Add(SequenceNumber(10000), kTypeRangeDeletion, "9995", "9999a");
+    InternalKey internal_key("9995", SequenceNumber(10000), kTypeRangeDeletion);
+    inserted_keys.insert({internal_key.Encode().ToString(), "9999a"});
+  }
+
+#ifndef ROCKSDB_LITE
+  // Note: the first two blob references will not be considered when resolving
+  // the oldest blob file referenced (the first one is inlined TTL, while the
+  // second one is TTL and thus points to a TTL blob file).
+  constexpr std::array<uint64_t, 6> blob_file_numbers{
+      kInvalidBlobFileNumber, 5, 103, 17, 102, 101};
+  for (size_t i = 0; i < blob_file_numbers.size(); ++i) {
+    std::string key(ToString(i + 10001));
+    std::string blob_index;
+    if (i == 0) {
+      BlobIndex::EncodeInlinedTTL(&blob_index, /* expiration */ 1234567890ULL,
+                                  "foo");
+    } else if (i == 1) {
+      BlobIndex::EncodeBlobTTL(&blob_index, /* expiration */ 1234567890ULL,
+                               blob_file_numbers[i], /* offset */ i << 10,
+                               /* size */ i << 20, kNoCompression);
+    } else {
+      BlobIndex::EncodeBlob(&blob_index, blob_file_numbers[i],
+                            /* offset */ i << 10, /* size */ i << 20,
+                            kNoCompression);
+    }
+
+    const SequenceNumber seq(i + 10001);
+    new_mem->Add(seq, kTypeBlobIndex, key, blob_index);
+
+    InternalKey internal_key(key, seq, kTypeBlobIndex);
+    inserted_keys.emplace_hint(inserted_keys.end(),
+                               internal_key.Encode().ToString(), blob_index);
+  }
+#endif
+
+  autovector<MemTable*> to_delete;
+  cfd->imm()->Add(new_mem, &to_delete);
+  for (auto& m : to_delete) {
+    delete m;
+  }
+
+  EventLogger event_logger(db_options_.info_log.get());
+  SnapshotChecker* snapshot_checker = nullptr;  // not relavant
+  FlushJob flush_job(dbname_, versions_->GetColumnFamilySet()->GetDefault(),
+                     db_options_, *cfd->GetLatestMutableCFOptions(),
+                     nullptr /* memtable_id */, env_options_, versions_.get(),
+                     &mutex_, &shutting_down_, {}, kMaxSequenceNumber,
+                     snapshot_checker, &job_context, nullptr, nullptr, nullptr,
+                     kNoCompression, db_options_.statistics.get(),
+                     &event_logger, true, true /* sync_output_directory */,
+                     true /* write_manifest */, Env::Priority::USER);
+
+  HistogramData hist;
+  FileMetaData file_meta;
+  mutex_.Lock();
+  flush_job.PickMemTable();
+  ASSERT_OK(flush_job.Run(nullptr, &file_meta));
+  mutex_.Unlock();
+  db_options_.statistics->histogramData(FLUSH_TIME, &hist);
+  ASSERT_GT(hist.average, 0.0);
+
+  ASSERT_EQ(ToString(0), file_meta.smallest.user_key().ToString());
+  ASSERT_EQ("9999a", file_meta.largest.user_key().ToString());
+  ASSERT_EQ(1, file_meta.fd.smallest_seqno);
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ(10006, file_meta.fd.largest_seqno);
+  ASSERT_EQ(17, file_meta.oldest_blob_file_number);
+#else
+  ASSERT_EQ(10000, file_meta.fd.largest_seqno);
+#endif
+  mock_table_factory_->AssertSingleFile(inserted_keys);
+  job_context.Clean();
+}
+
+TEST_F(FlushJobTest, FlushMemTablesSingleColumnFamily) {
+  const size_t num_mems = 2;
+  const size_t num_mems_to_flush = 1;
+  const size_t num_keys_per_table = 100;
+  JobContext job_context(0);
+  ColumnFamilyData* cfd = versions_->GetColumnFamilySet()->GetDefault();
+  std::vector<uint64_t> memtable_ids;
+  std::vector<MemTable*> new_mems;
+  for (size_t i = 0; i != num_mems; ++i) {
+    MemTable* mem = cfd->ConstructNewMemtable(*cfd->GetLatestMutableCFOptions(),
+                                              kMaxSequenceNumber);
+    mem->SetID(i);
+    mem->Ref();
+    new_mems.emplace_back(mem);
+    memtable_ids.push_back(mem->GetID());
+
+    for (size_t j = 0; j < num_keys_per_table; ++j) {
+      std::string key(ToString(j + i * num_keys_per_table));
+      std::string value("value" + key);
+      mem->Add(SequenceNumber(j + i * num_keys_per_table), kTypeValue, key,
+               value);
+    }
+  }
+
+  autovector<MemTable*> to_delete;
+  for (auto mem : new_mems) {
+    cfd->imm()->Add(mem, &to_delete);
+  }
+
+  EventLogger event_logger(db_options_.info_log.get());
+  SnapshotChecker* snapshot_checker = nullptr;  // not relavant
+
+  assert(memtable_ids.size() == num_mems);
+  uint64_t smallest_memtable_id = memtable_ids.front();
+  uint64_t flush_memtable_id = smallest_memtable_id + num_mems_to_flush - 1;
+
+  FlushJob flush_job(dbname_, versions_->GetColumnFamilySet()->GetDefault(),
+                     db_options_, *cfd->GetLatestMutableCFOptions(),
+                     &flush_memtable_id, env_options_, versions_.get(), &mutex_,
+                     &shutting_down_, {}, kMaxSequenceNumber, snapshot_checker,
+                     &job_context, nullptr, nullptr, nullptr, kNoCompression,
+                     db_options_.statistics.get(), &event_logger, true,
+                     true /* sync_output_directory */,
+                     true /* write_manifest */, Env::Priority::USER);
+  HistogramData hist;
+  FileMetaData file_meta;
+  mutex_.Lock();
+  flush_job.PickMemTable();
+  ASSERT_OK(flush_job.Run(nullptr /* prep_tracker */, &file_meta));
+  mutex_.Unlock();
+  db_options_.statistics->histogramData(FLUSH_TIME, &hist);
+  ASSERT_GT(hist.average, 0.0);
+
+  ASSERT_EQ(ToString(0), file_meta.smallest.user_key().ToString());
+  ASSERT_EQ("99", file_meta.largest.user_key().ToString());
+  ASSERT_EQ(0, file_meta.fd.smallest_seqno);
+  ASSERT_EQ(SequenceNumber(num_mems_to_flush * num_keys_per_table - 1),
+            file_meta.fd.largest_seqno);
+  ASSERT_EQ(kInvalidBlobFileNumber, file_meta.oldest_blob_file_number);
+
+  for (auto m : to_delete) {
+    delete m;
+  }
+  to_delete.clear();
+  job_context.Clean();
+}
+
+TEST_F(FlushJobTest, FlushMemtablesMultipleColumnFamilies) {
+  autovector<ColumnFamilyData*> all_cfds;
+  for (auto cfd : *versions_->GetColumnFamilySet()) {
+    all_cfds.push_back(cfd);
+  }
+  const std::vector<size_t> num_memtables = {2, 1, 3};
+  assert(num_memtables.size() == column_family_names_.size());
+  const size_t num_keys_per_memtable = 1000;
+  JobContext job_context(0);
+  std::vector<uint64_t> memtable_ids;
+  std::vector<SequenceNumber> smallest_seqs;
+  std::vector<SequenceNumber> largest_seqs;
+  autovector<MemTable*> to_delete;
+  SequenceNumber curr_seqno = 0;
+  size_t k = 0;
+  for (auto cfd : all_cfds) {
+    smallest_seqs.push_back(curr_seqno);
+    for (size_t i = 0; i != num_memtables[k]; ++i) {
+      MemTable* mem = cfd->ConstructNewMemtable(
+          *cfd->GetLatestMutableCFOptions(), kMaxSequenceNumber);
+      mem->SetID(i);
+      mem->Ref();
+
+      for (size_t j = 0; j != num_keys_per_memtable; ++j) {
+        std::string key(ToString(j + i * num_keys_per_memtable));
+        std::string value("value" + key);
+        mem->Add(curr_seqno++, kTypeValue, key, value);
+      }
+
+      cfd->imm()->Add(mem, &to_delete);
+    }
+    largest_seqs.push_back(curr_seqno - 1);
+    memtable_ids.push_back(num_memtables[k++] - 1);
+  }
+
+  EventLogger event_logger(db_options_.info_log.get());
+  SnapshotChecker* snapshot_checker = nullptr;  // not relevant
+  std::vector<std::unique_ptr<FlushJob>> flush_jobs;
+  k = 0;
+  for (auto cfd : all_cfds) {
+    std::vector<SequenceNumber> snapshot_seqs;
+    flush_jobs.emplace_back(new FlushJob(
+        dbname_, cfd, db_options_, *cfd->GetLatestMutableCFOptions(),
+        &memtable_ids[k], env_options_, versions_.get(), &mutex_,
+        &shutting_down_, snapshot_seqs, kMaxSequenceNumber, snapshot_checker,
+        &job_context, nullptr, nullptr, nullptr, kNoCompression,
+        db_options_.statistics.get(), &event_logger, true,
+        false /* sync_output_directory */, false /* write_manifest */,
+        Env::Priority::USER));
+    k++;
+  }
+  HistogramData hist;
+  std::vector<FileMetaData> file_metas;
+  // Call reserve to avoid auto-resizing
+  file_metas.reserve(flush_jobs.size());
+  mutex_.Lock();
+  for (auto& job : flush_jobs) {
+    job->PickMemTable();
+  }
+  for (auto& job : flush_jobs) {
+    FileMetaData meta;
+    // Run will release and re-acquire  mutex
+    ASSERT_OK(job->Run(nullptr /**/, &meta));
+    file_metas.emplace_back(meta);
+  }
+  autovector<FileMetaData*> file_meta_ptrs;
+  for (auto& meta : file_metas) {
+    file_meta_ptrs.push_back(&meta);
+  }
+  autovector<const autovector<MemTable*>*> mems_list;
+  for (size_t i = 0; i != all_cfds.size(); ++i) {
+    const auto& mems = flush_jobs[i]->GetMemTables();
+    mems_list.push_back(&mems);
+  }
+  autovector<const MutableCFOptions*> mutable_cf_options_list;
+  for (auto cfd : all_cfds) {
+    mutable_cf_options_list.push_back(cfd->GetLatestMutableCFOptions());
+  }
+
+  Status s = InstallMemtableAtomicFlushResults(
+      nullptr /* imm_lists */, all_cfds, mutable_cf_options_list, mems_list,
+      versions_.get(), &mutex_, file_meta_ptrs, &job_context.memtables_to_free,
+      nullptr /* db_directory */, nullptr /* log_buffer */);
+  ASSERT_OK(s);
+
+  mutex_.Unlock();
+  db_options_.statistics->histogramData(FLUSH_TIME, &hist);
+  ASSERT_GT(hist.average, 0.0);
+  k = 0;
+  for (const auto& file_meta : file_metas) {
+    ASSERT_EQ(ToString(0), file_meta.smallest.user_key().ToString());
+    ASSERT_EQ("999", file_meta.largest.user_key()
+                         .ToString());  // max key by bytewise comparator
+    ASSERT_EQ(smallest_seqs[k], file_meta.fd.smallest_seqno);
+    ASSERT_EQ(largest_seqs[k], file_meta.fd.largest_seqno);
+    // Verify that imm is empty
+    ASSERT_EQ(std::numeric_limits<uint64_t>::max(),
+              all_cfds[k]->imm()->GetEarliestMemTableID());
+    ASSERT_EQ(0, all_cfds[k]->imm()->GetLatestMemTableID());
+    ++k;
+  }
+
+  for (auto m : to_delete) {
+    delete m;
+  }
+  to_delete.clear();
+  job_context.Clean();
+}
+
+TEST_F(FlushJobTest, Snapshots) {
+  JobContext job_context(0);
+  auto cfd = versions_->GetColumnFamilySet()->GetDefault();
+  auto new_mem = cfd->ConstructNewMemtable(*cfd->GetLatestMutableCFOptions(),
+                                           kMaxSequenceNumber);
+
+  std::set<SequenceNumber> snapshots_set;
+  int keys = 10000;
+  int max_inserts_per_keys = 8;
+
+  Random rnd(301);
+  for (int i = 0; i < keys / 2; ++i) {
+    snapshots_set.insert(rnd.Uniform(keys * (max_inserts_per_keys / 2)) + 1);
+  }
+  // set has already removed the duplicate snapshots
+  std::vector<SequenceNumber> snapshots(snapshots_set.begin(),
+                                        snapshots_set.end());
+
+  new_mem->Ref();
+  SequenceNumber current_seqno = 0;
+  auto inserted_keys = mock::MakeMockFile();
+  for (int i = 1; i < keys; ++i) {
+    std::string key(ToString(i));
+    int insertions = rnd.Uniform(max_inserts_per_keys);
+    for (int j = 0; j < insertions; ++j) {
+      std::string value(test::RandomHumanReadableString(&rnd, 10));
+      auto seqno = ++current_seqno;
+      new_mem->Add(SequenceNumber(seqno), kTypeValue, key, value);
+      // a key is visible only if:
+      // 1. it's the last one written (j == insertions - 1)
+      // 2. there's a snapshot pointing at it
+      bool visible = (j == insertions - 1) ||
+                     (snapshots_set.find(seqno) != snapshots_set.end());
+      if (visible) {
+        InternalKey internal_key(key, seqno, kTypeValue);
+        inserted_keys.insert({internal_key.Encode().ToString(), value});
+      }
+    }
+  }
+
+  autovector<MemTable*> to_delete;
+  cfd->imm()->Add(new_mem, &to_delete);
+  for (auto& m : to_delete) {
+    delete m;
+  }
+
+  EventLogger event_logger(db_options_.info_log.get());
+  SnapshotChecker* snapshot_checker = nullptr;  // not relavant
+  FlushJob flush_job(dbname_, versions_->GetColumnFamilySet()->GetDefault(),
+                     db_options_, *cfd->GetLatestMutableCFOptions(),
+                     nullptr /* memtable_id */, env_options_, versions_.get(),
+                     &mutex_, &shutting_down_, snapshots, kMaxSequenceNumber,
+                     snapshot_checker, &job_context, nullptr, nullptr, nullptr,
+                     kNoCompression, db_options_.statistics.get(),
+                     &event_logger, true, true /* sync_output_directory */,
+                     true /* write_manifest */, Env::Priority::USER);
+  mutex_.Lock();
+  flush_job.PickMemTable();
+  ASSERT_OK(flush_job.Run());
+  mutex_.Unlock();
+  mock_table_factory_->AssertSingleFile(inserted_keys);
+  HistogramData hist;
+  db_options_.statistics->histogramData(FLUSH_TIME, &hist);
+  ASSERT_GT(hist.average, 0.0);
+  job_context.Clean();
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/flush_scheduler.cc b/src/rocksdb/db/flush_scheduler.cc
new file mode 100644
index 000000000..6f4d3e1a5
--- /dev/null
+++ b/src/rocksdb/db/flush_scheduler.cc
@@ -0,0 +1,86 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/flush_scheduler.h"
+
+#include <cassert>
+
+#include "db/column_family.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+void FlushScheduler::ScheduleWork(ColumnFamilyData* cfd) {
+#ifndef NDEBUG
+  {
+    std::lock_guard<std::mutex> lock(checking_mutex_);
+    assert(checking_set_.count(cfd) == 0);
+    checking_set_.insert(cfd);
+  }
+#endif  // NDEBUG
+  cfd->Ref();
+// Suppress false positive clang analyzer warnings.
+#ifndef __clang_analyzer__
+  Node* node = new Node{cfd, head_.load(std::memory_order_relaxed)};
+  while (!head_.compare_exchange_strong(
+      node->next, node, std::memory_order_relaxed, std::memory_order_relaxed)) {
+    // failing CAS updates the first param, so we are already set for
+    // retry.  TakeNextColumnFamily won't happen until after another
+    // inter-thread synchronization, so we don't even need release
+    // semantics for this CAS
+  }
+#endif  // __clang_analyzer__
+}
+
+ColumnFamilyData* FlushScheduler::TakeNextColumnFamily() {
+  while (true) {
+    if (head_.load(std::memory_order_relaxed) == nullptr) {
+      return nullptr;
+    }
+
+    // dequeue the head
+    Node* node = head_.load(std::memory_order_relaxed);
+    head_.store(node->next, std::memory_order_relaxed);
+    ColumnFamilyData* cfd = node->column_family;
+    delete node;
+
+#ifndef NDEBUG
+    {
+      std::lock_guard<std::mutex> lock(checking_mutex_);
+      auto iter = checking_set_.find(cfd);
+      assert(iter != checking_set_.end());
+      checking_set_.erase(iter);
+    }
+#endif  // NDEBUG
+
+    if (!cfd->IsDropped()) {
+      // success
+      return cfd;
+    }
+
+    // no longer relevant, retry
+    cfd->UnrefAndTryDelete();
+  }
+}
+
+bool FlushScheduler::Empty() {
+  auto rv = head_.load(std::memory_order_relaxed) == nullptr;
+#ifndef NDEBUG
+  std::lock_guard<std::mutex> lock(checking_mutex_);
+  // Empty is allowed to be called concurrnetly with ScheduleFlush. It would
+  // only miss the recent schedules.
+  assert((rv == checking_set_.empty()) || rv);
+#endif  // NDEBUG
+  return rv;
+}
+
+void FlushScheduler::Clear() {
+  ColumnFamilyData* cfd;
+  while ((cfd = TakeNextColumnFamily()) != nullptr) {
+    cfd->UnrefAndTryDelete();
+  }
+  assert(head_.load(std::memory_order_relaxed) == nullptr);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/flush_scheduler.h b/src/rocksdb/db/flush_scheduler.h
new file mode 100644
index 000000000..cbe17994f
--- /dev/null
+++ b/src/rocksdb/db/flush_scheduler.h
@@ -0,0 +1,54 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <stdint.h>
+#include <atomic>
+#include <mutex>
+#include <set>
+#include "util/autovector.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class ColumnFamilyData;
+
+// FlushScheduler keeps track of all column families whose memtable may
+// be full and require flushing. Unless otherwise noted, all methods on
+// FlushScheduler should be called only with the DB mutex held or from
+// a single-threaded recovery context.
+class FlushScheduler {
+ public:
+  FlushScheduler() : head_(nullptr) {}
+
+  // May be called from multiple threads at once, but not concurrent with
+  // any other method calls on this instance
+  void ScheduleWork(ColumnFamilyData* cfd);
+
+  // Removes and returns Ref()-ed column family. Client needs to Unref().
+  // Filters column families that have been dropped.
+  ColumnFamilyData* TakeNextColumnFamily();
+
+  // This can be called concurrently with ScheduleWork but it would miss all
+  // the scheduled flushes after the last synchronization. This would result
+  // into less precise enforcement of memtable sizes but should not matter much.
+  bool Empty();
+
+  void Clear();
+
+ private:
+  struct Node {
+    ColumnFamilyData* column_family;
+    Node* next;
+  };
+
+  std::atomic<Node*> head_;
+#ifndef NDEBUG
+  std::mutex checking_mutex_;
+  std::set<ColumnFamilyData*> checking_set_;
+#endif  // NDEBUG
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/forward_iterator.cc b/src/rocksdb/db/forward_iterator.cc
new file mode 100644
index 000000000..f2b882549
--- /dev/null
+++ b/src/rocksdb/db/forward_iterator.cc
@@ -0,0 +1,975 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+#include "db/forward_iterator.h"
+
+#include <limits>
+#include <string>
+#include <utility>
+
+#include "db/column_family.h"
+#include "db/db_impl/db_impl.h"
+#include "db/db_iter.h"
+#include "db/dbformat.h"
+#include "db/job_context.h"
+#include "db/range_del_aggregator.h"
+#include "db/range_tombstone_fragmenter.h"
+#include "rocksdb/env.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/slice_transform.h"
+#include "table/merging_iterator.h"
+#include "test_util/sync_point.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Usage:
+//     ForwardLevelIterator iter;
+//     iter.SetFileIndex(file_index);
+//     iter.Seek(target); // or iter.SeekToFirst();
+//     iter.Next()
+class ForwardLevelIterator : public InternalIterator {
+ public:
+  ForwardLevelIterator(const ColumnFamilyData* const cfd,
+                       const ReadOptions& read_options,
+                       const std::vector<FileMetaData*>& files,
+                       const SliceTransform* prefix_extractor)
+      : cfd_(cfd),
+        read_options_(read_options),
+        files_(files),
+        valid_(false),
+        file_index_(std::numeric_limits<uint32_t>::max()),
+        file_iter_(nullptr),
+        pinned_iters_mgr_(nullptr),
+        prefix_extractor_(prefix_extractor) {}
+
+  ~ForwardLevelIterator() override {
+    // Reset current pointer
+    if (pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled()) {
+      pinned_iters_mgr_->PinIterator(file_iter_);
+    } else {
+      delete file_iter_;
+    }
+  }
+
+  void SetFileIndex(uint32_t file_index) {
+    assert(file_index < files_.size());
+    status_ = Status::OK();
+    if (file_index != file_index_) {
+      file_index_ = file_index;
+      Reset();
+    }
+  }
+  void Reset() {
+    assert(file_index_ < files_.size());
+
+    // Reset current pointer
+    if (pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled()) {
+      pinned_iters_mgr_->PinIterator(file_iter_);
+    } else {
+      delete file_iter_;
+    }
+
+    ReadRangeDelAggregator range_del_agg(&cfd_->internal_comparator(),
+                                         kMaxSequenceNumber /* upper_bound */);
+    file_iter_ = cfd_->table_cache()->NewIterator(
+        read_options_, *(cfd_->soptions()), cfd_->internal_comparator(),
+        *files_[file_index_],
+        read_options_.ignore_range_deletions ? nullptr : &range_del_agg,
+        prefix_extractor_, /*table_reader_ptr=*/nullptr,
+        /*file_read_hist=*/nullptr, TableReaderCaller::kUserIterator,
+        /*arena=*/nullptr, /*skip_filters=*/false, /*level=*/-1,
+        /*smallest_compaction_key=*/nullptr,
+        /*largest_compaction_key=*/nullptr);
+    file_iter_->SetPinnedItersMgr(pinned_iters_mgr_);
+    valid_ = false;
+    if (!range_del_agg.IsEmpty()) {
+      status_ = Status::NotSupported(
+          "Range tombstones unsupported with ForwardIterator");
+    }
+  }
+  void SeekToLast() override {
+    status_ = Status::NotSupported("ForwardLevelIterator::SeekToLast()");
+    valid_ = false;
+  }
+  void Prev() override {
+    status_ = Status::NotSupported("ForwardLevelIterator::Prev()");
+    valid_ = false;
+  }
+  bool Valid() const override {
+    return valid_;
+  }
+  void SeekToFirst() override {
+    assert(file_iter_ != nullptr);
+    if (!status_.ok()) {
+      assert(!valid_);
+      return;
+    }
+    file_iter_->SeekToFirst();
+    valid_ = file_iter_->Valid();
+  }
+  void Seek(const Slice& internal_key) override {
+    assert(file_iter_ != nullptr);
+
+    // This deviates from the usual convention for InternalIterator::Seek() in
+    // that it doesn't discard pre-existing error status. That's because this
+    // Seek() is only supposed to be called immediately after SetFileIndex()
+    // (which discards pre-existing error status), and SetFileIndex() may set
+    // an error status, which we shouldn't discard.
+    if (!status_.ok()) {
+      assert(!valid_);
+      return;
+    }
+
+    file_iter_->Seek(internal_key);
+    valid_ = file_iter_->Valid();
+  }
+  void SeekForPrev(const Slice& /*internal_key*/) override {
+    status_ = Status::NotSupported("ForwardLevelIterator::SeekForPrev()");
+    valid_ = false;
+  }
+  void Next() override {
+    assert(valid_);
+    file_iter_->Next();
+    for (;;) {
+      valid_ = file_iter_->Valid();
+      if (!file_iter_->status().ok()) {
+        assert(!valid_);
+        return;
+      }
+      if (valid_) {
+        return;
+      }
+      if (file_index_ + 1 >= files_.size()) {
+        valid_ = false;
+        return;
+      }
+      SetFileIndex(file_index_ + 1);
+      if (!status_.ok()) {
+        assert(!valid_);
+        return;
+      }
+      file_iter_->SeekToFirst();
+    }
+  }
+  Slice key() const override {
+    assert(valid_);
+    return file_iter_->key();
+  }
+  Slice value() const override {
+    assert(valid_);
+    return file_iter_->value();
+  }
+  Status status() const override {
+    if (!status_.ok()) {
+      return status_;
+    } else if (file_iter_) {
+      return file_iter_->status();
+    }
+    return Status::OK();
+  }
+  bool IsKeyPinned() const override {
+    return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() &&
+           file_iter_->IsKeyPinned();
+  }
+  bool IsValuePinned() const override {
+    return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() &&
+           file_iter_->IsValuePinned();
+  }
+  void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override {
+    pinned_iters_mgr_ = pinned_iters_mgr;
+    if (file_iter_) {
+      file_iter_->SetPinnedItersMgr(pinned_iters_mgr_);
+    }
+  }
+
+ private:
+  const ColumnFamilyData* const cfd_;
+  const ReadOptions& read_options_;
+  const std::vector<FileMetaData*>& files_;
+
+  bool valid_;
+  uint32_t file_index_;
+  Status status_;
+  InternalIterator* file_iter_;
+  PinnedIteratorsManager* pinned_iters_mgr_;
+  const SliceTransform* prefix_extractor_;
+};
+
+ForwardIterator::ForwardIterator(DBImpl* db, const ReadOptions& read_options,
+                                 ColumnFamilyData* cfd,
+                                 SuperVersion* current_sv)
+    : db_(db),
+      read_options_(read_options),
+      cfd_(cfd),
+      prefix_extractor_(current_sv->mutable_cf_options.prefix_extractor.get()),
+      user_comparator_(cfd->user_comparator()),
+      immutable_min_heap_(MinIterComparator(&cfd_->internal_comparator())),
+      sv_(current_sv),
+      mutable_iter_(nullptr),
+      current_(nullptr),
+      valid_(false),
+      status_(Status::OK()),
+      immutable_status_(Status::OK()),
+      has_iter_trimmed_for_upper_bound_(false),
+      current_over_upper_bound_(false),
+      is_prev_set_(false),
+      is_prev_inclusive_(false),
+      pinned_iters_mgr_(nullptr) {
+  if (sv_) {
+    RebuildIterators(false);
+  }
+}
+
+ForwardIterator::~ForwardIterator() {
+  Cleanup(true);
+}
+
+void ForwardIterator::SVCleanup(DBImpl* db, SuperVersion* sv,
+                                bool background_purge_on_iterator_cleanup) {
+  if (sv->Unref()) {
+    // Job id == 0 means that this is not our background process, but rather
+    // user thread
+    JobContext job_context(0);
+    db->mutex_.Lock();
+    sv->Cleanup();
+    db->FindObsoleteFiles(&job_context, false, true);
+    if (background_purge_on_iterator_cleanup) {
+      db->ScheduleBgLogWriterClose(&job_context);
+      db->AddSuperVersionsToFreeQueue(sv);
+      db->SchedulePurge();
+    }
+    db->mutex_.Unlock();
+    if (!background_purge_on_iterator_cleanup) {
+      delete sv;
+    }
+    if (job_context.HaveSomethingToDelete()) {
+      db->PurgeObsoleteFiles(job_context, background_purge_on_iterator_cleanup);
+    }
+    job_context.Clean();
+  }
+}
+
+namespace {
+struct SVCleanupParams {
+  DBImpl* db;
+  SuperVersion* sv;
+  bool background_purge_on_iterator_cleanup;
+};
+}
+
+// Used in PinnedIteratorsManager to release pinned SuperVersion
+void ForwardIterator::DeferredSVCleanup(void* arg) {
+  auto d = reinterpret_cast<SVCleanupParams*>(arg);
+  ForwardIterator::SVCleanup(
+    d->db, d->sv, d->background_purge_on_iterator_cleanup);
+  delete d;
+}
+
+void ForwardIterator::SVCleanup() {
+  if (sv_ == nullptr) {
+    return;
+  }
+  bool background_purge =
+      read_options_.background_purge_on_iterator_cleanup ||
+      db_->immutable_db_options().avoid_unnecessary_blocking_io;
+  if (pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled()) {
+    // pinned_iters_mgr_ tells us to make sure that all visited key-value slices
+    // are alive until pinned_iters_mgr_->ReleasePinnedData() is called.
+    // The slices may point into some memtables owned by sv_, so we need to keep
+    // sv_ referenced until pinned_iters_mgr_ unpins everything.
+    auto p = new SVCleanupParams{db_, sv_, background_purge};
+    pinned_iters_mgr_->PinPtr(p, &ForwardIterator::DeferredSVCleanup);
+  } else {
+    SVCleanup(db_, sv_, background_purge);
+  }
+}
+
+void ForwardIterator::Cleanup(bool release_sv) {
+  if (mutable_iter_ != nullptr) {
+    DeleteIterator(mutable_iter_, true /* is_arena */);
+  }
+
+  for (auto* m : imm_iters_) {
+    DeleteIterator(m, true /* is_arena */);
+  }
+  imm_iters_.clear();
+
+  for (auto* f : l0_iters_) {
+    DeleteIterator(f);
+  }
+  l0_iters_.clear();
+
+  for (auto* l : level_iters_) {
+    DeleteIterator(l);
+  }
+  level_iters_.clear();
+
+  if (release_sv) {
+    SVCleanup();
+  }
+}
+
+bool ForwardIterator::Valid() const {
+  // See UpdateCurrent().
+  return valid_ ? !current_over_upper_bound_ : false;
+}
+
+void ForwardIterator::SeekToFirst() {
+  if (sv_ == nullptr) {
+    RebuildIterators(true);
+  } else if (sv_->version_number != cfd_->GetSuperVersionNumber()) {
+    RenewIterators();
+  } else if (immutable_status_.IsIncomplete()) {
+    ResetIncompleteIterators();
+  }
+  SeekInternal(Slice(), true);
+}
+
+bool ForwardIterator::IsOverUpperBound(const Slice& internal_key) const {
+  return !(read_options_.iterate_upper_bound == nullptr ||
+           cfd_->internal_comparator().user_comparator()->Compare(
+               ExtractUserKey(internal_key),
+               *read_options_.iterate_upper_bound) < 0);
+}
+
+void ForwardIterator::Seek(const Slice& internal_key) {
+  if (sv_ == nullptr) {
+    RebuildIterators(true);
+  } else if (sv_->version_number != cfd_->GetSuperVersionNumber()) {
+    RenewIterators();
+  } else if (immutable_status_.IsIncomplete()) {
+    ResetIncompleteIterators();
+  }
+  SeekInternal(internal_key, false);
+}
+
+void ForwardIterator::SeekInternal(const Slice& internal_key,
+                                   bool seek_to_first) {
+  assert(mutable_iter_);
+  // mutable
+  seek_to_first ? mutable_iter_->SeekToFirst() :
+                  mutable_iter_->Seek(internal_key);
+
+  // immutable
+  // TODO(ljin): NeedToSeekImmutable has negative impact on performance
+  // if it turns to need to seek immutable often. We probably want to have
+  // an option to turn it off.
+  if (seek_to_first || NeedToSeekImmutable(internal_key)) {
+    immutable_status_ = Status::OK();
+    if (has_iter_trimmed_for_upper_bound_ &&
+        (
+            // prev_ is not set yet
+            is_prev_set_ == false ||
+            // We are doing SeekToFirst() and internal_key.size() = 0
+            seek_to_first ||
+            // prev_key_ > internal_key
+            cfd_->internal_comparator().InternalKeyComparator::Compare(
+                prev_key_.GetInternalKey(), internal_key) > 0)) {
+      // Some iterators are trimmed. Need to rebuild.
+      RebuildIterators(true);
+      // Already seeked mutable iter, so seek again
+      seek_to_first ? mutable_iter_->SeekToFirst()
+                    : mutable_iter_->Seek(internal_key);
+    }
+    {
+      auto tmp = MinIterHeap(MinIterComparator(&cfd_->internal_comparator()));
+      immutable_min_heap_.swap(tmp);
+    }
+    for (size_t i = 0; i < imm_iters_.size(); i++) {
+      auto* m = imm_iters_[i];
+      seek_to_first ? m->SeekToFirst() : m->Seek(internal_key);
+      if (!m->status().ok()) {
+        immutable_status_ = m->status();
+      } else if (m->Valid()) {
+        immutable_min_heap_.push(m);
+      }
+    }
+
+    Slice target_user_key;
+    if (!seek_to_first) {
+      target_user_key = ExtractUserKey(internal_key);
+    }
+    const VersionStorageInfo* vstorage = sv_->current->storage_info();
+    const std::vector<FileMetaData*>& l0 = vstorage->LevelFiles(0);
+    for (size_t i = 0; i < l0.size(); ++i) {
+      if (!l0_iters_[i]) {
+        continue;
+      }
+      if (seek_to_first) {
+        l0_iters_[i]->SeekToFirst();
+      } else {
+        // If the target key passes over the larget key, we are sure Next()
+        // won't go over this file.
+        if (user_comparator_->Compare(target_user_key,
+                                      l0[i]->largest.user_key()) > 0) {
+          if (read_options_.iterate_upper_bound != nullptr) {
+            has_iter_trimmed_for_upper_bound_ = true;
+            DeleteIterator(l0_iters_[i]);
+            l0_iters_[i] = nullptr;
+          }
+          continue;
+        }
+        l0_iters_[i]->Seek(internal_key);
+      }
+
+      if (!l0_iters_[i]->status().ok()) {
+        immutable_status_ = l0_iters_[i]->status();
+      } else if (l0_iters_[i]->Valid() &&
+                 !IsOverUpperBound(l0_iters_[i]->key())) {
+        immutable_min_heap_.push(l0_iters_[i]);
+      } else {
+        has_iter_trimmed_for_upper_bound_ = true;
+        DeleteIterator(l0_iters_[i]);
+        l0_iters_[i] = nullptr;
+      }
+    }
+
+    for (int32_t level = 1; level < vstorage->num_levels(); ++level) {
+      const std::vector<FileMetaData*>& level_files =
+          vstorage->LevelFiles(level);
+      if (level_files.empty()) {
+        continue;
+      }
+      if (level_iters_[level - 1] == nullptr) {
+        continue;
+      }
+      uint32_t f_idx = 0;
+      if (!seek_to_first) {
+        f_idx = FindFileInRange(level_files, internal_key, 0,
+                                static_cast<uint32_t>(level_files.size()));
+      }
+
+      // Seek
+      if (f_idx < level_files.size()) {
+        level_iters_[level - 1]->SetFileIndex(f_idx);
+        seek_to_first ? level_iters_[level - 1]->SeekToFirst() :
+                        level_iters_[level - 1]->Seek(internal_key);
+
+        if (!level_iters_[level - 1]->status().ok()) {
+          immutable_status_ = level_iters_[level - 1]->status();
+        } else if (level_iters_[level - 1]->Valid() &&
+                   !IsOverUpperBound(level_iters_[level - 1]->key())) {
+          immutable_min_heap_.push(level_iters_[level - 1]);
+        } else {
+          // Nothing in this level is interesting. Remove.
+          has_iter_trimmed_for_upper_bound_ = true;
+          DeleteIterator(level_iters_[level - 1]);
+          level_iters_[level - 1] = nullptr;
+        }
+      }
+    }
+
+    if (seek_to_first) {
+      is_prev_set_ = false;
+    } else {
+      prev_key_.SetInternalKey(internal_key);
+      is_prev_set_ = true;
+      is_prev_inclusive_ = true;
+    }
+
+    TEST_SYNC_POINT_CALLBACK("ForwardIterator::SeekInternal:Immutable", this);
+  } else if (current_ && current_ != mutable_iter_) {
+    // current_ is one of immutable iterators, push it back to the heap
+    immutable_min_heap_.push(current_);
+  }
+
+  UpdateCurrent();
+  TEST_SYNC_POINT_CALLBACK("ForwardIterator::SeekInternal:Return", this);
+}
+
+void ForwardIterator::Next() {
+  assert(valid_);
+  bool update_prev_key = false;
+
+  if (sv_ == nullptr ||
+      sv_->version_number != cfd_->GetSuperVersionNumber()) {
+    std::string current_key = key().ToString();
+    Slice old_key(current_key.data(), current_key.size());
+
+    if (sv_ == nullptr) {
+      RebuildIterators(true);
+    } else {
+      RenewIterators();
+    }
+    SeekInternal(old_key, false);
+    if (!valid_ || key().compare(old_key) != 0) {
+      return;
+    }
+  } else if (current_ != mutable_iter_) {
+    // It is going to advance immutable iterator
+
+    if (is_prev_set_ && prefix_extractor_) {
+      // advance prev_key_ to current_ only if they share the same prefix
+      update_prev_key =
+          prefix_extractor_->Transform(prev_key_.GetUserKey())
+              .compare(prefix_extractor_->Transform(current_->key())) == 0;
+    } else {
+      update_prev_key = true;
+    }
+
+
+    if (update_prev_key) {
+      prev_key_.SetInternalKey(current_->key());
+      is_prev_set_ = true;
+      is_prev_inclusive_ = false;
+    }
+  }
+
+  current_->Next();
+  if (current_ != mutable_iter_) {
+    if (!current_->status().ok()) {
+      immutable_status_ = current_->status();
+    } else if ((current_->Valid()) && (!IsOverUpperBound(current_->key()))) {
+      immutable_min_heap_.push(current_);
+    } else {
+      if ((current_->Valid()) && (IsOverUpperBound(current_->key()))) {
+        // remove the current iterator
+        DeleteCurrentIter();
+        current_ = nullptr;
+      }
+      if (update_prev_key) {
+        mutable_iter_->Seek(prev_key_.GetInternalKey());
+      }
+    }
+  }
+  UpdateCurrent();
+  TEST_SYNC_POINT_CALLBACK("ForwardIterator::Next:Return", this);
+}
+
+Slice ForwardIterator::key() const {
+  assert(valid_);
+  return current_->key();
+}
+
+Slice ForwardIterator::value() const {
+  assert(valid_);
+  return current_->value();
+}
+
+Status ForwardIterator::status() const {
+  if (!status_.ok()) {
+    return status_;
+  } else if (!mutable_iter_->status().ok()) {
+    return mutable_iter_->status();
+  }
+
+  return immutable_status_;
+}
+
+Status ForwardIterator::GetProperty(std::string prop_name, std::string* prop) {
+  assert(prop != nullptr);
+  if (prop_name == "rocksdb.iterator.super-version-number") {
+    *prop = ToString(sv_->version_number);
+    return Status::OK();
+  }
+  return Status::InvalidArgument();
+}
+
+void ForwardIterator::SetPinnedItersMgr(
+    PinnedIteratorsManager* pinned_iters_mgr) {
+  pinned_iters_mgr_ = pinned_iters_mgr;
+  UpdateChildrenPinnedItersMgr();
+}
+
+void ForwardIterator::UpdateChildrenPinnedItersMgr() {
+  // Set PinnedIteratorsManager for mutable memtable iterator.
+  if (mutable_iter_) {
+    mutable_iter_->SetPinnedItersMgr(pinned_iters_mgr_);
+  }
+
+  // Set PinnedIteratorsManager for immutable memtable iterators.
+  for (InternalIterator* child_iter : imm_iters_) {
+    if (child_iter) {
+      child_iter->SetPinnedItersMgr(pinned_iters_mgr_);
+    }
+  }
+
+  // Set PinnedIteratorsManager for L0 files iterators.
+  for (InternalIterator* child_iter : l0_iters_) {
+    if (child_iter) {
+      child_iter->SetPinnedItersMgr(pinned_iters_mgr_);
+    }
+  }
+
+  // Set PinnedIteratorsManager for L1+ levels iterators.
+  for (ForwardLevelIterator* child_iter : level_iters_) {
+    if (child_iter) {
+      child_iter->SetPinnedItersMgr(pinned_iters_mgr_);
+    }
+  }
+}
+
+bool ForwardIterator::IsKeyPinned() const {
+  return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() &&
+         current_->IsKeyPinned();
+}
+
+bool ForwardIterator::IsValuePinned() const {
+  return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() &&
+         current_->IsValuePinned();
+}
+
+void ForwardIterator::RebuildIterators(bool refresh_sv) {
+  // Clean up
+  Cleanup(refresh_sv);
+  if (refresh_sv) {
+    // New
+    sv_ = cfd_->GetReferencedSuperVersion(db_);
+  }
+  ReadRangeDelAggregator range_del_agg(&cfd_->internal_comparator(),
+                                       kMaxSequenceNumber /* upper_bound */);
+  mutable_iter_ = sv_->mem->NewIterator(read_options_, &arena_);
+  sv_->imm->AddIterators(read_options_, &imm_iters_, &arena_);
+  if (!read_options_.ignore_range_deletions) {
+    std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter(
+        sv_->mem->NewRangeTombstoneIterator(
+            read_options_, sv_->current->version_set()->LastSequence()));
+    range_del_agg.AddTombstones(std::move(range_del_iter));
+    sv_->imm->AddRangeTombstoneIterators(read_options_, &arena_,
+                                         &range_del_agg);
+  }
+  has_iter_trimmed_for_upper_bound_ = false;
+
+  const auto* vstorage = sv_->current->storage_info();
+  const auto& l0_files = vstorage->LevelFiles(0);
+  l0_iters_.reserve(l0_files.size());
+  for (const auto* l0 : l0_files) {
+    if ((read_options_.iterate_upper_bound != nullptr) &&
+        cfd_->internal_comparator().user_comparator()->Compare(
+            l0->smallest.user_key(), *read_options_.iterate_upper_bound) > 0) {
+      // No need to set has_iter_trimmed_for_upper_bound_: this ForwardIterator
+      // will never be interested in files with smallest key above
+      // iterate_upper_bound, since iterate_upper_bound can't be changed.
+      l0_iters_.push_back(nullptr);
+      continue;
+    }
+    l0_iters_.push_back(cfd_->table_cache()->NewIterator(
+        read_options_, *cfd_->soptions(), cfd_->internal_comparator(), *l0,
+        read_options_.ignore_range_deletions ? nullptr : &range_del_agg,
+        sv_->mutable_cf_options.prefix_extractor.get(),
+        /*table_reader_ptr=*/nullptr, /*file_read_hist=*/nullptr,
+        TableReaderCaller::kUserIterator, /*arena=*/nullptr,
+        /*skip_filters=*/false, /*level=*/-1,
+        /*smallest_compaction_key=*/nullptr,
+        /*largest_compaction_key=*/nullptr));
+  }
+  BuildLevelIterators(vstorage);
+  current_ = nullptr;
+  is_prev_set_ = false;
+
+  UpdateChildrenPinnedItersMgr();
+  if (!range_del_agg.IsEmpty()) {
+    status_ = Status::NotSupported(
+        "Range tombstones unsupported with ForwardIterator");
+    valid_ = false;
+  }
+}
+
+void ForwardIterator::RenewIterators() {
+  SuperVersion* svnew;
+  assert(sv_);
+  svnew = cfd_->GetReferencedSuperVersion(db_);
+
+  if (mutable_iter_ != nullptr) {
+    DeleteIterator(mutable_iter_, true /* is_arena */);
+  }
+  for (auto* m : imm_iters_) {
+    DeleteIterator(m, true /* is_arena */);
+  }
+  imm_iters_.clear();
+
+  mutable_iter_ = svnew->mem->NewIterator(read_options_, &arena_);
+  svnew->imm->AddIterators(read_options_, &imm_iters_, &arena_);
+  ReadRangeDelAggregator range_del_agg(&cfd_->internal_comparator(),
+                                       kMaxSequenceNumber /* upper_bound */);
+  if (!read_options_.ignore_range_deletions) {
+    std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter(
+        svnew->mem->NewRangeTombstoneIterator(
+            read_options_, sv_->current->version_set()->LastSequence()));
+    range_del_agg.AddTombstones(std::move(range_del_iter));
+    svnew->imm->AddRangeTombstoneIterators(read_options_, &arena_,
+                                           &range_del_agg);
+  }
+
+  const auto* vstorage = sv_->current->storage_info();
+  const auto& l0_files = vstorage->LevelFiles(0);
+  const auto* vstorage_new = svnew->current->storage_info();
+  const auto& l0_files_new = vstorage_new->LevelFiles(0);
+  size_t iold, inew;
+  bool found;
+  std::vector<InternalIterator*> l0_iters_new;
+  l0_iters_new.reserve(l0_files_new.size());
+
+  for (inew = 0; inew < l0_files_new.size(); inew++) {
+    found = false;
+    for (iold = 0; iold < l0_files.size(); iold++) {
+      if (l0_files[iold] == l0_files_new[inew]) {
+        found = true;
+        break;
+      }
+    }
+    if (found) {
+      if (l0_iters_[iold] == nullptr) {
+        l0_iters_new.push_back(nullptr);
+        TEST_SYNC_POINT_CALLBACK("ForwardIterator::RenewIterators:Null", this);
+      } else {
+        l0_iters_new.push_back(l0_iters_[iold]);
+        l0_iters_[iold] = nullptr;
+        TEST_SYNC_POINT_CALLBACK("ForwardIterator::RenewIterators:Copy", this);
+      }
+      continue;
+    }
+    l0_iters_new.push_back(cfd_->table_cache()->NewIterator(
+        read_options_, *cfd_->soptions(), cfd_->internal_comparator(),
+        *l0_files_new[inew],
+        read_options_.ignore_range_deletions ? nullptr : &range_del_agg,
+        svnew->mutable_cf_options.prefix_extractor.get(),
+        /*table_reader_ptr=*/nullptr, /*file_read_hist=*/nullptr,
+        TableReaderCaller::kUserIterator, /*arena=*/nullptr,
+        /*skip_filters=*/false, /*level=*/-1,
+        /*smallest_compaction_key=*/nullptr,
+        /*largest_compaction_key=*/nullptr));
+  }
+
+  for (auto* f : l0_iters_) {
+    DeleteIterator(f);
+  }
+  l0_iters_.clear();
+  l0_iters_ = l0_iters_new;
+
+  for (auto* l : level_iters_) {
+    DeleteIterator(l);
+  }
+  level_iters_.clear();
+  BuildLevelIterators(vstorage_new);
+  current_ = nullptr;
+  is_prev_set_ = false;
+  SVCleanup();
+  sv_ = svnew;
+
+  UpdateChildrenPinnedItersMgr();
+  if (!range_del_agg.IsEmpty()) {
+    status_ = Status::NotSupported(
+        "Range tombstones unsupported with ForwardIterator");
+    valid_ = false;
+  }
+}
+
+void ForwardIterator::BuildLevelIterators(const VersionStorageInfo* vstorage) {
+  level_iters_.reserve(vstorage->num_levels() - 1);
+  for (int32_t level = 1; level < vstorage->num_levels(); ++level) {
+    const auto& level_files = vstorage->LevelFiles(level);
+    if ((level_files.empty()) ||
+        ((read_options_.iterate_upper_bound != nullptr) &&
+         (user_comparator_->Compare(*read_options_.iterate_upper_bound,
+                                    level_files[0]->smallest.user_key()) <
+          0))) {
+      level_iters_.push_back(nullptr);
+      if (!level_files.empty()) {
+        has_iter_trimmed_for_upper_bound_ = true;
+      }
+    } else {
+      level_iters_.push_back(new ForwardLevelIterator(
+          cfd_, read_options_, level_files,
+          sv_->mutable_cf_options.prefix_extractor.get()));
+    }
+  }
+}
+
+void ForwardIterator::ResetIncompleteIterators() {
+  const auto& l0_files = sv_->current->storage_info()->LevelFiles(0);
+  for (size_t i = 0; i < l0_iters_.size(); ++i) {
+    assert(i < l0_files.size());
+    if (!l0_iters_[i] || !l0_iters_[i]->status().IsIncomplete()) {
+      continue;
+    }
+    DeleteIterator(l0_iters_[i]);
+    l0_iters_[i] = cfd_->table_cache()->NewIterator(
+        read_options_, *cfd_->soptions(), cfd_->internal_comparator(),
+        *l0_files[i], /*range_del_agg=*/nullptr,
+        sv_->mutable_cf_options.prefix_extractor.get(),
+        /*table_reader_ptr=*/nullptr, /*file_read_hist=*/nullptr,
+        TableReaderCaller::kUserIterator, /*arena=*/nullptr,
+        /*skip_filters=*/false, /*level=*/-1,
+        /*smallest_compaction_key=*/nullptr,
+        /*largest_compaction_key=*/nullptr);
+    l0_iters_[i]->SetPinnedItersMgr(pinned_iters_mgr_);
+  }
+
+  for (auto* level_iter : level_iters_) {
+    if (level_iter && level_iter->status().IsIncomplete()) {
+      level_iter->Reset();
+    }
+  }
+
+  current_ = nullptr;
+  is_prev_set_ = false;
+}
+
+void ForwardIterator::UpdateCurrent() {
+  if (immutable_min_heap_.empty() && !mutable_iter_->Valid()) {
+    current_ = nullptr;
+  } else if (immutable_min_heap_.empty()) {
+    current_ = mutable_iter_;
+  } else if (!mutable_iter_->Valid()) {
+    current_ = immutable_min_heap_.top();
+    immutable_min_heap_.pop();
+  } else {
+    current_ = immutable_min_heap_.top();
+    assert(current_ != nullptr);
+    assert(current_->Valid());
+    int cmp = cfd_->internal_comparator().InternalKeyComparator::Compare(
+        mutable_iter_->key(), current_->key());
+    assert(cmp != 0);
+    if (cmp > 0) {
+      immutable_min_heap_.pop();
+    } else {
+      current_ = mutable_iter_;
+    }
+  }
+  valid_ = current_ != nullptr && immutable_status_.ok();
+  if (!status_.ok()) {
+    status_ = Status::OK();
+  }
+
+  // Upper bound doesn't apply to the memtable iterator. We want Valid() to
+  // return false when all iterators are over iterate_upper_bound, but can't
+  // just set valid_ to false, as that would effectively disable the tailing
+  // optimization (Seek() would be called on all immutable iterators regardless
+  // of whether the target key is greater than prev_key_).
+  current_over_upper_bound_ = valid_ && IsOverUpperBound(current_->key());
+}
+
+bool ForwardIterator::NeedToSeekImmutable(const Slice& target) {
+  // We maintain the interval (prev_key_, immutable_min_heap_.top()->key())
+  // such that there are no records with keys within that range in
+  // immutable_min_heap_. Since immutable structures (SST files and immutable
+  // memtables) can't change in this version, we don't need to do a seek if
+  // 'target' belongs to that interval (immutable_min_heap_.top() is already
+  // at the correct position).
+
+  if (!valid_ || !current_ || !is_prev_set_ || !immutable_status_.ok()) {
+    return true;
+  }
+  Slice prev_key = prev_key_.GetInternalKey();
+  if (prefix_extractor_ && prefix_extractor_->Transform(target).compare(
+    prefix_extractor_->Transform(prev_key)) != 0) {
+    return true;
+  }
+  if (cfd_->internal_comparator().InternalKeyComparator::Compare(
+        prev_key, target) >= (is_prev_inclusive_ ? 1 : 0)) {
+    return true;
+  }
+
+  if (immutable_min_heap_.empty() && current_ == mutable_iter_) {
+    // Nothing to seek on.
+    return false;
+  }
+  if (cfd_->internal_comparator().InternalKeyComparator::Compare(
+        target, current_ == mutable_iter_ ? immutable_min_heap_.top()->key()
+                                          : current_->key()) > 0) {
+    return true;
+  }
+  return false;
+}
+
+void ForwardIterator::DeleteCurrentIter() {
+  const VersionStorageInfo* vstorage = sv_->current->storage_info();
+  const std::vector<FileMetaData*>& l0 = vstorage->LevelFiles(0);
+  for (size_t i = 0; i < l0.size(); ++i) {
+    if (!l0_iters_[i]) {
+      continue;
+    }
+    if (l0_iters_[i] == current_) {
+      has_iter_trimmed_for_upper_bound_ = true;
+      DeleteIterator(l0_iters_[i]);
+      l0_iters_[i] = nullptr;
+      return;
+    }
+  }
+
+  for (int32_t level = 1; level < vstorage->num_levels(); ++level) {
+    if (level_iters_[level - 1] == nullptr) {
+      continue;
+    }
+    if (level_iters_[level - 1] == current_) {
+      has_iter_trimmed_for_upper_bound_ = true;
+      DeleteIterator(level_iters_[level - 1]);
+      level_iters_[level - 1] = nullptr;
+    }
+  }
+}
+
+bool ForwardIterator::TEST_CheckDeletedIters(int* pdeleted_iters,
+                                             int* pnum_iters) {
+  bool retval = false;
+  int deleted_iters = 0;
+  int num_iters = 0;
+
+  const VersionStorageInfo* vstorage = sv_->current->storage_info();
+  const std::vector<FileMetaData*>& l0 = vstorage->LevelFiles(0);
+  for (size_t i = 0; i < l0.size(); ++i) {
+    if (!l0_iters_[i]) {
+      retval = true;
+      deleted_iters++;
+    } else {
+      num_iters++;
+    }
+  }
+
+  for (int32_t level = 1; level < vstorage->num_levels(); ++level) {
+    if ((level_iters_[level - 1] == nullptr) &&
+        (!vstorage->LevelFiles(level).empty())) {
+      retval = true;
+      deleted_iters++;
+    } else if (!vstorage->LevelFiles(level).empty()) {
+      num_iters++;
+    }
+  }
+  if ((!retval) && num_iters <= 1) {
+    retval = true;
+  }
+  if (pdeleted_iters) {
+    *pdeleted_iters = deleted_iters;
+  }
+  if (pnum_iters) {
+    *pnum_iters = num_iters;
+  }
+  return retval;
+}
+
+uint32_t ForwardIterator::FindFileInRange(
+    const std::vector<FileMetaData*>& files, const Slice& internal_key,
+    uint32_t left, uint32_t right) {
+  auto cmp = [&](const FileMetaData* f, const Slice& key) -> bool {
+    return cfd_->internal_comparator().InternalKeyComparator::Compare(
+            f->largest.Encode(), key) < 0;
+  };
+  const auto &b = files.begin();
+  return static_cast<uint32_t>(std::lower_bound(b + left,
+                                 b + right, internal_key, cmp) - b);
+}
+
+void ForwardIterator::DeleteIterator(InternalIterator* iter, bool is_arena) {
+  if (iter == nullptr) {
+    return;
+  }
+
+  if (pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled()) {
+    pinned_iters_mgr_->PinIterator(iter, is_arena);
+  } else {
+    if (is_arena) {
+      iter->~InternalIterator();
+    } else {
+      delete iter;
+    }
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/db/forward_iterator.h b/src/rocksdb/db/forward_iterator.h
new file mode 100644
index 000000000..8c671c75f
--- /dev/null
+++ b/src/rocksdb/db/forward_iterator.h
@@ -0,0 +1,160 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <string>
+#include <vector>
+#include <queue>
+
+#include "db/dbformat.h"
+#include "memory/arena.h"
+#include "rocksdb/db.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/options.h"
+#include "table/internal_iterator.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBImpl;
+class Env;
+struct SuperVersion;
+class ColumnFamilyData;
+class ForwardLevelIterator;
+class VersionStorageInfo;
+struct FileMetaData;
+
+class MinIterComparator {
+ public:
+  explicit MinIterComparator(const Comparator* comparator) :
+    comparator_(comparator) {}
+
+  bool operator()(InternalIterator* a, InternalIterator* b) {
+    return comparator_->Compare(a->key(), b->key()) > 0;
+  }
+ private:
+  const Comparator* comparator_;
+};
+
+typedef std::priority_queue<InternalIterator*, std::vector<InternalIterator*>,
+                            MinIterComparator> MinIterHeap;
+
+/**
+ * ForwardIterator is a special type of iterator that only supports Seek()
+ * and Next(). It is expected to perform better than TailingIterator by
+ * removing the encapsulation and making all information accessible within
+ * the iterator. At the current implementation, snapshot is taken at the
+ * time Seek() is called. The Next() followed do not see new values after.
+ */
+class ForwardIterator : public InternalIterator {
+ public:
+  ForwardIterator(DBImpl* db, const ReadOptions& read_options,
+                  ColumnFamilyData* cfd, SuperVersion* current_sv = nullptr);
+  virtual ~ForwardIterator();
+
+  void SeekForPrev(const Slice& /*target*/) override {
+    status_ = Status::NotSupported("ForwardIterator::SeekForPrev()");
+    valid_ = false;
+  }
+  void SeekToLast() override {
+    status_ = Status::NotSupported("ForwardIterator::SeekToLast()");
+    valid_ = false;
+  }
+  void Prev() override {
+    status_ = Status::NotSupported("ForwardIterator::Prev");
+    valid_ = false;
+  }
+
+  virtual bool Valid() const override;
+  void SeekToFirst() override;
+  virtual void Seek(const Slice& target) override;
+  virtual void Next() override;
+  virtual Slice key() const override;
+  virtual Slice value() const override;
+  virtual Status status() const override;
+  virtual Status GetProperty(std::string prop_name, std::string* prop) override;
+  virtual void SetPinnedItersMgr(
+      PinnedIteratorsManager* pinned_iters_mgr) override;
+  virtual bool IsKeyPinned() const override;
+  virtual bool IsValuePinned() const override;
+
+  bool TEST_CheckDeletedIters(int* deleted_iters, int* num_iters);
+
+ private:
+  void Cleanup(bool release_sv);
+  // Unreference and, if needed, clean up the current SuperVersion. This is
+  // either done immediately or deferred until this iterator is unpinned by
+  // PinnedIteratorsManager.
+  void SVCleanup();
+  static void SVCleanup(
+    DBImpl* db, SuperVersion* sv, bool background_purge_on_iterator_cleanup);
+  static void DeferredSVCleanup(void* arg);
+
+  void RebuildIterators(bool refresh_sv);
+  void RenewIterators();
+  void BuildLevelIterators(const VersionStorageInfo* vstorage);
+  void ResetIncompleteIterators();
+  void SeekInternal(const Slice& internal_key, bool seek_to_first);
+  void UpdateCurrent();
+  bool NeedToSeekImmutable(const Slice& internal_key);
+  void DeleteCurrentIter();
+  uint32_t FindFileInRange(
+    const std::vector<FileMetaData*>& files, const Slice& internal_key,
+    uint32_t left, uint32_t right);
+
+  bool IsOverUpperBound(const Slice& internal_key) const;
+
+  // Set PinnedIteratorsManager for all children Iterators, this function should
+  // be called whenever we update children Iterators or pinned_iters_mgr_.
+  void UpdateChildrenPinnedItersMgr();
+
+  // A helper function that will release iter in the proper manner, or pass it
+  // to pinned_iters_mgr_ to release it later if pinning is enabled.
+  void DeleteIterator(InternalIterator* iter, bool is_arena = false);
+
+  DBImpl* const db_;
+  const ReadOptions read_options_;
+  ColumnFamilyData* const cfd_;
+  const SliceTransform* const prefix_extractor_;
+  const Comparator* user_comparator_;
+  MinIterHeap immutable_min_heap_;
+
+  SuperVersion* sv_;
+  InternalIterator* mutable_iter_;
+  std::vector<InternalIterator*> imm_iters_;
+  std::vector<InternalIterator*> l0_iters_;
+  std::vector<ForwardLevelIterator*> level_iters_;
+  InternalIterator* current_;
+  bool valid_;
+
+  // Internal iterator status; set only by one of the unsupported methods.
+  Status status_;
+  // Status of immutable iterators, maintained here to avoid iterating over
+  // all of them in status().
+  Status immutable_status_;
+  // Indicates that at least one of the immutable iterators pointed to a key
+  // larger than iterate_upper_bound and was therefore destroyed. Seek() may
+  // need to rebuild such iterators.
+  bool has_iter_trimmed_for_upper_bound_;
+  // Is current key larger than iterate_upper_bound? If so, makes Valid()
+  // return false.
+  bool current_over_upper_bound_;
+
+  // Left endpoint of the range of keys that immutable iterators currently
+  // cover. When Seek() is called with a key that's within that range, immutable
+  // iterators don't need to be moved; see NeedToSeekImmutable(). This key is
+  // included in the range after a Seek(), but excluded when advancing the
+  // iterator using Next().
+  IterKey prev_key_;
+  bool is_prev_set_;
+  bool is_prev_inclusive_;
+
+  PinnedIteratorsManager* pinned_iters_mgr_;
+  Arena arena_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/db/forward_iterator_bench.cc b/src/rocksdb/db/forward_iterator_bench.cc
new file mode 100644
index 000000000..6f1223537
--- /dev/null
+++ b/src/rocksdb/db/forward_iterator_bench.cc
@@ -0,0 +1,377 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#if !defined(GFLAGS) || defined(ROCKSDB_LITE)
+#include <cstdio>
+int main() {
+  fprintf(stderr, "Please install gflags to run rocksdb tools\n");
+  return 1;
+}
+#elif defined(OS_MACOSX) || defined(OS_WIN)
+// Block forward_iterator_bench under MAC and Windows
+int main() { return 0; }
+#else
+#include <semaphore.h>
+#include <atomic>
+#include <bitset>
+#include <chrono>
+#include <climits>
+#include <condition_variable>
+#include <limits>
+#include <mutex>
+#include <queue>
+#include <random>
+#include <thread>
+
+#include "port/port.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/db.h"
+#include "rocksdb/status.h"
+#include "rocksdb/table.h"
+#include "test_util/testharness.h"
+#include "util/gflags_compat.h"
+
+const int MAX_SHARDS = 100000;
+
+DEFINE_int32(writers, 8, "");
+DEFINE_int32(readers, 8, "");
+DEFINE_int64(rate, 100000, "");
+DEFINE_int64(value_size, 300, "");
+DEFINE_int64(shards, 1000, "");
+DEFINE_int64(memtable_size, 500000000, "");
+DEFINE_int64(block_cache_size, 300000000, "");
+DEFINE_int64(block_size, 65536, "");
+DEFINE_double(runtime, 300.0, "");
+DEFINE_bool(cache_only_first, true, "");
+DEFINE_bool(iterate_upper_bound, true, "");
+
+struct Stats {
+  char pad1[128] __attribute__((__unused__));
+  std::atomic<uint64_t> written{0};
+  char pad2[128] __attribute__((__unused__));
+  std::atomic<uint64_t> read{0};
+  std::atomic<uint64_t> cache_misses{0};
+  char pad3[128] __attribute__((__unused__));
+} stats;
+
+struct Key {
+  Key() {}
+  Key(uint64_t shard_in, uint64_t seqno_in)
+      : shard_be(htobe64(shard_in)), seqno_be(htobe64(seqno_in)) {}
+
+  uint64_t shard() const { return be64toh(shard_be); }
+  uint64_t seqno() const { return be64toh(seqno_be); }
+
+ private:
+  uint64_t shard_be;
+  uint64_t seqno_be;
+} __attribute__((__packed__));
+
+struct Reader;
+struct Writer;
+
+struct ShardState {
+  char pad1[128] __attribute__((__unused__));
+  std::atomic<uint64_t> last_written{0};
+  Writer* writer;
+  Reader* reader;
+  char pad2[128] __attribute__((__unused__));
+  std::atomic<uint64_t> last_read{0};
+  std::unique_ptr<ROCKSDB_NAMESPACE::Iterator> it;
+  std::unique_ptr<ROCKSDB_NAMESPACE::Iterator> it_cacheonly;
+  Key upper_bound;
+  ROCKSDB_NAMESPACE::Slice upper_bound_slice;
+  char pad3[128] __attribute__((__unused__));
+};
+
+struct Reader {
+ public:
+  explicit Reader(std::vector<ShardState>* shard_states,
+                  ROCKSDB_NAMESPACE::DB* db)
+      : shard_states_(shard_states), db_(db) {
+    sem_init(&sem_, 0, 0);
+    thread_ = port::Thread(&Reader::run, this);
+  }
+
+  void run() {
+    while (1) {
+      sem_wait(&sem_);
+      if (done_.load()) {
+        break;
+      }
+
+      uint64_t shard;
+      {
+        std::lock_guard<std::mutex> guard(queue_mutex_);
+        assert(!shards_pending_queue_.empty());
+        shard = shards_pending_queue_.front();
+        shards_pending_queue_.pop();
+        shards_pending_set_.reset(shard);
+      }
+      readOnceFromShard(shard);
+    }
+  }
+
+  void readOnceFromShard(uint64_t shard) {
+    ShardState& state = (*shard_states_)[shard];
+    if (!state.it) {
+      // Initialize iterators
+      ROCKSDB_NAMESPACE::ReadOptions options;
+      options.tailing = true;
+      if (FLAGS_iterate_upper_bound) {
+        state.upper_bound = Key(shard, std::numeric_limits<uint64_t>::max());
+        state.upper_bound_slice = ROCKSDB_NAMESPACE::Slice(
+            (const char*)&state.upper_bound, sizeof(state.upper_bound));
+        options.iterate_upper_bound = &state.upper_bound_slice;
+      }
+
+      state.it.reset(db_->NewIterator(options));
+
+      if (FLAGS_cache_only_first) {
+        options.read_tier = ROCKSDB_NAMESPACE::ReadTier::kBlockCacheTier;
+        state.it_cacheonly.reset(db_->NewIterator(options));
+      }
+    }
+
+    const uint64_t upto = state.last_written.load();
+    for (ROCKSDB_NAMESPACE::Iterator* it :
+         {state.it_cacheonly.get(), state.it.get()}) {
+      if (it == nullptr) {
+        continue;
+      }
+      if (state.last_read.load() >= upto) {
+        break;
+      }
+      bool need_seek = true;
+      for (uint64_t seq = state.last_read.load() + 1; seq <= upto; ++seq) {
+        if (need_seek) {
+          Key from(shard, state.last_read.load() + 1);
+          it->Seek(ROCKSDB_NAMESPACE::Slice((const char*)&from, sizeof(from)));
+          need_seek = false;
+        } else {
+          it->Next();
+        }
+        if (it->status().IsIncomplete()) {
+          ++::stats.cache_misses;
+          break;
+        }
+        assert(it->Valid());
+        assert(it->key().size() == sizeof(Key));
+        Key key;
+        memcpy(&key, it->key().data(), it->key().size());
+        // fprintf(stderr, "Expecting (%ld, %ld) read (%ld, %ld)\n",
+        //         shard, seq, key.shard(), key.seqno());
+        assert(key.shard() == shard);
+        assert(key.seqno() == seq);
+        state.last_read.store(seq);
+        ++::stats.read;
+      }
+    }
+  }
+
+  void onWrite(uint64_t shard) {
+    {
+      std::lock_guard<std::mutex> guard(queue_mutex_);
+      if (!shards_pending_set_.test(shard)) {
+        shards_pending_queue_.push(shard);
+        shards_pending_set_.set(shard);
+        sem_post(&sem_);
+      }
+    }
+  }
+
+  ~Reader() {
+    done_.store(true);
+    sem_post(&sem_);
+    thread_.join();
+  }
+
+ private:
+  char pad1[128] __attribute__((__unused__));
+  std::vector<ShardState>* shard_states_;
+  ROCKSDB_NAMESPACE::DB* db_;
+  ROCKSDB_NAMESPACE::port::Thread thread_;
+  sem_t sem_;
+  std::mutex queue_mutex_;
+  std::bitset<MAX_SHARDS + 1> shards_pending_set_;
+  std::queue<uint64_t> shards_pending_queue_;
+  std::atomic<bool> done_{false};
+  char pad2[128] __attribute__((__unused__));
+};
+
+struct Writer {
+  explicit Writer(std::vector<ShardState>* shard_states,
+                  ROCKSDB_NAMESPACE::DB* db)
+      : shard_states_(shard_states), db_(db) {}
+
+  void start() { thread_ = port::Thread(&Writer::run, this); }
+
+  void run() {
+    std::queue<std::chrono::steady_clock::time_point> workq;
+    std::chrono::steady_clock::time_point deadline(
+        std::chrono::steady_clock::now() +
+        std::chrono::nanoseconds((uint64_t)(1000000000 * FLAGS_runtime)));
+    std::vector<uint64_t> my_shards;
+    for (int i = 1; i <= FLAGS_shards; ++i) {
+      if ((*shard_states_)[i].writer == this) {
+        my_shards.push_back(i);
+      }
+    }
+
+    std::mt19937 rng{std::random_device()()};
+    std::uniform_int_distribution<int> shard_dist(
+        0, static_cast<int>(my_shards.size()) - 1);
+    std::string value(FLAGS_value_size, '*');
+
+    while (1) {
+      auto now = std::chrono::steady_clock::now();
+      if (FLAGS_runtime >= 0 && now >= deadline) {
+        break;
+      }
+      if (workq.empty()) {
+        for (int i = 0; i < FLAGS_rate; i += FLAGS_writers) {
+          std::chrono::nanoseconds offset(1000000000LL * i / FLAGS_rate);
+          workq.push(now + offset);
+        }
+      }
+      while (!workq.empty() && workq.front() < now) {
+        workq.pop();
+        uint64_t shard = my_shards[shard_dist(rng)];
+        ShardState& state = (*shard_states_)[shard];
+        uint64_t seqno = state.last_written.load() + 1;
+        Key key(shard, seqno);
+        // fprintf(stderr, "Writing (%ld, %ld)\n", shard, seqno);
+        ROCKSDB_NAMESPACE::Status status =
+            db_->Put(ROCKSDB_NAMESPACE::WriteOptions(),
+                     ROCKSDB_NAMESPACE::Slice((const char*)&key, sizeof(key)),
+                     ROCKSDB_NAMESPACE::Slice(value));
+        assert(status.ok());
+        state.last_written.store(seqno);
+        state.reader->onWrite(shard);
+        ++::stats.written;
+      }
+      std::this_thread::sleep_for(std::chrono::milliseconds(1));
+    }
+    // fprintf(stderr, "Writer done\n");
+  }
+
+  ~Writer() { thread_.join(); }
+
+ private:
+  char pad1[128] __attribute__((__unused__));
+  std::vector<ShardState>* shard_states_;
+  ROCKSDB_NAMESPACE::DB* db_;
+  ROCKSDB_NAMESPACE::port::Thread thread_;
+  char pad2[128] __attribute__((__unused__));
+};
+
+struct StatsThread {
+  explicit StatsThread(ROCKSDB_NAMESPACE::DB* db)
+      : db_(db), thread_(&StatsThread::run, this) {}
+
+  void run() {
+    //    using namespace std::chrono;
+    auto tstart = std::chrono::steady_clock::now(), tlast = tstart;
+    uint64_t wlast = 0, rlast = 0;
+    while (!done_.load()) {
+      {
+        std::unique_lock<std::mutex> lock(cvm_);
+        cv_.wait_for(lock, std::chrono::seconds(1));
+      }
+      auto now = std::chrono::steady_clock::now();
+      double elapsed =
+          std::chrono::duration_cast<std::chrono::duration<double> >(
+              now - tlast).count();
+      uint64_t w = ::stats.written.load();
+      uint64_t r = ::stats.read.load();
+      fprintf(stderr,
+              "%s elapsed %4lds | written %10ld | w/s %10.0f | read %10ld | "
+              "r/s %10.0f | cache misses %10ld\n",
+              db_->GetEnv()->TimeToString(time(nullptr)).c_str(),
+              std::chrono::duration_cast<std::chrono::seconds>(now - tstart)
+                  .count(),
+              w, (w - wlast) / elapsed, r, (r - rlast) / elapsed,
+              ::stats.cache_misses.load());
+      wlast = w;
+      rlast = r;
+      tlast = now;
+    }
+  }
+
+  ~StatsThread() {
+    {
+      std::lock_guard<std::mutex> guard(cvm_);
+      done_.store(true);
+    }
+    cv_.notify_all();
+    thread_.join();
+  }
+
+ private:
+  ROCKSDB_NAMESPACE::DB* db_;
+  std::mutex cvm_;
+  std::condition_variable cv_;
+  ROCKSDB_NAMESPACE::port::Thread thread_;
+  std::atomic<bool> done_{false};
+};
+
+int main(int argc, char** argv) {
+  GFLAGS_NAMESPACE::ParseCommandLineFlags(&argc, &argv, true);
+
+  std::mt19937 rng{std::random_device()()};
+  ROCKSDB_NAMESPACE::Status status;
+  std::string path =
+      ROCKSDB_NAMESPACE::test::PerThreadDBPath("forward_iterator_test");
+  fprintf(stderr, "db path is %s\n", path.c_str());
+  ROCKSDB_NAMESPACE::Options options;
+  options.create_if_missing = true;
+  options.compression = ROCKSDB_NAMESPACE::CompressionType::kNoCompression;
+  options.compaction_style =
+      ROCKSDB_NAMESPACE::CompactionStyle::kCompactionStyleNone;
+  options.level0_slowdown_writes_trigger = 99999;
+  options.level0_stop_writes_trigger = 99999;
+  options.use_direct_io_for_flush_and_compaction = true;
+  options.write_buffer_size = FLAGS_memtable_size;
+  ROCKSDB_NAMESPACE::BlockBasedTableOptions table_options;
+  table_options.block_cache =
+      ROCKSDB_NAMESPACE::NewLRUCache(FLAGS_block_cache_size);
+  table_options.block_size = FLAGS_block_size;
+  options.table_factory.reset(
+      ROCKSDB_NAMESPACE::NewBlockBasedTableFactory(table_options));
+
+  status = ROCKSDB_NAMESPACE::DestroyDB(path, options);
+  assert(status.ok());
+  ROCKSDB_NAMESPACE::DB* db_raw;
+  status = ROCKSDB_NAMESPACE::DB::Open(options, path, &db_raw);
+  assert(status.ok());
+  std::unique_ptr<ROCKSDB_NAMESPACE::DB> db(db_raw);
+
+  std::vector<ShardState> shard_states(FLAGS_shards + 1);
+  std::deque<Reader> readers;
+  while (static_cast<int>(readers.size()) < FLAGS_readers) {
+    readers.emplace_back(&shard_states, db_raw);
+  }
+  std::deque<Writer> writers;
+  while (static_cast<int>(writers.size()) < FLAGS_writers) {
+    writers.emplace_back(&shard_states, db_raw);
+  }
+
+  // Each shard gets a random reader and random writer assigned to it
+  for (int i = 1; i <= FLAGS_shards; ++i) {
+    std::uniform_int_distribution<int> reader_dist(0, FLAGS_readers - 1);
+    std::uniform_int_distribution<int> writer_dist(0, FLAGS_writers - 1);
+    shard_states[i].reader = &readers[reader_dist(rng)];
+    shard_states[i].writer = &writers[writer_dist(rng)];
+  }
+
+  StatsThread stats_thread(db_raw);
+  for (Writer& w : writers) {
+    w.start();
+  }
+
+  writers.clear();
+  readers.clear();
+}
+#endif  // !defined(GFLAGS) || defined(ROCKSDB_LITE)
diff --git a/src/rocksdb/db/import_column_family_job.cc b/src/rocksdb/db/import_column_family_job.cc
new file mode 100644
index 000000000..15af1cf80
--- /dev/null
+++ b/src/rocksdb/db/import_column_family_job.cc
@@ -0,0 +1,276 @@
+#ifndef ROCKSDB_LITE
+
+#include "db/import_column_family_job.h"
+
+#include <algorithm>
+#include <cinttypes>
+#include <string>
+#include <vector>
+
+#include "db/version_edit.h"
+#include "file/file_util.h"
+#include "file/random_access_file_reader.h"
+#include "table/merging_iterator.h"
+#include "table/scoped_arena_iterator.h"
+#include "table/sst_file_writer_collectors.h"
+#include "table/table_builder.h"
+#include "util/stop_watch.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+Status ImportColumnFamilyJob::Prepare(uint64_t next_file_number,
+                                      SuperVersion* sv) {
+  Status status;
+
+  // Read the information of files we are importing
+  for (const auto& file_metadata : metadata_) {
+    const auto file_path = file_metadata.db_path + "/" + file_metadata.name;
+    IngestedFileInfo file_to_import;
+    status = GetIngestedFileInfo(file_path, &file_to_import, sv);
+    if (!status.ok()) {
+      return status;
+    }
+    files_to_import_.push_back(file_to_import);
+  }
+
+  const auto ucmp = cfd_->internal_comparator().user_comparator();
+  auto num_files = files_to_import_.size();
+  if (num_files == 0) {
+    return Status::InvalidArgument("The list of files is empty");
+  } else if (num_files > 1) {
+    // Verify that passed files don't have overlapping ranges in any particular
+    // level.
+    int min_level = 1;  // Check for overlaps in Level 1 and above.
+    int max_level = -1;
+    for (const auto& file_metadata : metadata_) {
+      if (file_metadata.level > max_level) {
+        max_level = file_metadata.level;
+      }
+    }
+    for (int level = min_level; level <= max_level; ++level) {
+      autovector<const IngestedFileInfo*> sorted_files;
+      for (size_t i = 0; i < num_files; i++) {
+        if (metadata_[i].level == level) {
+          sorted_files.push_back(&files_to_import_[i]);
+        }
+      }
+
+      std::sort(sorted_files.begin(), sorted_files.end(),
+                [&ucmp](const IngestedFileInfo* info1,
+                        const IngestedFileInfo* info2) {
+                  return sstableKeyCompare(ucmp, info1->smallest_internal_key,
+                                           info2->smallest_internal_key) < 0;
+                });
+
+      for (size_t i = 0; i < sorted_files.size() - 1; i++) {
+        if (sstableKeyCompare(ucmp, sorted_files[i]->largest_internal_key,
+                              sorted_files[i + 1]->smallest_internal_key) >=
+            0) {
+          return Status::InvalidArgument("Files have overlapping ranges");
+        }
+      }
+    }
+  }
+
+  for (const auto& f : files_to_import_) {
+    if (f.num_entries == 0) {
+      return Status::InvalidArgument("File contain no entries");
+    }
+
+    if (!f.smallest_internal_key.Valid() || !f.largest_internal_key.Valid()) {
+      return Status::Corruption("File has corrupted keys");
+    }
+  }
+
+  // Copy/Move external files into DB
+  auto hardlink_files = import_options_.move_files;
+  for (auto& f : files_to_import_) {
+    f.fd = FileDescriptor(next_file_number++, 0, f.file_size);
+
+    const auto path_outside_db = f.external_file_path;
+    const auto path_inside_db = TableFileName(
+        cfd_->ioptions()->cf_paths, f.fd.GetNumber(), f.fd.GetPathId());
+
+    if (hardlink_files) {
+      status =
+          fs_->LinkFile(path_outside_db, path_inside_db, IOOptions(), nullptr);
+      if (status.IsNotSupported()) {
+        // Original file is on a different FS, use copy instead of hard linking
+        hardlink_files = false;
+      }
+    }
+    if (!hardlink_files) {
+      status = CopyFile(fs_, path_outside_db, path_inside_db, 0,
+                        db_options_.use_fsync);
+    }
+    if (!status.ok()) {
+      break;
+    }
+    f.copy_file = !hardlink_files;
+    f.internal_file_path = path_inside_db;
+  }
+
+  if (!status.ok()) {
+    // We failed, remove all files that we copied into the db
+    for (const auto& f : files_to_import_) {
+      if (f.internal_file_path.empty()) {
+        break;
+      }
+      const auto s =
+          fs_->DeleteFile(f.internal_file_path, IOOptions(), nullptr);
+      if (!s.ok()) {
+        ROCKS_LOG_WARN(db_options_.info_log,
+                       "AddFile() clean up for file %s failed : %s",
+                       f.internal_file_path.c_str(), s.ToString().c_str());
+      }
+    }
+  }
+
+  return status;
+}
+
+// REQUIRES: we have become the only writer by entering both write_thread_ and
+// nonmem_write_thread_
+Status ImportColumnFamilyJob::Run() {
+  Status status;
+  edit_.SetColumnFamily(cfd_->GetID());
+
+  // We use the import time as the ancester time. This is the time the data
+  // is written to the database.
+  int64_t temp_current_time = 0;
+  uint64_t oldest_ancester_time = kUnknownOldestAncesterTime;
+  uint64_t current_time = kUnknownOldestAncesterTime;
+  if (env_->GetCurrentTime(&temp_current_time).ok()) {
+    current_time = oldest_ancester_time =
+        static_cast<uint64_t>(temp_current_time);
+  }
+
+  for (size_t i = 0; i < files_to_import_.size(); ++i) {
+    const auto& f = files_to_import_[i];
+    const auto& file_metadata = metadata_[i];
+
+    edit_.AddFile(file_metadata.level, f.fd.GetNumber(), f.fd.GetPathId(),
+                  f.fd.GetFileSize(), f.smallest_internal_key,
+                  f.largest_internal_key, file_metadata.smallest_seqno,
+                  file_metadata.largest_seqno, false, kInvalidBlobFileNumber,
+                  oldest_ancester_time, current_time, kUnknownFileChecksum,
+                  kUnknownFileChecksumFuncName);
+
+    // If incoming sequence number is higher, update local sequence number.
+    if (file_metadata.largest_seqno > versions_->LastSequence()) {
+      versions_->SetLastAllocatedSequence(file_metadata.largest_seqno);
+      versions_->SetLastPublishedSequence(file_metadata.largest_seqno);
+      versions_->SetLastSequence(file_metadata.largest_seqno);
+    }
+  }
+
+  return status;
+}
+
+void ImportColumnFamilyJob::Cleanup(const Status& status) {
+  if (!status.ok()) {
+    // We failed to add files to the database remove all the files we copied.
+    for (const auto& f : files_to_import_) {
+      const auto s =
+          fs_->DeleteFile(f.internal_file_path, IOOptions(), nullptr);
+      if (!s.ok()) {
+        ROCKS_LOG_WARN(db_options_.info_log,
+                       "AddFile() clean up for file %s failed : %s",
+                       f.internal_file_path.c_str(), s.ToString().c_str());
+      }
+    }
+  } else if (status.ok() && import_options_.move_files) {
+    // The files were moved and added successfully, remove original file links
+    for (IngestedFileInfo& f : files_to_import_) {
+      const auto s =
+          fs_->DeleteFile(f.external_file_path, IOOptions(), nullptr);
+      if (!s.ok()) {
+        ROCKS_LOG_WARN(
+            db_options_.info_log,
+            "%s was added to DB successfully but failed to remove original "
+            "file link : %s",
+            f.external_file_path.c_str(), s.ToString().c_str());
+      }
+    }
+  }
+}
+
+Status ImportColumnFamilyJob::GetIngestedFileInfo(
+    const std::string& external_file, IngestedFileInfo* file_to_import,
+    SuperVersion* sv) {
+  file_to_import->external_file_path = external_file;
+
+  // Get external file size
+  Status status = fs_->GetFileSize(external_file, IOOptions(),
+                                   &file_to_import->file_size, nullptr);
+  if (!status.ok()) {
+    return status;
+  }
+
+  // Create TableReader for external file
+  std::unique_ptr<TableReader> table_reader;
+  std::unique_ptr<FSRandomAccessFile> sst_file;
+  std::unique_ptr<RandomAccessFileReader> sst_file_reader;
+
+  status = fs_->NewRandomAccessFile(external_file, env_options_,
+                                    &sst_file, nullptr);
+  if (!status.ok()) {
+    return status;
+  }
+  sst_file_reader.reset(
+      new RandomAccessFileReader(std::move(sst_file), external_file));
+
+  status = cfd_->ioptions()->table_factory->NewTableReader(
+      TableReaderOptions(*cfd_->ioptions(),
+                         sv->mutable_cf_options.prefix_extractor.get(),
+                         env_options_, cfd_->internal_comparator()),
+      std::move(sst_file_reader), file_to_import->file_size, &table_reader);
+  if (!status.ok()) {
+    return status;
+  }
+
+  // Get the external file properties
+  auto props = table_reader->GetTableProperties();
+
+  // Set original_seqno to 0.
+  file_to_import->original_seqno = 0;
+
+  // Get number of entries in table
+  file_to_import->num_entries = props->num_entries;
+
+  ParsedInternalKey key;
+  ReadOptions ro;
+  // During reading the external file we can cache blocks that we read into
+  // the block cache, if we later change the global seqno of this file, we will
+  // have block in cache that will include keys with wrong seqno.
+  // We need to disable fill_cache so that we read from the file without
+  // updating the block cache.
+  ro.fill_cache = false;
+  std::unique_ptr<InternalIterator> iter(table_reader->NewIterator(
+      ro, sv->mutable_cf_options.prefix_extractor.get(), /*arena=*/nullptr,
+      /*skip_filters=*/false, TableReaderCaller::kExternalSSTIngestion));
+
+  // Get first (smallest) key from file
+  iter->SeekToFirst();
+  if (!ParseInternalKey(iter->key(), &key)) {
+    return Status::Corruption("external file have corrupted keys");
+  }
+  file_to_import->smallest_internal_key.SetFrom(key);
+
+  // Get last (largest) key from file
+  iter->SeekToLast();
+  if (!ParseInternalKey(iter->key(), &key)) {
+    return Status::Corruption("external file have corrupted keys");
+  }
+  file_to_import->largest_internal_key.SetFrom(key);
+
+  file_to_import->cf_id = static_cast<uint32_t>(props->column_family_id);
+
+  file_to_import->table_properties = *props;
+
+  return status;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/import_column_family_job.h b/src/rocksdb/db/import_column_family_job.h
new file mode 100644
index 000000000..160fd1247
--- /dev/null
+++ b/src/rocksdb/db/import_column_family_job.h
@@ -0,0 +1,72 @@
+#pragma once
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "db/column_family.h"
+#include "db/dbformat.h"
+#include "db/external_sst_file_ingestion_job.h"
+#include "db/snapshot_impl.h"
+#include "options/db_options.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/metadata.h"
+#include "rocksdb/sst_file_writer.h"
+#include "util/autovector.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Imports a set of sst files as is into a new column family. Logic is similar
+// to ExternalSstFileIngestionJob.
+class ImportColumnFamilyJob {
+ public:
+  ImportColumnFamilyJob(Env* env, VersionSet* versions, ColumnFamilyData* cfd,
+                        const ImmutableDBOptions& db_options,
+                        const EnvOptions& env_options,
+                        const ImportColumnFamilyOptions& import_options,
+                        const std::vector<LiveFileMetaData>& metadata)
+      : env_(env),
+        versions_(versions),
+        cfd_(cfd),
+        db_options_(db_options),
+        fs_(db_options_.fs.get()),
+        env_options_(env_options),
+        import_options_(import_options),
+        metadata_(metadata) {}
+
+  // Prepare the job by copying external files into the DB.
+  Status Prepare(uint64_t next_file_number, SuperVersion* sv);
+
+  // Will execute the import job and prepare edit() to be applied.
+  // REQUIRES: Mutex held
+  Status Run();
+
+  // Cleanup after successful/failed job
+  void Cleanup(const Status& status);
+
+  VersionEdit* edit() { return &edit_; }
+
+  const autovector<IngestedFileInfo>& files_to_import() const {
+    return files_to_import_;
+  }
+
+ private:
+  // Open the external file and populate `file_to_import` with all the
+  // external information we need to import this file.
+  Status GetIngestedFileInfo(const std::string& external_file,
+                             IngestedFileInfo* file_to_import,
+                             SuperVersion* sv);
+
+  Env* env_;
+  VersionSet* versions_;
+  ColumnFamilyData* cfd_;
+  const ImmutableDBOptions& db_options_;
+  FileSystem* fs_;
+  const EnvOptions& env_options_;
+  autovector<IngestedFileInfo> files_to_import_;
+  VersionEdit edit_;
+  const ImportColumnFamilyOptions& import_options_;
+  std::vector<LiveFileMetaData> metadata_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/import_column_family_test.cc b/src/rocksdb/db/import_column_family_test.cc
new file mode 100644
index 000000000..a25560b7c
--- /dev/null
+++ b/src/rocksdb/db/import_column_family_test.cc
@@ -0,0 +1,567 @@
+#ifndef ROCKSDB_LITE
+
+#include <functional>
+#include "db/db_test_util.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "rocksdb/sst_file_writer.h"
+#include "test_util/testutil.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class ImportColumnFamilyTest : public DBTestBase {
+ public:
+  ImportColumnFamilyTest() : DBTestBase("/import_column_family_test") {
+    sst_files_dir_ = dbname_ + "/sst_files/";
+    DestroyAndRecreateExternalSSTFilesDir();
+    export_files_dir_ = test::TmpDir(env_) + "/export";
+    import_cfh_ = nullptr;
+    import_cfh2_ = nullptr;
+    metadata_ptr_ = nullptr;
+  }
+
+  ~ImportColumnFamilyTest() {
+    if (import_cfh_) {
+      db_->DropColumnFamily(import_cfh_);
+      db_->DestroyColumnFamilyHandle(import_cfh_);
+      import_cfh_ = nullptr;
+    }
+    if (import_cfh2_) {
+      db_->DropColumnFamily(import_cfh2_);
+      db_->DestroyColumnFamilyHandle(import_cfh2_);
+      import_cfh2_ = nullptr;
+    }
+    if (metadata_ptr_) {
+      delete metadata_ptr_;
+      metadata_ptr_ = nullptr;
+    }
+    test::DestroyDir(env_, sst_files_dir_);
+    test::DestroyDir(env_, export_files_dir_);
+  }
+
+  void DestroyAndRecreateExternalSSTFilesDir() {
+    test::DestroyDir(env_, sst_files_dir_);
+    env_->CreateDir(sst_files_dir_);
+    test::DestroyDir(env_, export_files_dir_);
+  }
+
+  LiveFileMetaData LiveFileMetaDataInit(std::string name, std::string path,
+                                        int level,
+                                        SequenceNumber smallest_seqno,
+                                        SequenceNumber largest_seqno) {
+    LiveFileMetaData metadata;
+    metadata.name = name;
+    metadata.db_path = path;
+    metadata.smallest_seqno = smallest_seqno;
+    metadata.largest_seqno = largest_seqno;
+    metadata.level = level;
+    return metadata;
+  }
+
+ protected:
+  std::string sst_files_dir_;
+  std::string export_files_dir_;
+  ColumnFamilyHandle* import_cfh_;
+  ColumnFamilyHandle* import_cfh2_;
+  ExportImportFilesMetaData* metadata_ptr_;
+};
+
+TEST_F(ImportColumnFamilyTest, ImportSSTFileWriterFiles) {
+  Options options = CurrentOptions();
+  CreateAndReopenWithCF({"koko"}, options);
+
+  SstFileWriter sfw_cf1(EnvOptions(), options, handles_[1]);
+  SstFileWriter sfw_unknown(EnvOptions(), options);
+
+  // cf1.sst
+  const std::string cf1_sst_name = "cf1.sst";
+  const std::string cf1_sst = sst_files_dir_ + cf1_sst_name;
+  ASSERT_OK(sfw_cf1.Open(cf1_sst));
+  ASSERT_OK(sfw_cf1.Put("K1", "V1"));
+  ASSERT_OK(sfw_cf1.Put("K2", "V2"));
+  ASSERT_OK(sfw_cf1.Finish());
+
+  // cf_unknown.sst
+  const std::string unknown_sst_name = "cf_unknown.sst";
+  const std::string unknown_sst = sst_files_dir_ + unknown_sst_name;
+  ASSERT_OK(sfw_unknown.Open(unknown_sst));
+  ASSERT_OK(sfw_unknown.Put("K3", "V1"));
+  ASSERT_OK(sfw_unknown.Put("K4", "V2"));
+  ASSERT_OK(sfw_unknown.Finish());
+
+  {
+    // Import sst file corresponding to cf1 onto a new cf and verify
+    ExportImportFilesMetaData metadata;
+    metadata.files.push_back(
+        LiveFileMetaDataInit(cf1_sst_name, sst_files_dir_, 0, 10, 19));
+    metadata.db_comparator_name = options.comparator->Name();
+
+    ASSERT_OK(db_->CreateColumnFamilyWithImport(
+        options, "toto", ImportColumnFamilyOptions(), metadata, &import_cfh_));
+    ASSERT_NE(import_cfh_, nullptr);
+
+    std::string value;
+    db_->Get(ReadOptions(), import_cfh_, "K1", &value);
+    ASSERT_EQ(value, "V1");
+    db_->Get(ReadOptions(), import_cfh_, "K2", &value);
+    ASSERT_EQ(value, "V2");
+    ASSERT_OK(db_->DropColumnFamily(import_cfh_));
+    ASSERT_OK(db_->DestroyColumnFamilyHandle(import_cfh_));
+    import_cfh_ = nullptr;
+  }
+
+  {
+    // Import sst file corresponding to unknown cf onto a new cf and verify
+    ExportImportFilesMetaData metadata;
+    metadata.files.push_back(
+        LiveFileMetaDataInit(unknown_sst_name, sst_files_dir_, 0, 20, 29));
+    metadata.db_comparator_name = options.comparator->Name();
+
+    ASSERT_OK(db_->CreateColumnFamilyWithImport(
+        options, "yoyo", ImportColumnFamilyOptions(), metadata, &import_cfh_));
+    ASSERT_NE(import_cfh_, nullptr);
+
+    std::string value;
+    db_->Get(ReadOptions(), import_cfh_, "K3", &value);
+    ASSERT_EQ(value, "V1");
+    db_->Get(ReadOptions(), import_cfh_, "K4", &value);
+    ASSERT_EQ(value, "V2");
+  }
+}
+
+TEST_F(ImportColumnFamilyTest, ImportSSTFileWriterFilesWithOverlap) {
+  Options options = CurrentOptions();
+  CreateAndReopenWithCF({"koko"}, options);
+
+  SstFileWriter sfw_cf1(EnvOptions(), options, handles_[1]);
+
+  // file3.sst
+  const std::string file3_sst_name = "file3.sst";
+  const std::string file3_sst = sst_files_dir_ + file3_sst_name;
+  ASSERT_OK(sfw_cf1.Open(file3_sst));
+  for (int i = 0; i < 100; ++i) {
+    sfw_cf1.Put(Key(i), Key(i) + "_val");
+  }
+  ASSERT_OK(sfw_cf1.Finish());
+
+  // file2.sst
+  const std::string file2_sst_name = "file2.sst";
+  const std::string file2_sst = sst_files_dir_ + file2_sst_name;
+  ASSERT_OK(sfw_cf1.Open(file2_sst));
+  for (int i = 0; i < 100; i += 2) {
+    sfw_cf1.Put(Key(i), Key(i) + "_overwrite1");
+  }
+  ASSERT_OK(sfw_cf1.Finish());
+
+  // file1a.sst
+  const std::string file1a_sst_name = "file1a.sst";
+  const std::string file1a_sst = sst_files_dir_ + file1a_sst_name;
+  ASSERT_OK(sfw_cf1.Open(file1a_sst));
+  for (int i = 0; i < 52; i += 4) {
+    sfw_cf1.Put(Key(i), Key(i) + "_overwrite2");
+  }
+  ASSERT_OK(sfw_cf1.Finish());
+
+  // file1b.sst
+  const std::string file1b_sst_name = "file1b.sst";
+  const std::string file1b_sst = sst_files_dir_ + file1b_sst_name;
+  ASSERT_OK(sfw_cf1.Open(file1b_sst));
+  for (int i = 52; i < 100; i += 4) {
+    sfw_cf1.Put(Key(i), Key(i) + "_overwrite2");
+  }
+  ASSERT_OK(sfw_cf1.Finish());
+
+  // file0a.sst
+  const std::string file0a_sst_name = "file0a.sst";
+  const std::string file0a_sst = sst_files_dir_ + file0a_sst_name;
+  ASSERT_OK(sfw_cf1.Open(file0a_sst));
+  for (int i = 0; i < 100; i += 16) {
+    sfw_cf1.Put(Key(i), Key(i) + "_overwrite3");
+  }
+  ASSERT_OK(sfw_cf1.Finish());
+
+  // file0b.sst
+  const std::string file0b_sst_name = "file0b.sst";
+  const std::string file0b_sst = sst_files_dir_ + file0b_sst_name;
+  ASSERT_OK(sfw_cf1.Open(file0b_sst));
+  for (int i = 0; i < 100; i += 16) {
+    sfw_cf1.Put(Key(i), Key(i) + "_overwrite4");
+  }
+  ASSERT_OK(sfw_cf1.Finish());
+
+  // Import sst files and verify
+  ExportImportFilesMetaData metadata;
+  metadata.files.push_back(
+      LiveFileMetaDataInit(file3_sst_name, sst_files_dir_, 3, 10, 19));
+  metadata.files.push_back(
+      LiveFileMetaDataInit(file2_sst_name, sst_files_dir_, 2, 20, 29));
+  metadata.files.push_back(
+      LiveFileMetaDataInit(file1a_sst_name, sst_files_dir_, 1, 30, 34));
+  metadata.files.push_back(
+      LiveFileMetaDataInit(file1b_sst_name, sst_files_dir_, 1, 35, 39));
+  metadata.files.push_back(
+      LiveFileMetaDataInit(file0a_sst_name, sst_files_dir_, 0, 40, 49));
+  metadata.files.push_back(
+      LiveFileMetaDataInit(file0b_sst_name, sst_files_dir_, 0, 50, 59));
+  metadata.db_comparator_name = options.comparator->Name();
+
+  ASSERT_OK(db_->CreateColumnFamilyWithImport(
+      options, "toto", ImportColumnFamilyOptions(), metadata, &import_cfh_));
+  ASSERT_NE(import_cfh_, nullptr);
+
+  for (int i = 0; i < 100; i++) {
+    std::string value;
+    db_->Get(ReadOptions(), import_cfh_, Key(i), &value);
+    if (i % 16 == 0) {
+      ASSERT_EQ(value, Key(i) + "_overwrite4");
+    } else if (i % 4 == 0) {
+      ASSERT_EQ(value, Key(i) + "_overwrite2");
+    } else if (i % 2 == 0) {
+      ASSERT_EQ(value, Key(i) + "_overwrite1");
+    } else {
+      ASSERT_EQ(value, Key(i) + "_val");
+    }
+  }
+
+  for (int i = 0; i < 100; i += 5) {
+    ASSERT_OK(
+        db_->Put(WriteOptions(), import_cfh_, Key(i), Key(i) + "_overwrite5"));
+  }
+
+  // Flush and check again
+  ASSERT_OK(db_->Flush(FlushOptions(), import_cfh_));
+  for (int i = 0; i < 100; i++) {
+    std::string value;
+    db_->Get(ReadOptions(), import_cfh_, Key(i), &value);
+    if (i % 5 == 0) {
+      ASSERT_EQ(value, Key(i) + "_overwrite5");
+    } else if (i % 16 == 0) {
+      ASSERT_EQ(value, Key(i) + "_overwrite4");
+    } else if (i % 4 == 0) {
+      ASSERT_EQ(value, Key(i) + "_overwrite2");
+    } else if (i % 2 == 0) {
+      ASSERT_EQ(value, Key(i) + "_overwrite1");
+    } else {
+      ASSERT_EQ(value, Key(i) + "_val");
+    }
+  }
+
+  // Compact and check again.
+  ASSERT_OK(
+      db_->CompactRange(CompactRangeOptions(), import_cfh_, nullptr, nullptr));
+  for (int i = 0; i < 100; i++) {
+    std::string value;
+    db_->Get(ReadOptions(), import_cfh_, Key(i), &value);
+    if (i % 5 == 0) {
+      ASSERT_EQ(value, Key(i) + "_overwrite5");
+    } else if (i % 16 == 0) {
+      ASSERT_EQ(value, Key(i) + "_overwrite4");
+    } else if (i % 4 == 0) {
+      ASSERT_EQ(value, Key(i) + "_overwrite2");
+    } else if (i % 2 == 0) {
+      ASSERT_EQ(value, Key(i) + "_overwrite1");
+    } else {
+      ASSERT_EQ(value, Key(i) + "_val");
+    }
+  }
+}
+
+TEST_F(ImportColumnFamilyTest, ImportExportedSSTFromAnotherCF) {
+  Options options = CurrentOptions();
+  CreateAndReopenWithCF({"koko"}, options);
+
+  for (int i = 0; i < 100; ++i) {
+    Put(1, Key(i), Key(i) + "_val");
+  }
+  ASSERT_OK(Flush(1));
+
+  ASSERT_OK(
+      db_->CompactRange(CompactRangeOptions(), handles_[1], nullptr, nullptr));
+
+  // Overwrite the value in the same set of keys.
+  for (int i = 0; i < 100; ++i) {
+    Put(1, Key(i), Key(i) + "_overwrite");
+  }
+
+  // Flush to create L0 file.
+  ASSERT_OK(Flush(1));
+  for (int i = 0; i < 100; ++i) {
+    Put(1, Key(i), Key(i) + "_overwrite2");
+  }
+
+  // Flush again to create another L0 file. It should have higher sequencer.
+  ASSERT_OK(Flush(1));
+
+  Checkpoint* checkpoint;
+  ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+  ASSERT_OK(checkpoint->ExportColumnFamily(handles_[1], export_files_dir_,
+                                           &metadata_ptr_));
+  ASSERT_NE(metadata_ptr_, nullptr);
+  delete checkpoint;
+
+  ImportColumnFamilyOptions import_options;
+  import_options.move_files = false;
+  ASSERT_OK(db_->CreateColumnFamilyWithImport(options, "toto", import_options,
+                                              *metadata_ptr_, &import_cfh_));
+  ASSERT_NE(import_cfh_, nullptr);
+
+  import_options.move_files = true;
+  ASSERT_OK(db_->CreateColumnFamilyWithImport(options, "yoyo", import_options,
+                                              *metadata_ptr_, &import_cfh2_));
+  ASSERT_NE(import_cfh2_, nullptr);
+  delete metadata_ptr_;
+  metadata_ptr_ = NULL;
+
+  std::string value1, value2;
+
+  for (int i = 0; i < 100; ++i) {
+    db_->Get(ReadOptions(), import_cfh_, Key(i), &value1);
+    ASSERT_EQ(Get(1, Key(i)), value1);
+  }
+
+  for (int i = 0; i < 100; ++i) {
+    db_->Get(ReadOptions(), import_cfh2_, Key(i), &value2);
+    ASSERT_EQ(Get(1, Key(i)), value2);
+  }
+
+  // Modify keys in cf1 and verify.
+  for (int i = 0; i < 25; i++) {
+    ASSERT_OK(db_->Delete(WriteOptions(), import_cfh_, Key(i)));
+  }
+  for (int i = 25; i < 50; i++) {
+    ASSERT_OK(
+        db_->Put(WriteOptions(), import_cfh_, Key(i), Key(i) + "_overwrite3"));
+  }
+  for (int i = 0; i < 25; ++i) {
+    ASSERT_TRUE(
+        db_->Get(ReadOptions(), import_cfh_, Key(i), &value1).IsNotFound());
+  }
+  for (int i = 25; i < 50; ++i) {
+    db_->Get(ReadOptions(), import_cfh_, Key(i), &value1);
+    ASSERT_EQ(Key(i) + "_overwrite3", value1);
+  }
+  for (int i = 50; i < 100; ++i) {
+    db_->Get(ReadOptions(), import_cfh_, Key(i), &value1);
+    ASSERT_EQ(Key(i) + "_overwrite2", value1);
+  }
+
+  for (int i = 0; i < 100; ++i) {
+    db_->Get(ReadOptions(), import_cfh2_, Key(i), &value2);
+    ASSERT_EQ(Get(1, Key(i)), value2);
+  }
+
+  // Compact and check again.
+  ASSERT_OK(db_->Flush(FlushOptions(), import_cfh_));
+  ASSERT_OK(
+      db_->CompactRange(CompactRangeOptions(), import_cfh_, nullptr, nullptr));
+
+  for (int i = 0; i < 25; ++i) {
+    ASSERT_TRUE(
+        db_->Get(ReadOptions(), import_cfh_, Key(i), &value1).IsNotFound());
+  }
+  for (int i = 25; i < 50; ++i) {
+    db_->Get(ReadOptions(), import_cfh_, Key(i), &value1);
+    ASSERT_EQ(Key(i) + "_overwrite3", value1);
+  }
+  for (int i = 50; i < 100; ++i) {
+    db_->Get(ReadOptions(), import_cfh_, Key(i), &value1);
+    ASSERT_EQ(Key(i) + "_overwrite2", value1);
+  }
+
+  for (int i = 0; i < 100; ++i) {
+    db_->Get(ReadOptions(), import_cfh2_, Key(i), &value2);
+    ASSERT_EQ(Get(1, Key(i)), value2);
+  }
+}
+
+TEST_F(ImportColumnFamilyTest, ImportExportedSSTFromAnotherDB) {
+  Options options = CurrentOptions();
+  CreateAndReopenWithCF({"koko"}, options);
+
+  for (int i = 0; i < 100; ++i) {
+    Put(1, Key(i), Key(i) + "_val");
+  }
+  ASSERT_OK(Flush(1));
+
+  // Compact to create a L1 file.
+  ASSERT_OK(
+      db_->CompactRange(CompactRangeOptions(), handles_[1], nullptr, nullptr));
+
+  // Overwrite the value in the same set of keys.
+  for (int i = 0; i < 50; ++i) {
+    Put(1, Key(i), Key(i) + "_overwrite");
+  }
+
+  // Flush to create L0 file.
+  ASSERT_OK(Flush(1));
+
+  for (int i = 0; i < 25; ++i) {
+    Put(1, Key(i), Key(i) + "_overwrite2");
+  }
+
+  // Flush again to create another L0 file. It should have higher sequencer.
+  ASSERT_OK(Flush(1));
+
+  Checkpoint* checkpoint;
+  ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+  ASSERT_OK(checkpoint->ExportColumnFamily(handles_[1], export_files_dir_,
+                                           &metadata_ptr_));
+  ASSERT_NE(metadata_ptr_, nullptr);
+  delete checkpoint;
+
+  // Create a new db and import the files.
+  DB* db_copy;
+  test::DestroyDir(env_, dbname_ + "/db_copy");
+  ASSERT_OK(DB::Open(options, dbname_ + "/db_copy", &db_copy));
+  ColumnFamilyHandle* cfh = nullptr;
+  ASSERT_OK(db_copy->CreateColumnFamilyWithImport(ColumnFamilyOptions(), "yoyo",
+                                                  ImportColumnFamilyOptions(),
+                                                  *metadata_ptr_, &cfh));
+  ASSERT_NE(cfh, nullptr);
+
+  for (int i = 0; i < 100; ++i) {
+    std::string value;
+    db_copy->Get(ReadOptions(), cfh, Key(i), &value);
+    ASSERT_EQ(Get(1, Key(i)), value);
+  }
+  db_copy->DropColumnFamily(cfh);
+  db_copy->DestroyColumnFamilyHandle(cfh);
+  delete db_copy;
+  test::DestroyDir(env_, dbname_ + "/db_copy");
+}
+
+TEST_F(ImportColumnFamilyTest, ImportColumnFamilyNegativeTest) {
+  Options options = CurrentOptions();
+  CreateAndReopenWithCF({"koko"}, options);
+
+  {
+    // Create column family with existing cf name.
+    ExportImportFilesMetaData metadata;
+
+    ASSERT_EQ(db_->CreateColumnFamilyWithImport(ColumnFamilyOptions(), "koko",
+                                                ImportColumnFamilyOptions(),
+                                                metadata, &import_cfh_),
+              Status::InvalidArgument("Column family already exists"));
+    ASSERT_EQ(import_cfh_, nullptr);
+  }
+
+  {
+    // Import with no files specified.
+    ExportImportFilesMetaData metadata;
+
+    ASSERT_EQ(db_->CreateColumnFamilyWithImport(ColumnFamilyOptions(), "yoyo",
+                                                ImportColumnFamilyOptions(),
+                                                metadata, &import_cfh_),
+              Status::InvalidArgument("The list of files is empty"));
+    ASSERT_EQ(import_cfh_, nullptr);
+  }
+
+  {
+    // Import with overlapping keys in sst files.
+    ExportImportFilesMetaData metadata;
+    SstFileWriter sfw_cf1(EnvOptions(), options, handles_[1]);
+    const std::string file1_sst_name = "file1.sst";
+    const std::string file1_sst = sst_files_dir_ + file1_sst_name;
+    ASSERT_OK(sfw_cf1.Open(file1_sst));
+    ASSERT_OK(sfw_cf1.Put("K1", "V1"));
+    ASSERT_OK(sfw_cf1.Put("K2", "V2"));
+    ASSERT_OK(sfw_cf1.Finish());
+    const std::string file2_sst_name = "file2.sst";
+    const std::string file2_sst = sst_files_dir_ + file2_sst_name;
+    ASSERT_OK(sfw_cf1.Open(file2_sst));
+    ASSERT_OK(sfw_cf1.Put("K2", "V2"));
+    ASSERT_OK(sfw_cf1.Put("K3", "V3"));
+    ASSERT_OK(sfw_cf1.Finish());
+
+    metadata.files.push_back(
+        LiveFileMetaDataInit(file1_sst_name, sst_files_dir_, 1, 10, 19));
+    metadata.files.push_back(
+        LiveFileMetaDataInit(file2_sst_name, sst_files_dir_, 1, 10, 19));
+    metadata.db_comparator_name = options.comparator->Name();
+
+    ASSERT_EQ(db_->CreateColumnFamilyWithImport(ColumnFamilyOptions(), "yoyo",
+                                                ImportColumnFamilyOptions(),
+                                                metadata, &import_cfh_),
+              Status::InvalidArgument("Files have overlapping ranges"));
+    ASSERT_EQ(import_cfh_, nullptr);
+  }
+
+  {
+    // Import with a mismatching comparator, should fail with appropriate error.
+    ExportImportFilesMetaData metadata;
+    Options mismatch_options = CurrentOptions();
+    mismatch_options.comparator = ReverseBytewiseComparator();
+    SstFileWriter sfw_cf1(EnvOptions(), mismatch_options, handles_[1]);
+    const std::string file1_sst_name = "file1.sst";
+    const std::string file1_sst = sst_files_dir_ + file1_sst_name;
+    ASSERT_OK(sfw_cf1.Open(file1_sst));
+    ASSERT_OK(sfw_cf1.Put("K2", "V2"));
+    ASSERT_OK(sfw_cf1.Put("K1", "V1"));
+    ASSERT_OK(sfw_cf1.Finish());
+
+    metadata.files.push_back(
+        LiveFileMetaDataInit(file1_sst_name, sst_files_dir_, 1, 10, 19));
+    metadata.db_comparator_name = mismatch_options.comparator->Name();
+
+    ASSERT_EQ(db_->CreateColumnFamilyWithImport(ColumnFamilyOptions(), "coco",
+                                                ImportColumnFamilyOptions(),
+                                                metadata, &import_cfh_),
+              Status::InvalidArgument("Comparator name mismatch"));
+    ASSERT_EQ(import_cfh_, nullptr);
+  }
+
+  {
+    // Import with non existent sst file should fail with appropriate error
+    ExportImportFilesMetaData metadata;
+    SstFileWriter sfw_cf1(EnvOptions(), options, handles_[1]);
+    const std::string file1_sst_name = "file1.sst";
+    const std::string file1_sst = sst_files_dir_ + file1_sst_name;
+    ASSERT_OK(sfw_cf1.Open(file1_sst));
+    ASSERT_OK(sfw_cf1.Put("K1", "V1"));
+    ASSERT_OK(sfw_cf1.Put("K2", "V2"));
+    ASSERT_OK(sfw_cf1.Finish());
+    const std::string file3_sst_name = "file3.sst";
+
+    metadata.files.push_back(
+        LiveFileMetaDataInit(file1_sst_name, sst_files_dir_, 1, 10, 19));
+    metadata.files.push_back(
+        LiveFileMetaDataInit(file3_sst_name, sst_files_dir_, 1, 10, 19));
+    metadata.db_comparator_name = options.comparator->Name();
+
+    ASSERT_EQ(db_->CreateColumnFamilyWithImport(ColumnFamilyOptions(), "yoyo",
+                                                ImportColumnFamilyOptions(),
+                                                metadata, &import_cfh_),
+              Status::IOError("No such file or directory"));
+    ASSERT_EQ(import_cfh_, nullptr);
+
+    // Test successful import after a failure with the same CF name. Ensures
+    // there is no side effect with CF when there is a failed import
+    metadata.files.pop_back();
+    metadata.db_comparator_name = options.comparator->Name();
+
+    ASSERT_OK(db_->CreateColumnFamilyWithImport(ColumnFamilyOptions(), "yoyo",
+                                                ImportColumnFamilyOptions(),
+                                                metadata, &import_cfh_));
+    ASSERT_NE(import_cfh_, nullptr);
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+  fprintf(stderr,
+          "SKIPPED as External SST File Writer and Import are not supported "
+          "in ROCKSDB_LITE\n");
+  return 0;
+}
+
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/internal_stats.cc b/src/rocksdb/db/internal_stats.cc
new file mode 100644
index 000000000..f729ee7c7
--- /dev/null
+++ b/src/rocksdb/db/internal_stats.cc
@@ -0,0 +1,1424 @@
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/internal_stats.h"
+
+#include <algorithm>
+#include <cinttypes>
+#include <limits>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "db/column_family.h"
+#include "db/db_impl/db_impl.h"
+#include "table/block_based/block_based_table_factory.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+#ifndef ROCKSDB_LITE
+
+const std::map<LevelStatType, LevelStat> InternalStats::compaction_level_stats =
+    {
+        {LevelStatType::NUM_FILES, LevelStat{"NumFiles", "Files"}},
+        {LevelStatType::COMPACTED_FILES,
+         LevelStat{"CompactedFiles", "CompactedFiles"}},
+        {LevelStatType::SIZE_BYTES, LevelStat{"SizeBytes", "Size"}},
+        {LevelStatType::SCORE, LevelStat{"Score", "Score"}},
+        {LevelStatType::READ_GB, LevelStat{"ReadGB", "Read(GB)"}},
+        {LevelStatType::RN_GB, LevelStat{"RnGB", "Rn(GB)"}},
+        {LevelStatType::RNP1_GB, LevelStat{"Rnp1GB", "Rnp1(GB)"}},
+        {LevelStatType::WRITE_GB, LevelStat{"WriteGB", "Write(GB)"}},
+        {LevelStatType::W_NEW_GB, LevelStat{"WnewGB", "Wnew(GB)"}},
+        {LevelStatType::MOVED_GB, LevelStat{"MovedGB", "Moved(GB)"}},
+        {LevelStatType::WRITE_AMP, LevelStat{"WriteAmp", "W-Amp"}},
+        {LevelStatType::READ_MBPS, LevelStat{"ReadMBps", "Rd(MB/s)"}},
+        {LevelStatType::WRITE_MBPS, LevelStat{"WriteMBps", "Wr(MB/s)"}},
+        {LevelStatType::COMP_SEC, LevelStat{"CompSec", "Comp(sec)"}},
+        {LevelStatType::COMP_CPU_SEC,
+         LevelStat{"CompMergeCPU", "CompMergeCPU(sec)"}},
+        {LevelStatType::COMP_COUNT, LevelStat{"CompCount", "Comp(cnt)"}},
+        {LevelStatType::AVG_SEC, LevelStat{"AvgSec", "Avg(sec)"}},
+        {LevelStatType::KEY_IN, LevelStat{"KeyIn", "KeyIn"}},
+        {LevelStatType::KEY_DROP, LevelStat{"KeyDrop", "KeyDrop"}},
+};
+
+namespace {
+const double kMB = 1048576.0;
+const double kGB = kMB * 1024;
+const double kMicrosInSec = 1000000.0;
+
+void PrintLevelStatsHeader(char* buf, size_t len, const std::string& cf_name,
+                           const std::string& group_by) {
+  int written_size =
+      snprintf(buf, len, "\n** Compaction Stats [%s] **\n", cf_name.c_str());
+  auto hdr = [](LevelStatType t) {
+    return InternalStats::compaction_level_stats.at(t).header_name.c_str();
+  };
+  int line_size = snprintf(
+      buf + written_size, len - written_size,
+      "%s    %s   %s     %s %s  %s %s %s %s %s %s %s %s %s %s %s %s %s %s\n",
+      // Note that we skip COMPACTED_FILES and merge it with Files column
+      group_by.c_str(), hdr(LevelStatType::NUM_FILES),
+      hdr(LevelStatType::SIZE_BYTES), hdr(LevelStatType::SCORE),
+      hdr(LevelStatType::READ_GB), hdr(LevelStatType::RN_GB),
+      hdr(LevelStatType::RNP1_GB), hdr(LevelStatType::WRITE_GB),
+      hdr(LevelStatType::W_NEW_GB), hdr(LevelStatType::MOVED_GB),
+      hdr(LevelStatType::WRITE_AMP), hdr(LevelStatType::READ_MBPS),
+      hdr(LevelStatType::WRITE_MBPS), hdr(LevelStatType::COMP_SEC),
+      hdr(LevelStatType::COMP_CPU_SEC), hdr(LevelStatType::COMP_COUNT),
+      hdr(LevelStatType::AVG_SEC), hdr(LevelStatType::KEY_IN),
+      hdr(LevelStatType::KEY_DROP));
+
+  written_size += line_size;
+  snprintf(buf + written_size, len - written_size, "%s\n",
+           std::string(line_size, '-').c_str());
+}
+
+void PrepareLevelStats(std::map<LevelStatType, double>* level_stats,
+                       int num_files, int being_compacted,
+                       double total_file_size, double score, double w_amp,
+                       const InternalStats::CompactionStats& stats) {
+  uint64_t bytes_read =
+      stats.bytes_read_non_output_levels + stats.bytes_read_output_level;
+  int64_t bytes_new = stats.bytes_written - stats.bytes_read_output_level;
+  double elapsed = (stats.micros + 1) / kMicrosInSec;
+
+  (*level_stats)[LevelStatType::NUM_FILES] = num_files;
+  (*level_stats)[LevelStatType::COMPACTED_FILES] = being_compacted;
+  (*level_stats)[LevelStatType::SIZE_BYTES] = total_file_size;
+  (*level_stats)[LevelStatType::SCORE] = score;
+  (*level_stats)[LevelStatType::READ_GB] = bytes_read / kGB;
+  (*level_stats)[LevelStatType::RN_GB] =
+      stats.bytes_read_non_output_levels / kGB;
+  (*level_stats)[LevelStatType::RNP1_GB] = stats.bytes_read_output_level / kGB;
+  (*level_stats)[LevelStatType::WRITE_GB] = stats.bytes_written / kGB;
+  (*level_stats)[LevelStatType::W_NEW_GB] = bytes_new / kGB;
+  (*level_stats)[LevelStatType::MOVED_GB] = stats.bytes_moved / kGB;
+  (*level_stats)[LevelStatType::WRITE_AMP] = w_amp;
+  (*level_stats)[LevelStatType::READ_MBPS] = bytes_read / kMB / elapsed;
+  (*level_stats)[LevelStatType::WRITE_MBPS] =
+      stats.bytes_written / kMB / elapsed;
+  (*level_stats)[LevelStatType::COMP_SEC] = stats.micros / kMicrosInSec;
+  (*level_stats)[LevelStatType::COMP_CPU_SEC] = stats.cpu_micros / kMicrosInSec;
+  (*level_stats)[LevelStatType::COMP_COUNT] = stats.count;
+  (*level_stats)[LevelStatType::AVG_SEC] =
+      stats.count == 0 ? 0 : stats.micros / kMicrosInSec / stats.count;
+  (*level_stats)[LevelStatType::KEY_IN] =
+      static_cast<double>(stats.num_input_records);
+  (*level_stats)[LevelStatType::KEY_DROP] =
+      static_cast<double>(stats.num_dropped_records);
+}
+
+void PrintLevelStats(char* buf, size_t len, const std::string& name,
+                     const std::map<LevelStatType, double>& stat_value) {
+  snprintf(
+      buf, len,
+      "%4s "      /*  Level */
+      "%6d/%-3d " /*  Files */
+      "%8s "      /*  Size */
+      "%5.1f "    /*  Score */
+      "%8.1f "    /*  Read(GB) */
+      "%7.1f "    /*  Rn(GB) */
+      "%8.1f "    /*  Rnp1(GB) */
+      "%9.1f "    /*  Write(GB) */
+      "%8.1f "    /*  Wnew(GB) */
+      "%9.1f "    /*  Moved(GB) */
+      "%5.1f "    /*  W-Amp */
+      "%8.1f "    /*  Rd(MB/s) */
+      "%8.1f "    /*  Wr(MB/s) */
+      "%9.2f "    /*  Comp(sec) */
+      "%17.2f "   /*  CompMergeCPU(sec) */
+      "%9d "      /*  Comp(cnt) */
+      "%8.3f "    /*  Avg(sec) */
+      "%7s "      /*  KeyIn */
+      "%6s\n",    /*  KeyDrop */
+      name.c_str(), static_cast<int>(stat_value.at(LevelStatType::NUM_FILES)),
+      static_cast<int>(stat_value.at(LevelStatType::COMPACTED_FILES)),
+      BytesToHumanString(
+          static_cast<uint64_t>(stat_value.at(LevelStatType::SIZE_BYTES)))
+          .c_str(),
+      stat_value.at(LevelStatType::SCORE),
+      stat_value.at(LevelStatType::READ_GB),
+      stat_value.at(LevelStatType::RN_GB),
+      stat_value.at(LevelStatType::RNP1_GB),
+      stat_value.at(LevelStatType::WRITE_GB),
+      stat_value.at(LevelStatType::W_NEW_GB),
+      stat_value.at(LevelStatType::MOVED_GB),
+      stat_value.at(LevelStatType::WRITE_AMP),
+      stat_value.at(LevelStatType::READ_MBPS),
+      stat_value.at(LevelStatType::WRITE_MBPS),
+      stat_value.at(LevelStatType::COMP_SEC),
+      stat_value.at(LevelStatType::COMP_CPU_SEC),
+      static_cast<int>(stat_value.at(LevelStatType::COMP_COUNT)),
+      stat_value.at(LevelStatType::AVG_SEC),
+      NumberToHumanString(
+          static_cast<std::int64_t>(stat_value.at(LevelStatType::KEY_IN)))
+          .c_str(),
+      NumberToHumanString(
+          static_cast<std::int64_t>(stat_value.at(LevelStatType::KEY_DROP)))
+          .c_str());
+}
+
+void PrintLevelStats(char* buf, size_t len, const std::string& name,
+                     int num_files, int being_compacted, double total_file_size,
+                     double score, double w_amp,
+                     const InternalStats::CompactionStats& stats) {
+  std::map<LevelStatType, double> level_stats;
+  PrepareLevelStats(&level_stats, num_files, being_compacted, total_file_size,
+                    score, w_amp, stats);
+  PrintLevelStats(buf, len, name, level_stats);
+}
+
+// Assumes that trailing numbers represent an optional argument. This requires
+// property names to not end with numbers.
+std::pair<Slice, Slice> GetPropertyNameAndArg(const Slice& property) {
+  Slice name = property, arg = property;
+  size_t sfx_len = 0;
+  while (sfx_len < property.size() &&
+         isdigit(property[property.size() - sfx_len - 1])) {
+    ++sfx_len;
+  }
+  name.remove_suffix(sfx_len);
+  arg.remove_prefix(property.size() - sfx_len);
+  return {name, arg};
+}
+}  // anonymous namespace
+
+static const std::string rocksdb_prefix = "rocksdb.";
+
+static const std::string num_files_at_level_prefix = "num-files-at-level";
+static const std::string compression_ratio_at_level_prefix =
+    "compression-ratio-at-level";
+static const std::string allstats = "stats";
+static const std::string sstables = "sstables";
+static const std::string cfstats = "cfstats";
+static const std::string cfstats_no_file_histogram =
+    "cfstats-no-file-histogram";
+static const std::string cf_file_histogram = "cf-file-histogram";
+static const std::string dbstats = "dbstats";
+static const std::string levelstats = "levelstats";
+static const std::string num_immutable_mem_table = "num-immutable-mem-table";
+static const std::string num_immutable_mem_table_flushed =
+    "num-immutable-mem-table-flushed";
+static const std::string mem_table_flush_pending = "mem-table-flush-pending";
+static const std::string compaction_pending = "compaction-pending";
+static const std::string background_errors = "background-errors";
+static const std::string cur_size_active_mem_table =
+    "cur-size-active-mem-table";
+static const std::string cur_size_all_mem_tables = "cur-size-all-mem-tables";
+static const std::string size_all_mem_tables = "size-all-mem-tables";
+static const std::string num_entries_active_mem_table =
+    "num-entries-active-mem-table";
+static const std::string num_entries_imm_mem_tables =
+    "num-entries-imm-mem-tables";
+static const std::string num_deletes_active_mem_table =
+    "num-deletes-active-mem-table";
+static const std::string num_deletes_imm_mem_tables =
+    "num-deletes-imm-mem-tables";
+static const std::string estimate_num_keys = "estimate-num-keys";
+static const std::string estimate_table_readers_mem =
+    "estimate-table-readers-mem";
+static const std::string is_file_deletions_enabled =
+    "is-file-deletions-enabled";
+static const std::string num_snapshots = "num-snapshots";
+static const std::string oldest_snapshot_time = "oldest-snapshot-time";
+static const std::string oldest_snapshot_sequence = "oldest-snapshot-sequence";
+static const std::string num_live_versions = "num-live-versions";
+static const std::string current_version_number =
+    "current-super-version-number";
+static const std::string estimate_live_data_size = "estimate-live-data-size";
+static const std::string min_log_number_to_keep_str = "min-log-number-to-keep";
+static const std::string min_obsolete_sst_number_to_keep_str =
+    "min-obsolete-sst-number-to-keep";
+static const std::string base_level_str = "base-level";
+static const std::string total_sst_files_size = "total-sst-files-size";
+static const std::string live_sst_files_size = "live-sst-files-size";
+static const std::string estimate_pending_comp_bytes =
+    "estimate-pending-compaction-bytes";
+static const std::string aggregated_table_properties =
+    "aggregated-table-properties";
+static const std::string aggregated_table_properties_at_level =
+    aggregated_table_properties + "-at-level";
+static const std::string num_running_compactions = "num-running-compactions";
+static const std::string num_running_flushes = "num-running-flushes";
+static const std::string actual_delayed_write_rate =
+    "actual-delayed-write-rate";
+static const std::string is_write_stopped = "is-write-stopped";
+static const std::string estimate_oldest_key_time = "estimate-oldest-key-time";
+static const std::string block_cache_capacity = "block-cache-capacity";
+static const std::string block_cache_usage = "block-cache-usage";
+static const std::string block_cache_pinned_usage = "block-cache-pinned-usage";
+static const std::string options_statistics = "options-statistics";
+
+const std::string DB::Properties::kNumFilesAtLevelPrefix =
+    rocksdb_prefix + num_files_at_level_prefix;
+const std::string DB::Properties::kCompressionRatioAtLevelPrefix =
+    rocksdb_prefix + compression_ratio_at_level_prefix;
+const std::string DB::Properties::kStats = rocksdb_prefix + allstats;
+const std::string DB::Properties::kSSTables = rocksdb_prefix + sstables;
+const std::string DB::Properties::kCFStats = rocksdb_prefix + cfstats;
+const std::string DB::Properties::kCFStatsNoFileHistogram =
+    rocksdb_prefix + cfstats_no_file_histogram;
+const std::string DB::Properties::kCFFileHistogram =
+    rocksdb_prefix + cf_file_histogram;
+const std::string DB::Properties::kDBStats = rocksdb_prefix + dbstats;
+const std::string DB::Properties::kLevelStats = rocksdb_prefix + levelstats;
+const std::string DB::Properties::kNumImmutableMemTable =
+    rocksdb_prefix + num_immutable_mem_table;
+const std::string DB::Properties::kNumImmutableMemTableFlushed =
+    rocksdb_prefix + num_immutable_mem_table_flushed;
+const std::string DB::Properties::kMemTableFlushPending =
+    rocksdb_prefix + mem_table_flush_pending;
+const std::string DB::Properties::kCompactionPending =
+    rocksdb_prefix + compaction_pending;
+const std::string DB::Properties::kNumRunningCompactions =
+    rocksdb_prefix + num_running_compactions;
+const std::string DB::Properties::kNumRunningFlushes =
+    rocksdb_prefix + num_running_flushes;
+const std::string DB::Properties::kBackgroundErrors =
+    rocksdb_prefix + background_errors;
+const std::string DB::Properties::kCurSizeActiveMemTable =
+    rocksdb_prefix + cur_size_active_mem_table;
+const std::string DB::Properties::kCurSizeAllMemTables =
+    rocksdb_prefix + cur_size_all_mem_tables;
+const std::string DB::Properties::kSizeAllMemTables =
+    rocksdb_prefix + size_all_mem_tables;
+const std::string DB::Properties::kNumEntriesActiveMemTable =
+    rocksdb_prefix + num_entries_active_mem_table;
+const std::string DB::Properties::kNumEntriesImmMemTables =
+    rocksdb_prefix + num_entries_imm_mem_tables;
+const std::string DB::Properties::kNumDeletesActiveMemTable =
+    rocksdb_prefix + num_deletes_active_mem_table;
+const std::string DB::Properties::kNumDeletesImmMemTables =
+    rocksdb_prefix + num_deletes_imm_mem_tables;
+const std::string DB::Properties::kEstimateNumKeys =
+    rocksdb_prefix + estimate_num_keys;
+const std::string DB::Properties::kEstimateTableReadersMem =
+    rocksdb_prefix + estimate_table_readers_mem;
+const std::string DB::Properties::kIsFileDeletionsEnabled =
+    rocksdb_prefix + is_file_deletions_enabled;
+const std::string DB::Properties::kNumSnapshots =
+    rocksdb_prefix + num_snapshots;
+const std::string DB::Properties::kOldestSnapshotTime =
+    rocksdb_prefix + oldest_snapshot_time;
+const std::string DB::Properties::kOldestSnapshotSequence =
+    rocksdb_prefix + oldest_snapshot_sequence;
+const std::string DB::Properties::kNumLiveVersions =
+    rocksdb_prefix + num_live_versions;
+const std::string DB::Properties::kCurrentSuperVersionNumber =
+    rocksdb_prefix + current_version_number;
+const std::string DB::Properties::kEstimateLiveDataSize =
+    rocksdb_prefix + estimate_live_data_size;
+const std::string DB::Properties::kMinLogNumberToKeep =
+    rocksdb_prefix + min_log_number_to_keep_str;
+const std::string DB::Properties::kMinObsoleteSstNumberToKeep =
+    rocksdb_prefix + min_obsolete_sst_number_to_keep_str;
+const std::string DB::Properties::kTotalSstFilesSize =
+    rocksdb_prefix + total_sst_files_size;
+const std::string DB::Properties::kLiveSstFilesSize =
+    rocksdb_prefix + live_sst_files_size;
+const std::string DB::Properties::kBaseLevel = rocksdb_prefix + base_level_str;
+const std::string DB::Properties::kEstimatePendingCompactionBytes =
+    rocksdb_prefix + estimate_pending_comp_bytes;
+const std::string DB::Properties::kAggregatedTableProperties =
+    rocksdb_prefix + aggregated_table_properties;
+const std::string DB::Properties::kAggregatedTablePropertiesAtLevel =
+    rocksdb_prefix + aggregated_table_properties_at_level;
+const std::string DB::Properties::kActualDelayedWriteRate =
+    rocksdb_prefix + actual_delayed_write_rate;
+const std::string DB::Properties::kIsWriteStopped =
+    rocksdb_prefix + is_write_stopped;
+const std::string DB::Properties::kEstimateOldestKeyTime =
+    rocksdb_prefix + estimate_oldest_key_time;
+const std::string DB::Properties::kBlockCacheCapacity =
+    rocksdb_prefix + block_cache_capacity;
+const std::string DB::Properties::kBlockCacheUsage =
+    rocksdb_prefix + block_cache_usage;
+const std::string DB::Properties::kBlockCachePinnedUsage =
+    rocksdb_prefix + block_cache_pinned_usage;
+const std::string DB::Properties::kOptionsStatistics =
+    rocksdb_prefix + options_statistics;
+
+const std::unordered_map<std::string, DBPropertyInfo>
+    InternalStats::ppt_name_to_info = {
+        {DB::Properties::kNumFilesAtLevelPrefix,
+         {false, &InternalStats::HandleNumFilesAtLevel, nullptr, nullptr,
+          nullptr}},
+        {DB::Properties::kCompressionRatioAtLevelPrefix,
+         {false, &InternalStats::HandleCompressionRatioAtLevelPrefix, nullptr,
+          nullptr, nullptr}},
+        {DB::Properties::kLevelStats,
+         {false, &InternalStats::HandleLevelStats, nullptr, nullptr, nullptr}},
+        {DB::Properties::kStats,
+         {false, &InternalStats::HandleStats, nullptr, nullptr, nullptr}},
+        {DB::Properties::kCFStats,
+         {false, &InternalStats::HandleCFStats, nullptr,
+          &InternalStats::HandleCFMapStats, nullptr}},
+        {DB::Properties::kCFStatsNoFileHistogram,
+         {false, &InternalStats::HandleCFStatsNoFileHistogram, nullptr, nullptr,
+          nullptr}},
+        {DB::Properties::kCFFileHistogram,
+         {false, &InternalStats::HandleCFFileHistogram, nullptr, nullptr,
+          nullptr}},
+        {DB::Properties::kDBStats,
+         {false, &InternalStats::HandleDBStats, nullptr, nullptr, nullptr}},
+        {DB::Properties::kSSTables,
+         {false, &InternalStats::HandleSsTables, nullptr, nullptr, nullptr}},
+        {DB::Properties::kAggregatedTableProperties,
+         {false, &InternalStats::HandleAggregatedTableProperties, nullptr,
+          nullptr, nullptr}},
+        {DB::Properties::kAggregatedTablePropertiesAtLevel,
+         {false, &InternalStats::HandleAggregatedTablePropertiesAtLevel,
+          nullptr, nullptr, nullptr}},
+        {DB::Properties::kNumImmutableMemTable,
+         {false, nullptr, &InternalStats::HandleNumImmutableMemTable, nullptr,
+          nullptr}},
+        {DB::Properties::kNumImmutableMemTableFlushed,
+         {false, nullptr, &InternalStats::HandleNumImmutableMemTableFlushed,
+          nullptr, nullptr}},
+        {DB::Properties::kMemTableFlushPending,
+         {false, nullptr, &InternalStats::HandleMemTableFlushPending, nullptr,
+          nullptr}},
+        {DB::Properties::kCompactionPending,
+         {false, nullptr, &InternalStats::HandleCompactionPending, nullptr,
+          nullptr}},
+        {DB::Properties::kBackgroundErrors,
+         {false, nullptr, &InternalStats::HandleBackgroundErrors, nullptr,
+          nullptr}},
+        {DB::Properties::kCurSizeActiveMemTable,
+         {false, nullptr, &InternalStats::HandleCurSizeActiveMemTable, nullptr,
+          nullptr}},
+        {DB::Properties::kCurSizeAllMemTables,
+         {false, nullptr, &InternalStats::HandleCurSizeAllMemTables, nullptr,
+          nullptr}},
+        {DB::Properties::kSizeAllMemTables,
+         {false, nullptr, &InternalStats::HandleSizeAllMemTables, nullptr,
+          nullptr}},
+        {DB::Properties::kNumEntriesActiveMemTable,
+         {false, nullptr, &InternalStats::HandleNumEntriesActiveMemTable,
+          nullptr, nullptr}},
+        {DB::Properties::kNumEntriesImmMemTables,
+         {false, nullptr, &InternalStats::HandleNumEntriesImmMemTables, nullptr,
+          nullptr}},
+        {DB::Properties::kNumDeletesActiveMemTable,
+         {false, nullptr, &InternalStats::HandleNumDeletesActiveMemTable,
+          nullptr, nullptr}},
+        {DB::Properties::kNumDeletesImmMemTables,
+         {false, nullptr, &InternalStats::HandleNumDeletesImmMemTables, nullptr,
+          nullptr}},
+        {DB::Properties::kEstimateNumKeys,
+         {false, nullptr, &InternalStats::HandleEstimateNumKeys, nullptr,
+          nullptr}},
+        {DB::Properties::kEstimateTableReadersMem,
+         {true, nullptr, &InternalStats::HandleEstimateTableReadersMem, nullptr,
+          nullptr}},
+        {DB::Properties::kIsFileDeletionsEnabled,
+         {false, nullptr, &InternalStats::HandleIsFileDeletionsEnabled, nullptr,
+          nullptr}},
+        {DB::Properties::kNumSnapshots,
+         {false, nullptr, &InternalStats::HandleNumSnapshots, nullptr,
+          nullptr}},
+        {DB::Properties::kOldestSnapshotTime,
+         {false, nullptr, &InternalStats::HandleOldestSnapshotTime, nullptr,
+          nullptr}},
+        {DB::Properties::kOldestSnapshotSequence,
+         {false, nullptr, &InternalStats::HandleOldestSnapshotSequence, nullptr,
+          nullptr}},
+        {DB::Properties::kNumLiveVersions,
+         {false, nullptr, &InternalStats::HandleNumLiveVersions, nullptr,
+          nullptr}},
+        {DB::Properties::kCurrentSuperVersionNumber,
+         {false, nullptr, &InternalStats::HandleCurrentSuperVersionNumber,
+          nullptr, nullptr}},
+        {DB::Properties::kEstimateLiveDataSize,
+         {true, nullptr, &InternalStats::HandleEstimateLiveDataSize, nullptr,
+          nullptr}},
+        {DB::Properties::kMinLogNumberToKeep,
+         {false, nullptr, &InternalStats::HandleMinLogNumberToKeep, nullptr,
+          nullptr}},
+        {DB::Properties::kMinObsoleteSstNumberToKeep,
+         {false, nullptr, &InternalStats::HandleMinObsoleteSstNumberToKeep,
+          nullptr, nullptr}},
+        {DB::Properties::kBaseLevel,
+         {false, nullptr, &InternalStats::HandleBaseLevel, nullptr, nullptr}},
+        {DB::Properties::kTotalSstFilesSize,
+         {false, nullptr, &InternalStats::HandleTotalSstFilesSize, nullptr,
+          nullptr}},
+        {DB::Properties::kLiveSstFilesSize,
+         {false, nullptr, &InternalStats::HandleLiveSstFilesSize, nullptr,
+          nullptr}},
+        {DB::Properties::kEstimatePendingCompactionBytes,
+         {false, nullptr, &InternalStats::HandleEstimatePendingCompactionBytes,
+          nullptr, nullptr}},
+        {DB::Properties::kNumRunningFlushes,
+         {false, nullptr, &InternalStats::HandleNumRunningFlushes, nullptr,
+          nullptr}},
+        {DB::Properties::kNumRunningCompactions,
+         {false, nullptr, &InternalStats::HandleNumRunningCompactions, nullptr,
+          nullptr}},
+        {DB::Properties::kActualDelayedWriteRate,
+         {false, nullptr, &InternalStats::HandleActualDelayedWriteRate, nullptr,
+          nullptr}},
+        {DB::Properties::kIsWriteStopped,
+         {false, nullptr, &InternalStats::HandleIsWriteStopped, nullptr,
+          nullptr}},
+        {DB::Properties::kEstimateOldestKeyTime,
+         {false, nullptr, &InternalStats::HandleEstimateOldestKeyTime, nullptr,
+          nullptr}},
+        {DB::Properties::kBlockCacheCapacity,
+         {false, nullptr, &InternalStats::HandleBlockCacheCapacity, nullptr,
+          nullptr}},
+        {DB::Properties::kBlockCacheUsage,
+         {false, nullptr, &InternalStats::HandleBlockCacheUsage, nullptr,
+          nullptr}},
+        {DB::Properties::kBlockCachePinnedUsage,
+         {false, nullptr, &InternalStats::HandleBlockCachePinnedUsage, nullptr,
+          nullptr}},
+        {DB::Properties::kOptionsStatistics,
+         {false, nullptr, nullptr, nullptr,
+          &DBImpl::GetPropertyHandleOptionsStatistics}},
+};
+
+const DBPropertyInfo* GetPropertyInfo(const Slice& property) {
+  std::string ppt_name = GetPropertyNameAndArg(property).first.ToString();
+  auto ppt_info_iter = InternalStats::ppt_name_to_info.find(ppt_name);
+  if (ppt_info_iter == InternalStats::ppt_name_to_info.end()) {
+    return nullptr;
+  }
+  return &ppt_info_iter->second;
+}
+
+bool InternalStats::GetStringProperty(const DBPropertyInfo& property_info,
+                                      const Slice& property,
+                                      std::string* value) {
+  assert(value != nullptr);
+  assert(property_info.handle_string != nullptr);
+  Slice arg = GetPropertyNameAndArg(property).second;
+  return (this->*(property_info.handle_string))(value, arg);
+}
+
+bool InternalStats::GetMapProperty(const DBPropertyInfo& property_info,
+                                   const Slice& /*property*/,
+                                   std::map<std::string, std::string>* value) {
+  assert(value != nullptr);
+  assert(property_info.handle_map != nullptr);
+  return (this->*(property_info.handle_map))(value);
+}
+
+bool InternalStats::GetIntProperty(const DBPropertyInfo& property_info,
+                                   uint64_t* value, DBImpl* db) {
+  assert(value != nullptr);
+  assert(property_info.handle_int != nullptr &&
+         !property_info.need_out_of_mutex);
+  db->mutex_.AssertHeld();
+  return (this->*(property_info.handle_int))(value, db, nullptr /* version */);
+}
+
+bool InternalStats::GetIntPropertyOutOfMutex(
+    const DBPropertyInfo& property_info, Version* version, uint64_t* value) {
+  assert(value != nullptr);
+  assert(property_info.handle_int != nullptr &&
+         property_info.need_out_of_mutex);
+  return (this->*(property_info.handle_int))(value, nullptr /* db */, version);
+}
+
+bool InternalStats::HandleNumFilesAtLevel(std::string* value, Slice suffix) {
+  uint64_t level;
+  const auto* vstorage = cfd_->current()->storage_info();
+  bool ok = ConsumeDecimalNumber(&suffix, &level) && suffix.empty();
+  if (!ok || static_cast<int>(level) >= number_levels_) {
+    return false;
+  } else {
+    char buf[100];
+    snprintf(buf, sizeof(buf), "%d",
+             vstorage->NumLevelFiles(static_cast<int>(level)));
+    *value = buf;
+    return true;
+  }
+}
+
+bool InternalStats::HandleCompressionRatioAtLevelPrefix(std::string* value,
+                                                        Slice suffix) {
+  uint64_t level;
+  const auto* vstorage = cfd_->current()->storage_info();
+  bool ok = ConsumeDecimalNumber(&suffix, &level) && suffix.empty();
+  if (!ok || level >= static_cast<uint64_t>(number_levels_)) {
+    return false;
+  }
+  *value = ToString(
+      vstorage->GetEstimatedCompressionRatioAtLevel(static_cast<int>(level)));
+  return true;
+}
+
+bool InternalStats::HandleLevelStats(std::string* value, Slice /*suffix*/) {
+  char buf[1000];
+  const auto* vstorage = cfd_->current()->storage_info();
+  snprintf(buf, sizeof(buf),
+           "Level Files Size(MB)\n"
+           "--------------------\n");
+  value->append(buf);
+
+  for (int level = 0; level < number_levels_; level++) {
+    snprintf(buf, sizeof(buf), "%3d %8d %8.0f\n", level,
+             vstorage->NumLevelFiles(level),
+             vstorage->NumLevelBytes(level) / kMB);
+    value->append(buf);
+  }
+  return true;
+}
+
+bool InternalStats::HandleStats(std::string* value, Slice suffix) {
+  if (!HandleCFStats(value, suffix)) {
+    return false;
+  }
+  if (!HandleDBStats(value, suffix)) {
+    return false;
+  }
+  return true;
+}
+
+bool InternalStats::HandleCFMapStats(
+    std::map<std::string, std::string>* cf_stats) {
+  DumpCFMapStats(cf_stats);
+  return true;
+}
+
+bool InternalStats::HandleCFStats(std::string* value, Slice /*suffix*/) {
+  DumpCFStats(value);
+  return true;
+}
+
+bool InternalStats::HandleCFStatsNoFileHistogram(std::string* value,
+                                                 Slice /*suffix*/) {
+  DumpCFStatsNoFileHistogram(value);
+  return true;
+}
+
+bool InternalStats::HandleCFFileHistogram(std::string* value,
+                                          Slice /*suffix*/) {
+  DumpCFFileHistogram(value);
+  return true;
+}
+
+bool InternalStats::HandleDBStats(std::string* value, Slice /*suffix*/) {
+  DumpDBStats(value);
+  return true;
+}
+
+bool InternalStats::HandleSsTables(std::string* value, Slice /*suffix*/) {
+  auto* current = cfd_->current();
+  *value = current->DebugString(true, true);
+  return true;
+}
+
+bool InternalStats::HandleAggregatedTableProperties(std::string* value,
+                                                    Slice /*suffix*/) {
+  std::shared_ptr<const TableProperties> tp;
+  auto s = cfd_->current()->GetAggregatedTableProperties(&tp);
+  if (!s.ok()) {
+    return false;
+  }
+  *value = tp->ToString();
+  return true;
+}
+
+bool InternalStats::HandleAggregatedTablePropertiesAtLevel(std::string* value,
+                                                           Slice suffix) {
+  uint64_t level;
+  bool ok = ConsumeDecimalNumber(&suffix, &level) && suffix.empty();
+  if (!ok || static_cast<int>(level) >= number_levels_) {
+    return false;
+  }
+  std::shared_ptr<const TableProperties> tp;
+  auto s = cfd_->current()->GetAggregatedTableProperties(
+      &tp, static_cast<int>(level));
+  if (!s.ok()) {
+    return false;
+  }
+  *value = tp->ToString();
+  return true;
+}
+
+bool InternalStats::HandleNumImmutableMemTable(uint64_t* value, DBImpl* /*db*/,
+                                               Version* /*version*/) {
+  *value = cfd_->imm()->NumNotFlushed();
+  return true;
+}
+
+bool InternalStats::HandleNumImmutableMemTableFlushed(uint64_t* value,
+                                                      DBImpl* /*db*/,
+                                                      Version* /*version*/) {
+  *value = cfd_->imm()->NumFlushed();
+  return true;
+}
+
+bool InternalStats::HandleMemTableFlushPending(uint64_t* value, DBImpl* /*db*/,
+                                               Version* /*version*/) {
+  *value = (cfd_->imm()->IsFlushPending() ? 1 : 0);
+  return true;
+}
+
+bool InternalStats::HandleNumRunningFlushes(uint64_t* value, DBImpl* db,
+                                            Version* /*version*/) {
+  *value = db->num_running_flushes();
+  return true;
+}
+
+bool InternalStats::HandleCompactionPending(uint64_t* value, DBImpl* /*db*/,
+                                            Version* /*version*/) {
+  // 1 if the system already determines at least one compaction is needed.
+  // 0 otherwise,
+  const auto* vstorage = cfd_->current()->storage_info();
+  *value = (cfd_->compaction_picker()->NeedsCompaction(vstorage) ? 1 : 0);
+  return true;
+}
+
+bool InternalStats::HandleNumRunningCompactions(uint64_t* value, DBImpl* db,
+                                                Version* /*version*/) {
+  *value = db->num_running_compactions_;
+  return true;
+}
+
+bool InternalStats::HandleBackgroundErrors(uint64_t* value, DBImpl* /*db*/,
+                                           Version* /*version*/) {
+  // Accumulated number of  errors in background flushes or compactions.
+  *value = GetBackgroundErrorCount();
+  return true;
+}
+
+bool InternalStats::HandleCurSizeActiveMemTable(uint64_t* value, DBImpl* /*db*/,
+                                                Version* /*version*/) {
+  // Current size of the active memtable
+  *value = cfd_->mem()->ApproximateMemoryUsage();
+  return true;
+}
+
+bool InternalStats::HandleCurSizeAllMemTables(uint64_t* value, DBImpl* /*db*/,
+                                              Version* /*version*/) {
+  // Current size of the active memtable + immutable memtables
+  *value = cfd_->mem()->ApproximateMemoryUsage() +
+           cfd_->imm()->ApproximateUnflushedMemTablesMemoryUsage();
+  return true;
+}
+
+bool InternalStats::HandleSizeAllMemTables(uint64_t* value, DBImpl* /*db*/,
+                                           Version* /*version*/) {
+  *value = cfd_->mem()->ApproximateMemoryUsage() +
+           cfd_->imm()->ApproximateMemoryUsage();
+  return true;
+}
+
+bool InternalStats::HandleNumEntriesActiveMemTable(uint64_t* value,
+                                                   DBImpl* /*db*/,
+                                                   Version* /*version*/) {
+  // Current number of entires in the active memtable
+  *value = cfd_->mem()->num_entries();
+  return true;
+}
+
+bool InternalStats::HandleNumEntriesImmMemTables(uint64_t* value,
+                                                 DBImpl* /*db*/,
+                                                 Version* /*version*/) {
+  // Current number of entries in the immutable memtables
+  *value = cfd_->imm()->current()->GetTotalNumEntries();
+  return true;
+}
+
+bool InternalStats::HandleNumDeletesActiveMemTable(uint64_t* value,
+                                                   DBImpl* /*db*/,
+                                                   Version* /*version*/) {
+  // Current number of entires in the active memtable
+  *value = cfd_->mem()->num_deletes();
+  return true;
+}
+
+bool InternalStats::HandleNumDeletesImmMemTables(uint64_t* value,
+                                                 DBImpl* /*db*/,
+                                                 Version* /*version*/) {
+  // Current number of entries in the immutable memtables
+  *value = cfd_->imm()->current()->GetTotalNumDeletes();
+  return true;
+}
+
+bool InternalStats::HandleEstimateNumKeys(uint64_t* value, DBImpl* /*db*/,
+                                          Version* /*version*/) {
+  // Estimate number of entries in the column family:
+  // Use estimated entries in tables + total entries in memtables.
+  const auto* vstorage = cfd_->current()->storage_info();
+  uint64_t estimate_keys = cfd_->mem()->num_entries() +
+                           cfd_->imm()->current()->GetTotalNumEntries() +
+                           vstorage->GetEstimatedActiveKeys();
+  uint64_t estimate_deletes =
+      cfd_->mem()->num_deletes() + cfd_->imm()->current()->GetTotalNumDeletes();
+  *value = estimate_keys > estimate_deletes * 2
+               ? estimate_keys - (estimate_deletes * 2)
+               : 0;
+  return true;
+}
+
+bool InternalStats::HandleNumSnapshots(uint64_t* value, DBImpl* db,
+                                       Version* /*version*/) {
+  *value = db->snapshots().count();
+  return true;
+}
+
+bool InternalStats::HandleOldestSnapshotTime(uint64_t* value, DBImpl* db,
+                                             Version* /*version*/) {
+  *value = static_cast<uint64_t>(db->snapshots().GetOldestSnapshotTime());
+  return true;
+}
+
+bool InternalStats::HandleOldestSnapshotSequence(uint64_t* value, DBImpl* db,
+                                                 Version* /*version*/) {
+  *value = static_cast<uint64_t>(db->snapshots().GetOldestSnapshotSequence());
+  return true;
+}
+
+bool InternalStats::HandleNumLiveVersions(uint64_t* value, DBImpl* /*db*/,
+                                          Version* /*version*/) {
+  *value = cfd_->GetNumLiveVersions();
+  return true;
+}
+
+bool InternalStats::HandleCurrentSuperVersionNumber(uint64_t* value,
+                                                    DBImpl* /*db*/,
+                                                    Version* /*version*/) {
+  *value = cfd_->GetSuperVersionNumber();
+  return true;
+}
+
+bool InternalStats::HandleIsFileDeletionsEnabled(uint64_t* value, DBImpl* db,
+                                                 Version* /*version*/) {
+  *value = db->IsFileDeletionsEnabled();
+  return true;
+}
+
+bool InternalStats::HandleBaseLevel(uint64_t* value, DBImpl* /*db*/,
+                                    Version* /*version*/) {
+  const auto* vstorage = cfd_->current()->storage_info();
+  *value = vstorage->base_level();
+  return true;
+}
+
+bool InternalStats::HandleTotalSstFilesSize(uint64_t* value, DBImpl* /*db*/,
+                                            Version* /*version*/) {
+  *value = cfd_->GetTotalSstFilesSize();
+  return true;
+}
+
+bool InternalStats::HandleLiveSstFilesSize(uint64_t* value, DBImpl* /*db*/,
+                                           Version* /*version*/) {
+  *value = cfd_->GetLiveSstFilesSize();
+  return true;
+}
+
+bool InternalStats::HandleEstimatePendingCompactionBytes(uint64_t* value,
+                                                         DBImpl* /*db*/,
+                                                         Version* /*version*/) {
+  const auto* vstorage = cfd_->current()->storage_info();
+  *value = vstorage->estimated_compaction_needed_bytes();
+  return true;
+}
+
+bool InternalStats::HandleEstimateTableReadersMem(uint64_t* value,
+                                                  DBImpl* /*db*/,
+                                                  Version* version) {
+  *value = (version == nullptr) ? 0 : version->GetMemoryUsageByTableReaders();
+  return true;
+}
+
+bool InternalStats::HandleEstimateLiveDataSize(uint64_t* value, DBImpl* /*db*/,
+                                               Version* version) {
+  const auto* vstorage = version->storage_info();
+  *value = vstorage->EstimateLiveDataSize();
+  return true;
+}
+
+bool InternalStats::HandleMinLogNumberToKeep(uint64_t* value, DBImpl* db,
+                                             Version* /*version*/) {
+  *value = db->MinLogNumberToKeep();
+  return true;
+}
+
+bool InternalStats::HandleMinObsoleteSstNumberToKeep(uint64_t* value,
+                                                     DBImpl* db,
+                                                     Version* /*version*/) {
+  *value = db->MinObsoleteSstNumberToKeep();
+  return true;
+}
+
+bool InternalStats::HandleActualDelayedWriteRate(uint64_t* value, DBImpl* db,
+                                                 Version* /*version*/) {
+  const WriteController& wc = db->write_controller();
+  if (!wc.NeedsDelay()) {
+    *value = 0;
+  } else {
+    *value = wc.delayed_write_rate();
+  }
+  return true;
+}
+
+bool InternalStats::HandleIsWriteStopped(uint64_t* value, DBImpl* db,
+                                         Version* /*version*/) {
+  *value = db->write_controller().IsStopped() ? 1 : 0;
+  return true;
+}
+
+bool InternalStats::HandleEstimateOldestKeyTime(uint64_t* value, DBImpl* /*db*/,
+                                                Version* /*version*/) {
+  // TODO(yiwu): The property is currently available for fifo compaction
+  // with allow_compaction = false. This is because we don't propagate
+  // oldest_key_time on compaction.
+  if (cfd_->ioptions()->compaction_style != kCompactionStyleFIFO ||
+      cfd_->GetCurrentMutableCFOptions()
+          ->compaction_options_fifo.allow_compaction) {
+    return false;
+  }
+
+  TablePropertiesCollection collection;
+  auto s = cfd_->current()->GetPropertiesOfAllTables(&collection);
+  if (!s.ok()) {
+    return false;
+  }
+  *value = std::numeric_limits<uint64_t>::max();
+  for (auto& p : collection) {
+    *value = std::min(*value, p.second->oldest_key_time);
+    if (*value == 0) {
+      break;
+    }
+  }
+  if (*value > 0) {
+    *value = std::min({cfd_->mem()->ApproximateOldestKeyTime(),
+                       cfd_->imm()->ApproximateOldestKeyTime(), *value});
+  }
+  return *value > 0 && *value < std::numeric_limits<uint64_t>::max();
+}
+
+bool InternalStats::HandleBlockCacheStat(Cache** block_cache) {
+  assert(block_cache != nullptr);
+  auto* table_factory = cfd_->ioptions()->table_factory;
+  assert(table_factory != nullptr);
+  if (BlockBasedTableFactory::kName != table_factory->Name()) {
+    return false;
+  }
+  auto* table_options =
+      reinterpret_cast<BlockBasedTableOptions*>(table_factory->GetOptions());
+  if (table_options == nullptr) {
+    return false;
+  }
+  *block_cache = table_options->block_cache.get();
+  if (table_options->no_block_cache || *block_cache == nullptr) {
+    return false;
+  }
+  return true;
+}
+
+bool InternalStats::HandleBlockCacheCapacity(uint64_t* value, DBImpl* /*db*/,
+                                             Version* /*version*/) {
+  Cache* block_cache;
+  bool ok = HandleBlockCacheStat(&block_cache);
+  if (!ok) {
+    return false;
+  }
+  *value = static_cast<uint64_t>(block_cache->GetCapacity());
+  return true;
+}
+
+bool InternalStats::HandleBlockCacheUsage(uint64_t* value, DBImpl* /*db*/,
+                                          Version* /*version*/) {
+  Cache* block_cache;
+  bool ok = HandleBlockCacheStat(&block_cache);
+  if (!ok) {
+    return false;
+  }
+  *value = static_cast<uint64_t>(block_cache->GetUsage());
+  return true;
+}
+
+bool InternalStats::HandleBlockCachePinnedUsage(uint64_t* value, DBImpl* /*db*/,
+                                                Version* /*version*/) {
+  Cache* block_cache;
+  bool ok = HandleBlockCacheStat(&block_cache);
+  if (!ok) {
+    return false;
+  }
+  *value = static_cast<uint64_t>(block_cache->GetPinnedUsage());
+  return true;
+}
+
+void InternalStats::DumpDBStats(std::string* value) {
+  char buf[1000];
+  // DB-level stats, only available from default column family
+  double seconds_up = (env_->NowMicros() - started_at_ + 1) / kMicrosInSec;
+  double interval_seconds_up = seconds_up - db_stats_snapshot_.seconds_up;
+  snprintf(buf, sizeof(buf),
+           "\n** DB Stats **\nUptime(secs): %.1f total, %.1f interval\n",
+           seconds_up, interval_seconds_up);
+  value->append(buf);
+  // Cumulative
+  uint64_t user_bytes_written =
+      GetDBStats(InternalStats::kIntStatsBytesWritten);
+  uint64_t num_keys_written =
+      GetDBStats(InternalStats::kIntStatsNumKeysWritten);
+  uint64_t write_other = GetDBStats(InternalStats::kIntStatsWriteDoneByOther);
+  uint64_t write_self = GetDBStats(InternalStats::kIntStatsWriteDoneBySelf);
+  uint64_t wal_bytes = GetDBStats(InternalStats::kIntStatsWalFileBytes);
+  uint64_t wal_synced = GetDBStats(InternalStats::kIntStatsWalFileSynced);
+  uint64_t write_with_wal = GetDBStats(InternalStats::kIntStatsWriteWithWal);
+  uint64_t write_stall_micros =
+      GetDBStats(InternalStats::kIntStatsWriteStallMicros);
+
+  const int kHumanMicrosLen = 32;
+  char human_micros[kHumanMicrosLen];
+
+  // Data
+  // writes: total number of write requests.
+  // keys: total number of key updates issued by all the write requests
+  // commit groups: number of group commits issued to the DB. Each group can
+  //                contain one or more writes.
+  // so writes/keys is the average number of put in multi-put or put
+  // writes/groups is the average group commit size.
+  //
+  // The format is the same for interval stats.
+  snprintf(buf, sizeof(buf),
+           "Cumulative writes: %s writes, %s keys, %s commit groups, "
+           "%.1f writes per commit group, ingest: %.2f GB, %.2f MB/s\n",
+           NumberToHumanString(write_other + write_self).c_str(),
+           NumberToHumanString(num_keys_written).c_str(),
+           NumberToHumanString(write_self).c_str(),
+           (write_other + write_self) / static_cast<double>(write_self + 1),
+           user_bytes_written / kGB, user_bytes_written / kMB / seconds_up);
+  value->append(buf);
+  // WAL
+  snprintf(buf, sizeof(buf),
+           "Cumulative WAL: %s writes, %s syncs, "
+           "%.2f writes per sync, written: %.2f GB, %.2f MB/s\n",
+           NumberToHumanString(write_with_wal).c_str(),
+           NumberToHumanString(wal_synced).c_str(),
+           write_with_wal / static_cast<double>(wal_synced + 1),
+           wal_bytes / kGB, wal_bytes / kMB / seconds_up);
+  value->append(buf);
+  // Stall
+  AppendHumanMicros(write_stall_micros, human_micros, kHumanMicrosLen, true);
+  snprintf(buf, sizeof(buf), "Cumulative stall: %s, %.1f percent\n",
+           human_micros,
+           // 10000 = divide by 1M to get secs, then multiply by 100 for pct
+           write_stall_micros / 10000.0 / std::max(seconds_up, 0.001));
+  value->append(buf);
+
+  // Interval
+  uint64_t interval_write_other = write_other - db_stats_snapshot_.write_other;
+  uint64_t interval_write_self = write_self - db_stats_snapshot_.write_self;
+  uint64_t interval_num_keys_written =
+      num_keys_written - db_stats_snapshot_.num_keys_written;
+  snprintf(
+      buf, sizeof(buf),
+      "Interval writes: %s writes, %s keys, %s commit groups, "
+      "%.1f writes per commit group, ingest: %.2f MB, %.2f MB/s\n",
+      NumberToHumanString(interval_write_other + interval_write_self).c_str(),
+      NumberToHumanString(interval_num_keys_written).c_str(),
+      NumberToHumanString(interval_write_self).c_str(),
+      static_cast<double>(interval_write_other + interval_write_self) /
+          (interval_write_self + 1),
+      (user_bytes_written - db_stats_snapshot_.ingest_bytes) / kMB,
+      (user_bytes_written - db_stats_snapshot_.ingest_bytes) / kMB /
+          std::max(interval_seconds_up, 0.001)),
+      value->append(buf);
+
+  uint64_t interval_write_with_wal =
+      write_with_wal - db_stats_snapshot_.write_with_wal;
+  uint64_t interval_wal_synced = wal_synced - db_stats_snapshot_.wal_synced;
+  uint64_t interval_wal_bytes = wal_bytes - db_stats_snapshot_.wal_bytes;
+
+  snprintf(
+      buf, sizeof(buf),
+      "Interval WAL: %s writes, %s syncs, "
+      "%.2f writes per sync, written: %.2f MB, %.2f MB/s\n",
+      NumberToHumanString(interval_write_with_wal).c_str(),
+      NumberToHumanString(interval_wal_synced).c_str(),
+      interval_write_with_wal / static_cast<double>(interval_wal_synced + 1),
+      interval_wal_bytes / kGB,
+      interval_wal_bytes / kMB / std::max(interval_seconds_up, 0.001));
+  value->append(buf);
+
+  // Stall
+  AppendHumanMicros(write_stall_micros - db_stats_snapshot_.write_stall_micros,
+                    human_micros, kHumanMicrosLen, true);
+  snprintf(buf, sizeof(buf), "Interval stall: %s, %.1f percent\n", human_micros,
+           // 10000 = divide by 1M to get secs, then multiply by 100 for pct
+           (write_stall_micros - db_stats_snapshot_.write_stall_micros) /
+               10000.0 / std::max(interval_seconds_up, 0.001));
+  value->append(buf);
+
+  db_stats_snapshot_.seconds_up = seconds_up;
+  db_stats_snapshot_.ingest_bytes = user_bytes_written;
+  db_stats_snapshot_.write_other = write_other;
+  db_stats_snapshot_.write_self = write_self;
+  db_stats_snapshot_.num_keys_written = num_keys_written;
+  db_stats_snapshot_.wal_bytes = wal_bytes;
+  db_stats_snapshot_.wal_synced = wal_synced;
+  db_stats_snapshot_.write_with_wal = write_with_wal;
+  db_stats_snapshot_.write_stall_micros = write_stall_micros;
+}
+
+/**
+ * Dump Compaction Level stats to a map of stat name with "compaction." prefix
+ * to value in double as string. The level in stat name is represented with
+ * a prefix "Lx" where "x" is the level number. A special level "Sum"
+ * represents the sum of a stat for all levels.
+ * The result also contains IO stall counters which keys start with "io_stalls."
+ * and values represent uint64 encoded as strings.
+ */
+void InternalStats::DumpCFMapStats(
+    std::map<std::string, std::string>* cf_stats) {
+  CompactionStats compaction_stats_sum;
+  std::map<int, std::map<LevelStatType, double>> levels_stats;
+  DumpCFMapStats(&levels_stats, &compaction_stats_sum);
+  for (auto const& level_ent : levels_stats) {
+    auto level_str =
+        level_ent.first == -1 ? "Sum" : "L" + ToString(level_ent.first);
+    for (auto const& stat_ent : level_ent.second) {
+      auto stat_type = stat_ent.first;
+      auto key_str =
+          "compaction." + level_str + "." +
+          InternalStats::compaction_level_stats.at(stat_type).property_name;
+      (*cf_stats)[key_str] = std::to_string(stat_ent.second);
+    }
+  }
+
+  DumpCFMapStatsIOStalls(cf_stats);
+}
+
+void InternalStats::DumpCFMapStats(
+    std::map<int, std::map<LevelStatType, double>>* levels_stats,
+    CompactionStats* compaction_stats_sum) {
+  const VersionStorageInfo* vstorage = cfd_->current()->storage_info();
+
+  int num_levels_to_check =
+      (cfd_->ioptions()->compaction_style != kCompactionStyleFIFO)
+          ? vstorage->num_levels() - 1
+          : 1;
+
+  // Compaction scores are sorted based on its value. Restore them to the
+  // level order
+  std::vector<double> compaction_score(number_levels_, 0);
+  for (int i = 0; i < num_levels_to_check; ++i) {
+    compaction_score[vstorage->CompactionScoreLevel(i)] =
+        vstorage->CompactionScore(i);
+  }
+  // Count # of files being compacted for each level
+  std::vector<int> files_being_compacted(number_levels_, 0);
+  for (int level = 0; level < number_levels_; ++level) {
+    for (auto* f : vstorage->LevelFiles(level)) {
+      if (f->being_compacted) {
+        ++files_being_compacted[level];
+      }
+    }
+  }
+
+  int total_files = 0;
+  int total_files_being_compacted = 0;
+  double total_file_size = 0;
+  uint64_t flush_ingest = cf_stats_value_[BYTES_FLUSHED];
+  uint64_t add_file_ingest = cf_stats_value_[BYTES_INGESTED_ADD_FILE];
+  uint64_t curr_ingest = flush_ingest + add_file_ingest;
+  for (int level = 0; level < number_levels_; level++) {
+    int files = vstorage->NumLevelFiles(level);
+    total_files += files;
+    total_files_being_compacted += files_being_compacted[level];
+    if (comp_stats_[level].micros > 0 || files > 0) {
+      compaction_stats_sum->Add(comp_stats_[level]);
+      total_file_size += vstorage->NumLevelBytes(level);
+      uint64_t input_bytes;
+      if (level == 0) {
+        input_bytes = curr_ingest;
+      } else {
+        input_bytes = comp_stats_[level].bytes_read_non_output_levels;
+      }
+      double w_amp =
+          (input_bytes == 0)
+              ? 0.0
+              : static_cast<double>(comp_stats_[level].bytes_written) /
+                    input_bytes;
+      std::map<LevelStatType, double> level_stats;
+      PrepareLevelStats(&level_stats, files, files_being_compacted[level],
+                        static_cast<double>(vstorage->NumLevelBytes(level)),
+                        compaction_score[level], w_amp, comp_stats_[level]);
+      (*levels_stats)[level] = level_stats;
+    }
+  }
+  // Cumulative summary
+  double w_amp = compaction_stats_sum->bytes_written /
+                 static_cast<double>(curr_ingest + 1);
+  // Stats summary across levels
+  std::map<LevelStatType, double> sum_stats;
+  PrepareLevelStats(&sum_stats, total_files, total_files_being_compacted,
+                    total_file_size, 0, w_amp, *compaction_stats_sum);
+  (*levels_stats)[-1] = sum_stats;  //  -1 is for the Sum level
+}
+
+void InternalStats::DumpCFMapStatsByPriority(
+    std::map<int, std::map<LevelStatType, double>>* priorities_stats) {
+  for (size_t priority = 0; priority < comp_stats_by_pri_.size(); priority++) {
+    if (comp_stats_by_pri_[priority].micros > 0) {
+      std::map<LevelStatType, double> priority_stats;
+      PrepareLevelStats(&priority_stats, 0 /* num_files */,
+                        0 /* being_compacted */, 0 /* total_file_size */,
+                        0 /* compaction_score */, 0 /* w_amp */,
+                        comp_stats_by_pri_[priority]);
+      (*priorities_stats)[static_cast<int>(priority)] = priority_stats;
+    }
+  }
+}
+
+void InternalStats::DumpCFMapStatsIOStalls(
+    std::map<std::string, std::string>* cf_stats) {
+  (*cf_stats)["io_stalls.level0_slowdown"] =
+      std::to_string(cf_stats_count_[L0_FILE_COUNT_LIMIT_SLOWDOWNS]);
+  (*cf_stats)["io_stalls.level0_slowdown_with_compaction"] =
+      std::to_string(cf_stats_count_[LOCKED_L0_FILE_COUNT_LIMIT_SLOWDOWNS]);
+  (*cf_stats)["io_stalls.level0_numfiles"] =
+      std::to_string(cf_stats_count_[L0_FILE_COUNT_LIMIT_STOPS]);
+  (*cf_stats)["io_stalls.level0_numfiles_with_compaction"] =
+      std::to_string(cf_stats_count_[LOCKED_L0_FILE_COUNT_LIMIT_STOPS]);
+  (*cf_stats)["io_stalls.stop_for_pending_compaction_bytes"] =
+      std::to_string(cf_stats_count_[PENDING_COMPACTION_BYTES_LIMIT_STOPS]);
+  (*cf_stats)["io_stalls.slowdown_for_pending_compaction_bytes"] =
+      std::to_string(cf_stats_count_[PENDING_COMPACTION_BYTES_LIMIT_SLOWDOWNS]);
+  (*cf_stats)["io_stalls.memtable_compaction"] =
+      std::to_string(cf_stats_count_[MEMTABLE_LIMIT_STOPS]);
+  (*cf_stats)["io_stalls.memtable_slowdown"] =
+      std::to_string(cf_stats_count_[MEMTABLE_LIMIT_SLOWDOWNS]);
+
+  uint64_t total_stop = cf_stats_count_[L0_FILE_COUNT_LIMIT_STOPS] +
+                        cf_stats_count_[PENDING_COMPACTION_BYTES_LIMIT_STOPS] +
+                        cf_stats_count_[MEMTABLE_LIMIT_STOPS];
+
+  uint64_t total_slowdown =
+      cf_stats_count_[L0_FILE_COUNT_LIMIT_SLOWDOWNS] +
+      cf_stats_count_[PENDING_COMPACTION_BYTES_LIMIT_SLOWDOWNS] +
+      cf_stats_count_[MEMTABLE_LIMIT_SLOWDOWNS];
+
+  (*cf_stats)["io_stalls.total_stop"] = std::to_string(total_stop);
+  (*cf_stats)["io_stalls.total_slowdown"] = std::to_string(total_slowdown);
+}
+
+void InternalStats::DumpCFStats(std::string* value) {
+  DumpCFStatsNoFileHistogram(value);
+  DumpCFFileHistogram(value);
+}
+
+void InternalStats::DumpCFStatsNoFileHistogram(std::string* value) {
+  char buf[2000];
+  // Per-ColumnFamily stats
+  PrintLevelStatsHeader(buf, sizeof(buf), cfd_->GetName(), "Level");
+  value->append(buf);
+
+  // Print stats for each level
+  std::map<int, std::map<LevelStatType, double>> levels_stats;
+  CompactionStats compaction_stats_sum;
+  DumpCFMapStats(&levels_stats, &compaction_stats_sum);
+  for (int l = 0; l < number_levels_; ++l) {
+    if (levels_stats.find(l) != levels_stats.end()) {
+      PrintLevelStats(buf, sizeof(buf), "L" + ToString(l), levels_stats[l]);
+      value->append(buf);
+    }
+  }
+
+  // Print sum of level stats
+  PrintLevelStats(buf, sizeof(buf), "Sum", levels_stats[-1]);
+  value->append(buf);
+
+  uint64_t flush_ingest = cf_stats_value_[BYTES_FLUSHED];
+  uint64_t add_file_ingest = cf_stats_value_[BYTES_INGESTED_ADD_FILE];
+  uint64_t ingest_files_addfile = cf_stats_value_[INGESTED_NUM_FILES_TOTAL];
+  uint64_t ingest_l0_files_addfile =
+      cf_stats_value_[INGESTED_LEVEL0_NUM_FILES_TOTAL];
+  uint64_t ingest_keys_addfile = cf_stats_value_[INGESTED_NUM_KEYS_TOTAL];
+  // Cumulative summary
+  uint64_t total_stall_count =
+      cf_stats_count_[L0_FILE_COUNT_LIMIT_SLOWDOWNS] +
+      cf_stats_count_[L0_FILE_COUNT_LIMIT_STOPS] +
+      cf_stats_count_[PENDING_COMPACTION_BYTES_LIMIT_SLOWDOWNS] +
+      cf_stats_count_[PENDING_COMPACTION_BYTES_LIMIT_STOPS] +
+      cf_stats_count_[MEMTABLE_LIMIT_STOPS] +
+      cf_stats_count_[MEMTABLE_LIMIT_SLOWDOWNS];
+  // Interval summary
+  uint64_t interval_flush_ingest =
+      flush_ingest - cf_stats_snapshot_.ingest_bytes_flush;
+  uint64_t interval_add_file_inget =
+      add_file_ingest - cf_stats_snapshot_.ingest_bytes_addfile;
+  uint64_t interval_ingest =
+      interval_flush_ingest + interval_add_file_inget + 1;
+  CompactionStats interval_stats(compaction_stats_sum);
+  interval_stats.Subtract(cf_stats_snapshot_.comp_stats);
+  double w_amp =
+      interval_stats.bytes_written / static_cast<double>(interval_ingest);
+  PrintLevelStats(buf, sizeof(buf), "Int", 0, 0, 0, 0, w_amp, interval_stats);
+  value->append(buf);
+
+  PrintLevelStatsHeader(buf, sizeof(buf), cfd_->GetName(), "Priority");
+  value->append(buf);
+  std::map<int, std::map<LevelStatType, double>> priorities_stats;
+  DumpCFMapStatsByPriority(&priorities_stats);
+  for (size_t priority = 0; priority < comp_stats_by_pri_.size(); ++priority) {
+    if (priorities_stats.find(static_cast<int>(priority)) !=
+        priorities_stats.end()) {
+      PrintLevelStats(
+          buf, sizeof(buf),
+          Env::PriorityToString(static_cast<Env::Priority>(priority)),
+          priorities_stats[static_cast<int>(priority)]);
+      value->append(buf);
+    }
+  }
+
+  double seconds_up = (env_->NowMicros() - started_at_ + 1) / kMicrosInSec;
+  double interval_seconds_up = seconds_up - cf_stats_snapshot_.seconds_up;
+  snprintf(buf, sizeof(buf), "Uptime(secs): %.1f total, %.1f interval\n",
+           seconds_up, interval_seconds_up);
+  value->append(buf);
+  snprintf(buf, sizeof(buf), "Flush(GB): cumulative %.3f, interval %.3f\n",
+           flush_ingest / kGB, interval_flush_ingest / kGB);
+  value->append(buf);
+  snprintf(buf, sizeof(buf), "AddFile(GB): cumulative %.3f, interval %.3f\n",
+           add_file_ingest / kGB, interval_add_file_inget / kGB);
+  value->append(buf);
+
+  uint64_t interval_ingest_files_addfile =
+      ingest_files_addfile - cf_stats_snapshot_.ingest_files_addfile;
+  snprintf(buf, sizeof(buf),
+           "AddFile(Total Files): cumulative %" PRIu64 ", interval %" PRIu64
+           "\n",
+           ingest_files_addfile, interval_ingest_files_addfile);
+  value->append(buf);
+
+  uint64_t interval_ingest_l0_files_addfile =
+      ingest_l0_files_addfile - cf_stats_snapshot_.ingest_l0_files_addfile;
+  snprintf(buf, sizeof(buf),
+           "AddFile(L0 Files): cumulative %" PRIu64 ", interval %" PRIu64 "\n",
+           ingest_l0_files_addfile, interval_ingest_l0_files_addfile);
+  value->append(buf);
+
+  uint64_t interval_ingest_keys_addfile =
+      ingest_keys_addfile - cf_stats_snapshot_.ingest_keys_addfile;
+  snprintf(buf, sizeof(buf),
+           "AddFile(Keys): cumulative %" PRIu64 ", interval %" PRIu64 "\n",
+           ingest_keys_addfile, interval_ingest_keys_addfile);
+  value->append(buf);
+
+  // Compact
+  uint64_t compact_bytes_read = 0;
+  uint64_t compact_bytes_write = 0;
+  uint64_t compact_micros = 0;
+  for (int level = 0; level < number_levels_; level++) {
+    compact_bytes_read += comp_stats_[level].bytes_read_output_level +
+                          comp_stats_[level].bytes_read_non_output_levels;
+    compact_bytes_write += comp_stats_[level].bytes_written;
+    compact_micros += comp_stats_[level].micros;
+  }
+
+  snprintf(buf, sizeof(buf),
+           "Cumulative compaction: %.2f GB write, %.2f MB/s write, "
+           "%.2f GB read, %.2f MB/s read, %.1f seconds\n",
+           compact_bytes_write / kGB, compact_bytes_write / kMB / seconds_up,
+           compact_bytes_read / kGB, compact_bytes_read / kMB / seconds_up,
+           compact_micros / kMicrosInSec);
+  value->append(buf);
+
+  // Compaction interval
+  uint64_t interval_compact_bytes_write =
+      compact_bytes_write - cf_stats_snapshot_.compact_bytes_write;
+  uint64_t interval_compact_bytes_read =
+      compact_bytes_read - cf_stats_snapshot_.compact_bytes_read;
+  uint64_t interval_compact_micros =
+      compact_micros - cf_stats_snapshot_.compact_micros;
+
+  snprintf(
+      buf, sizeof(buf),
+      "Interval compaction: %.2f GB write, %.2f MB/s write, "
+      "%.2f GB read, %.2f MB/s read, %.1f seconds\n",
+      interval_compact_bytes_write / kGB,
+      interval_compact_bytes_write / kMB / std::max(interval_seconds_up, 0.001),
+      interval_compact_bytes_read / kGB,
+      interval_compact_bytes_read / kMB / std::max(interval_seconds_up, 0.001),
+      interval_compact_micros / kMicrosInSec);
+  value->append(buf);
+  cf_stats_snapshot_.compact_bytes_write = compact_bytes_write;
+  cf_stats_snapshot_.compact_bytes_read = compact_bytes_read;
+  cf_stats_snapshot_.compact_micros = compact_micros;
+
+  snprintf(buf, sizeof(buf),
+           "Stalls(count): %" PRIu64
+           " level0_slowdown, "
+           "%" PRIu64
+           " level0_slowdown_with_compaction, "
+           "%" PRIu64
+           " level0_numfiles, "
+           "%" PRIu64
+           " level0_numfiles_with_compaction, "
+           "%" PRIu64
+           " stop for pending_compaction_bytes, "
+           "%" PRIu64
+           " slowdown for pending_compaction_bytes, "
+           "%" PRIu64
+           " memtable_compaction, "
+           "%" PRIu64
+           " memtable_slowdown, "
+           "interval %" PRIu64 " total count\n",
+           cf_stats_count_[L0_FILE_COUNT_LIMIT_SLOWDOWNS],
+           cf_stats_count_[LOCKED_L0_FILE_COUNT_LIMIT_SLOWDOWNS],
+           cf_stats_count_[L0_FILE_COUNT_LIMIT_STOPS],
+           cf_stats_count_[LOCKED_L0_FILE_COUNT_LIMIT_STOPS],
+           cf_stats_count_[PENDING_COMPACTION_BYTES_LIMIT_STOPS],
+           cf_stats_count_[PENDING_COMPACTION_BYTES_LIMIT_SLOWDOWNS],
+           cf_stats_count_[MEMTABLE_LIMIT_STOPS],
+           cf_stats_count_[MEMTABLE_LIMIT_SLOWDOWNS],
+           total_stall_count - cf_stats_snapshot_.stall_count);
+  value->append(buf);
+
+  cf_stats_snapshot_.seconds_up = seconds_up;
+  cf_stats_snapshot_.ingest_bytes_flush = flush_ingest;
+  cf_stats_snapshot_.ingest_bytes_addfile = add_file_ingest;
+  cf_stats_snapshot_.ingest_files_addfile = ingest_files_addfile;
+  cf_stats_snapshot_.ingest_l0_files_addfile = ingest_l0_files_addfile;
+  cf_stats_snapshot_.ingest_keys_addfile = ingest_keys_addfile;
+  cf_stats_snapshot_.comp_stats = compaction_stats_sum;
+  cf_stats_snapshot_.stall_count = total_stall_count;
+}
+
+void InternalStats::DumpCFFileHistogram(std::string* value) {
+  char buf[2000];
+  snprintf(buf, sizeof(buf),
+           "\n** File Read Latency Histogram By Level [%s] **\n",
+           cfd_->GetName().c_str());
+  value->append(buf);
+
+  for (int level = 0; level < number_levels_; level++) {
+    if (!file_read_latency_[level].Empty()) {
+      char buf2[5000];
+      snprintf(buf2, sizeof(buf2),
+               "** Level %d read latency histogram (micros):\n%s\n", level,
+               file_read_latency_[level].ToString().c_str());
+      value->append(buf2);
+    }
+  }
+}
+
+#else
+
+const DBPropertyInfo* GetPropertyInfo(const Slice& /*property*/) {
+  return nullptr;
+}
+
+#endif  // !ROCKSDB_LITE
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/internal_stats.h b/src/rocksdb/db/internal_stats.h
new file mode 100644
index 000000000..ce83be244
--- /dev/null
+++ b/src/rocksdb/db/internal_stats.h
@@ -0,0 +1,697 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+
+#pragma once
+#include <map>
+#include <string>
+#include <vector>
+
+#include "db/version_set.h"
+
+class ColumnFamilyData;
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBImpl;
+class MemTableList;
+
+// Config for retrieving a property's value.
+struct DBPropertyInfo {
+  bool need_out_of_mutex;
+
+  // gcc had an internal error for initializing union of pointer-to-member-
+  // functions. Workaround is to populate exactly one of the following function
+  // pointers with a non-nullptr value.
+
+  // @param value Value-result argument for storing the property's string value
+  // @param suffix Argument portion of the property. For example, suffix would
+  //      be "5" for the property "rocksdb.num-files-at-level5". So far, only
+  //      certain string properties take an argument.
+  bool (InternalStats::*handle_string)(std::string* value, Slice suffix);
+
+  // @param value Value-result argument for storing the property's uint64 value
+  // @param db Many of the int properties rely on DBImpl methods.
+  // @param version Version is needed in case the property is retrieved without
+  //      holding db mutex, which is only supported for int properties.
+  bool (InternalStats::*handle_int)(uint64_t* value, DBImpl* db,
+                                    Version* version);
+
+  // @param props Map of general properties to populate
+  bool (InternalStats::*handle_map)(std::map<std::string, std::string>* props);
+
+  // handle the string type properties rely on DBImpl methods
+  // @param value Value-result argument for storing the property's string value
+  bool (DBImpl::*handle_string_dbimpl)(std::string* value);
+};
+
+extern const DBPropertyInfo* GetPropertyInfo(const Slice& property);
+
+#ifndef ROCKSDB_LITE
+#undef SCORE
+enum class LevelStatType {
+  INVALID = 0,
+  NUM_FILES,
+  COMPACTED_FILES,
+  SIZE_BYTES,
+  SCORE,
+  READ_GB,
+  RN_GB,
+  RNP1_GB,
+  WRITE_GB,
+  W_NEW_GB,
+  MOVED_GB,
+  WRITE_AMP,
+  READ_MBPS,
+  WRITE_MBPS,
+  COMP_SEC,
+  COMP_CPU_SEC,
+  COMP_COUNT,
+  AVG_SEC,
+  KEY_IN,
+  KEY_DROP,
+  TOTAL  // total number of types
+};
+
+struct LevelStat {
+  // This what will be L?.property_name in the flat map returned to the user
+  std::string property_name;
+  // This will be what we will print in the header in the cli
+  std::string header_name;
+};
+
+class InternalStats {
+ public:
+  static const std::map<LevelStatType, LevelStat> compaction_level_stats;
+
+  enum InternalCFStatsType {
+    L0_FILE_COUNT_LIMIT_SLOWDOWNS,
+    LOCKED_L0_FILE_COUNT_LIMIT_SLOWDOWNS,
+    MEMTABLE_LIMIT_STOPS,
+    MEMTABLE_LIMIT_SLOWDOWNS,
+    L0_FILE_COUNT_LIMIT_STOPS,
+    LOCKED_L0_FILE_COUNT_LIMIT_STOPS,
+    PENDING_COMPACTION_BYTES_LIMIT_SLOWDOWNS,
+    PENDING_COMPACTION_BYTES_LIMIT_STOPS,
+    WRITE_STALLS_ENUM_MAX,
+    BYTES_FLUSHED,
+    BYTES_INGESTED_ADD_FILE,
+    INGESTED_NUM_FILES_TOTAL,
+    INGESTED_LEVEL0_NUM_FILES_TOTAL,
+    INGESTED_NUM_KEYS_TOTAL,
+    INTERNAL_CF_STATS_ENUM_MAX,
+  };
+
+  enum InternalDBStatsType {
+    kIntStatsWalFileBytes,
+    kIntStatsWalFileSynced,
+    kIntStatsBytesWritten,
+    kIntStatsNumKeysWritten,
+    kIntStatsWriteDoneByOther,
+    kIntStatsWriteDoneBySelf,
+    kIntStatsWriteWithWal,
+    kIntStatsWriteStallMicros,
+    kIntStatsNumMax,
+  };
+
+  InternalStats(int num_levels, Env* env, ColumnFamilyData* cfd)
+      : db_stats_{},
+        cf_stats_value_{},
+        cf_stats_count_{},
+        comp_stats_(num_levels),
+        comp_stats_by_pri_(Env::Priority::TOTAL),
+        file_read_latency_(num_levels),
+        bg_error_count_(0),
+        number_levels_(num_levels),
+        env_(env),
+        cfd_(cfd),
+        started_at_(env->NowMicros()) {}
+
+  // Per level compaction stats.  comp_stats_[level] stores the stats for
+  // compactions that produced data for the specified "level".
+  struct CompactionStats {
+    uint64_t micros;
+    uint64_t cpu_micros;
+
+    // The number of bytes read from all non-output levels
+    uint64_t bytes_read_non_output_levels;
+
+    // The number of bytes read from the compaction output level.
+    uint64_t bytes_read_output_level;
+
+    // Total number of bytes written during compaction
+    uint64_t bytes_written;
+
+    // Total number of bytes moved to the output level
+    uint64_t bytes_moved;
+
+    // The number of compaction input files in all non-output levels.
+    int num_input_files_in_non_output_levels;
+
+    // The number of compaction input files in the output level.
+    int num_input_files_in_output_level;
+
+    // The number of compaction output files.
+    int num_output_files;
+
+    // Total incoming entries during compaction between levels N and N+1
+    uint64_t num_input_records;
+
+    // Accumulated diff number of entries
+    // (num input entries - num output entires) for compaction  levels N and N+1
+    uint64_t num_dropped_records;
+
+    // Number of compactions done
+    int count;
+
+    // Number of compactions done per CompactionReason
+    int counts[static_cast<int>(CompactionReason::kNumOfReasons)];
+
+    explicit CompactionStats()
+        : micros(0),
+          cpu_micros(0),
+          bytes_read_non_output_levels(0),
+          bytes_read_output_level(0),
+          bytes_written(0),
+          bytes_moved(0),
+          num_input_files_in_non_output_levels(0),
+          num_input_files_in_output_level(0),
+          num_output_files(0),
+          num_input_records(0),
+          num_dropped_records(0),
+          count(0) {
+      int num_of_reasons = static_cast<int>(CompactionReason::kNumOfReasons);
+      for (int i = 0; i < num_of_reasons; i++) {
+        counts[i] = 0;
+      }
+    }
+
+    explicit CompactionStats(CompactionReason reason, int c)
+        : micros(0),
+          cpu_micros(0),
+          bytes_read_non_output_levels(0),
+          bytes_read_output_level(0),
+          bytes_written(0),
+          bytes_moved(0),
+          num_input_files_in_non_output_levels(0),
+          num_input_files_in_output_level(0),
+          num_output_files(0),
+          num_input_records(0),
+          num_dropped_records(0),
+          count(c) {
+      int num_of_reasons = static_cast<int>(CompactionReason::kNumOfReasons);
+      for (int i = 0; i < num_of_reasons; i++) {
+        counts[i] = 0;
+      }
+      int r = static_cast<int>(reason);
+      if (r >= 0 && r < num_of_reasons) {
+        counts[r] = c;
+      } else {
+        count = 0;
+      }
+    }
+
+    explicit CompactionStats(const CompactionStats& c)
+        : micros(c.micros),
+          cpu_micros(c.cpu_micros),
+          bytes_read_non_output_levels(c.bytes_read_non_output_levels),
+          bytes_read_output_level(c.bytes_read_output_level),
+          bytes_written(c.bytes_written),
+          bytes_moved(c.bytes_moved),
+          num_input_files_in_non_output_levels(
+              c.num_input_files_in_non_output_levels),
+          num_input_files_in_output_level(c.num_input_files_in_output_level),
+          num_output_files(c.num_output_files),
+          num_input_records(c.num_input_records),
+          num_dropped_records(c.num_dropped_records),
+          count(c.count) {
+      int num_of_reasons = static_cast<int>(CompactionReason::kNumOfReasons);
+      for (int i = 0; i < num_of_reasons; i++) {
+        counts[i] = c.counts[i];
+      }
+    }
+
+    CompactionStats& operator=(const CompactionStats& c) {
+      micros = c.micros;
+      cpu_micros = c.cpu_micros;
+      bytes_read_non_output_levels = c.bytes_read_non_output_levels;
+      bytes_read_output_level = c.bytes_read_output_level;
+      bytes_written = c.bytes_written;
+      bytes_moved = c.bytes_moved;
+      num_input_files_in_non_output_levels =
+          c.num_input_files_in_non_output_levels;
+      num_input_files_in_output_level = c.num_input_files_in_output_level;
+      num_output_files = c.num_output_files;
+      num_input_records = c.num_input_records;
+      num_dropped_records = c.num_dropped_records;
+      count = c.count;
+
+      int num_of_reasons = static_cast<int>(CompactionReason::kNumOfReasons);
+      for (int i = 0; i < num_of_reasons; i++) {
+        counts[i] = c.counts[i];
+      }
+      return *this;
+    }
+
+    void Clear() {
+      this->micros = 0;
+      this->cpu_micros = 0;
+      this->bytes_read_non_output_levels = 0;
+      this->bytes_read_output_level = 0;
+      this->bytes_written = 0;
+      this->bytes_moved = 0;
+      this->num_input_files_in_non_output_levels = 0;
+      this->num_input_files_in_output_level = 0;
+      this->num_output_files = 0;
+      this->num_input_records = 0;
+      this->num_dropped_records = 0;
+      this->count = 0;
+      int num_of_reasons = static_cast<int>(CompactionReason::kNumOfReasons);
+      for (int i = 0; i < num_of_reasons; i++) {
+        counts[i] = 0;
+      }
+    }
+
+    void Add(const CompactionStats& c) {
+      this->micros += c.micros;
+      this->cpu_micros += c.cpu_micros;
+      this->bytes_read_non_output_levels += c.bytes_read_non_output_levels;
+      this->bytes_read_output_level += c.bytes_read_output_level;
+      this->bytes_written += c.bytes_written;
+      this->bytes_moved += c.bytes_moved;
+      this->num_input_files_in_non_output_levels +=
+          c.num_input_files_in_non_output_levels;
+      this->num_input_files_in_output_level +=
+          c.num_input_files_in_output_level;
+      this->num_output_files += c.num_output_files;
+      this->num_input_records += c.num_input_records;
+      this->num_dropped_records += c.num_dropped_records;
+      this->count += c.count;
+      int num_of_reasons = static_cast<int>(CompactionReason::kNumOfReasons);
+      for (int i = 0; i< num_of_reasons; i++) {
+        counts[i] += c.counts[i];
+      }
+    }
+
+    void Subtract(const CompactionStats& c) {
+      this->micros -= c.micros;
+      this->cpu_micros -= c.cpu_micros;
+      this->bytes_read_non_output_levels -= c.bytes_read_non_output_levels;
+      this->bytes_read_output_level -= c.bytes_read_output_level;
+      this->bytes_written -= c.bytes_written;
+      this->bytes_moved -= c.bytes_moved;
+      this->num_input_files_in_non_output_levels -=
+          c.num_input_files_in_non_output_levels;
+      this->num_input_files_in_output_level -=
+          c.num_input_files_in_output_level;
+      this->num_output_files -= c.num_output_files;
+      this->num_input_records -= c.num_input_records;
+      this->num_dropped_records -= c.num_dropped_records;
+      this->count -= c.count;
+      int num_of_reasons = static_cast<int>(CompactionReason::kNumOfReasons);
+      for (int i = 0; i < num_of_reasons; i++) {
+        counts[i] -= c.counts[i];
+      }
+    }
+  };
+
+  void Clear() {
+    for (int i = 0; i < kIntStatsNumMax; i++) {
+      db_stats_[i].store(0);
+    }
+    for (int i = 0; i < INTERNAL_CF_STATS_ENUM_MAX; i++) {
+      cf_stats_count_[i] = 0;
+      cf_stats_value_[i] = 0;
+    }
+    for (auto& comp_stat : comp_stats_) {
+      comp_stat.Clear();
+    }
+    for (auto& h : file_read_latency_) {
+      h.Clear();
+    }
+    cf_stats_snapshot_.Clear();
+    db_stats_snapshot_.Clear();
+    bg_error_count_ = 0;
+    started_at_ = env_->NowMicros();
+  }
+
+  void AddCompactionStats(int level, Env::Priority thread_pri,
+                          const CompactionStats& stats) {
+    comp_stats_[level].Add(stats);
+    comp_stats_by_pri_[thread_pri].Add(stats);
+  }
+
+  void IncBytesMoved(int level, uint64_t amount) {
+    comp_stats_[level].bytes_moved += amount;
+  }
+
+  void AddCFStats(InternalCFStatsType type, uint64_t value) {
+    cf_stats_value_[type] += value;
+    ++cf_stats_count_[type];
+  }
+
+  void AddDBStats(InternalDBStatsType type, uint64_t value,
+                  bool concurrent = false) {
+    auto& v = db_stats_[type];
+    if (concurrent) {
+      v.fetch_add(value, std::memory_order_relaxed);
+    } else {
+      v.store(v.load(std::memory_order_relaxed) + value,
+              std::memory_order_relaxed);
+    }
+  }
+
+  uint64_t GetDBStats(InternalDBStatsType type) {
+    return db_stats_[type].load(std::memory_order_relaxed);
+  }
+
+  HistogramImpl* GetFileReadHist(int level) {
+    return &file_read_latency_[level];
+  }
+
+  uint64_t GetBackgroundErrorCount() const { return bg_error_count_; }
+
+  uint64_t BumpAndGetBackgroundErrorCount() { return ++bg_error_count_; }
+
+  bool GetStringProperty(const DBPropertyInfo& property_info,
+                         const Slice& property, std::string* value);
+
+  bool GetMapProperty(const DBPropertyInfo& property_info,
+                      const Slice& property,
+                      std::map<std::string, std::string>* value);
+
+  bool GetIntProperty(const DBPropertyInfo& property_info, uint64_t* value,
+                      DBImpl* db);
+
+  bool GetIntPropertyOutOfMutex(const DBPropertyInfo& property_info,
+                                Version* version, uint64_t* value);
+
+  const std::vector<CompactionStats>& TEST_GetCompactionStats() const {
+    return comp_stats_;
+  }
+
+  // Store a mapping from the user-facing DB::Properties string to our
+  // DBPropertyInfo struct used internally for retrieving properties.
+  static const std::unordered_map<std::string, DBPropertyInfo> ppt_name_to_info;
+
+ private:
+  void DumpDBStats(std::string* value);
+  void DumpCFMapStats(std::map<std::string, std::string>* cf_stats);
+  void DumpCFMapStats(
+      std::map<int, std::map<LevelStatType, double>>* level_stats,
+      CompactionStats* compaction_stats_sum);
+  void DumpCFMapStatsByPriority(
+      std::map<int, std::map<LevelStatType, double>>* priorities_stats);
+  void DumpCFMapStatsIOStalls(std::map<std::string, std::string>* cf_stats);
+  void DumpCFStats(std::string* value);
+  void DumpCFStatsNoFileHistogram(std::string* value);
+  void DumpCFFileHistogram(std::string* value);
+
+  bool HandleBlockCacheStat(Cache** block_cache);
+
+  // Per-DB stats
+  std::atomic<uint64_t> db_stats_[kIntStatsNumMax];
+  // Per-ColumnFamily stats
+  uint64_t cf_stats_value_[INTERNAL_CF_STATS_ENUM_MAX];
+  uint64_t cf_stats_count_[INTERNAL_CF_STATS_ENUM_MAX];
+  // Per-ColumnFamily/level compaction stats
+  std::vector<CompactionStats> comp_stats_;
+  std::vector<CompactionStats> comp_stats_by_pri_;
+  std::vector<HistogramImpl> file_read_latency_;
+
+  // Used to compute per-interval statistics
+  struct CFStatsSnapshot {
+    // ColumnFamily-level stats
+    CompactionStats comp_stats;
+    uint64_t ingest_bytes_flush;      // Bytes written to L0 (Flush)
+    uint64_t stall_count;             // Stall count
+    // Stats from compaction jobs - bytes written, bytes read, duration.
+    uint64_t compact_bytes_write;
+    uint64_t compact_bytes_read;
+    uint64_t compact_micros;
+    double seconds_up;
+
+    // AddFile specific stats
+    uint64_t ingest_bytes_addfile;     // Total Bytes ingested
+    uint64_t ingest_files_addfile;     // Total number of files ingested
+    uint64_t ingest_l0_files_addfile;  // Total number of files ingested to L0
+    uint64_t ingest_keys_addfile;      // Total number of keys ingested
+
+    CFStatsSnapshot()
+        : ingest_bytes_flush(0),
+          stall_count(0),
+          compact_bytes_write(0),
+          compact_bytes_read(0),
+          compact_micros(0),
+          seconds_up(0),
+          ingest_bytes_addfile(0),
+          ingest_files_addfile(0),
+          ingest_l0_files_addfile(0),
+          ingest_keys_addfile(0) {}
+
+    void Clear() {
+      comp_stats.Clear();
+      ingest_bytes_flush = 0;
+      stall_count = 0;
+      compact_bytes_write = 0;
+      compact_bytes_read = 0;
+      compact_micros = 0;
+      seconds_up = 0;
+      ingest_bytes_addfile = 0;
+      ingest_files_addfile = 0;
+      ingest_l0_files_addfile = 0;
+      ingest_keys_addfile = 0;
+    }
+  } cf_stats_snapshot_;
+
+  struct DBStatsSnapshot {
+    // DB-level stats
+    uint64_t ingest_bytes;            // Bytes written by user
+    uint64_t wal_bytes;               // Bytes written to WAL
+    uint64_t wal_synced;              // Number of times WAL is synced
+    uint64_t write_with_wal;          // Number of writes that request WAL
+    // These count the number of writes processed by the calling thread or
+    // another thread.
+    uint64_t write_other;
+    uint64_t write_self;
+    // Total number of keys written. write_self and write_other measure number
+    // of write requests written, Each of the write request can contain updates
+    // to multiple keys. num_keys_written is total number of keys updated by all
+    // those writes.
+    uint64_t num_keys_written;
+    // Total time writes delayed by stalls.
+    uint64_t write_stall_micros;
+    double seconds_up;
+
+    DBStatsSnapshot()
+        : ingest_bytes(0),
+          wal_bytes(0),
+          wal_synced(0),
+          write_with_wal(0),
+          write_other(0),
+          write_self(0),
+          num_keys_written(0),
+          write_stall_micros(0),
+          seconds_up(0) {}
+
+    void Clear() {
+      ingest_bytes = 0;
+      wal_bytes = 0;
+      wal_synced = 0;
+      write_with_wal = 0;
+      write_other = 0;
+      write_self = 0;
+      num_keys_written = 0;
+      write_stall_micros = 0;
+      seconds_up = 0;
+    }
+  } db_stats_snapshot_;
+
+  // Handler functions for getting property values. They use "value" as a value-
+  // result argument, and return true upon successfully setting "value".
+  bool HandleNumFilesAtLevel(std::string* value, Slice suffix);
+  bool HandleCompressionRatioAtLevelPrefix(std::string* value, Slice suffix);
+  bool HandleLevelStats(std::string* value, Slice suffix);
+  bool HandleStats(std::string* value, Slice suffix);
+  bool HandleCFMapStats(std::map<std::string, std::string>* compaction_stats);
+  bool HandleCFStats(std::string* value, Slice suffix);
+  bool HandleCFStatsNoFileHistogram(std::string* value, Slice suffix);
+  bool HandleCFFileHistogram(std::string* value, Slice suffix);
+  bool HandleDBStats(std::string* value, Slice suffix);
+  bool HandleSsTables(std::string* value, Slice suffix);
+  bool HandleAggregatedTableProperties(std::string* value, Slice suffix);
+  bool HandleAggregatedTablePropertiesAtLevel(std::string* value, Slice suffix);
+  bool HandleNumImmutableMemTable(uint64_t* value, DBImpl* db,
+                                  Version* version);
+  bool HandleNumImmutableMemTableFlushed(uint64_t* value, DBImpl* db,
+                                         Version* version);
+  bool HandleMemTableFlushPending(uint64_t* value, DBImpl* db,
+                                  Version* version);
+  bool HandleNumRunningFlushes(uint64_t* value, DBImpl* db, Version* version);
+  bool HandleCompactionPending(uint64_t* value, DBImpl* db, Version* version);
+  bool HandleNumRunningCompactions(uint64_t* value, DBImpl* db,
+                                   Version* version);
+  bool HandleBackgroundErrors(uint64_t* value, DBImpl* db, Version* version);
+  bool HandleCurSizeActiveMemTable(uint64_t* value, DBImpl* db,
+                                   Version* version);
+  bool HandleCurSizeAllMemTables(uint64_t* value, DBImpl* db, Version* version);
+  bool HandleSizeAllMemTables(uint64_t* value, DBImpl* db, Version* version);
+  bool HandleNumEntriesActiveMemTable(uint64_t* value, DBImpl* db,
+                                      Version* version);
+  bool HandleNumEntriesImmMemTables(uint64_t* value, DBImpl* db,
+                                    Version* version);
+  bool HandleNumDeletesActiveMemTable(uint64_t* value, DBImpl* db,
+                                      Version* version);
+  bool HandleNumDeletesImmMemTables(uint64_t* value, DBImpl* db,
+                                    Version* version);
+  bool HandleEstimateNumKeys(uint64_t* value, DBImpl* db, Version* version);
+  bool HandleNumSnapshots(uint64_t* value, DBImpl* db, Version* version);
+  bool HandleOldestSnapshotTime(uint64_t* value, DBImpl* db, Version* version);
+  bool HandleOldestSnapshotSequence(uint64_t* value, DBImpl* db,
+                                    Version* version);
+  bool HandleNumLiveVersions(uint64_t* value, DBImpl* db, Version* version);
+  bool HandleCurrentSuperVersionNumber(uint64_t* value, DBImpl* db,
+                                       Version* version);
+  bool HandleIsFileDeletionsEnabled(uint64_t* value, DBImpl* db,
+                                    Version* version);
+  bool HandleBaseLevel(uint64_t* value, DBImpl* db, Version* version);
+  bool HandleTotalSstFilesSize(uint64_t* value, DBImpl* db, Version* version);
+  bool HandleLiveSstFilesSize(uint64_t* value, DBImpl* db, Version* version);
+  bool HandleEstimatePendingCompactionBytes(uint64_t* value, DBImpl* db,
+                                            Version* version);
+  bool HandleEstimateTableReadersMem(uint64_t* value, DBImpl* db,
+                                     Version* version);
+  bool HandleEstimateLiveDataSize(uint64_t* value, DBImpl* db,
+                                  Version* version);
+  bool HandleMinLogNumberToKeep(uint64_t* value, DBImpl* db, Version* version);
+  bool HandleMinObsoleteSstNumberToKeep(uint64_t* value, DBImpl* db,
+                                        Version* version);
+  bool HandleActualDelayedWriteRate(uint64_t* value, DBImpl* db,
+                                    Version* version);
+  bool HandleIsWriteStopped(uint64_t* value, DBImpl* db, Version* version);
+  bool HandleEstimateOldestKeyTime(uint64_t* value, DBImpl* db,
+                                   Version* version);
+  bool HandleBlockCacheCapacity(uint64_t* value, DBImpl* db, Version* version);
+  bool HandleBlockCacheUsage(uint64_t* value, DBImpl* db, Version* version);
+  bool HandleBlockCachePinnedUsage(uint64_t* value, DBImpl* db,
+                                   Version* version);
+  // Total number of background errors encountered. Every time a flush task
+  // or compaction task fails, this counter is incremented. The failure can
+  // be caused by any possible reason, including file system errors, out of
+  // resources, or input file corruption. Failing when retrying the same flush
+  // or compaction will cause the counter to increase too.
+  uint64_t bg_error_count_;
+
+  const int number_levels_;
+  Env* env_;
+  ColumnFamilyData* cfd_;
+  uint64_t started_at_;
+};
+
+#else
+
+class InternalStats {
+ public:
+  enum InternalCFStatsType {
+    L0_FILE_COUNT_LIMIT_SLOWDOWNS,
+    LOCKED_L0_FILE_COUNT_LIMIT_SLOWDOWNS,
+    MEMTABLE_LIMIT_STOPS,
+    MEMTABLE_LIMIT_SLOWDOWNS,
+    L0_FILE_COUNT_LIMIT_STOPS,
+    LOCKED_L0_FILE_COUNT_LIMIT_STOPS,
+    PENDING_COMPACTION_BYTES_LIMIT_SLOWDOWNS,
+    PENDING_COMPACTION_BYTES_LIMIT_STOPS,
+    WRITE_STALLS_ENUM_MAX,
+    BYTES_FLUSHED,
+    BYTES_INGESTED_ADD_FILE,
+    INGESTED_NUM_FILES_TOTAL,
+    INGESTED_LEVEL0_NUM_FILES_TOTAL,
+    INGESTED_NUM_KEYS_TOTAL,
+    INTERNAL_CF_STATS_ENUM_MAX,
+  };
+
+  enum InternalDBStatsType {
+    kIntStatsWalFileBytes,
+    kIntStatsWalFileSynced,
+    kIntStatsBytesWritten,
+    kIntStatsNumKeysWritten,
+    kIntStatsWriteDoneByOther,
+    kIntStatsWriteDoneBySelf,
+    kIntStatsWriteWithWal,
+    kIntStatsWriteStallMicros,
+    kIntStatsNumMax,
+  };
+
+  InternalStats(int /*num_levels*/, Env* /*env*/, ColumnFamilyData* /*cfd*/) {}
+
+  struct CompactionStats {
+    uint64_t micros;
+    uint64_t cpu_micros;
+    uint64_t bytes_read_non_output_levels;
+    uint64_t bytes_read_output_level;
+    uint64_t bytes_written;
+    uint64_t bytes_moved;
+    int num_input_files_in_non_output_levels;
+    int num_input_files_in_output_level;
+    int num_output_files;
+    uint64_t num_input_records;
+    uint64_t num_dropped_records;
+    int count;
+
+    explicit CompactionStats() {}
+
+    explicit CompactionStats(CompactionReason /*reason*/, int /*c*/) {}
+
+    explicit CompactionStats(const CompactionStats& /*c*/) {}
+
+    void Add(const CompactionStats& /*c*/) {}
+
+    void Subtract(const CompactionStats& /*c*/) {}
+  };
+
+  void AddCompactionStats(int /*level*/, Env::Priority /*thread_pri*/,
+                          const CompactionStats& /*stats*/) {}
+
+  void IncBytesMoved(int /*level*/, uint64_t /*amount*/) {}
+
+  void AddCFStats(InternalCFStatsType /*type*/, uint64_t /*value*/) {}
+
+  void AddDBStats(InternalDBStatsType /*type*/, uint64_t /*value*/,
+                  bool /*concurrent */ = false) {}
+
+  HistogramImpl* GetFileReadHist(int /*level*/) { return nullptr; }
+
+  uint64_t GetBackgroundErrorCount() const { return 0; }
+
+  uint64_t BumpAndGetBackgroundErrorCount() { return 0; }
+
+  bool GetStringProperty(const DBPropertyInfo& /*property_info*/,
+                         const Slice& /*property*/, std::string* /*value*/) {
+    return false;
+  }
+
+  bool GetMapProperty(const DBPropertyInfo& /*property_info*/,
+                      const Slice& /*property*/,
+                      std::map<std::string, std::string>* /*value*/) {
+    return false;
+  }
+
+  bool GetIntProperty(const DBPropertyInfo& /*property_info*/, uint64_t* /*value*/,
+                      DBImpl* /*db*/) const {
+    return false;
+  }
+
+  bool GetIntPropertyOutOfMutex(const DBPropertyInfo& /*property_info*/,
+                                Version* /*version*/, uint64_t* /*value*/) const {
+    return false;
+  }
+};
+#endif  // !ROCKSDB_LITE
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/job_context.h b/src/rocksdb/db/job_context.h
new file mode 100644
index 000000000..31ff26c3a
--- /dev/null
+++ b/src/rocksdb/db/job_context.h
@@ -0,0 +1,219 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "db/log_writer.h"
+#include "db/column_family.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class MemTable;
+struct SuperVersion;
+
+struct SuperVersionContext {
+  struct WriteStallNotification {
+    WriteStallInfo write_stall_info;
+    const ImmutableCFOptions* immutable_cf_options;
+  };
+
+  autovector<SuperVersion*> superversions_to_free;
+#ifndef ROCKSDB_DISABLE_STALL_NOTIFICATION
+  autovector<WriteStallNotification> write_stall_notifications;
+#endif
+  std::unique_ptr<SuperVersion>
+      new_superversion;  // if nullptr no new superversion
+
+  explicit SuperVersionContext(bool create_superversion = false)
+    : new_superversion(create_superversion ? new SuperVersion() : nullptr) {}
+
+  explicit SuperVersionContext(SuperVersionContext&& other)
+      : superversions_to_free(std::move(other.superversions_to_free)),
+#ifndef ROCKSDB_DISABLE_STALL_NOTIFICATION
+        write_stall_notifications(std::move(other.write_stall_notifications)),
+#endif
+        new_superversion(std::move(other.new_superversion)) {
+  }
+
+  void NewSuperVersion() {
+    new_superversion = std::unique_ptr<SuperVersion>(new SuperVersion());
+  }
+
+  inline bool HaveSomethingToDelete() const {
+#ifndef ROCKSDB_DISABLE_STALL_NOTIFICATION
+    return !superversions_to_free.empty() ||
+           !write_stall_notifications.empty();
+#else
+    return !superversions_to_free.empty();
+#endif
+  }
+
+  void PushWriteStallNotification(
+      WriteStallCondition old_cond, WriteStallCondition new_cond,
+      const std::string& name, const ImmutableCFOptions* ioptions) {
+#if !defined(ROCKSDB_LITE) && !defined(ROCKSDB_DISABLE_STALL_NOTIFICATION)
+    WriteStallNotification notif;
+    notif.write_stall_info.cf_name = name;
+    notif.write_stall_info.condition.prev = old_cond;
+    notif.write_stall_info.condition.cur = new_cond;
+    notif.immutable_cf_options = ioptions;
+    write_stall_notifications.push_back(notif);
+#else
+    (void)old_cond;
+    (void)new_cond;
+    (void)name;
+    (void)ioptions;
+#endif  // !defined(ROCKSDB_LITE) && !defined(ROCKSDB_DISABLE_STALL_NOTIFICATION)
+  }
+
+  void Clean() {
+#if !defined(ROCKSDB_LITE) && !defined(ROCKSDB_DISABLE_STALL_NOTIFICATION)
+    // notify listeners on changed write stall conditions
+    for (auto& notif : write_stall_notifications) {
+      for (auto& listener : notif.immutable_cf_options->listeners) {
+        listener->OnStallConditionsChanged(notif.write_stall_info);
+      }
+    }
+    write_stall_notifications.clear();
+#endif  // !ROCKSDB_LITE
+    // free superversions
+    for (auto s : superversions_to_free) {
+      delete s;
+    }
+    superversions_to_free.clear();
+  }
+
+  ~SuperVersionContext() {
+#ifndef ROCKSDB_DISABLE_STALL_NOTIFICATION
+    assert(write_stall_notifications.empty());
+#endif
+    assert(superversions_to_free.empty());
+  }
+};
+
+struct JobContext {
+  inline bool HaveSomethingToDelete() const {
+    return full_scan_candidate_files.size() || sst_delete_files.size() ||
+           log_delete_files.size() || manifest_delete_files.size();
+  }
+
+  inline bool HaveSomethingToClean() const {
+    bool sv_have_sth = false;
+    for (const auto& sv_ctx : superversion_contexts) {
+      if (sv_ctx.HaveSomethingToDelete()) {
+        sv_have_sth = true;
+        break;
+      }
+    }
+    return memtables_to_free.size() > 0 || logs_to_free.size() > 0 ||
+           sv_have_sth;
+  }
+
+  // Structure to store information for candidate files to delete.
+  struct CandidateFileInfo {
+    std::string file_name;
+    std::string file_path;
+    CandidateFileInfo(std::string name, std::string path)
+        : file_name(std::move(name)), file_path(std::move(path)) {}
+    bool operator==(const CandidateFileInfo& other) const {
+      return file_name == other.file_name &&
+             file_path == other.file_path;
+    }
+  };
+
+  // Unique job id
+  int job_id;
+
+  // a list of all files that we'll consider deleting
+  // (every once in a while this is filled up with all files
+  // in the DB directory)
+  // (filled only if we're doing full scan)
+  std::vector<CandidateFileInfo> full_scan_candidate_files;
+
+  // the list of all live sst files that cannot be deleted
+  std::vector<FileDescriptor> sst_live;
+
+  // a list of sst files that we need to delete
+  std::vector<ObsoleteFileInfo> sst_delete_files;
+
+  // a list of log files that we need to delete
+  std::vector<uint64_t> log_delete_files;
+
+  // a list of log files that we need to preserve during full purge since they
+  // will be reused later
+  std::vector<uint64_t> log_recycle_files;
+
+  // a list of manifest files that we need to delete
+  std::vector<std::string> manifest_delete_files;
+
+  // a list of memtables to be free
+  autovector<MemTable*> memtables_to_free;
+
+  // contexts for installing superversions for multiple column families
+  std::vector<SuperVersionContext> superversion_contexts;
+
+  autovector<log::Writer*> logs_to_free;
+
+  // the current manifest_file_number, log_number and prev_log_number
+  // that corresponds to the set of files in 'live'.
+  uint64_t manifest_file_number;
+  uint64_t pending_manifest_file_number;
+  uint64_t log_number;
+  uint64_t prev_log_number;
+
+  uint64_t min_pending_output = 0;
+  uint64_t prev_total_log_size = 0;
+  size_t num_alive_log_files = 0;
+  uint64_t size_log_to_delete = 0;
+
+  // Snapshot taken before flush/compaction job.
+  std::unique_ptr<ManagedSnapshot> job_snapshot;
+
+  explicit JobContext(int _job_id, bool create_superversion = false) {
+    job_id = _job_id;
+    manifest_file_number = 0;
+    pending_manifest_file_number = 0;
+    log_number = 0;
+    prev_log_number = 0;
+    superversion_contexts.emplace_back(
+        SuperVersionContext(create_superversion));
+  }
+
+  // For non-empty JobContext Clean() has to be called at least once before
+  // before destruction (see asserts in ~JobContext()). Should be called with
+  // unlocked DB mutex. Destructor doesn't call Clean() to avoid accidentally
+  // doing potentially slow Clean() with locked DB mutex.
+  void Clean() {
+    // free superversions
+    for (auto& sv_context : superversion_contexts) {
+      sv_context.Clean();
+    }
+    // free pending memtables
+    for (auto m : memtables_to_free) {
+      delete m;
+    }
+    for (auto l : logs_to_free) {
+      delete l;
+    }
+
+    memtables_to_free.clear();
+    logs_to_free.clear();
+    job_snapshot.reset();
+  }
+
+  ~JobContext() {
+    assert(memtables_to_free.size() == 0);
+    assert(logs_to_free.size() == 0);
+  }
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/listener_test.cc b/src/rocksdb/db/listener_test.cc
new file mode 100644
index 000000000..eb1a08a35
--- /dev/null
+++ b/src/rocksdb/db/listener_test.cc
@@ -0,0 +1,1042 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob_index.h"
+#include "db/db_impl/db_impl.h"
+#include "db/db_test_util.h"
+#include "db/dbformat.h"
+#include "db/version_set.h"
+#include "db/write_batch_internal.h"
+#include "file/filename.h"
+#include "logging/logging.h"
+#include "memtable/hash_linklist_rep.h"
+#include "monitoring/statistics.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/options.h"
+#include "rocksdb/perf_context.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/table.h"
+#include "rocksdb/table_properties.h"
+#include "table/block_based/block_based_table_factory.h"
+#include "table/plain/plain_table_factory.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/hash.h"
+#include "util/mutexlock.h"
+#include "util/rate_limiter.h"
+#include "util/string_util.h"
+#include "utilities/merge_operators.h"
+
+#ifndef ROCKSDB_LITE
+
+namespace ROCKSDB_NAMESPACE {
+
+class EventListenerTest : public DBTestBase {
+ public:
+  EventListenerTest() : DBTestBase("/listener_test") {}
+
+  static std::string BlobStr(uint64_t blob_file_number, uint64_t offset,
+                             uint64_t size) {
+    std::string blob_index;
+    BlobIndex::EncodeBlob(&blob_index, blob_file_number, offset, size,
+                          kNoCompression);
+    return blob_index;
+  }
+
+  const size_t k110KB = 110 << 10;
+};
+
+struct TestPropertiesCollector
+    : public ROCKSDB_NAMESPACE::TablePropertiesCollector {
+  ROCKSDB_NAMESPACE::Status AddUserKey(
+      const ROCKSDB_NAMESPACE::Slice& /*key*/,
+      const ROCKSDB_NAMESPACE::Slice& /*value*/,
+      ROCKSDB_NAMESPACE::EntryType /*type*/,
+      ROCKSDB_NAMESPACE::SequenceNumber /*seq*/,
+      uint64_t /*file_size*/) override {
+    return Status::OK();
+  }
+  ROCKSDB_NAMESPACE::Status Finish(
+      ROCKSDB_NAMESPACE::UserCollectedProperties* properties) override {
+    properties->insert({"0", "1"});
+    return Status::OK();
+  }
+
+  const char* Name() const override { return "TestTablePropertiesCollector"; }
+
+  ROCKSDB_NAMESPACE::UserCollectedProperties GetReadableProperties()
+      const override {
+    ROCKSDB_NAMESPACE::UserCollectedProperties ret;
+    ret["2"] = "3";
+    return ret;
+  }
+};
+
+class TestPropertiesCollectorFactory : public TablePropertiesCollectorFactory {
+ public:
+  TablePropertiesCollector* CreateTablePropertiesCollector(
+      TablePropertiesCollectorFactory::Context /*context*/) override {
+    return new TestPropertiesCollector;
+  }
+  const char* Name() const override { return "TestTablePropertiesCollector"; }
+};
+
+class TestCompactionListener : public EventListener {
+ public:
+  explicit TestCompactionListener(EventListenerTest* test) : test_(test) {}
+
+  void OnCompactionCompleted(DB *db, const CompactionJobInfo& ci) override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    compacted_dbs_.push_back(db);
+    ASSERT_GT(ci.input_files.size(), 0U);
+    ASSERT_EQ(ci.input_files.size(), ci.input_file_infos.size());
+
+    for (size_t i = 0; i < ci.input_file_infos.size(); ++i) {
+      ASSERT_EQ(ci.input_file_infos[i].level, ci.base_input_level);
+      ASSERT_EQ(ci.input_file_infos[i].file_number,
+                TableFileNameToNumber(ci.input_files[i]));
+    }
+
+    ASSERT_GT(ci.output_files.size(), 0U);
+    ASSERT_EQ(ci.output_files.size(), ci.output_file_infos.size());
+
+    ASSERT_TRUE(test_);
+    ASSERT_EQ(test_->db_, db);
+
+    std::vector<std::vector<FileMetaData>> files_by_level;
+    test_->dbfull()->TEST_GetFilesMetaData(test_->handles_[ci.cf_id],
+                                           &files_by_level);
+    ASSERT_GT(files_by_level.size(), ci.output_level);
+
+    for (size_t i = 0; i < ci.output_file_infos.size(); ++i) {
+      ASSERT_EQ(ci.output_file_infos[i].level, ci.output_level);
+      ASSERT_EQ(ci.output_file_infos[i].file_number,
+                TableFileNameToNumber(ci.output_files[i]));
+
+      auto it = std::find_if(
+          files_by_level[ci.output_level].begin(),
+          files_by_level[ci.output_level].end(), [&](const FileMetaData& meta) {
+            return meta.fd.GetNumber() == ci.output_file_infos[i].file_number;
+          });
+      ASSERT_NE(it, files_by_level[ci.output_level].end());
+
+      ASSERT_EQ(ci.output_file_infos[i].oldest_blob_file_number,
+                it->oldest_blob_file_number);
+    }
+
+    ASSERT_EQ(db->GetEnv()->GetThreadID(), ci.thread_id);
+    ASSERT_GT(ci.thread_id, 0U);
+
+    for (auto fl : {ci.input_files, ci.output_files}) {
+      for (auto fn : fl) {
+        auto it = ci.table_properties.find(fn);
+        ASSERT_NE(it, ci.table_properties.end());
+        auto tp = it->second;
+        ASSERT_TRUE(tp != nullptr);
+        ASSERT_EQ(tp->user_collected_properties.find("0")->second, "1");
+      }
+    }
+  }
+
+  EventListenerTest* test_;
+  std::vector<DB*> compacted_dbs_;
+  std::mutex mutex_;
+};
+
+TEST_F(EventListenerTest, OnSingleDBCompactionTest) {
+  const int kTestKeySize = 16;
+  const int kTestValueSize = 984;
+  const int kEntrySize = kTestKeySize + kTestValueSize;
+  const int kEntriesPerBuffer = 100;
+  const int kNumL0Files = 4;
+
+  Options options;
+  options.env = CurrentOptions().env;
+  options.create_if_missing = true;
+  options.write_buffer_size = kEntrySize * kEntriesPerBuffer;
+  options.compaction_style = kCompactionStyleLevel;
+  options.target_file_size_base = options.write_buffer_size;
+  options.max_bytes_for_level_base = options.target_file_size_base * 2;
+  options.max_bytes_for_level_multiplier = 2;
+  options.compression = kNoCompression;
+#ifdef ROCKSDB_USING_THREAD_STATUS
+  options.enable_thread_tracking = true;
+#endif  // ROCKSDB_USING_THREAD_STATUS
+  options.level0_file_num_compaction_trigger = kNumL0Files;
+  options.table_properties_collector_factories.push_back(
+      std::make_shared<TestPropertiesCollectorFactory>());
+
+  TestCompactionListener* listener = new TestCompactionListener(this);
+  options.listeners.emplace_back(listener);
+  std::vector<std::string> cf_names = {
+      "pikachu", "ilya", "muromec", "dobrynia",
+      "nikitich", "alyosha", "popovich"};
+  CreateAndReopenWithCF(cf_names, options);
+  ASSERT_OK(Put(1, "pikachu", std::string(90000, 'p')));
+
+  WriteBatch batch;
+  ASSERT_OK(WriteBatchInternal::PutBlobIndex(&batch, 1, "ditto",
+                                             BlobStr(123, 0, 1 << 10)));
+  ASSERT_OK(dbfull()->Write(WriteOptions(), &batch));
+
+  ASSERT_OK(Put(2, "ilya", std::string(90000, 'i')));
+  ASSERT_OK(Put(3, "muromec", std::string(90000, 'm')));
+  ASSERT_OK(Put(4, "dobrynia", std::string(90000, 'd')));
+  ASSERT_OK(Put(5, "nikitich", std::string(90000, 'n')));
+  ASSERT_OK(Put(6, "alyosha", std::string(90000, 'a')));
+  ASSERT_OK(Put(7, "popovich", std::string(90000, 'p')));
+  for (int i = 1; i < 8; ++i) {
+    ASSERT_OK(Flush(i));
+    dbfull()->TEST_WaitForFlushMemTable();
+    ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[i],
+                                     nullptr, nullptr));
+    dbfull()->TEST_WaitForCompact();
+  }
+
+  ASSERT_EQ(listener->compacted_dbs_.size(), cf_names.size());
+  for (size_t i = 0; i < cf_names.size(); ++i) {
+    ASSERT_EQ(listener->compacted_dbs_[i], db_);
+  }
+}
+
+// This simple Listener can only handle one flush at a time.
+class TestFlushListener : public EventListener {
+ public:
+  TestFlushListener(Env* env, EventListenerTest* test)
+      : slowdown_count(0), stop_count(0), db_closed(), env_(env), test_(test) {
+    db_closed = false;
+  }
+  void OnTableFileCreated(
+      const TableFileCreationInfo& info) override {
+    // remember the info for later checking the FlushJobInfo.
+    prev_fc_info_ = info;
+    ASSERT_GT(info.db_name.size(), 0U);
+    ASSERT_GT(info.cf_name.size(), 0U);
+    ASSERT_GT(info.file_path.size(), 0U);
+    ASSERT_GT(info.job_id, 0);
+    ASSERT_GT(info.table_properties.data_size, 0U);
+    ASSERT_GT(info.table_properties.raw_key_size, 0U);
+    ASSERT_GT(info.table_properties.raw_value_size, 0U);
+    ASSERT_GT(info.table_properties.num_data_blocks, 0U);
+    ASSERT_GT(info.table_properties.num_entries, 0U);
+
+#ifdef ROCKSDB_USING_THREAD_STATUS
+    // Verify the id of the current thread that created this table
+    // file matches the id of any active flush or compaction thread.
+    uint64_t thread_id = env_->GetThreadID();
+    std::vector<ThreadStatus> thread_list;
+    ASSERT_OK(env_->GetThreadList(&thread_list));
+    bool found_match = false;
+    for (auto thread_status : thread_list) {
+      if (thread_status.operation_type == ThreadStatus::OP_FLUSH ||
+          thread_status.operation_type == ThreadStatus::OP_COMPACTION) {
+        if (thread_id == thread_status.thread_id) {
+          found_match = true;
+          break;
+        }
+      }
+    }
+    ASSERT_TRUE(found_match);
+#endif  // ROCKSDB_USING_THREAD_STATUS
+  }
+
+  void OnFlushCompleted(
+      DB* db, const FlushJobInfo& info) override {
+    flushed_dbs_.push_back(db);
+    flushed_column_family_names_.push_back(info.cf_name);
+    if (info.triggered_writes_slowdown) {
+      slowdown_count++;
+    }
+    if (info.triggered_writes_stop) {
+      stop_count++;
+    }
+    // verify whether the previously created file matches the flushed file.
+    ASSERT_EQ(prev_fc_info_.db_name, db->GetName());
+    ASSERT_EQ(prev_fc_info_.cf_name, info.cf_name);
+    ASSERT_EQ(prev_fc_info_.job_id, info.job_id);
+    ASSERT_EQ(prev_fc_info_.file_path, info.file_path);
+    ASSERT_EQ(TableFileNameToNumber(info.file_path), info.file_number);
+
+    // Note: the following chunk relies on the notification pertaining to the
+    // database pointed to by DBTestBase::db_, and is thus bypassed when
+    // that assumption does not hold (see the test case MultiDBMultiListeners
+    // below).
+    ASSERT_TRUE(test_);
+    if (db == test_->db_) {
+      std::vector<std::vector<FileMetaData>> files_by_level;
+      test_->dbfull()->TEST_GetFilesMetaData(test_->handles_[info.cf_id],
+                                             &files_by_level);
+
+      ASSERT_FALSE(files_by_level.empty());
+      auto it = std::find_if(files_by_level[0].begin(), files_by_level[0].end(),
+                             [&](const FileMetaData& meta) {
+                               return meta.fd.GetNumber() == info.file_number;
+                             });
+      ASSERT_NE(it, files_by_level[0].end());
+      ASSERT_EQ(info.oldest_blob_file_number, it->oldest_blob_file_number);
+    }
+
+    ASSERT_EQ(db->GetEnv()->GetThreadID(), info.thread_id);
+    ASSERT_GT(info.thread_id, 0U);
+    ASSERT_EQ(info.table_properties.user_collected_properties.find("0")->second,
+              "1");
+  }
+
+  std::vector<std::string> flushed_column_family_names_;
+  std::vector<DB*> flushed_dbs_;
+  int slowdown_count;
+  int stop_count;
+  bool db_closing;
+  std::atomic_bool db_closed;
+  TableFileCreationInfo prev_fc_info_;
+
+ protected:
+  Env* env_;
+  EventListenerTest* test_;
+};
+
+TEST_F(EventListenerTest, OnSingleDBFlushTest) {
+  Options options;
+  options.env = CurrentOptions().env;
+  options.write_buffer_size = k110KB;
+#ifdef ROCKSDB_USING_THREAD_STATUS
+  options.enable_thread_tracking = true;
+#endif  // ROCKSDB_USING_THREAD_STATUS
+  TestFlushListener* listener = new TestFlushListener(options.env, this);
+  options.listeners.emplace_back(listener);
+  std::vector<std::string> cf_names = {
+      "pikachu", "ilya", "muromec", "dobrynia",
+      "nikitich", "alyosha", "popovich"};
+  options.table_properties_collector_factories.push_back(
+      std::make_shared<TestPropertiesCollectorFactory>());
+  CreateAndReopenWithCF(cf_names, options);
+
+  ASSERT_OK(Put(1, "pikachu", std::string(90000, 'p')));
+
+  WriteBatch batch;
+  ASSERT_OK(WriteBatchInternal::PutBlobIndex(&batch, 1, "ditto",
+                                             BlobStr(456, 0, 1 << 10)));
+  ASSERT_OK(dbfull()->Write(WriteOptions(), &batch));
+
+  ASSERT_OK(Put(2, "ilya", std::string(90000, 'i')));
+  ASSERT_OK(Put(3, "muromec", std::string(90000, 'm')));
+  ASSERT_OK(Put(4, "dobrynia", std::string(90000, 'd')));
+  ASSERT_OK(Put(5, "nikitich", std::string(90000, 'n')));
+  ASSERT_OK(Put(6, "alyosha", std::string(90000, 'a')));
+  ASSERT_OK(Put(7, "popovich", std::string(90000, 'p')));
+  for (int i = 1; i < 8; ++i) {
+    ASSERT_OK(Flush(i));
+    dbfull()->TEST_WaitForFlushMemTable();
+    ASSERT_EQ(listener->flushed_dbs_.size(), i);
+    ASSERT_EQ(listener->flushed_column_family_names_.size(), i);
+  }
+
+  // make sure callback functions are called in the right order
+  for (size_t i = 0; i < cf_names.size(); ++i) {
+    ASSERT_EQ(listener->flushed_dbs_[i], db_);
+    ASSERT_EQ(listener->flushed_column_family_names_[i], cf_names[i]);
+  }
+}
+
+TEST_F(EventListenerTest, MultiCF) {
+  Options options;
+  options.env = CurrentOptions().env;
+  options.write_buffer_size = k110KB;
+#ifdef ROCKSDB_USING_THREAD_STATUS
+  options.enable_thread_tracking = true;
+#endif  // ROCKSDB_USING_THREAD_STATUS
+  TestFlushListener* listener = new TestFlushListener(options.env, this);
+  options.listeners.emplace_back(listener);
+  options.table_properties_collector_factories.push_back(
+      std::make_shared<TestPropertiesCollectorFactory>());
+  std::vector<std::string> cf_names = {
+      "pikachu", "ilya", "muromec", "dobrynia",
+      "nikitich", "alyosha", "popovich"};
+  CreateAndReopenWithCF(cf_names, options);
+
+  ASSERT_OK(Put(1, "pikachu", std::string(90000, 'p')));
+  ASSERT_OK(Put(2, "ilya", std::string(90000, 'i')));
+  ASSERT_OK(Put(3, "muromec", std::string(90000, 'm')));
+  ASSERT_OK(Put(4, "dobrynia", std::string(90000, 'd')));
+  ASSERT_OK(Put(5, "nikitich", std::string(90000, 'n')));
+  ASSERT_OK(Put(6, "alyosha", std::string(90000, 'a')));
+  ASSERT_OK(Put(7, "popovich", std::string(90000, 'p')));
+  for (int i = 1; i < 8; ++i) {
+    ASSERT_OK(Flush(i));
+    ASSERT_EQ(listener->flushed_dbs_.size(), i);
+    ASSERT_EQ(listener->flushed_column_family_names_.size(), i);
+  }
+
+  // make sure callback functions are called in the right order
+  for (size_t i = 0; i < cf_names.size(); i++) {
+    ASSERT_EQ(listener->flushed_dbs_[i], db_);
+    ASSERT_EQ(listener->flushed_column_family_names_[i], cf_names[i]);
+  }
+}
+
+TEST_F(EventListenerTest, MultiDBMultiListeners) {
+  Options options;
+  options.env = CurrentOptions().env;
+#ifdef ROCKSDB_USING_THREAD_STATUS
+  options.enable_thread_tracking = true;
+#endif  // ROCKSDB_USING_THREAD_STATUS
+  options.table_properties_collector_factories.push_back(
+      std::make_shared<TestPropertiesCollectorFactory>());
+  std::vector<TestFlushListener*> listeners;
+  const int kNumDBs = 5;
+  const int kNumListeners = 10;
+  for (int i = 0; i < kNumListeners; ++i) {
+    listeners.emplace_back(new TestFlushListener(options.env, this));
+  }
+
+  std::vector<std::string> cf_names = {
+      "pikachu", "ilya", "muromec", "dobrynia",
+      "nikitich", "alyosha", "popovich"};
+
+  options.create_if_missing = true;
+  for (int i = 0; i < kNumListeners; ++i) {
+    options.listeners.emplace_back(listeners[i]);
+  }
+  DBOptions db_opts(options);
+  ColumnFamilyOptions cf_opts(options);
+
+  std::vector<DB*> dbs;
+  std::vector<std::vector<ColumnFamilyHandle *>> vec_handles;
+
+  for (int d = 0; d < kNumDBs; ++d) {
+    ASSERT_OK(DestroyDB(dbname_ + ToString(d), options));
+    DB* db;
+    std::vector<ColumnFamilyHandle*> handles;
+    ASSERT_OK(DB::Open(options, dbname_ + ToString(d), &db));
+    for (size_t c = 0; c < cf_names.size(); ++c) {
+      ColumnFamilyHandle* handle;
+      db->CreateColumnFamily(cf_opts, cf_names[c], &handle);
+      handles.push_back(handle);
+    }
+
+    vec_handles.push_back(std::move(handles));
+    dbs.push_back(db);
+  }
+
+  for (int d = 0; d < kNumDBs; ++d) {
+    for (size_t c = 0; c < cf_names.size(); ++c) {
+      ASSERT_OK(dbs[d]->Put(WriteOptions(), vec_handles[d][c],
+                cf_names[c], cf_names[c]));
+    }
+  }
+
+  for (size_t c = 0; c < cf_names.size(); ++c) {
+    for (int d = 0; d < kNumDBs; ++d) {
+      ASSERT_OK(dbs[d]->Flush(FlushOptions(), vec_handles[d][c]));
+      reinterpret_cast<DBImpl*>(dbs[d])->TEST_WaitForFlushMemTable();
+    }
+  }
+
+  for (auto* listener : listeners) {
+    int pos = 0;
+    for (size_t c = 0; c < cf_names.size(); ++c) {
+      for (int d = 0; d < kNumDBs; ++d) {
+        ASSERT_EQ(listener->flushed_dbs_[pos], dbs[d]);
+        ASSERT_EQ(listener->flushed_column_family_names_[pos], cf_names[c]);
+        pos++;
+      }
+    }
+  }
+
+
+  for (auto handles : vec_handles) {
+    for (auto h : handles) {
+      delete h;
+    }
+    handles.clear();
+  }
+  vec_handles.clear();
+
+  for (auto db : dbs) {
+    delete db;
+  }
+}
+
+TEST_F(EventListenerTest, DisableBGCompaction) {
+  Options options;
+  options.env = CurrentOptions().env;
+#ifdef ROCKSDB_USING_THREAD_STATUS
+  options.enable_thread_tracking = true;
+#endif  // ROCKSDB_USING_THREAD_STATUS
+  TestFlushListener* listener = new TestFlushListener(options.env, this);
+  const int kCompactionTrigger = 1;
+  const int kSlowdownTrigger = 5;
+  const int kStopTrigger = 100;
+  options.level0_file_num_compaction_trigger = kCompactionTrigger;
+  options.level0_slowdown_writes_trigger = kSlowdownTrigger;
+  options.level0_stop_writes_trigger = kStopTrigger;
+  options.max_write_buffer_number = 10;
+  options.listeners.emplace_back(listener);
+  // BG compaction is disabled.  Number of L0 files will simply keeps
+  // increasing in this test.
+  options.compaction_style = kCompactionStyleNone;
+  options.compression = kNoCompression;
+  options.write_buffer_size = 100000;  // Small write buffer
+  options.table_properties_collector_factories.push_back(
+      std::make_shared<TestPropertiesCollectorFactory>());
+
+  CreateAndReopenWithCF({"pikachu"}, options);
+  ColumnFamilyMetaData cf_meta;
+  db_->GetColumnFamilyMetaData(handles_[1], &cf_meta);
+
+  // keep writing until writes are forced to stop.
+  for (int i = 0; static_cast<int>(cf_meta.file_count) < kSlowdownTrigger * 10;
+       ++i) {
+    Put(1, ToString(i), std::string(10000, 'x'), WriteOptions());
+    FlushOptions fo;
+    fo.allow_write_stall = true;
+    db_->Flush(fo, handles_[1]);
+    db_->GetColumnFamilyMetaData(handles_[1], &cf_meta);
+  }
+  ASSERT_GE(listener->slowdown_count, kSlowdownTrigger * 9);
+}
+
+class TestCompactionReasonListener : public EventListener {
+ public:
+  void OnCompactionCompleted(DB* /*db*/, const CompactionJobInfo& ci) override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    compaction_reasons_.push_back(ci.compaction_reason);
+  }
+
+  std::vector<CompactionReason> compaction_reasons_;
+  std::mutex mutex_;
+};
+
+TEST_F(EventListenerTest, CompactionReasonLevel) {
+  Options options;
+  options.env = CurrentOptions().env;
+  options.create_if_missing = true;
+  options.memtable_factory.reset(
+      new SpecialSkipListFactory(DBTestBase::kNumKeysByGenerateNewRandomFile));
+
+  TestCompactionReasonListener* listener = new TestCompactionReasonListener();
+  options.listeners.emplace_back(listener);
+
+  options.level0_file_num_compaction_trigger = 4;
+  options.compaction_style = kCompactionStyleLevel;
+
+  DestroyAndReopen(options);
+  Random rnd(301);
+
+  // Write 4 files in L0
+  for (int i = 0; i < 4; i++) {
+    GenerateNewRandomFile(&rnd);
+  }
+  dbfull()->TEST_WaitForCompact();
+
+  ASSERT_EQ(listener->compaction_reasons_.size(), 1);
+  ASSERT_EQ(listener->compaction_reasons_[0],
+            CompactionReason::kLevelL0FilesNum);
+
+  DestroyAndReopen(options);
+
+  // Write 3 non-overlapping files in L0
+  for (int k = 1; k <= 30; k++) {
+    ASSERT_OK(Put(Key(k), Key(k)));
+    if (k % 10 == 0) {
+      Flush();
+    }
+  }
+
+  // Do a trivial move from L0 -> L1
+  db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+
+  options.max_bytes_for_level_base = 1;
+  Close();
+  listener->compaction_reasons_.clear();
+  Reopen(options);
+
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_GT(listener->compaction_reasons_.size(), 1);
+
+  for (auto compaction_reason : listener->compaction_reasons_) {
+    ASSERT_EQ(compaction_reason, CompactionReason::kLevelMaxLevelSize);
+  }
+
+  options.disable_auto_compactions = true;
+  Close();
+  listener->compaction_reasons_.clear();
+  Reopen(options);
+
+  Put("key", "value");
+  CompactRangeOptions cro;
+  cro.bottommost_level_compaction = BottommostLevelCompaction::kForceOptimized;
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  ASSERT_GT(listener->compaction_reasons_.size(), 0);
+  for (auto compaction_reason : listener->compaction_reasons_) {
+    ASSERT_EQ(compaction_reason, CompactionReason::kManualCompaction);
+  }
+}
+
+TEST_F(EventListenerTest, CompactionReasonUniversal) {
+  Options options;
+  options.env = CurrentOptions().env;
+  options.create_if_missing = true;
+  options.memtable_factory.reset(
+      new SpecialSkipListFactory(DBTestBase::kNumKeysByGenerateNewRandomFile));
+
+  TestCompactionReasonListener* listener = new TestCompactionReasonListener();
+  options.listeners.emplace_back(listener);
+
+  options.compaction_style = kCompactionStyleUniversal;
+
+  Random rnd(301);
+
+  options.level0_file_num_compaction_trigger = 8;
+  options.compaction_options_universal.max_size_amplification_percent = 100000;
+  options.compaction_options_universal.size_ratio = 100000;
+  DestroyAndReopen(options);
+  listener->compaction_reasons_.clear();
+
+  // Write 8 files in L0
+  for (int i = 0; i < 8; i++) {
+    GenerateNewRandomFile(&rnd);
+  }
+  dbfull()->TEST_WaitForCompact();
+
+  ASSERT_GT(listener->compaction_reasons_.size(), 0);
+  for (auto compaction_reason : listener->compaction_reasons_) {
+    ASSERT_EQ(compaction_reason, CompactionReason::kUniversalSizeRatio);
+  }
+
+  options.level0_file_num_compaction_trigger = 8;
+  options.compaction_options_universal.max_size_amplification_percent = 1;
+  options.compaction_options_universal.size_ratio = 100000;
+
+  DestroyAndReopen(options);
+  listener->compaction_reasons_.clear();
+
+  // Write 8 files in L0
+  for (int i = 0; i < 8; i++) {
+    GenerateNewRandomFile(&rnd);
+  }
+  dbfull()->TEST_WaitForCompact();
+
+  ASSERT_GT(listener->compaction_reasons_.size(), 0);
+  for (auto compaction_reason : listener->compaction_reasons_) {
+    ASSERT_EQ(compaction_reason, CompactionReason::kUniversalSizeAmplification);
+  }
+
+  options.disable_auto_compactions = true;
+  Close();
+  listener->compaction_reasons_.clear();
+  Reopen(options);
+
+  db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+
+  ASSERT_GT(listener->compaction_reasons_.size(), 0);
+  for (auto compaction_reason : listener->compaction_reasons_) {
+    ASSERT_EQ(compaction_reason, CompactionReason::kManualCompaction);
+  }
+}
+
+TEST_F(EventListenerTest, CompactionReasonFIFO) {
+  Options options;
+  options.env = CurrentOptions().env;
+  options.create_if_missing = true;
+  options.memtable_factory.reset(
+      new SpecialSkipListFactory(DBTestBase::kNumKeysByGenerateNewRandomFile));
+
+  TestCompactionReasonListener* listener = new TestCompactionReasonListener();
+  options.listeners.emplace_back(listener);
+
+  options.level0_file_num_compaction_trigger = 4;
+  options.compaction_style = kCompactionStyleFIFO;
+  options.compaction_options_fifo.max_table_files_size = 1;
+
+  DestroyAndReopen(options);
+  Random rnd(301);
+
+  // Write 4 files in L0
+  for (int i = 0; i < 4; i++) {
+    GenerateNewRandomFile(&rnd);
+  }
+  dbfull()->TEST_WaitForCompact();
+
+  ASSERT_GT(listener->compaction_reasons_.size(), 0);
+  for (auto compaction_reason : listener->compaction_reasons_) {
+    ASSERT_EQ(compaction_reason, CompactionReason::kFIFOMaxSize);
+  }
+}
+
+class TableFileCreationListener : public EventListener {
+ public:
+  class TestEnv : public EnvWrapper {
+   public:
+    TestEnv() : EnvWrapper(Env::Default()) {}
+
+    void SetStatus(Status s) { status_ = s; }
+
+    Status NewWritableFile(const std::string& fname,
+                           std::unique_ptr<WritableFile>* result,
+                           const EnvOptions& options) override {
+      if (fname.size() > 4 && fname.substr(fname.size() - 4) == ".sst") {
+        if (!status_.ok()) {
+          return status_;
+        }
+      }
+      return Env::Default()->NewWritableFile(fname, result, options);
+    }
+
+   private:
+    Status status_;
+  };
+
+  TableFileCreationListener() {
+    for (int i = 0; i < 2; i++) {
+      started_[i] = finished_[i] = failure_[i] = 0;
+    }
+  }
+
+  int Index(TableFileCreationReason reason) {
+    int idx;
+    switch (reason) {
+      case TableFileCreationReason::kFlush:
+        idx = 0;
+        break;
+      case TableFileCreationReason::kCompaction:
+        idx = 1;
+        break;
+      default:
+        idx = -1;
+    }
+    return idx;
+  }
+
+  void CheckAndResetCounters(int flush_started, int flush_finished,
+                             int flush_failure, int compaction_started,
+                             int compaction_finished, int compaction_failure) {
+    ASSERT_EQ(started_[0], flush_started);
+    ASSERT_EQ(finished_[0], flush_finished);
+    ASSERT_EQ(failure_[0], flush_failure);
+    ASSERT_EQ(started_[1], compaction_started);
+    ASSERT_EQ(finished_[1], compaction_finished);
+    ASSERT_EQ(failure_[1], compaction_failure);
+    for (int i = 0; i < 2; i++) {
+      started_[i] = finished_[i] = failure_[i] = 0;
+    }
+  }
+
+  void OnTableFileCreationStarted(
+      const TableFileCreationBriefInfo& info) override {
+    int idx = Index(info.reason);
+    if (idx >= 0) {
+      started_[idx]++;
+    }
+    ASSERT_GT(info.db_name.size(), 0U);
+    ASSERT_GT(info.cf_name.size(), 0U);
+    ASSERT_GT(info.file_path.size(), 0U);
+    ASSERT_GT(info.job_id, 0);
+  }
+
+  void OnTableFileCreated(const TableFileCreationInfo& info) override {
+    int idx = Index(info.reason);
+    if (idx >= 0) {
+      finished_[idx]++;
+    }
+    ASSERT_GT(info.db_name.size(), 0U);
+    ASSERT_GT(info.cf_name.size(), 0U);
+    ASSERT_GT(info.file_path.size(), 0U);
+    ASSERT_GT(info.job_id, 0);
+    if (info.status.ok()) {
+      ASSERT_GT(info.table_properties.data_size, 0U);
+      ASSERT_GT(info.table_properties.raw_key_size, 0U);
+      ASSERT_GT(info.table_properties.raw_value_size, 0U);
+      ASSERT_GT(info.table_properties.num_data_blocks, 0U);
+      ASSERT_GT(info.table_properties.num_entries, 0U);
+    } else {
+      if (idx >= 0) {
+        failure_[idx]++;
+      }
+    }
+  }
+
+  TestEnv test_env;
+  int started_[2];
+  int finished_[2];
+  int failure_[2];
+};
+
+TEST_F(EventListenerTest, TableFileCreationListenersTest) {
+  auto listener = std::make_shared<TableFileCreationListener>();
+  Options options;
+  options.create_if_missing = true;
+  options.listeners.push_back(listener);
+  options.env = &listener->test_env;
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put("foo", "aaa"));
+  ASSERT_OK(Put("bar", "bbb"));
+  ASSERT_OK(Flush());
+  dbfull()->TEST_WaitForFlushMemTable();
+  listener->CheckAndResetCounters(1, 1, 0, 0, 0, 0);
+
+  ASSERT_OK(Put("foo", "aaa1"));
+  ASSERT_OK(Put("bar", "bbb1"));
+  listener->test_env.SetStatus(Status::NotSupported("not supported"));
+  ASSERT_NOK(Flush());
+  listener->CheckAndResetCounters(1, 1, 1, 0, 0, 0);
+  listener->test_env.SetStatus(Status::OK());
+
+  Reopen(options);
+  ASSERT_OK(Put("foo", "aaa2"));
+  ASSERT_OK(Put("bar", "bbb2"));
+  ASSERT_OK(Flush());
+  dbfull()->TEST_WaitForFlushMemTable();
+  listener->CheckAndResetCounters(1, 1, 0, 0, 0, 0);
+
+  const Slice kRangeStart = "a";
+  const Slice kRangeEnd = "z";
+  dbfull()->CompactRange(CompactRangeOptions(), &kRangeStart, &kRangeEnd);
+  dbfull()->TEST_WaitForCompact();
+  listener->CheckAndResetCounters(0, 0, 0, 1, 1, 0);
+
+  ASSERT_OK(Put("foo", "aaa3"));
+  ASSERT_OK(Put("bar", "bbb3"));
+  ASSERT_OK(Flush());
+  listener->test_env.SetStatus(Status::NotSupported("not supported"));
+  dbfull()->CompactRange(CompactRangeOptions(), &kRangeStart, &kRangeEnd);
+  dbfull()->TEST_WaitForCompact();
+  listener->CheckAndResetCounters(1, 1, 0, 1, 1, 1);
+}
+
+class MemTableSealedListener : public EventListener {
+private:
+  SequenceNumber latest_seq_number_;
+public:
+  MemTableSealedListener() {}
+  void OnMemTableSealed(const MemTableInfo& info) override {
+    latest_seq_number_ = info.first_seqno;
+  }
+
+  void OnFlushCompleted(DB* /*db*/,
+    const FlushJobInfo& flush_job_info) override {
+    ASSERT_LE(flush_job_info.smallest_seqno, latest_seq_number_);
+  }
+};
+
+TEST_F(EventListenerTest, MemTableSealedListenerTest) {
+  auto listener = std::make_shared<MemTableSealedListener>();
+  Options options;
+  options.create_if_missing = true;
+  options.listeners.push_back(listener);
+  DestroyAndReopen(options);
+
+  for (unsigned int i = 0; i < 10; i++) {
+    std::string tag = std::to_string(i);
+    ASSERT_OK(Put("foo"+tag, "aaa"));
+    ASSERT_OK(Put("bar"+tag, "bbb"));
+
+    ASSERT_OK(Flush());
+  }
+}
+
+class ColumnFamilyHandleDeletionStartedListener : public EventListener {
+ private:
+  std::vector<std::string> cfs_;
+  int counter;
+
+ public:
+  explicit ColumnFamilyHandleDeletionStartedListener(
+      const std::vector<std::string>& cfs)
+      : cfs_(cfs), counter(0) {
+    cfs_.insert(cfs_.begin(), kDefaultColumnFamilyName);
+  }
+  void OnColumnFamilyHandleDeletionStarted(
+      ColumnFamilyHandle* handle) override {
+    ASSERT_EQ(cfs_[handle->GetID()], handle->GetName());
+    counter++;
+  }
+  int getCounter() { return counter; }
+};
+
+TEST_F(EventListenerTest, ColumnFamilyHandleDeletionStartedListenerTest) {
+  std::vector<std::string> cfs{"pikachu", "eevee", "Mewtwo"};
+  auto listener =
+      std::make_shared<ColumnFamilyHandleDeletionStartedListener>(cfs);
+  Options options;
+  options.env = CurrentOptions().env;
+  options.create_if_missing = true;
+  options.listeners.push_back(listener);
+  CreateAndReopenWithCF(cfs, options);
+  ASSERT_EQ(handles_.size(), 4);
+  delete handles_[3];
+  delete handles_[2];
+  delete handles_[1];
+  handles_.resize(1);
+  ASSERT_EQ(listener->getCounter(), 3);
+}
+
+class BackgroundErrorListener : public EventListener {
+ private:
+  SpecialEnv* env_;
+  int counter_;
+
+ public:
+  BackgroundErrorListener(SpecialEnv* env) : env_(env), counter_(0) {}
+
+  void OnBackgroundError(BackgroundErrorReason /*reason*/,
+                         Status* bg_error) override {
+    if (counter_ == 0) {
+      // suppress the first error and disable write-dropping such that a retry
+      // can succeed.
+      *bg_error = Status::OK();
+      env_->drop_writes_.store(false, std::memory_order_release);
+      env_->no_slowdown_ = false;
+    }
+    ++counter_;
+  }
+
+  int counter() { return counter_; }
+};
+
+TEST_F(EventListenerTest, BackgroundErrorListenerFailedFlushTest) {
+  auto listener = std::make_shared<BackgroundErrorListener>(env_);
+  Options options;
+  options.create_if_missing = true;
+  options.env = env_;
+  options.listeners.push_back(listener);
+  options.memtable_factory.reset(new SpecialSkipListFactory(1));
+  options.paranoid_checks = true;
+  DestroyAndReopen(options);
+
+  // the usual TEST_WaitForFlushMemTable() doesn't work for failed flushes, so
+  // forge a custom one for the failed flush case.
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::BGWorkFlush:done",
+        "EventListenerTest:BackgroundErrorListenerFailedFlushTest:1"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  env_->drop_writes_.store(true, std::memory_order_release);
+  env_->no_slowdown_ = true;
+
+  ASSERT_OK(Put("key0", "val"));
+  ASSERT_OK(Put("key1", "val"));
+  TEST_SYNC_POINT("EventListenerTest:BackgroundErrorListenerFailedFlushTest:1");
+  ASSERT_EQ(1, listener->counter());
+  ASSERT_OK(Put("key2", "val"));
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  ASSERT_EQ(1, NumTableFilesAtLevel(0));
+}
+
+TEST_F(EventListenerTest, BackgroundErrorListenerFailedCompactionTest) {
+  auto listener = std::make_shared<BackgroundErrorListener>(env_);
+  Options options;
+  options.create_if_missing = true;
+  options.disable_auto_compactions = true;
+  options.env = env_;
+  options.level0_file_num_compaction_trigger = 2;
+  options.listeners.push_back(listener);
+  options.memtable_factory.reset(new SpecialSkipListFactory(2));
+  options.paranoid_checks = true;
+  DestroyAndReopen(options);
+
+  // third iteration triggers the second memtable's flush
+  for (int i = 0; i < 3; ++i) {
+    ASSERT_OK(Put("key0", "val"));
+    if (i > 0) {
+      ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+    }
+    ASSERT_OK(Put("key1", "val"));
+  }
+  ASSERT_EQ(2, NumTableFilesAtLevel(0));
+
+  env_->drop_writes_.store(true, std::memory_order_release);
+  env_->no_slowdown_ = true;
+  ASSERT_OK(dbfull()->SetOptions({{"disable_auto_compactions", "false"}}));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ(1, listener->counter());
+
+  // trigger flush so compaction is triggered again; this time it succeeds
+  // The previous failed compaction may get retried automatically, so we may
+  // be left with 0 or 1 files in level 1, depending on when the retry gets
+  // scheduled
+  ASSERT_OK(Put("key0", "val"));
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_LE(1, NumTableFilesAtLevel(0));
+}
+
+class TestFileOperationListener : public EventListener {
+ public:
+  TestFileOperationListener() {
+    file_reads_.store(0);
+    file_reads_success_.store(0);
+    file_writes_.store(0);
+    file_writes_success_.store(0);
+  }
+
+  void OnFileReadFinish(const FileOperationInfo& info) override {
+    ++file_reads_;
+    if (info.status.ok()) {
+      ++file_reads_success_;
+    }
+    ReportDuration(info);
+  }
+
+  void OnFileWriteFinish(const FileOperationInfo& info) override {
+    ++file_writes_;
+    if (info.status.ok()) {
+      ++file_writes_success_;
+    }
+    ReportDuration(info);
+  }
+
+  bool ShouldBeNotifiedOnFileIO() override { return true; }
+
+  std::atomic<size_t> file_reads_;
+  std::atomic<size_t> file_reads_success_;
+  std::atomic<size_t> file_writes_;
+  std::atomic<size_t> file_writes_success_;
+
+ private:
+  void ReportDuration(const FileOperationInfo& info) const {
+    auto duration = std::chrono::duration_cast<std::chrono::nanoseconds>(
+        info.finish_timestamp - info.start_timestamp);
+    ASSERT_GT(duration.count(), 0);
+  }
+};
+
+TEST_F(EventListenerTest, OnFileOperationTest) {
+  Options options;
+  options.env = CurrentOptions().env;
+  options.create_if_missing = true;
+
+  TestFileOperationListener* listener = new TestFileOperationListener();
+  options.listeners.emplace_back(listener);
+
+  DestroyAndReopen(options);
+  ASSERT_OK(Put("foo", "aaa"));
+  dbfull()->Flush(FlushOptions());
+  dbfull()->TEST_WaitForFlushMemTable();
+  ASSERT_GE(listener->file_writes_.load(),
+            listener->file_writes_success_.load());
+  ASSERT_GT(listener->file_writes_.load(), 0);
+  Close();
+
+  Reopen(options);
+  ASSERT_GE(listener->file_reads_.load(), listener->file_reads_success_.load());
+  ASSERT_GT(listener->file_reads_.load(), 0);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/log_format.h b/src/rocksdb/db/log_format.h
new file mode 100644
index 000000000..c22e2b6bc
--- /dev/null
+++ b/src/rocksdb/db/log_format.h
@@ -0,0 +1,48 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Log format information shared by reader and writer.
+// See ../doc/log_format.txt for more detail.
+
+#pragma once
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace log {
+
+enum RecordType {
+  // Zero is reserved for preallocated files
+  kZeroType = 0,
+  kFullType = 1,
+
+  // For fragments
+  kFirstType = 2,
+  kMiddleType = 3,
+  kLastType = 4,
+
+  // For recycled log files
+  kRecyclableFullType = 5,
+  kRecyclableFirstType = 6,
+  kRecyclableMiddleType = 7,
+  kRecyclableLastType = 8,
+};
+static const int kMaxRecordType = kRecyclableLastType;
+
+static const unsigned int kBlockSize = 32768;
+
+// Header is checksum (4 bytes), length (2 bytes), type (1 byte)
+static const int kHeaderSize = 4 + 2 + 1;
+
+// Recyclable header is checksum (4 bytes), length (2 bytes), type (1 byte),
+// log number (4 bytes).
+static const int kRecyclableHeaderSize = 4 + 2 + 1 + 4;
+
+}  // namespace log
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/log_reader.cc b/src/rocksdb/db/log_reader.cc
new file mode 100644
index 000000000..c60a814b9
--- /dev/null
+++ b/src/rocksdb/db/log_reader.cc
@@ -0,0 +1,624 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/log_reader.h"
+
+#include <stdio.h>
+#include "file/sequence_file_reader.h"
+#include "rocksdb/env.h"
+#include "test_util/sync_point.h"
+#include "util/coding.h"
+#include "util/crc32c.h"
+#include "util/util.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace log {
+
+Reader::Reporter::~Reporter() {
+}
+
+Reader::Reader(std::shared_ptr<Logger> info_log,
+               std::unique_ptr<SequentialFileReader>&& _file,
+               Reporter* reporter, bool checksum, uint64_t log_num)
+    : info_log_(info_log),
+      file_(std::move(_file)),
+      reporter_(reporter),
+      checksum_(checksum),
+      backing_store_(new char[kBlockSize]),
+      buffer_(),
+      eof_(false),
+      read_error_(false),
+      eof_offset_(0),
+      last_record_offset_(0),
+      end_of_buffer_offset_(0),
+      log_number_(log_num),
+      recycled_(false) {}
+
+Reader::~Reader() {
+  delete[] backing_store_;
+}
+
+// For kAbsoluteConsistency, on clean shutdown we don't expect any error
+// in the log files.  For other modes, we can ignore only incomplete records
+// in the last log file, which are presumably due to a write in progress
+// during restart (or from log recycling).
+//
+// TODO krad: Evaluate if we need to move to a more strict mode where we
+// restrict the inconsistency to only the last log
+bool Reader::ReadRecord(Slice* record, std::string* scratch,
+                        WALRecoveryMode wal_recovery_mode) {
+  scratch->clear();
+  record->clear();
+  bool in_fragmented_record = false;
+  // Record offset of the logical record that we're reading
+  // 0 is a dummy value to make compilers happy
+  uint64_t prospective_record_offset = 0;
+
+  Slice fragment;
+  while (true) {
+    uint64_t physical_record_offset = end_of_buffer_offset_ - buffer_.size();
+    size_t drop_size = 0;
+    const unsigned int record_type = ReadPhysicalRecord(&fragment, &drop_size);
+    switch (record_type) {
+      case kFullType:
+      case kRecyclableFullType:
+        if (in_fragmented_record && !scratch->empty()) {
+          // Handle bug in earlier versions of log::Writer where
+          // it could emit an empty kFirstType record at the tail end
+          // of a block followed by a kFullType or kFirstType record
+          // at the beginning of the next block.
+          ReportCorruption(scratch->size(), "partial record without end(1)");
+        }
+        prospective_record_offset = physical_record_offset;
+        scratch->clear();
+        *record = fragment;
+        last_record_offset_ = prospective_record_offset;
+        return true;
+
+      case kFirstType:
+      case kRecyclableFirstType:
+        if (in_fragmented_record && !scratch->empty()) {
+          // Handle bug in earlier versions of log::Writer where
+          // it could emit an empty kFirstType record at the tail end
+          // of a block followed by a kFullType or kFirstType record
+          // at the beginning of the next block.
+          ReportCorruption(scratch->size(), "partial record without end(2)");
+        }
+        prospective_record_offset = physical_record_offset;
+        scratch->assign(fragment.data(), fragment.size());
+        in_fragmented_record = true;
+        break;
+
+      case kMiddleType:
+      case kRecyclableMiddleType:
+        if (!in_fragmented_record) {
+          ReportCorruption(fragment.size(),
+                           "missing start of fragmented record(1)");
+        } else {
+          scratch->append(fragment.data(), fragment.size());
+        }
+        break;
+
+      case kLastType:
+      case kRecyclableLastType:
+        if (!in_fragmented_record) {
+          ReportCorruption(fragment.size(),
+                           "missing start of fragmented record(2)");
+        } else {
+          scratch->append(fragment.data(), fragment.size());
+          *record = Slice(*scratch);
+          last_record_offset_ = prospective_record_offset;
+          return true;
+        }
+        break;
+
+      case kBadHeader:
+        if (wal_recovery_mode == WALRecoveryMode::kAbsoluteConsistency) {
+          // in clean shutdown we don't expect any error in the log files
+          ReportCorruption(drop_size, "truncated header");
+        }
+        FALLTHROUGH_INTENDED;
+
+      case kEof:
+        if (in_fragmented_record) {
+          if (wal_recovery_mode == WALRecoveryMode::kAbsoluteConsistency) {
+            // in clean shutdown we don't expect any error in the log files
+            ReportCorruption(scratch->size(), "error reading trailing data");
+          }
+          // This can be caused by the writer dying immediately after
+          //  writing a physical record but before completing the next; don't
+          //  treat it as a corruption, just ignore the entire logical record.
+          scratch->clear();
+        }
+        return false;
+
+      case kOldRecord:
+        if (wal_recovery_mode != WALRecoveryMode::kSkipAnyCorruptedRecords) {
+          // Treat a record from a previous instance of the log as EOF.
+          if (in_fragmented_record) {
+            if (wal_recovery_mode == WALRecoveryMode::kAbsoluteConsistency) {
+              // in clean shutdown we don't expect any error in the log files
+              ReportCorruption(scratch->size(), "error reading trailing data");
+            }
+            // This can be caused by the writer dying immediately after
+            //  writing a physical record but before completing the next; don't
+            //  treat it as a corruption, just ignore the entire logical record.
+            scratch->clear();
+          }
+          return false;
+        }
+        FALLTHROUGH_INTENDED;
+
+      case kBadRecord:
+        if (in_fragmented_record) {
+          ReportCorruption(scratch->size(), "error in middle of record");
+          in_fragmented_record = false;
+          scratch->clear();
+        }
+        break;
+
+      case kBadRecordLen:
+      case kBadRecordChecksum:
+        if (recycled_ &&
+            wal_recovery_mode ==
+                WALRecoveryMode::kTolerateCorruptedTailRecords) {
+          scratch->clear();
+          return false;
+        }
+        if (record_type == kBadRecordLen) {
+          ReportCorruption(drop_size, "bad record length");
+        } else {
+          ReportCorruption(drop_size, "checksum mismatch");
+        }
+        if (in_fragmented_record) {
+          ReportCorruption(scratch->size(), "error in middle of record");
+          in_fragmented_record = false;
+          scratch->clear();
+        }
+        break;
+
+      default: {
+        char buf[40];
+        snprintf(buf, sizeof(buf), "unknown record type %u", record_type);
+        ReportCorruption(
+            (fragment.size() + (in_fragmented_record ? scratch->size() : 0)),
+            buf);
+        in_fragmented_record = false;
+        scratch->clear();
+        break;
+      }
+    }
+  }
+  return false;
+}
+
+uint64_t Reader::LastRecordOffset() {
+  return last_record_offset_;
+}
+
+void Reader::UnmarkEOF() {
+  if (read_error_) {
+    return;
+  }
+  eof_ = false;
+  if (eof_offset_ == 0) {
+    return;
+  }
+  UnmarkEOFInternal();
+}
+
+void Reader::UnmarkEOFInternal() {
+  // If the EOF was in the middle of a block (a partial block was read) we have
+  // to read the rest of the block as ReadPhysicalRecord can only read full
+  // blocks and expects the file position indicator to be aligned to the start
+  // of a block.
+  //
+  //      consumed_bytes + buffer_size() + remaining == kBlockSize
+
+  size_t consumed_bytes = eof_offset_ - buffer_.size();
+  size_t remaining = kBlockSize - eof_offset_;
+
+  // backing_store_ is used to concatenate what is left in buffer_ and
+  // the remainder of the block. If buffer_ already uses backing_store_,
+  // we just append the new data.
+  if (buffer_.data() != backing_store_ + consumed_bytes) {
+    // Buffer_ does not use backing_store_ for storage.
+    // Copy what is left in buffer_ to backing_store.
+    memmove(backing_store_ + consumed_bytes, buffer_.data(), buffer_.size());
+  }
+
+  Slice read_buffer;
+  Status status = file_->Read(remaining, &read_buffer,
+    backing_store_ + eof_offset_);
+
+  size_t added = read_buffer.size();
+  end_of_buffer_offset_ += added;
+
+  if (!status.ok()) {
+    if (added > 0) {
+      ReportDrop(added, status);
+    }
+
+    read_error_ = true;
+    return;
+  }
+
+  if (read_buffer.data() != backing_store_ + eof_offset_) {
+    // Read did not write to backing_store_
+    memmove(backing_store_ + eof_offset_, read_buffer.data(),
+      read_buffer.size());
+  }
+
+  buffer_ = Slice(backing_store_ + consumed_bytes,
+    eof_offset_ + added - consumed_bytes);
+
+  if (added < remaining) {
+    eof_ = true;
+    eof_offset_ += added;
+  } else {
+    eof_offset_ = 0;
+  }
+}
+
+void Reader::ReportCorruption(size_t bytes, const char* reason) {
+  ReportDrop(bytes, Status::Corruption(reason));
+}
+
+void Reader::ReportDrop(size_t bytes, const Status& reason) {
+  if (reporter_ != nullptr) {
+    reporter_->Corruption(bytes, reason);
+  }
+}
+
+bool Reader::ReadMore(size_t* drop_size, int *error) {
+  if (!eof_ && !read_error_) {
+    // Last read was a full read, so this is a trailer to skip
+    buffer_.clear();
+    Status status = file_->Read(kBlockSize, &buffer_, backing_store_);
+    end_of_buffer_offset_ += buffer_.size();
+    if (!status.ok()) {
+      buffer_.clear();
+      ReportDrop(kBlockSize, status);
+      read_error_ = true;
+      *error = kEof;
+      return false;
+    } else if (buffer_.size() < static_cast<size_t>(kBlockSize)) {
+      eof_ = true;
+      eof_offset_ = buffer_.size();
+    }
+    return true;
+  } else {
+    // Note that if buffer_ is non-empty, we have a truncated header at the
+    //  end of the file, which can be caused by the writer crashing in the
+    //  middle of writing the header. Unless explicitly requested we don't
+    //  considering this an error, just report EOF.
+    if (buffer_.size()) {
+      *drop_size = buffer_.size();
+      buffer_.clear();
+      *error = kBadHeader;
+      return false;
+    }
+    buffer_.clear();
+    *error = kEof;
+    return false;
+  }
+}
+
+unsigned int Reader::ReadPhysicalRecord(Slice* result, size_t* drop_size) {
+  while (true) {
+    // We need at least the minimum header size
+    if (buffer_.size() < static_cast<size_t>(kHeaderSize)) {
+      // the default value of r is meaningless because ReadMore will overwrite
+      // it if it returns false; in case it returns true, the return value will
+      // not be used anyway
+      int r = kEof;
+      if (!ReadMore(drop_size, &r)) {
+        return r;
+      }
+      continue;
+    }
+
+    // Parse the header
+    const char* header = buffer_.data();
+    const uint32_t a = static_cast<uint32_t>(header[4]) & 0xff;
+    const uint32_t b = static_cast<uint32_t>(header[5]) & 0xff;
+    const unsigned int type = header[6];
+    const uint32_t length = a | (b << 8);
+    int header_size = kHeaderSize;
+    if (type >= kRecyclableFullType && type <= kRecyclableLastType) {
+      if (end_of_buffer_offset_ - buffer_.size() == 0) {
+        recycled_ = true;
+      }
+      header_size = kRecyclableHeaderSize;
+      // We need enough for the larger header
+      if (buffer_.size() < static_cast<size_t>(kRecyclableHeaderSize)) {
+        int r = kEof;
+        if (!ReadMore(drop_size, &r)) {
+          return r;
+        }
+        continue;
+      }
+      const uint32_t log_num = DecodeFixed32(header + 7);
+      if (log_num != log_number_) {
+        return kOldRecord;
+      }
+    }
+    if (header_size + length > buffer_.size()) {
+      *drop_size = buffer_.size();
+      buffer_.clear();
+      if (!eof_) {
+        return kBadRecordLen;
+      }
+      // If the end of the file has been reached without reading |length|
+      // bytes of payload, assume the writer died in the middle of writing the
+      // record. Don't report a corruption unless requested.
+      if (*drop_size) {
+        return kBadHeader;
+      }
+      return kEof;
+    }
+
+    if (type == kZeroType && length == 0) {
+      // Skip zero length record without reporting any drops since
+      // such records are produced by the mmap based writing code in
+      // env_posix.cc that preallocates file regions.
+      // NOTE: this should never happen in DB written by new RocksDB versions,
+      // since we turn off mmap writes to manifest and log files
+      buffer_.clear();
+      return kBadRecord;
+    }
+
+    // Check crc
+    if (checksum_) {
+      uint32_t expected_crc = crc32c::Unmask(DecodeFixed32(header));
+      uint32_t actual_crc = crc32c::Value(header + 6, length + header_size - 6);
+      if (actual_crc != expected_crc) {
+        // Drop the rest of the buffer since "length" itself may have
+        // been corrupted and if we trust it, we could find some
+        // fragment of a real log record that just happens to look
+        // like a valid log record.
+        *drop_size = buffer_.size();
+        buffer_.clear();
+        return kBadRecordChecksum;
+      }
+    }
+
+    buffer_.remove_prefix(header_size + length);
+
+    *result = Slice(header + header_size, length);
+    return type;
+  }
+}
+
+bool FragmentBufferedReader::ReadRecord(Slice* record, std::string* scratch,
+                                        WALRecoveryMode /*unused*/) {
+  assert(record != nullptr);
+  assert(scratch != nullptr);
+  record->clear();
+  scratch->clear();
+
+  uint64_t prospective_record_offset = 0;
+  uint64_t physical_record_offset = end_of_buffer_offset_ - buffer_.size();
+  size_t drop_size = 0;
+  unsigned int fragment_type_or_err = 0;  // Initialize to make compiler happy
+  Slice fragment;
+  while (TryReadFragment(&fragment, &drop_size, &fragment_type_or_err)) {
+    switch (fragment_type_or_err) {
+      case kFullType:
+      case kRecyclableFullType:
+        if (in_fragmented_record_ && !fragments_.empty()) {
+          ReportCorruption(fragments_.size(), "partial record without end(1)");
+        }
+        fragments_.clear();
+        *record = fragment;
+        prospective_record_offset = physical_record_offset;
+        last_record_offset_ = prospective_record_offset;
+        in_fragmented_record_ = false;
+        return true;
+
+      case kFirstType:
+      case kRecyclableFirstType:
+        if (in_fragmented_record_ || !fragments_.empty()) {
+          ReportCorruption(fragments_.size(), "partial record without end(2)");
+        }
+        prospective_record_offset = physical_record_offset;
+        fragments_.assign(fragment.data(), fragment.size());
+        in_fragmented_record_ = true;
+        break;
+
+      case kMiddleType:
+      case kRecyclableMiddleType:
+        if (!in_fragmented_record_) {
+          ReportCorruption(fragment.size(),
+                           "missing start of fragmented record(1)");
+        } else {
+          fragments_.append(fragment.data(), fragment.size());
+        }
+        break;
+
+      case kLastType:
+      case kRecyclableLastType:
+        if (!in_fragmented_record_) {
+          ReportCorruption(fragment.size(),
+                           "missing start of fragmented record(2)");
+        } else {
+          fragments_.append(fragment.data(), fragment.size());
+          scratch->assign(fragments_.data(), fragments_.size());
+          fragments_.clear();
+          *record = Slice(*scratch);
+          last_record_offset_ = prospective_record_offset;
+          in_fragmented_record_ = false;
+          return true;
+        }
+        break;
+
+      case kBadHeader:
+      case kBadRecord:
+      case kEof:
+      case kOldRecord:
+        if (in_fragmented_record_) {
+          ReportCorruption(fragments_.size(), "error in middle of record");
+          in_fragmented_record_ = false;
+          fragments_.clear();
+        }
+        break;
+
+      case kBadRecordChecksum:
+        if (recycled_) {
+          fragments_.clear();
+          return false;
+        }
+        ReportCorruption(drop_size, "checksum mismatch");
+        if (in_fragmented_record_) {
+          ReportCorruption(fragments_.size(), "error in middle of record");
+          in_fragmented_record_ = false;
+          fragments_.clear();
+        }
+        break;
+
+      default: {
+        char buf[40];
+        snprintf(buf, sizeof(buf), "unknown record type %u",
+                 fragment_type_or_err);
+        ReportCorruption(
+            fragment.size() + (in_fragmented_record_ ? fragments_.size() : 0),
+            buf);
+        in_fragmented_record_ = false;
+        fragments_.clear();
+        break;
+      }
+    }
+  }
+  return false;
+}
+
+void FragmentBufferedReader::UnmarkEOF() {
+  if (read_error_) {
+    return;
+  }
+  eof_ = false;
+  UnmarkEOFInternal();
+}
+
+bool FragmentBufferedReader::TryReadMore(size_t* drop_size, int* error) {
+  if (!eof_ && !read_error_) {
+    // Last read was a full read, so this is a trailer to skip
+    buffer_.clear();
+    Status status = file_->Read(kBlockSize, &buffer_, backing_store_);
+    end_of_buffer_offset_ += buffer_.size();
+    if (!status.ok()) {
+      buffer_.clear();
+      ReportDrop(kBlockSize, status);
+      read_error_ = true;
+      *error = kEof;
+      return false;
+    } else if (buffer_.size() < static_cast<size_t>(kBlockSize)) {
+      eof_ = true;
+      eof_offset_ = buffer_.size();
+      TEST_SYNC_POINT_CALLBACK(
+          "FragmentBufferedLogReader::TryReadMore:FirstEOF", nullptr);
+    }
+    return true;
+  } else if (!read_error_) {
+    UnmarkEOF();
+  }
+  if (!read_error_) {
+    return true;
+  }
+  *error = kEof;
+  *drop_size = buffer_.size();
+  if (buffer_.size() > 0) {
+    *error = kBadHeader;
+  }
+  buffer_.clear();
+  return false;
+}
+
+// return true if the caller should process the fragment_type_or_err.
+bool FragmentBufferedReader::TryReadFragment(
+    Slice* fragment, size_t* drop_size, unsigned int* fragment_type_or_err) {
+  assert(fragment != nullptr);
+  assert(drop_size != nullptr);
+  assert(fragment_type_or_err != nullptr);
+
+  while (buffer_.size() < static_cast<size_t>(kHeaderSize)) {
+    size_t old_size = buffer_.size();
+    int error = kEof;
+    if (!TryReadMore(drop_size, &error)) {
+      *fragment_type_or_err = error;
+      return false;
+    } else if (old_size == buffer_.size()) {
+      return false;
+    }
+  }
+  const char* header = buffer_.data();
+  const uint32_t a = static_cast<uint32_t>(header[4]) & 0xff;
+  const uint32_t b = static_cast<uint32_t>(header[5]) & 0xff;
+  const unsigned int type = header[6];
+  const uint32_t length = a | (b << 8);
+  int header_size = kHeaderSize;
+  if (type >= kRecyclableFullType && type <= kRecyclableLastType) {
+    if (end_of_buffer_offset_ - buffer_.size() == 0) {
+      recycled_ = true;
+    }
+    header_size = kRecyclableHeaderSize;
+    while (buffer_.size() < static_cast<size_t>(kRecyclableHeaderSize)) {
+      size_t old_size = buffer_.size();
+      int error = kEof;
+      if (!TryReadMore(drop_size, &error)) {
+        *fragment_type_or_err = error;
+        return false;
+      } else if (old_size == buffer_.size()) {
+        return false;
+      }
+    }
+    const uint32_t log_num = DecodeFixed32(header + 7);
+    if (log_num != log_number_) {
+      *fragment_type_or_err = kOldRecord;
+      return true;
+    }
+  }
+
+  while (header_size + length > buffer_.size()) {
+    size_t old_size = buffer_.size();
+    int error = kEof;
+    if (!TryReadMore(drop_size, &error)) {
+      *fragment_type_or_err = error;
+      return false;
+    } else if (old_size == buffer_.size()) {
+      return false;
+    }
+  }
+
+  if (type == kZeroType && length == 0) {
+    buffer_.clear();
+    *fragment_type_or_err = kBadRecord;
+    return true;
+  }
+
+  if (checksum_) {
+    uint32_t expected_crc = crc32c::Unmask(DecodeFixed32(header));
+    uint32_t actual_crc = crc32c::Value(header + 6, length + header_size - 6);
+    if (actual_crc != expected_crc) {
+      *drop_size = buffer_.size();
+      buffer_.clear();
+      *fragment_type_or_err = kBadRecordChecksum;
+      return true;
+    }
+  }
+
+  buffer_.remove_prefix(header_size + length);
+
+  *fragment = Slice(header + header_size, length);
+  *fragment_type_or_err = type;
+  return true;
+}
+
+}  // namespace log
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/log_reader.h b/src/rocksdb/db/log_reader.h
new file mode 100644
index 000000000..293ae957c
--- /dev/null
+++ b/src/rocksdb/db/log_reader.h
@@ -0,0 +1,189 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <memory>
+#include <stdint.h>
+
+#include "db/log_format.h"
+#include "file/sequence_file_reader.h"
+#include "rocksdb/options.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+class Logger;
+
+namespace log {
+
+/**
+ * Reader is a general purpose log stream reader implementation. The actual job
+ * of reading from the device is implemented by the SequentialFile interface.
+ *
+ * Please see Writer for details on the file and record layout.
+ */
+class Reader {
+ public:
+  // Interface for reporting errors.
+  class Reporter {
+   public:
+    virtual ~Reporter();
+
+    // Some corruption was detected.  "size" is the approximate number
+    // of bytes dropped due to the corruption.
+    virtual void Corruption(size_t bytes, const Status& status) = 0;
+  };
+
+  // Create a reader that will return log records from "*file".
+  // "*file" must remain live while this Reader is in use.
+  //
+  // If "reporter" is non-nullptr, it is notified whenever some data is
+  // dropped due to a detected corruption.  "*reporter" must remain
+  // live while this Reader is in use.
+  //
+  // If "checksum" is true, verify checksums if available.
+  Reader(std::shared_ptr<Logger> info_log,
+         // @lint-ignore TXT2 T25377293 Grandfathered in
+         std::unique_ptr<SequentialFileReader>&& file, Reporter* reporter,
+         bool checksum, uint64_t log_num);
+  // No copying allowed
+  Reader(const Reader&) = delete;
+  void operator=(const Reader&) = delete;
+
+  virtual ~Reader();
+
+  // Read the next record into *record.  Returns true if read
+  // successfully, false if we hit end of the input.  May use
+  // "*scratch" as temporary storage.  The contents filled in *record
+  // will only be valid until the next mutating operation on this
+  // reader or the next mutation to *scratch.
+  virtual bool ReadRecord(Slice* record, std::string* scratch,
+                          WALRecoveryMode wal_recovery_mode =
+                              WALRecoveryMode::kTolerateCorruptedTailRecords);
+
+  // Returns the physical offset of the last record returned by ReadRecord.
+  //
+  // Undefined before the first call to ReadRecord.
+  uint64_t LastRecordOffset();
+
+  // returns true if the reader has encountered an eof condition.
+  bool IsEOF() {
+    return eof_;
+  }
+
+  // returns true if the reader has encountered read error.
+  bool hasReadError() const { return read_error_; }
+
+  // when we know more data has been written to the file. we can use this
+  // function to force the reader to look again in the file.
+  // Also aligns the file position indicator to the start of the next block
+  // by reading the rest of the data from the EOF position to the end of the
+  // block that was partially read.
+  virtual void UnmarkEOF();
+
+  SequentialFileReader* file() { return file_.get(); }
+
+  Reporter* GetReporter() const { return reporter_; }
+
+  uint64_t GetLogNumber() const { return log_number_; }
+
+  size_t GetReadOffset() const {
+    return static_cast<size_t>(end_of_buffer_offset_);
+  }
+
+ protected:
+  std::shared_ptr<Logger> info_log_;
+  const std::unique_ptr<SequentialFileReader> file_;
+  Reporter* const reporter_;
+  bool const checksum_;
+  char* const backing_store_;
+
+  // Internal state variables used for reading records
+  Slice buffer_;
+  bool eof_;   // Last Read() indicated EOF by returning < kBlockSize
+  bool read_error_;   // Error occurred while reading from file
+
+  // Offset of the file position indicator within the last block when an
+  // EOF was detected.
+  size_t eof_offset_;
+
+  // Offset of the last record returned by ReadRecord.
+  uint64_t last_record_offset_;
+  // Offset of the first location past the end of buffer_.
+  uint64_t end_of_buffer_offset_;
+
+  // which log number this is
+  uint64_t const log_number_;
+
+  // Whether this is a recycled log file
+  bool recycled_;
+
+  // Extend record types with the following special values
+  enum {
+    kEof = kMaxRecordType + 1,
+    // Returned whenever we find an invalid physical record.
+    // Currently there are three situations in which this happens:
+    // * The record has an invalid CRC (ReadPhysicalRecord reports a drop)
+    // * The record is a 0-length record (No drop is reported)
+    kBadRecord = kMaxRecordType + 2,
+    // Returned when we fail to read a valid header.
+    kBadHeader = kMaxRecordType + 3,
+    // Returned when we read an old record from a previous user of the log.
+    kOldRecord = kMaxRecordType + 4,
+    // Returned when we get a bad record length
+    kBadRecordLen = kMaxRecordType + 5,
+    // Returned when we get a bad record checksum
+    kBadRecordChecksum = kMaxRecordType + 6,
+  };
+
+  // Return type, or one of the preceding special values
+  unsigned int ReadPhysicalRecord(Slice* result, size_t* drop_size);
+
+  // Read some more
+  bool ReadMore(size_t* drop_size, int *error);
+
+  void UnmarkEOFInternal();
+
+  // Reports dropped bytes to the reporter.
+  // buffer_ must be updated to remove the dropped bytes prior to invocation.
+  void ReportCorruption(size_t bytes, const char* reason);
+  void ReportDrop(size_t bytes, const Status& reason);
+};
+
+class FragmentBufferedReader : public Reader {
+ public:
+  FragmentBufferedReader(std::shared_ptr<Logger> info_log,
+                         // @lint-ignore TXT2 T25377293 Grandfathered in
+                         std::unique_ptr<SequentialFileReader>&& _file,
+                         Reporter* reporter, bool checksum, uint64_t log_num)
+      : Reader(info_log, std::move(_file), reporter, checksum, log_num),
+        fragments_(),
+        in_fragmented_record_(false) {}
+  ~FragmentBufferedReader() override {}
+  bool ReadRecord(Slice* record, std::string* scratch,
+                  WALRecoveryMode wal_recovery_mode =
+                      WALRecoveryMode::kTolerateCorruptedTailRecords) override;
+  void UnmarkEOF() override;
+
+ private:
+  std::string fragments_;
+  bool in_fragmented_record_;
+
+  bool TryReadFragment(Slice* result, size_t* drop_size,
+                       unsigned int* fragment_type_or_err);
+
+  bool TryReadMore(size_t* drop_size, int* error);
+
+  // No copy allowed
+  FragmentBufferedReader(const FragmentBufferedReader&);
+  void operator=(const FragmentBufferedReader&);
+};
+
+}  // namespace log
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/log_test.cc b/src/rocksdb/db/log_test.cc
new file mode 100644
index 000000000..849b89d8a
--- /dev/null
+++ b/src/rocksdb/db/log_test.cc
@@ -0,0 +1,928 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/log_reader.h"
+#include "db/log_writer.h"
+#include "env/composite_env_wrapper.h"
+#include "file/sequence_file_reader.h"
+#include "file/writable_file_writer.h"
+#include "rocksdb/env.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/coding.h"
+#include "util/crc32c.h"
+#include "util/random.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace log {
+
+// Construct a string of the specified length made out of the supplied
+// partial string.
+static std::string BigString(const std::string& partial_string, size_t n) {
+  std::string result;
+  while (result.size() < n) {
+    result.append(partial_string);
+  }
+  result.resize(n);
+  return result;
+}
+
+// Construct a string from a number
+static std::string NumberString(int n) {
+  char buf[50];
+  snprintf(buf, sizeof(buf), "%d.", n);
+  return std::string(buf);
+}
+
+// Return a skewed potentially long string
+static std::string RandomSkewedString(int i, Random* rnd) {
+  return BigString(NumberString(i), rnd->Skewed(17));
+}
+
+// Param type is tuple<int, bool>
+// get<0>(tuple): non-zero if recycling log, zero if regular log
+// get<1>(tuple): true if allow retry after read EOF, false otherwise
+class LogTest : public ::testing::TestWithParam<std::tuple<int, bool>> {
+ private:
+  class StringSource : public SequentialFile {
+   public:
+    Slice& contents_;
+    bool force_error_;
+    size_t force_error_position_;
+    bool force_eof_;
+    size_t force_eof_position_;
+    bool returned_partial_;
+    bool fail_after_read_partial_;
+    explicit StringSource(Slice& contents, bool fail_after_read_partial)
+        : contents_(contents),
+          force_error_(false),
+          force_error_position_(0),
+          force_eof_(false),
+          force_eof_position_(0),
+          returned_partial_(false),
+          fail_after_read_partial_(fail_after_read_partial) {}
+
+    Status Read(size_t n, Slice* result, char* scratch) override {
+      if (fail_after_read_partial_) {
+        EXPECT_TRUE(!returned_partial_) << "must not Read() after eof/error";
+      }
+
+      if (force_error_) {
+        if (force_error_position_ >= n) {
+          force_error_position_ -= n;
+        } else {
+          *result = Slice(contents_.data(), force_error_position_);
+          contents_.remove_prefix(force_error_position_);
+          force_error_ = false;
+          returned_partial_ = true;
+          return Status::Corruption("read error");
+        }
+      }
+
+      if (contents_.size() < n) {
+        n = contents_.size();
+        returned_partial_ = true;
+      }
+
+      if (force_eof_) {
+        if (force_eof_position_ >= n) {
+          force_eof_position_ -= n;
+        } else {
+          force_eof_ = false;
+          n = force_eof_position_;
+          returned_partial_ = true;
+        }
+      }
+
+      // By using scratch we ensure that caller has control over the
+      // lifetime of result.data()
+      memcpy(scratch, contents_.data(), n);
+      *result = Slice(scratch, n);
+
+      contents_.remove_prefix(n);
+      return Status::OK();
+    }
+
+    Status Skip(uint64_t n) override {
+      if (n > contents_.size()) {
+        contents_.clear();
+        return Status::NotFound("in-memory file skipepd past end");
+      }
+
+      contents_.remove_prefix(n);
+
+      return Status::OK();
+    }
+  };
+
+  inline StringSource* GetStringSourceFromLegacyReader(
+      SequentialFileReader* reader) {
+    LegacySequentialFileWrapper* file =
+        static_cast<LegacySequentialFileWrapper*>(reader->file());
+    return static_cast<StringSource*>(file->target());
+  }
+
+  class ReportCollector : public Reader::Reporter {
+   public:
+    size_t dropped_bytes_;
+    std::string message_;
+
+    ReportCollector() : dropped_bytes_(0) { }
+    void Corruption(size_t bytes, const Status& status) override {
+      dropped_bytes_ += bytes;
+      message_.append(status.ToString());
+    }
+  };
+
+  std::string& dest_contents() {
+    auto dest = test::GetStringSinkFromLegacyWriter(writer_.file());
+    assert(dest);
+    return dest->contents_;
+  }
+
+  const std::string& dest_contents() const {
+    auto dest = test::GetStringSinkFromLegacyWriter(writer_.file());
+    assert(dest);
+    return dest->contents_;
+  }
+
+  void reset_source_contents() {
+    auto src = GetStringSourceFromLegacyReader(reader_->file());
+    assert(src);
+    src->contents_ = dest_contents();
+  }
+
+  Slice reader_contents_;
+  std::unique_ptr<WritableFileWriter> dest_holder_;
+  std::unique_ptr<SequentialFileReader> source_holder_;
+  ReportCollector report_;
+  Writer writer_;
+  std::unique_ptr<Reader> reader_;
+
+ protected:
+  bool allow_retry_read_;
+
+ public:
+  LogTest()
+      : reader_contents_(),
+        dest_holder_(test::GetWritableFileWriter(
+            new test::StringSink(&reader_contents_), "" /* don't care */)),
+        source_holder_(test::GetSequentialFileReader(
+            new StringSource(reader_contents_, !std::get<1>(GetParam())),
+            "" /* file name */)),
+        writer_(std::move(dest_holder_), 123, std::get<0>(GetParam())),
+        allow_retry_read_(std::get<1>(GetParam())) {
+    if (allow_retry_read_) {
+      reader_.reset(new FragmentBufferedReader(
+          nullptr, std::move(source_holder_), &report_, true /* checksum */,
+          123 /* log_number */));
+    } else {
+      reader_.reset(new Reader(nullptr, std::move(source_holder_), &report_,
+                               true /* checksum */, 123 /* log_number */));
+    }
+  }
+
+  Slice* get_reader_contents() { return &reader_contents_; }
+
+  void Write(const std::string& msg) {
+    writer_.AddRecord(Slice(msg));
+  }
+
+  size_t WrittenBytes() const {
+    return dest_contents().size();
+  }
+
+  std::string Read(const WALRecoveryMode wal_recovery_mode =
+                       WALRecoveryMode::kTolerateCorruptedTailRecords) {
+    std::string scratch;
+    Slice record;
+    bool ret = false;
+    ret = reader_->ReadRecord(&record, &scratch, wal_recovery_mode);
+    if (ret) {
+      return record.ToString();
+    } else {
+      return "EOF";
+    }
+  }
+
+  void IncrementByte(int offset, char delta) {
+    dest_contents()[offset] += delta;
+  }
+
+  void SetByte(int offset, char new_byte) {
+    dest_contents()[offset] = new_byte;
+  }
+
+  void ShrinkSize(int bytes) {
+    auto dest = test::GetStringSinkFromLegacyWriter(writer_.file());
+    assert(dest);
+    dest->Drop(bytes);
+  }
+
+  void FixChecksum(int header_offset, int len, bool recyclable) {
+    // Compute crc of type/len/data
+    int header_size = recyclable ? kRecyclableHeaderSize : kHeaderSize;
+    uint32_t crc = crc32c::Value(&dest_contents()[header_offset + 6],
+                                 header_size - 6 + len);
+    crc = crc32c::Mask(crc);
+    EncodeFixed32(&dest_contents()[header_offset], crc);
+  }
+
+  void ForceError(size_t position = 0) {
+    auto src = GetStringSourceFromLegacyReader(reader_->file());
+    src->force_error_ = true;
+    src->force_error_position_ = position;
+  }
+
+  size_t DroppedBytes() const {
+    return report_.dropped_bytes_;
+  }
+
+  std::string ReportMessage() const {
+    return report_.message_;
+  }
+
+  void ForceEOF(size_t position = 0) {
+    auto src = GetStringSourceFromLegacyReader(reader_->file());
+    src->force_eof_ = true;
+    src->force_eof_position_ = position;
+  }
+
+  void UnmarkEOF() {
+    auto src = GetStringSourceFromLegacyReader(reader_->file());
+    src->returned_partial_ = false;
+    reader_->UnmarkEOF();
+  }
+
+  bool IsEOF() { return reader_->IsEOF(); }
+
+  // Returns OK iff recorded error message contains "msg"
+  std::string MatchError(const std::string& msg) const {
+    if (report_.message_.find(msg) == std::string::npos) {
+      return report_.message_;
+    } else {
+      return "OK";
+    }
+  }
+};
+
+TEST_P(LogTest, Empty) { ASSERT_EQ("EOF", Read()); }
+
+TEST_P(LogTest, ReadWrite) {
+  Write("foo");
+  Write("bar");
+  Write("");
+  Write("xxxx");
+  ASSERT_EQ("foo", Read());
+  ASSERT_EQ("bar", Read());
+  ASSERT_EQ("", Read());
+  ASSERT_EQ("xxxx", Read());
+  ASSERT_EQ("EOF", Read());
+  ASSERT_EQ("EOF", Read());  // Make sure reads at eof work
+}
+
+TEST_P(LogTest, ManyBlocks) {
+  for (int i = 0; i < 100000; i++) {
+    Write(NumberString(i));
+  }
+  for (int i = 0; i < 100000; i++) {
+    ASSERT_EQ(NumberString(i), Read());
+  }
+  ASSERT_EQ("EOF", Read());
+}
+
+TEST_P(LogTest, Fragmentation) {
+  Write("small");
+  Write(BigString("medium", 50000));
+  Write(BigString("large", 100000));
+  ASSERT_EQ("small", Read());
+  ASSERT_EQ(BigString("medium", 50000), Read());
+  ASSERT_EQ(BigString("large", 100000), Read());
+  ASSERT_EQ("EOF", Read());
+}
+
+TEST_P(LogTest, MarginalTrailer) {
+  // Make a trailer that is exactly the same length as an empty record.
+  int header_size =
+      std::get<0>(GetParam()) ? kRecyclableHeaderSize : kHeaderSize;
+  const int n = kBlockSize - 2 * header_size;
+  Write(BigString("foo", n));
+  ASSERT_EQ((unsigned int)(kBlockSize - header_size), WrittenBytes());
+  Write("");
+  Write("bar");
+  ASSERT_EQ(BigString("foo", n), Read());
+  ASSERT_EQ("", Read());
+  ASSERT_EQ("bar", Read());
+  ASSERT_EQ("EOF", Read());
+}
+
+TEST_P(LogTest, MarginalTrailer2) {
+  // Make a trailer that is exactly the same length as an empty record.
+  int header_size =
+      std::get<0>(GetParam()) ? kRecyclableHeaderSize : kHeaderSize;
+  const int n = kBlockSize - 2 * header_size;
+  Write(BigString("foo", n));
+  ASSERT_EQ((unsigned int)(kBlockSize - header_size), WrittenBytes());
+  Write("bar");
+  ASSERT_EQ(BigString("foo", n), Read());
+  ASSERT_EQ("bar", Read());
+  ASSERT_EQ("EOF", Read());
+  ASSERT_EQ(0U, DroppedBytes());
+  ASSERT_EQ("", ReportMessage());
+}
+
+TEST_P(LogTest, ShortTrailer) {
+  int header_size =
+      std::get<0>(GetParam()) ? kRecyclableHeaderSize : kHeaderSize;
+  const int n = kBlockSize - 2 * header_size + 4;
+  Write(BigString("foo", n));
+  ASSERT_EQ((unsigned int)(kBlockSize - header_size + 4), WrittenBytes());
+  Write("");
+  Write("bar");
+  ASSERT_EQ(BigString("foo", n), Read());
+  ASSERT_EQ("", Read());
+  ASSERT_EQ("bar", Read());
+  ASSERT_EQ("EOF", Read());
+}
+
+TEST_P(LogTest, AlignedEof) {
+  int header_size =
+      std::get<0>(GetParam()) ? kRecyclableHeaderSize : kHeaderSize;
+  const int n = kBlockSize - 2 * header_size + 4;
+  Write(BigString("foo", n));
+  ASSERT_EQ((unsigned int)(kBlockSize - header_size + 4), WrittenBytes());
+  ASSERT_EQ(BigString("foo", n), Read());
+  ASSERT_EQ("EOF", Read());
+}
+
+TEST_P(LogTest, RandomRead) {
+  const int N = 500;
+  Random write_rnd(301);
+  for (int i = 0; i < N; i++) {
+    Write(RandomSkewedString(i, &write_rnd));
+  }
+  Random read_rnd(301);
+  for (int i = 0; i < N; i++) {
+    ASSERT_EQ(RandomSkewedString(i, &read_rnd), Read());
+  }
+  ASSERT_EQ("EOF", Read());
+}
+
+// Tests of all the error paths in log_reader.cc follow:
+
+TEST_P(LogTest, ReadError) {
+  Write("foo");
+  ForceError();
+  ASSERT_EQ("EOF", Read());
+  ASSERT_EQ((unsigned int)kBlockSize, DroppedBytes());
+  ASSERT_EQ("OK", MatchError("read error"));
+}
+
+TEST_P(LogTest, BadRecordType) {
+  Write("foo");
+  // Type is stored in header[6]
+  IncrementByte(6, 100);
+  FixChecksum(0, 3, false);
+  ASSERT_EQ("EOF", Read());
+  ASSERT_EQ(3U, DroppedBytes());
+  ASSERT_EQ("OK", MatchError("unknown record type"));
+}
+
+TEST_P(LogTest, TruncatedTrailingRecordIsIgnored) {
+  Write("foo");
+  ShrinkSize(4);   // Drop all payload as well as a header byte
+  ASSERT_EQ("EOF", Read());
+  // Truncated last record is ignored, not treated as an error
+  ASSERT_EQ(0U, DroppedBytes());
+  ASSERT_EQ("", ReportMessage());
+}
+
+TEST_P(LogTest, TruncatedTrailingRecordIsNotIgnored) {
+  if (allow_retry_read_) {
+    // If read retry is allowed, then truncated trailing record should not
+    // raise an error.
+    return;
+  }
+  Write("foo");
+  ShrinkSize(4);  // Drop all payload as well as a header byte
+  ASSERT_EQ("EOF", Read(WALRecoveryMode::kAbsoluteConsistency));
+  // Truncated last record is ignored, not treated as an error
+  ASSERT_GT(DroppedBytes(), 0U);
+  ASSERT_EQ("OK", MatchError("Corruption: truncated header"));
+}
+
+TEST_P(LogTest, BadLength) {
+  if (allow_retry_read_) {
+    // If read retry is allowed, then we should not raise an error when the
+    // record length specified in header is longer than data currently
+    // available. It's possible that the body of the record is not written yet.
+    return;
+  }
+  bool recyclable_log = (std::get<0>(GetParam()) != 0);
+  int header_size = recyclable_log ? kRecyclableHeaderSize : kHeaderSize;
+  const int kPayloadSize = kBlockSize - header_size;
+  Write(BigString("bar", kPayloadSize));
+  Write("foo");
+  // Least significant size byte is stored in header[4].
+  IncrementByte(4, 1);
+  if (!recyclable_log) {
+    ASSERT_EQ("foo", Read());
+    ASSERT_EQ(kBlockSize, DroppedBytes());
+    ASSERT_EQ("OK", MatchError("bad record length"));
+  } else {
+    ASSERT_EQ("EOF", Read());
+  }
+}
+
+TEST_P(LogTest, BadLengthAtEndIsIgnored) {
+  if (allow_retry_read_) {
+    // If read retry is allowed, then we should not raise an error when the
+    // record length specified in header is longer than data currently
+    // available. It's possible that the body of the record is not written yet.
+    return;
+  }
+  Write("foo");
+  ShrinkSize(1);
+  ASSERT_EQ("EOF", Read());
+  ASSERT_EQ(0U, DroppedBytes());
+  ASSERT_EQ("", ReportMessage());
+}
+
+TEST_P(LogTest, BadLengthAtEndIsNotIgnored) {
+  if (allow_retry_read_) {
+    // If read retry is allowed, then we should not raise an error when the
+    // record length specified in header is longer than data currently
+    // available. It's possible that the body of the record is not written yet.
+    return;
+  }
+  Write("foo");
+  ShrinkSize(1);
+  ASSERT_EQ("EOF", Read(WALRecoveryMode::kAbsoluteConsistency));
+  ASSERT_GT(DroppedBytes(), 0U);
+  ASSERT_EQ("OK", MatchError("Corruption: truncated header"));
+}
+
+TEST_P(LogTest, ChecksumMismatch) {
+  Write("foooooo");
+  IncrementByte(0, 14);
+  ASSERT_EQ("EOF", Read());
+  bool recyclable_log = (std::get<0>(GetParam()) != 0);
+  if (!recyclable_log) {
+    ASSERT_EQ(14U, DroppedBytes());
+    ASSERT_EQ("OK", MatchError("checksum mismatch"));
+  } else {
+    ASSERT_EQ(0U, DroppedBytes());
+    ASSERT_EQ("", ReportMessage());
+  }
+}
+
+TEST_P(LogTest, UnexpectedMiddleType) {
+  Write("foo");
+  bool recyclable_log = (std::get<0>(GetParam()) != 0);
+  SetByte(6, static_cast<char>(recyclable_log ? kRecyclableMiddleType
+                                              : kMiddleType));
+  FixChecksum(0, 3, !!recyclable_log);
+  ASSERT_EQ("EOF", Read());
+  ASSERT_EQ(3U, DroppedBytes());
+  ASSERT_EQ("OK", MatchError("missing start"));
+}
+
+TEST_P(LogTest, UnexpectedLastType) {
+  Write("foo");
+  bool recyclable_log = (std::get<0>(GetParam()) != 0);
+  SetByte(6,
+          static_cast<char>(recyclable_log ? kRecyclableLastType : kLastType));
+  FixChecksum(0, 3, !!recyclable_log);
+  ASSERT_EQ("EOF", Read());
+  ASSERT_EQ(3U, DroppedBytes());
+  ASSERT_EQ("OK", MatchError("missing start"));
+}
+
+TEST_P(LogTest, UnexpectedFullType) {
+  Write("foo");
+  Write("bar");
+  bool recyclable_log = (std::get<0>(GetParam()) != 0);
+  SetByte(
+      6, static_cast<char>(recyclable_log ? kRecyclableFirstType : kFirstType));
+  FixChecksum(0, 3, !!recyclable_log);
+  ASSERT_EQ("bar", Read());
+  ASSERT_EQ("EOF", Read());
+  ASSERT_EQ(3U, DroppedBytes());
+  ASSERT_EQ("OK", MatchError("partial record without end"));
+}
+
+TEST_P(LogTest, UnexpectedFirstType) {
+  Write("foo");
+  Write(BigString("bar", 100000));
+  bool recyclable_log = (std::get<0>(GetParam()) != 0);
+  SetByte(
+      6, static_cast<char>(recyclable_log ? kRecyclableFirstType : kFirstType));
+  FixChecksum(0, 3, !!recyclable_log);
+  ASSERT_EQ(BigString("bar", 100000), Read());
+  ASSERT_EQ("EOF", Read());
+  ASSERT_EQ(3U, DroppedBytes());
+  ASSERT_EQ("OK", MatchError("partial record without end"));
+}
+
+TEST_P(LogTest, MissingLastIsIgnored) {
+  Write(BigString("bar", kBlockSize));
+  // Remove the LAST block, including header.
+  ShrinkSize(14);
+  ASSERT_EQ("EOF", Read());
+  ASSERT_EQ("", ReportMessage());
+  ASSERT_EQ(0U, DroppedBytes());
+}
+
+TEST_P(LogTest, MissingLastIsNotIgnored) {
+  if (allow_retry_read_) {
+    // If read retry is allowed, then truncated trailing record should not
+    // raise an error.
+    return;
+  }
+  Write(BigString("bar", kBlockSize));
+  // Remove the LAST block, including header.
+  ShrinkSize(14);
+  ASSERT_EQ("EOF", Read(WALRecoveryMode::kAbsoluteConsistency));
+  ASSERT_GT(DroppedBytes(), 0U);
+  ASSERT_EQ("OK", MatchError("Corruption: error reading trailing data"));
+}
+
+TEST_P(LogTest, PartialLastIsIgnored) {
+  Write(BigString("bar", kBlockSize));
+  // Cause a bad record length in the LAST block.
+  ShrinkSize(1);
+  ASSERT_EQ("EOF", Read());
+  ASSERT_EQ("", ReportMessage());
+  ASSERT_EQ(0U, DroppedBytes());
+}
+
+TEST_P(LogTest, PartialLastIsNotIgnored) {
+  if (allow_retry_read_) {
+    // If read retry is allowed, then truncated trailing record should not
+    // raise an error.
+    return;
+  }
+  Write(BigString("bar", kBlockSize));
+  // Cause a bad record length in the LAST block.
+  ShrinkSize(1);
+  ASSERT_EQ("EOF", Read(WALRecoveryMode::kAbsoluteConsistency));
+  ASSERT_GT(DroppedBytes(), 0U);
+  ASSERT_EQ("OK", MatchError(
+                      "Corruption: truncated headerCorruption: "
+                      "error reading trailing data"));
+}
+
+TEST_P(LogTest, ErrorJoinsRecords) {
+  // Consider two fragmented records:
+  //    first(R1) last(R1) first(R2) last(R2)
+  // where the middle two fragments disappear.  We do not want
+  // first(R1),last(R2) to get joined and returned as a valid record.
+
+  // Write records that span two blocks
+  Write(BigString("foo", kBlockSize));
+  Write(BigString("bar", kBlockSize));
+  Write("correct");
+
+  // Wipe the middle block
+  for (unsigned int offset = kBlockSize; offset < 2*kBlockSize; offset++) {
+    SetByte(offset, 'x');
+  }
+
+  bool recyclable_log = (std::get<0>(GetParam()) != 0);
+  if (!recyclable_log) {
+    ASSERT_EQ("correct", Read());
+    ASSERT_EQ("EOF", Read());
+    size_t dropped = DroppedBytes();
+    ASSERT_LE(dropped, 2 * kBlockSize + 100);
+    ASSERT_GE(dropped, 2 * kBlockSize);
+  } else {
+    ASSERT_EQ("EOF", Read());
+  }
+}
+
+TEST_P(LogTest, ClearEofSingleBlock) {
+  Write("foo");
+  Write("bar");
+  bool recyclable_log = (std::get<0>(GetParam()) != 0);
+  int header_size = recyclable_log ? kRecyclableHeaderSize : kHeaderSize;
+  ForceEOF(3 + header_size + 2);
+  ASSERT_EQ("foo", Read());
+  UnmarkEOF();
+  ASSERT_EQ("bar", Read());
+  ASSERT_TRUE(IsEOF());
+  ASSERT_EQ("EOF", Read());
+  Write("xxx");
+  UnmarkEOF();
+  ASSERT_EQ("xxx", Read());
+  ASSERT_TRUE(IsEOF());
+}
+
+TEST_P(LogTest, ClearEofMultiBlock) {
+  size_t num_full_blocks = 5;
+  bool recyclable_log = (std::get<0>(GetParam()) != 0);
+  int header_size = recyclable_log ? kRecyclableHeaderSize : kHeaderSize;
+  size_t n = (kBlockSize - header_size) * num_full_blocks + 25;
+  Write(BigString("foo", n));
+  Write(BigString("bar", n));
+  ForceEOF(n + num_full_blocks * header_size + header_size + 3);
+  ASSERT_EQ(BigString("foo", n), Read());
+  ASSERT_TRUE(IsEOF());
+  UnmarkEOF();
+  ASSERT_EQ(BigString("bar", n), Read());
+  ASSERT_TRUE(IsEOF());
+  Write(BigString("xxx", n));
+  UnmarkEOF();
+  ASSERT_EQ(BigString("xxx", n), Read());
+  ASSERT_TRUE(IsEOF());
+}
+
+TEST_P(LogTest, ClearEofError) {
+  // If an error occurs during Read() in UnmarkEOF(), the records contained
+  // in the buffer should be returned on subsequent calls of ReadRecord()
+  // until no more full records are left, whereafter ReadRecord() should return
+  // false to indicate that it cannot read any further.
+
+  Write("foo");
+  Write("bar");
+  UnmarkEOF();
+  ASSERT_EQ("foo", Read());
+  ASSERT_TRUE(IsEOF());
+  Write("xxx");
+  ForceError(0);
+  UnmarkEOF();
+  ASSERT_EQ("bar", Read());
+  ASSERT_EQ("EOF", Read());
+}
+
+TEST_P(LogTest, ClearEofError2) {
+  Write("foo");
+  Write("bar");
+  UnmarkEOF();
+  ASSERT_EQ("foo", Read());
+  Write("xxx");
+  ForceError(3);
+  UnmarkEOF();
+  ASSERT_EQ("bar", Read());
+  ASSERT_EQ("EOF", Read());
+  ASSERT_EQ(3U, DroppedBytes());
+  ASSERT_EQ("OK", MatchError("read error"));
+}
+
+TEST_P(LogTest, Recycle) {
+  bool recyclable_log = (std::get<0>(GetParam()) != 0);
+  if (!recyclable_log) {
+    return;  // test is only valid for recycled logs
+  }
+  Write("foo");
+  Write("bar");
+  Write("baz");
+  Write("bif");
+  Write("blitz");
+  while (get_reader_contents()->size() < log::kBlockSize * 2) {
+    Write("xxxxxxxxxxxxxxxx");
+  }
+  std::unique_ptr<WritableFileWriter> dest_holder(test::GetWritableFileWriter(
+      new test::OverwritingStringSink(get_reader_contents()),
+      "" /* don't care */));
+  Writer recycle_writer(std::move(dest_holder), 123, true);
+  recycle_writer.AddRecord(Slice("foooo"));
+  recycle_writer.AddRecord(Slice("bar"));
+  ASSERT_GE(get_reader_contents()->size(), log::kBlockSize * 2);
+  ASSERT_EQ("foooo", Read());
+  ASSERT_EQ("bar", Read());
+  ASSERT_EQ("EOF", Read());
+}
+
+INSTANTIATE_TEST_CASE_P(bool, LogTest,
+                        ::testing::Values(std::make_tuple(0, false),
+                                          std::make_tuple(0, true),
+                                          std::make_tuple(1, false),
+                                          std::make_tuple(1, true)));
+
+class RetriableLogTest : public ::testing::TestWithParam<int> {
+ private:
+  class ReportCollector : public Reader::Reporter {
+   public:
+    size_t dropped_bytes_;
+    std::string message_;
+
+    ReportCollector() : dropped_bytes_(0) {}
+    void Corruption(size_t bytes, const Status& status) override {
+      dropped_bytes_ += bytes;
+      message_.append(status.ToString());
+    }
+  };
+
+  Slice contents_;
+  std::unique_ptr<WritableFileWriter> dest_holder_;
+  std::unique_ptr<Writer> log_writer_;
+  Env* env_;
+  EnvOptions env_options_;
+  const std::string test_dir_;
+  const std::string log_file_;
+  std::unique_ptr<WritableFileWriter> writer_;
+  std::unique_ptr<SequentialFileReader> reader_;
+  ReportCollector report_;
+  std::unique_ptr<FragmentBufferedReader> log_reader_;
+
+ public:
+  RetriableLogTest()
+      : contents_(),
+        dest_holder_(nullptr),
+        log_writer_(nullptr),
+        env_(Env::Default()),
+        test_dir_(test::PerThreadDBPath("retriable_log_test")),
+        log_file_(test_dir_ + "/log"),
+        writer_(nullptr),
+        reader_(nullptr),
+        log_reader_(nullptr) {}
+
+  Status SetupTestEnv() {
+    dest_holder_.reset(test::GetWritableFileWriter(
+        new test::StringSink(&contents_), "" /* file name */));
+    assert(dest_holder_ != nullptr);
+    log_writer_.reset(new Writer(std::move(dest_holder_), 123, GetParam()));
+    assert(log_writer_ != nullptr);
+
+    Status s;
+    s = env_->CreateDirIfMissing(test_dir_);
+    std::unique_ptr<WritableFile> writable_file;
+    if (s.ok()) {
+      s = env_->NewWritableFile(log_file_, &writable_file, env_options_);
+    }
+    if (s.ok()) {
+      writer_.reset(new WritableFileWriter(
+          NewLegacyWritableFileWrapper(std::move(writable_file)), log_file_,
+          env_options_));
+      assert(writer_ != nullptr);
+    }
+    std::unique_ptr<SequentialFile> seq_file;
+    if (s.ok()) {
+      s = env_->NewSequentialFile(log_file_, &seq_file, env_options_);
+    }
+    if (s.ok()) {
+      reader_.reset(new SequentialFileReader(
+          NewLegacySequentialFileWrapper(seq_file), log_file_));
+      assert(reader_ != nullptr);
+      log_reader_.reset(new FragmentBufferedReader(
+          nullptr, std::move(reader_), &report_, true /* checksum */,
+          123 /* log_number */));
+      assert(log_reader_ != nullptr);
+    }
+    return s;
+  }
+
+  std::string contents() {
+    auto file = test::GetStringSinkFromLegacyWriter(log_writer_->file());
+    assert(file != nullptr);
+    return file->contents_;
+  }
+
+  void Encode(const std::string& msg) { log_writer_->AddRecord(Slice(msg)); }
+
+  void Write(const Slice& data) {
+    writer_->Append(data);
+    writer_->Sync(true);
+  }
+
+  bool TryRead(std::string* result) {
+    assert(result != nullptr);
+    result->clear();
+    std::string scratch;
+    Slice record;
+    bool r = log_reader_->ReadRecord(&record, &scratch);
+    if (r) {
+      result->assign(record.data(), record.size());
+      return true;
+    } else {
+      return false;
+    }
+  }
+};
+
+TEST_P(RetriableLogTest, TailLog_PartialHeader) {
+  ASSERT_OK(SetupTestEnv());
+  std::vector<int> remaining_bytes_in_last_record;
+  size_t header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize;
+  bool eof = false;
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"RetriableLogTest::TailLog:AfterPart1",
+        "RetriableLogTest::TailLog:BeforeReadRecord"},
+       {"FragmentBufferedLogReader::TryReadMore:FirstEOF",
+        "RetriableLogTest::TailLog:BeforePart2"}});
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->SetCallBack(
+      "FragmentBufferedLogReader::TryReadMore:FirstEOF",
+      [&](void* /*arg*/) { eof = true; });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  size_t delta = header_size - 1;
+  port::Thread log_writer_thread([&]() {
+    size_t old_sz = contents().size();
+    Encode("foo");
+    size_t new_sz = contents().size();
+    std::string part1 = contents().substr(old_sz, delta);
+    std::string part2 =
+        contents().substr(old_sz + delta, new_sz - old_sz - delta);
+    Write(Slice(part1));
+    TEST_SYNC_POINT("RetriableLogTest::TailLog:AfterPart1");
+    TEST_SYNC_POINT("RetriableLogTest::TailLog:BeforePart2");
+    Write(Slice(part2));
+  });
+
+  std::string record;
+  port::Thread log_reader_thread([&]() {
+    TEST_SYNC_POINT("RetriableLogTest::TailLog:BeforeReadRecord");
+    while (!TryRead(&record)) {
+    }
+  });
+  log_reader_thread.join();
+  log_writer_thread.join();
+  ASSERT_EQ("foo", record);
+  ASSERT_TRUE(eof);
+}
+
+TEST_P(RetriableLogTest, TailLog_FullHeader) {
+  ASSERT_OK(SetupTestEnv());
+  std::vector<int> remaining_bytes_in_last_record;
+  size_t header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize;
+  bool eof = false;
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"RetriableLogTest::TailLog:AfterPart1",
+        "RetriableLogTest::TailLog:BeforeReadRecord"},
+       {"FragmentBufferedLogReader::TryReadMore:FirstEOF",
+        "RetriableLogTest::TailLog:BeforePart2"}});
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->SetCallBack(
+      "FragmentBufferedLogReader::TryReadMore:FirstEOF",
+      [&](void* /*arg*/) { eof = true; });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  size_t delta = header_size + 1;
+  port::Thread log_writer_thread([&]() {
+    size_t old_sz = contents().size();
+    Encode("foo");
+    size_t new_sz = contents().size();
+    std::string part1 = contents().substr(old_sz, delta);
+    std::string part2 =
+        contents().substr(old_sz + delta, new_sz - old_sz - delta);
+    Write(Slice(part1));
+    TEST_SYNC_POINT("RetriableLogTest::TailLog:AfterPart1");
+    TEST_SYNC_POINT("RetriableLogTest::TailLog:BeforePart2");
+    Write(Slice(part2));
+    ASSERT_TRUE(eof);
+  });
+
+  std::string record;
+  port::Thread log_reader_thread([&]() {
+    TEST_SYNC_POINT("RetriableLogTest::TailLog:BeforeReadRecord");
+    while (!TryRead(&record)) {
+    }
+  });
+  log_reader_thread.join();
+  log_writer_thread.join();
+  ASSERT_EQ("foo", record);
+}
+
+TEST_P(RetriableLogTest, NonBlockingReadFullRecord) {
+  // Clear all sync point callbacks even if this test does not use sync point.
+  // It is necessary, otherwise the execute of this test may hit a sync point
+  // with which a callback is registered. The registered callback may access
+  // some dead variable, causing segfault.
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  ASSERT_OK(SetupTestEnv());
+  size_t header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize;
+  size_t delta = header_size - 1;
+  size_t old_sz = contents().size();
+  Encode("foo-bar");
+  size_t new_sz = contents().size();
+  std::string part1 = contents().substr(old_sz, delta);
+  std::string part2 =
+      contents().substr(old_sz + delta, new_sz - old_sz - delta);
+  Write(Slice(part1));
+  std::string record;
+  ASSERT_FALSE(TryRead(&record));
+  ASSERT_TRUE(record.empty());
+  Write(Slice(part2));
+  ASSERT_TRUE(TryRead(&record));
+  ASSERT_EQ("foo-bar", record);
+}
+
+INSTANTIATE_TEST_CASE_P(bool, RetriableLogTest, ::testing::Values(0, 2));
+
+}  // namespace log
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/log_writer.cc b/src/rocksdb/db/log_writer.cc
new file mode 100644
index 000000000..0222ee2a7
--- /dev/null
+++ b/src/rocksdb/db/log_writer.cc
@@ -0,0 +1,162 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/log_writer.h"
+
+#include <stdint.h>
+#include "file/writable_file_writer.h"
+#include "rocksdb/env.h"
+#include "util/coding.h"
+#include "util/crc32c.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace log {
+
+Writer::Writer(std::unique_ptr<WritableFileWriter>&& dest, uint64_t log_number,
+               bool recycle_log_files, bool manual_flush)
+    : dest_(std::move(dest)),
+      block_offset_(0),
+      log_number_(log_number),
+      recycle_log_files_(recycle_log_files),
+      manual_flush_(manual_flush) {
+  for (int i = 0; i <= kMaxRecordType; i++) {
+    char t = static_cast<char>(i);
+    type_crc_[i] = crc32c::Value(&t, 1);
+  }
+}
+
+Writer::~Writer() {
+  if (dest_) {
+    WriteBuffer();
+  }
+}
+
+Status Writer::WriteBuffer() { return dest_->Flush(); }
+
+Status Writer::Close() {
+  Status s;
+  if (dest_) {
+    s = dest_->Close();
+    dest_.reset();
+  }
+  return s;
+}
+
+Status Writer::AddRecord(const Slice& slice) {
+  const char* ptr = slice.data();
+  size_t left = slice.size();
+
+  // Header size varies depending on whether we are recycling or not.
+  const int header_size =
+      recycle_log_files_ ? kRecyclableHeaderSize : kHeaderSize;
+
+  // Fragment the record if necessary and emit it.  Note that if slice
+  // is empty, we still want to iterate once to emit a single
+  // zero-length record
+  Status s;
+  bool begin = true;
+  do {
+    const int64_t leftover = kBlockSize - block_offset_;
+    assert(leftover >= 0);
+    if (leftover < header_size) {
+      // Switch to a new block
+      if (leftover > 0) {
+        // Fill the trailer (literal below relies on kHeaderSize and
+        // kRecyclableHeaderSize being <= 11)
+        assert(header_size <= 11);
+        s = dest_->Append(Slice("\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
+                                static_cast<size_t>(leftover)));
+        if (!s.ok()) {
+          break;
+        }
+      }
+      block_offset_ = 0;
+    }
+
+    // Invariant: we never leave < header_size bytes in a block.
+    assert(static_cast<int64_t>(kBlockSize - block_offset_) >= header_size);
+
+    const size_t avail = kBlockSize - block_offset_ - header_size;
+    const size_t fragment_length = (left < avail) ? left : avail;
+
+    RecordType type;
+    const bool end = (left == fragment_length);
+    if (begin && end) {
+      type = recycle_log_files_ ? kRecyclableFullType : kFullType;
+    } else if (begin) {
+      type = recycle_log_files_ ? kRecyclableFirstType : kFirstType;
+    } else if (end) {
+      type = recycle_log_files_ ? kRecyclableLastType : kLastType;
+    } else {
+      type = recycle_log_files_ ? kRecyclableMiddleType : kMiddleType;
+    }
+
+    s = EmitPhysicalRecord(type, ptr, fragment_length);
+    ptr += fragment_length;
+    left -= fragment_length;
+    begin = false;
+  } while (s.ok() && left > 0);
+
+  if (s.ok()) {
+    if (!manual_flush_) {
+      s = dest_->Flush();
+    }
+  }
+
+  return s;
+}
+
+bool Writer::TEST_BufferIsEmpty() { return dest_->TEST_BufferIsEmpty(); }
+
+Status Writer::EmitPhysicalRecord(RecordType t, const char* ptr, size_t n) {
+  assert(n <= 0xffff);  // Must fit in two bytes
+
+  size_t header_size;
+  char buf[kRecyclableHeaderSize];
+
+  // Format the header
+  buf[4] = static_cast<char>(n & 0xff);
+  buf[5] = static_cast<char>(n >> 8);
+  buf[6] = static_cast<char>(t);
+
+  uint32_t crc = type_crc_[t];
+  if (t < kRecyclableFullType) {
+    // Legacy record format
+    assert(block_offset_ + kHeaderSize + n <= kBlockSize);
+    header_size = kHeaderSize;
+  } else {
+    // Recyclable record format
+    assert(block_offset_ + kRecyclableHeaderSize + n <= kBlockSize);
+    header_size = kRecyclableHeaderSize;
+
+    // Only encode low 32-bits of the 64-bit log number.  This means
+    // we will fail to detect an old record if we recycled a log from
+    // ~4 billion logs ago, but that is effectively impossible, and
+    // even if it were we'dbe far more likely to see a false positive
+    // on the 32-bit CRC.
+    EncodeFixed32(buf + 7, static_cast<uint32_t>(log_number_));
+    crc = crc32c::Extend(crc, buf + 7, 4);
+  }
+
+  // Compute the crc of the record type and the payload.
+  crc = crc32c::Extend(crc, ptr, n);
+  crc = crc32c::Mask(crc);  // Adjust for storage
+  EncodeFixed32(buf, crc);
+
+  // Write the header and the payload
+  Status s = dest_->Append(Slice(buf, header_size));
+  if (s.ok()) {
+    s = dest_->Append(Slice(ptr, n));
+  }
+  block_offset_ += header_size + n;
+  return s;
+}
+
+}  // namespace log
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/log_writer.h b/src/rocksdb/db/log_writer.h
new file mode 100644
index 000000000..a7f952edd
--- /dev/null
+++ b/src/rocksdb/db/log_writer.h
@@ -0,0 +1,114 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+
+#include <stdint.h>
+
+#include <memory>
+
+#include "db/log_format.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class WritableFileWriter;
+
+namespace log {
+
+/**
+ * Writer is a general purpose log stream writer. It provides an append-only
+ * abstraction for writing data. The details of the how the data is written is
+ * handled by the WriteableFile sub-class implementation.
+ *
+ * File format:
+ *
+ * File is broken down into variable sized records. The format of each record
+ * is described below.
+ *       +-----+-------------+--+----+----------+------+-- ... ----+
+ * File  | r0  |        r1   |P | r2 |    r3    |  r4  |           |
+ *       +-----+-------------+--+----+----------+------+-- ... ----+
+ *       <--- kBlockSize ------>|<-- kBlockSize ------>|
+ *  rn = variable size records
+ *  P = Padding
+ *
+ * Data is written out in kBlockSize chunks. If next record does not fit
+ * into the space left, the leftover space will be padded with \0.
+ *
+ * Legacy record format:
+ *
+ * +---------+-----------+-----------+--- ... ---+
+ * |CRC (4B) | Size (2B) | Type (1B) | Payload   |
+ * +---------+-----------+-----------+--- ... ---+
+ *
+ * CRC = 32bit hash computed over the record type and payload using CRC
+ * Size = Length of the payload data
+ * Type = Type of record
+ *        (kZeroType, kFullType, kFirstType, kLastType, kMiddleType )
+ *        The type is used to group a bunch of records together to represent
+ *        blocks that are larger than kBlockSize
+ * Payload = Byte stream as long as specified by the payload size
+ *
+ * Recyclable record format:
+ *
+ * +---------+-----------+-----------+----------------+--- ... ---+
+ * |CRC (4B) | Size (2B) | Type (1B) | Log number (4B)| Payload   |
+ * +---------+-----------+-----------+----------------+--- ... ---+
+ *
+ * Same as above, with the addition of
+ * Log number = 32bit log file number, so that we can distinguish between
+ * records written by the most recent log writer vs a previous one.
+ */
+class Writer {
+ public:
+  // Create a writer that will append data to "*dest".
+  // "*dest" must be initially empty.
+  // "*dest" must remain live while this Writer is in use.
+  explicit Writer(std::unique_ptr<WritableFileWriter>&& dest,
+                  uint64_t log_number, bool recycle_log_files,
+                  bool manual_flush = false);
+  // No copying allowed
+  Writer(const Writer&) = delete;
+  void operator=(const Writer&) = delete;
+
+  ~Writer();
+
+  Status AddRecord(const Slice& slice);
+
+  WritableFileWriter* file() { return dest_.get(); }
+  const WritableFileWriter* file() const { return dest_.get(); }
+
+  uint64_t get_log_number() const { return log_number_; }
+
+  Status WriteBuffer();
+
+  Status Close();
+
+  bool TEST_BufferIsEmpty();
+
+ private:
+  std::unique_ptr<WritableFileWriter> dest_;
+  size_t block_offset_;       // Current offset in block
+  uint64_t log_number_;
+  bool recycle_log_files_;
+
+  // crc32c values for all supported record types.  These are
+  // pre-computed to reduce the overhead of computing the crc of the
+  // record type stored in the header.
+  uint32_t type_crc_[kMaxRecordType + 1];
+
+  Status EmitPhysicalRecord(RecordType type, const char* ptr, size_t length);
+
+  // If true, it does not flush after each write. Instead it relies on the upper
+  // layer to manually does the flush by calling ::WriteBuffer()
+  bool manual_flush_;
+};
+
+}  // namespace log
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/logs_with_prep_tracker.cc b/src/rocksdb/db/logs_with_prep_tracker.cc
new file mode 100644
index 000000000..ff98155c4
--- /dev/null
+++ b/src/rocksdb/db/logs_with_prep_tracker.cc
@@ -0,0 +1,67 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#include "db/logs_with_prep_tracker.h"
+
+#include "port/likely.h"
+
+namespace ROCKSDB_NAMESPACE {
+void LogsWithPrepTracker::MarkLogAsHavingPrepSectionFlushed(uint64_t log) {
+  assert(log != 0);
+  std::lock_guard<std::mutex> lock(prepared_section_completed_mutex_);
+  auto it = prepared_section_completed_.find(log);
+  if (UNLIKELY(it == prepared_section_completed_.end())) {
+    prepared_section_completed_[log] = 1;
+  } else {
+    it->second += 1;
+  }
+}
+
+void LogsWithPrepTracker::MarkLogAsContainingPrepSection(uint64_t log) {
+  assert(log != 0);
+  std::lock_guard<std::mutex> lock(logs_with_prep_mutex_);
+
+  auto rit = logs_with_prep_.rbegin();
+  bool updated = false;
+  // Most probably the last log is the one that is being marked for
+  // having a prepare section; so search from the end.
+  for (; rit != logs_with_prep_.rend() && rit->log >= log; ++rit) {
+    if (rit->log == log) {
+      rit->cnt++;
+      updated = true;
+      break;
+    }
+  }
+  if (!updated) {
+    // We are either at the start, or at a position with rit->log < log
+    logs_with_prep_.insert(rit.base(), {log, 1});
+  }
+}
+
+uint64_t LogsWithPrepTracker::FindMinLogContainingOutstandingPrep() {
+  std::lock_guard<std::mutex> lock(logs_with_prep_mutex_);
+  auto it = logs_with_prep_.begin();
+  // start with the smallest log
+  for (; it != logs_with_prep_.end();) {
+    auto min_log = it->log;
+    {
+      std::lock_guard<std::mutex> lock2(prepared_section_completed_mutex_);
+      auto completed_it = prepared_section_completed_.find(min_log);
+      if (completed_it == prepared_section_completed_.end() ||
+          completed_it->second < it->cnt) {
+        return min_log;
+      }
+      assert(completed_it != prepared_section_completed_.end() &&
+             completed_it->second == it->cnt);
+      prepared_section_completed_.erase(completed_it);
+    }
+    // erase from beginning in vector is not efficient but this function is not
+    // on the fast path.
+    it = logs_with_prep_.erase(it);
+  }
+  // no such log found
+  return 0;
+}
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/logs_with_prep_tracker.h b/src/rocksdb/db/logs_with_prep_tracker.h
new file mode 100644
index 000000000..86c88012a
--- /dev/null
+++ b/src/rocksdb/db/logs_with_prep_tracker.h
@@ -0,0 +1,63 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#pragma once
+
+#include <stdint.h>
+#include <cassert>
+#include <cstdlib>
+#include <mutex>
+#include <unordered_map>
+#include <vector>
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// This class is used to track the log files with outstanding prepare entries.
+class LogsWithPrepTracker {
+ public:
+  // Called when a transaction prepared in `log` has been committed or aborted.
+  void MarkLogAsHavingPrepSectionFlushed(uint64_t log);
+  // Called when a transaction is prepared in `log`.
+  void MarkLogAsContainingPrepSection(uint64_t log);
+  // Return the earliest log file with outstanding prepare entries.
+  uint64_t FindMinLogContainingOutstandingPrep();
+  size_t TEST_PreparedSectionCompletedSize() {
+    return prepared_section_completed_.size();
+  }
+  size_t TEST_LogsWithPrepSize() { return logs_with_prep_.size(); }
+
+ private:
+  // REQUIRES: logs_with_prep_mutex_ held
+  //
+  // sorted list of log numbers still containing prepared data.
+  // this is used by FindObsoleteFiles to determine which
+  // flushed logs we must keep around because they still
+  // contain prepared data which has not been committed or rolled back
+  struct LogCnt {
+    uint64_t log;  // the log number
+    uint64_t cnt;  // number of prepared sections in the log
+  };
+  std::vector<LogCnt> logs_with_prep_;
+  std::mutex logs_with_prep_mutex_;
+
+  // REQUIRES: prepared_section_completed_mutex_ held
+  //
+  // to be used in conjunction with logs_with_prep_.
+  // once a transaction with data in log L is committed or rolled back
+  // rather than updating logs_with_prep_ directly we keep track of that
+  // in prepared_section_completed_ which maps LOG -> instance_count. This helps
+  // avoiding contention between a commit thread and the prepare threads.
+  //
+  // when trying to determine the minimum log still active we first
+  // consult logs_with_prep_. while that root value maps to
+  // an equal value in prepared_section_completed_ we erase the log from
+  // both logs_with_prep_ and prepared_section_completed_.
+  std::unordered_map<uint64_t, uint64_t> prepared_section_completed_;
+  std::mutex prepared_section_completed_mutex_;
+
+};
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/lookup_key.h b/src/rocksdb/db/lookup_key.h
new file mode 100644
index 000000000..51e5daed1
--- /dev/null
+++ b/src/rocksdb/db/lookup_key.h
@@ -0,0 +1,66 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <string>
+#include <utility>
+#include "rocksdb/db.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/types.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// A helper class useful for DBImpl::Get()
+class LookupKey {
+ public:
+  // Initialize *this for looking up user_key at a snapshot with
+  // the specified sequence number.
+  LookupKey(const Slice& _user_key, SequenceNumber sequence,
+            const Slice* ts = nullptr);
+
+  ~LookupKey();
+
+  // Return a key suitable for lookup in a MemTable.
+  Slice memtable_key() const {
+    return Slice(start_, static_cast<size_t>(end_ - start_));
+  }
+
+  // Return an internal key (suitable for passing to an internal iterator)
+  Slice internal_key() const {
+    return Slice(kstart_, static_cast<size_t>(end_ - kstart_));
+  }
+
+  // Return the user key
+  Slice user_key() const {
+    return Slice(kstart_, static_cast<size_t>(end_ - kstart_ - 8));
+  }
+
+ private:
+  // We construct a char array of the form:
+  //    klength  varint32               <-- start_
+  //    userkey  char[klength]          <-- kstart_
+  //    tag      uint64
+  //                                    <-- end_
+  // The array is a suitable MemTable key.
+  // The suffix starting with "userkey" can be used as an InternalKey.
+  const char* start_;
+  const char* kstart_;
+  const char* end_;
+  char space_[200];  // Avoid allocation for short keys
+
+  // No copying allowed
+  LookupKey(const LookupKey&);
+  void operator=(const LookupKey&);
+};
+
+inline LookupKey::~LookupKey() {
+  if (start_ != space_) delete[] start_;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/malloc_stats.cc b/src/rocksdb/db/malloc_stats.cc
new file mode 100644
index 000000000..12824e516
--- /dev/null
+++ b/src/rocksdb/db/malloc_stats.cc
@@ -0,0 +1,54 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/malloc_stats.h"
+
+#ifndef ROCKSDB_LITE
+#include <memory>
+#include <string.h>
+
+#include "port/jemalloc_helper.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+#ifdef ROCKSDB_JEMALLOC
+
+typedef struct {
+  char* cur;
+  char* end;
+} MallocStatus;
+
+static void GetJemallocStatus(void* mstat_arg, const char* status) {
+  MallocStatus* mstat = reinterpret_cast<MallocStatus*>(mstat_arg);
+  size_t status_len = status ? strlen(status) : 0;
+  size_t buf_size = (size_t)(mstat->end - mstat->cur);
+  if (!status_len || status_len > buf_size) {
+    return;
+  }
+
+  snprintf(mstat->cur, buf_size, "%s", status);
+  mstat->cur += status_len;
+}
+void DumpMallocStats(std::string* stats) {
+  if (!HasJemalloc()) {
+    return;
+  }
+  MallocStatus mstat;
+  const unsigned int kMallocStatusLen = 1000000;
+  std::unique_ptr<char[]> buf{new char[kMallocStatusLen + 1]};
+  mstat.cur = buf.get();
+  mstat.end = buf.get() + kMallocStatusLen;
+  malloc_stats_print(GetJemallocStatus, &mstat, "");
+  stats->append(buf.get());
+}
+#else
+void DumpMallocStats(std::string*) {}
+#endif  // ROCKSDB_JEMALLOC
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/malloc_stats.h b/src/rocksdb/db/malloc_stats.h
new file mode 100644
index 000000000..18aff3ad0
--- /dev/null
+++ b/src/rocksdb/db/malloc_stats.h
@@ -0,0 +1,24 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <string>
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+void DumpMallocStats(std::string*);
+
+}
+
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/manual_compaction_test.cc b/src/rocksdb/db/manual_compaction_test.cc
new file mode 100644
index 000000000..22cd919b5
--- /dev/null
+++ b/src/rocksdb/db/manual_compaction_test.cc
@@ -0,0 +1,160 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Test for issue 178: a manual compaction causes deleted data to reappear.
+#include <iostream>
+#include <sstream>
+#include <cstdlib>
+
+#include "port/port.h"
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/db.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/write_batch.h"
+#include "test_util/testharness.h"
+
+using namespace ROCKSDB_NAMESPACE;
+
+namespace {
+
+// Reasoning: previously the number was 1100000. Since the keys are written to
+// the batch in one write each write will result into one SST file. each write
+// will result into one SST file. We reduced the write_buffer_size to 1K to
+// basically have the same effect with however less number of keys, which
+// results into less test runtime.
+const int kNumKeys = 1100;
+
+std::string Key1(int i) {
+  char buf[100];
+  snprintf(buf, sizeof(buf), "my_key_%d", i);
+  return buf;
+}
+
+std::string Key2(int i) {
+  return Key1(i) + "_xxx";
+}
+
+class ManualCompactionTest : public testing::Test {
+ public:
+  ManualCompactionTest() {
+    // Get rid of any state from an old run.
+    dbname_ = ROCKSDB_NAMESPACE::test::PerThreadDBPath("rocksdb_cbug_test");
+    DestroyDB(dbname_, ROCKSDB_NAMESPACE::Options());
+  }
+
+  std::string dbname_;
+};
+
+class DestroyAllCompactionFilter : public CompactionFilter {
+ public:
+  DestroyAllCompactionFilter() {}
+
+  bool Filter(int /*level*/, const Slice& /*key*/, const Slice& existing_value,
+              std::string* /*new_value*/,
+              bool* /*value_changed*/) const override {
+    return existing_value.ToString() == "destroy";
+  }
+
+  const char* Name() const override { return "DestroyAllCompactionFilter"; }
+};
+
+TEST_F(ManualCompactionTest, CompactTouchesAllKeys) {
+  for (int iter = 0; iter < 2; ++iter) {
+    DB* db;
+    Options options;
+    if (iter == 0) { // level compaction
+      options.num_levels = 3;
+      options.compaction_style = kCompactionStyleLevel;
+    } else { // universal compaction
+      options.compaction_style = kCompactionStyleUniversal;
+    }
+    options.create_if_missing = true;
+    options.compression = ROCKSDB_NAMESPACE::kNoCompression;
+    options.compaction_filter = new DestroyAllCompactionFilter();
+    ASSERT_OK(DB::Open(options, dbname_, &db));
+
+    db->Put(WriteOptions(), Slice("key1"), Slice("destroy"));
+    db->Put(WriteOptions(), Slice("key2"), Slice("destroy"));
+    db->Put(WriteOptions(), Slice("key3"), Slice("value3"));
+    db->Put(WriteOptions(), Slice("key4"), Slice("destroy"));
+
+    Slice key4("key4");
+    db->CompactRange(CompactRangeOptions(), nullptr, &key4);
+    Iterator* itr = db->NewIterator(ReadOptions());
+    itr->SeekToFirst();
+    ASSERT_TRUE(itr->Valid());
+    ASSERT_EQ("key3", itr->key().ToString());
+    itr->Next();
+    ASSERT_TRUE(!itr->Valid());
+    delete itr;
+
+    delete options.compaction_filter;
+    delete db;
+    DestroyDB(dbname_, options);
+  }
+}
+
+TEST_F(ManualCompactionTest, Test) {
+  // Open database.  Disable compression since it affects the creation
+  // of layers and the code below is trying to test against a very
+  // specific scenario.
+  ROCKSDB_NAMESPACE::DB* db;
+  ROCKSDB_NAMESPACE::Options db_options;
+  db_options.write_buffer_size = 1024;
+  db_options.create_if_missing = true;
+  db_options.compression = ROCKSDB_NAMESPACE::kNoCompression;
+  ASSERT_OK(ROCKSDB_NAMESPACE::DB::Open(db_options, dbname_, &db));
+
+  // create first key range
+  ROCKSDB_NAMESPACE::WriteBatch batch;
+  for (int i = 0; i < kNumKeys; i++) {
+    batch.Put(Key1(i), "value for range 1 key");
+  }
+  ASSERT_OK(db->Write(ROCKSDB_NAMESPACE::WriteOptions(), &batch));
+
+  // create second key range
+  batch.Clear();
+  for (int i = 0; i < kNumKeys; i++) {
+    batch.Put(Key2(i), "value for range 2 key");
+  }
+  ASSERT_OK(db->Write(ROCKSDB_NAMESPACE::WriteOptions(), &batch));
+
+  // delete second key range
+  batch.Clear();
+  for (int i = 0; i < kNumKeys; i++) {
+    batch.Delete(Key2(i));
+  }
+  ASSERT_OK(db->Write(ROCKSDB_NAMESPACE::WriteOptions(), &batch));
+
+  // compact database
+  std::string start_key = Key1(0);
+  std::string end_key = Key1(kNumKeys - 1);
+  ROCKSDB_NAMESPACE::Slice least(start_key.data(), start_key.size());
+  ROCKSDB_NAMESPACE::Slice greatest(end_key.data(), end_key.size());
+
+  // commenting out the line below causes the example to work correctly
+  db->CompactRange(CompactRangeOptions(), &least, &greatest);
+
+  // count the keys
+  ROCKSDB_NAMESPACE::Iterator* iter =
+      db->NewIterator(ROCKSDB_NAMESPACE::ReadOptions());
+  int num_keys = 0;
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    num_keys++;
+  }
+  delete iter;
+  ASSERT_EQ(kNumKeys, num_keys) << "Bad number of keys";
+
+  // close database
+  delete db;
+  DestroyDB(dbname_, ROCKSDB_NAMESPACE::Options());
+}
+
+}  // anonymous namespace
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/memtable.cc b/src/rocksdb/db/memtable.cc
new file mode 100644
index 000000000..45483ea09
--- /dev/null
+++ b/src/rocksdb/db/memtable.cc
@@ -0,0 +1,1122 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/memtable.h"
+
+#include <algorithm>
+#include <array>
+#include <limits>
+#include <memory>
+#include "db/dbformat.h"
+#include "db/merge_context.h"
+#include "db/merge_helper.h"
+#include "db/pinned_iterators_manager.h"
+#include "db/range_tombstone_fragmenter.h"
+#include "db/read_callback.h"
+#include "memory/arena.h"
+#include "memory/memory_usage.h"
+#include "monitoring/perf_context_imp.h"
+#include "monitoring/statistics.h"
+#include "port/port.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/env.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/write_buffer_manager.h"
+#include "table/internal_iterator.h"
+#include "table/iterator_wrapper.h"
+#include "table/merging_iterator.h"
+#include "util/autovector.h"
+#include "util/coding.h"
+#include "util/mutexlock.h"
+#include "util/util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+ImmutableMemTableOptions::ImmutableMemTableOptions(
+    const ImmutableCFOptions& ioptions,
+    const MutableCFOptions& mutable_cf_options)
+    : arena_block_size(mutable_cf_options.arena_block_size),
+      memtable_prefix_bloom_bits(
+          static_cast<uint32_t>(
+              static_cast<double>(mutable_cf_options.write_buffer_size) *
+              mutable_cf_options.memtable_prefix_bloom_size_ratio) *
+          8u),
+      memtable_huge_page_size(mutable_cf_options.memtable_huge_page_size),
+      memtable_whole_key_filtering(
+          mutable_cf_options.memtable_whole_key_filtering),
+      inplace_update_support(ioptions.inplace_update_support),
+      inplace_update_num_locks(mutable_cf_options.inplace_update_num_locks),
+      inplace_callback(ioptions.inplace_callback),
+      max_successive_merges(mutable_cf_options.max_successive_merges),
+      statistics(ioptions.statistics),
+      merge_operator(ioptions.merge_operator),
+      info_log(ioptions.info_log) {}
+
+MemTable::MemTable(const InternalKeyComparator& cmp,
+                   const ImmutableCFOptions& ioptions,
+                   const MutableCFOptions& mutable_cf_options,
+                   WriteBufferManager* write_buffer_manager,
+                   SequenceNumber latest_seq, uint32_t column_family_id)
+    : comparator_(cmp),
+      moptions_(ioptions, mutable_cf_options),
+      refs_(0),
+      kArenaBlockSize(OptimizeBlockSize(moptions_.arena_block_size)),
+      mem_tracker_(write_buffer_manager),
+      arena_(moptions_.arena_block_size,
+             (write_buffer_manager != nullptr &&
+              (write_buffer_manager->enabled() ||
+               write_buffer_manager->cost_to_cache()))
+                 ? &mem_tracker_
+                 : nullptr,
+             mutable_cf_options.memtable_huge_page_size),
+      table_(ioptions.memtable_factory->CreateMemTableRep(
+          comparator_, &arena_, mutable_cf_options.prefix_extractor.get(),
+          ioptions.info_log, column_family_id)),
+      range_del_table_(SkipListFactory().CreateMemTableRep(
+          comparator_, &arena_, nullptr /* transform */, ioptions.info_log,
+          column_family_id)),
+      is_range_del_table_empty_(true),
+      data_size_(0),
+      num_entries_(0),
+      num_deletes_(0),
+      write_buffer_size_(mutable_cf_options.write_buffer_size),
+      flush_in_progress_(false),
+      flush_completed_(false),
+      file_number_(0),
+      first_seqno_(0),
+      earliest_seqno_(latest_seq),
+      creation_seq_(latest_seq),
+      mem_next_logfile_number_(0),
+      min_prep_log_referenced_(0),
+      locks_(moptions_.inplace_update_support
+                 ? moptions_.inplace_update_num_locks
+                 : 0),
+      prefix_extractor_(mutable_cf_options.prefix_extractor.get()),
+      flush_state_(FLUSH_NOT_REQUESTED),
+      env_(ioptions.env),
+      insert_with_hint_prefix_extractor_(
+          ioptions.memtable_insert_with_hint_prefix_extractor),
+      oldest_key_time_(std::numeric_limits<uint64_t>::max()),
+      atomic_flush_seqno_(kMaxSequenceNumber),
+      approximate_memory_usage_(0) {
+  UpdateFlushState();
+  // something went wrong if we need to flush before inserting anything
+  assert(!ShouldScheduleFlush());
+
+  // use bloom_filter_ for both whole key and prefix bloom filter
+  if ((prefix_extractor_ || moptions_.memtable_whole_key_filtering) &&
+      moptions_.memtable_prefix_bloom_bits > 0) {
+    bloom_filter_.reset(
+        new DynamicBloom(&arena_, moptions_.memtable_prefix_bloom_bits,
+                         6 /* hard coded 6 probes */,
+                         moptions_.memtable_huge_page_size, ioptions.info_log));
+  }
+}
+
+MemTable::~MemTable() {
+  mem_tracker_.FreeMem();
+  assert(refs_ == 0);
+}
+
+size_t MemTable::ApproximateMemoryUsage() {
+  autovector<size_t> usages = {
+      arena_.ApproximateMemoryUsage(), table_->ApproximateMemoryUsage(),
+      range_del_table_->ApproximateMemoryUsage(),
+      ROCKSDB_NAMESPACE::ApproximateMemoryUsage(insert_hints_)};
+  size_t total_usage = 0;
+  for (size_t usage : usages) {
+    // If usage + total_usage >= kMaxSizet, return kMaxSizet.
+    // the following variation is to avoid numeric overflow.
+    if (usage >= port::kMaxSizet - total_usage) {
+      return port::kMaxSizet;
+    }
+    total_usage += usage;
+  }
+  approximate_memory_usage_.store(total_usage, std::memory_order_relaxed);
+  // otherwise, return the actual usage
+  return total_usage;
+}
+
+bool MemTable::ShouldFlushNow() {
+  size_t write_buffer_size = write_buffer_size_.load(std::memory_order_relaxed);
+  // In a lot of times, we cannot allocate arena blocks that exactly matches the
+  // buffer size. Thus we have to decide if we should over-allocate or
+  // under-allocate.
+  // This constant variable can be interpreted as: if we still have more than
+  // "kAllowOverAllocationRatio * kArenaBlockSize" space left, we'd try to over
+  // allocate one more block.
+  const double kAllowOverAllocationRatio = 0.6;
+
+  // If arena still have room for new block allocation, we can safely say it
+  // shouldn't flush.
+  auto allocated_memory = table_->ApproximateMemoryUsage() +
+                          range_del_table_->ApproximateMemoryUsage() +
+                          arena_.MemoryAllocatedBytes();
+
+  approximate_memory_usage_.store(allocated_memory, std::memory_order_relaxed);
+
+  // if we can still allocate one more block without exceeding the
+  // over-allocation ratio, then we should not flush.
+  if (allocated_memory + kArenaBlockSize <
+      write_buffer_size + kArenaBlockSize * kAllowOverAllocationRatio) {
+    return false;
+  }
+
+  // if user keeps adding entries that exceeds write_buffer_size, we need to
+  // flush earlier even though we still have much available memory left.
+  if (allocated_memory >
+      write_buffer_size + kArenaBlockSize * kAllowOverAllocationRatio) {
+    return true;
+  }
+
+  // In this code path, Arena has already allocated its "last block", which
+  // means the total allocatedmemory size is either:
+  //  (1) "moderately" over allocated the memory (no more than `0.6 * arena
+  // block size`. Or,
+  //  (2) the allocated memory is less than write buffer size, but we'll stop
+  // here since if we allocate a new arena block, we'll over allocate too much
+  // more (half of the arena block size) memory.
+  //
+  // In either case, to avoid over-allocate, the last block will stop allocation
+  // when its usage reaches a certain ratio, which we carefully choose "0.75
+  // full" as the stop condition because it addresses the following issue with
+  // great simplicity: What if the next inserted entry's size is
+  // bigger than AllocatedAndUnused()?
+  //
+  // The answer is: if the entry size is also bigger than 0.25 *
+  // kArenaBlockSize, a dedicated block will be allocated for it; otherwise
+  // arena will anyway skip the AllocatedAndUnused() and allocate a new, empty
+  // and regular block. In either case, we *overly* over-allocated.
+  //
+  // Therefore, setting the last block to be at most "0.75 full" avoids both
+  // cases.
+  //
+  // NOTE: the average percentage of waste space of this approach can be counted
+  // as: "arena block size * 0.25 / write buffer size". User who specify a small
+  // write buffer size and/or big arena block size may suffer.
+  return arena_.AllocatedAndUnused() < kArenaBlockSize / 4;
+}
+
+void MemTable::UpdateFlushState() {
+  auto state = flush_state_.load(std::memory_order_relaxed);
+  if (state == FLUSH_NOT_REQUESTED && ShouldFlushNow()) {
+    // ignore CAS failure, because that means somebody else requested
+    // a flush
+    flush_state_.compare_exchange_strong(state, FLUSH_REQUESTED,
+                                         std::memory_order_relaxed,
+                                         std::memory_order_relaxed);
+  }
+}
+
+void MemTable::UpdateOldestKeyTime() {
+  uint64_t oldest_key_time = oldest_key_time_.load(std::memory_order_relaxed);
+  if (oldest_key_time == std::numeric_limits<uint64_t>::max()) {
+    int64_t current_time = 0;
+    auto s = env_->GetCurrentTime(&current_time);
+    if (s.ok()) {
+      assert(current_time >= 0);
+      // If fail, the timestamp is already set.
+      oldest_key_time_.compare_exchange_strong(
+          oldest_key_time, static_cast<uint64_t>(current_time),
+          std::memory_order_relaxed, std::memory_order_relaxed);
+    }
+  }
+}
+
+int MemTable::KeyComparator::operator()(const char* prefix_len_key1,
+                                        const char* prefix_len_key2) const {
+  // Internal keys are encoded as length-prefixed strings.
+  Slice k1 = GetLengthPrefixedSlice(prefix_len_key1);
+  Slice k2 = GetLengthPrefixedSlice(prefix_len_key2);
+  return comparator.CompareKeySeq(k1, k2);
+}
+
+int MemTable::KeyComparator::operator()(const char* prefix_len_key,
+                                        const KeyComparator::DecodedType& key)
+    const {
+  // Internal keys are encoded as length-prefixed strings.
+  Slice a = GetLengthPrefixedSlice(prefix_len_key);
+  return comparator.CompareKeySeq(a, key);
+}
+
+void MemTableRep::InsertConcurrently(KeyHandle /*handle*/) {
+#ifndef ROCKSDB_LITE
+  throw std::runtime_error("concurrent insert not supported");
+#else
+  abort();
+#endif
+}
+
+Slice MemTableRep::UserKey(const char* key) const {
+  Slice slice = GetLengthPrefixedSlice(key);
+  return Slice(slice.data(), slice.size() - 8);
+}
+
+KeyHandle MemTableRep::Allocate(const size_t len, char** buf) {
+  *buf = allocator_->Allocate(len);
+  return static_cast<KeyHandle>(*buf);
+}
+
+// Encode a suitable internal key target for "target" and return it.
+// Uses *scratch as scratch space, and the returned pointer will point
+// into this scratch space.
+const char* EncodeKey(std::string* scratch, const Slice& target) {
+  scratch->clear();
+  PutVarint32(scratch, static_cast<uint32_t>(target.size()));
+  scratch->append(target.data(), target.size());
+  return scratch->data();
+}
+
+class MemTableIterator : public InternalIterator {
+ public:
+  MemTableIterator(const MemTable& mem, const ReadOptions& read_options,
+                   Arena* arena, bool use_range_del_table = false)
+      : bloom_(nullptr),
+        prefix_extractor_(mem.prefix_extractor_),
+        comparator_(mem.comparator_),
+        valid_(false),
+        arena_mode_(arena != nullptr),
+        value_pinned_(
+            !mem.GetImmutableMemTableOptions()->inplace_update_support) {
+    if (use_range_del_table) {
+      iter_ = mem.range_del_table_->GetIterator(arena);
+    } else if (prefix_extractor_ != nullptr && !read_options.total_order_seek &&
+               !read_options.auto_prefix_mode) {
+      // Auto prefix mode is not implemented in memtable yet.
+      bloom_ = mem.bloom_filter_.get();
+      iter_ = mem.table_->GetDynamicPrefixIterator(arena);
+    } else {
+      iter_ = mem.table_->GetIterator(arena);
+    }
+  }
+  // No copying allowed
+  MemTableIterator(const MemTableIterator&) = delete;
+  void operator=(const MemTableIterator&) = delete;
+
+  ~MemTableIterator() override {
+#ifndef NDEBUG
+    // Assert that the MemTableIterator is never deleted while
+    // Pinning is Enabled.
+    assert(!pinned_iters_mgr_ || !pinned_iters_mgr_->PinningEnabled());
+#endif
+    if (arena_mode_) {
+      iter_->~Iterator();
+    } else {
+      delete iter_;
+    }
+  }
+
+#ifndef NDEBUG
+  void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override {
+    pinned_iters_mgr_ = pinned_iters_mgr;
+  }
+  PinnedIteratorsManager* pinned_iters_mgr_ = nullptr;
+#endif
+
+  bool Valid() const override { return valid_; }
+  void Seek(const Slice& k) override {
+    PERF_TIMER_GUARD(seek_on_memtable_time);
+    PERF_COUNTER_ADD(seek_on_memtable_count, 1);
+    if (bloom_) {
+      // iterator should only use prefix bloom filter
+      Slice user_k(ExtractUserKey(k));
+      if (prefix_extractor_->InDomain(user_k) &&
+          !bloom_->MayContain(prefix_extractor_->Transform(user_k))) {
+        PERF_COUNTER_ADD(bloom_memtable_miss_count, 1);
+        valid_ = false;
+        return;
+      } else {
+        PERF_COUNTER_ADD(bloom_memtable_hit_count, 1);
+      }
+    }
+    iter_->Seek(k, nullptr);
+    valid_ = iter_->Valid();
+  }
+  void SeekForPrev(const Slice& k) override {
+    PERF_TIMER_GUARD(seek_on_memtable_time);
+    PERF_COUNTER_ADD(seek_on_memtable_count, 1);
+    if (bloom_) {
+      Slice user_k(ExtractUserKey(k));
+      if (prefix_extractor_->InDomain(user_k) &&
+          !bloom_->MayContain(prefix_extractor_->Transform(user_k))) {
+        PERF_COUNTER_ADD(bloom_memtable_miss_count, 1);
+        valid_ = false;
+        return;
+      } else {
+        PERF_COUNTER_ADD(bloom_memtable_hit_count, 1);
+      }
+    }
+    iter_->Seek(k, nullptr);
+    valid_ = iter_->Valid();
+    if (!Valid()) {
+      SeekToLast();
+    }
+    while (Valid() && comparator_.comparator.Compare(k, key()) < 0) {
+      Prev();
+    }
+  }
+  void SeekToFirst() override {
+    iter_->SeekToFirst();
+    valid_ = iter_->Valid();
+  }
+  void SeekToLast() override {
+    iter_->SeekToLast();
+    valid_ = iter_->Valid();
+  }
+  void Next() override {
+    PERF_COUNTER_ADD(next_on_memtable_count, 1);
+    assert(Valid());
+    iter_->Next();
+    valid_ = iter_->Valid();
+  }
+  void Prev() override {
+    PERF_COUNTER_ADD(prev_on_memtable_count, 1);
+    assert(Valid());
+    iter_->Prev();
+    valid_ = iter_->Valid();
+  }
+  Slice key() const override {
+    assert(Valid());
+    return GetLengthPrefixedSlice(iter_->key());
+  }
+  Slice value() const override {
+    assert(Valid());
+    Slice key_slice = GetLengthPrefixedSlice(iter_->key());
+    return GetLengthPrefixedSlice(key_slice.data() + key_slice.size());
+  }
+
+  Status status() const override { return Status::OK(); }
+
+  bool IsKeyPinned() const override {
+    // memtable data is always pinned
+    return true;
+  }
+
+  bool IsValuePinned() const override {
+    // memtable value is always pinned, except if we allow inplace update.
+    return value_pinned_;
+  }
+
+ private:
+  DynamicBloom* bloom_;
+  const SliceTransform* const prefix_extractor_;
+  const MemTable::KeyComparator comparator_;
+  MemTableRep::Iterator* iter_;
+  bool valid_;
+  bool arena_mode_;
+  bool value_pinned_;
+};
+
+InternalIterator* MemTable::NewIterator(const ReadOptions& read_options,
+                                        Arena* arena) {
+  assert(arena != nullptr);
+  auto mem = arena->AllocateAligned(sizeof(MemTableIterator));
+  return new (mem) MemTableIterator(*this, read_options, arena);
+}
+
+FragmentedRangeTombstoneIterator* MemTable::NewRangeTombstoneIterator(
+    const ReadOptions& read_options, SequenceNumber read_seq) {
+  if (read_options.ignore_range_deletions ||
+      is_range_del_table_empty_.load(std::memory_order_relaxed)) {
+    return nullptr;
+  }
+  auto* unfragmented_iter = new MemTableIterator(
+      *this, read_options, nullptr /* arena */, true /* use_range_del_table */);
+  if (unfragmented_iter == nullptr) {
+    return nullptr;
+  }
+  auto fragmented_tombstone_list =
+      std::make_shared<FragmentedRangeTombstoneList>(
+          std::unique_ptr<InternalIterator>(unfragmented_iter),
+          comparator_.comparator);
+
+  auto* fragmented_iter = new FragmentedRangeTombstoneIterator(
+      fragmented_tombstone_list, comparator_.comparator, read_seq);
+  return fragmented_iter;
+}
+
+port::RWMutex* MemTable::GetLock(const Slice& key) {
+  return &locks_[fastrange64(GetSliceNPHash64(key), locks_.size())];
+}
+
+MemTable::MemTableStats MemTable::ApproximateStats(const Slice& start_ikey,
+                                                   const Slice& end_ikey) {
+  uint64_t entry_count = table_->ApproximateNumEntries(start_ikey, end_ikey);
+  entry_count += range_del_table_->ApproximateNumEntries(start_ikey, end_ikey);
+  if (entry_count == 0) {
+    return {0, 0};
+  }
+  uint64_t n = num_entries_.load(std::memory_order_relaxed);
+  if (n == 0) {
+    return {0, 0};
+  }
+  if (entry_count > n) {
+    // (range_del_)table_->ApproximateNumEntries() is just an estimate so it can
+    // be larger than actual entries we have. Cap it to entries we have to limit
+    // the inaccuracy.
+    entry_count = n;
+  }
+  uint64_t data_size = data_size_.load(std::memory_order_relaxed);
+  return {entry_count * (data_size / n), entry_count};
+}
+
+bool MemTable::Add(SequenceNumber s, ValueType type,
+                   const Slice& key, /* user key */
+                   const Slice& value, bool allow_concurrent,
+                   MemTablePostProcessInfo* post_process_info, void** hint) {
+  // Format of an entry is concatenation of:
+  //  key_size     : varint32 of internal_key.size()
+  //  key bytes    : char[internal_key.size()]
+  //  value_size   : varint32 of value.size()
+  //  value bytes  : char[value.size()]
+  uint32_t key_size = static_cast<uint32_t>(key.size());
+  uint32_t val_size = static_cast<uint32_t>(value.size());
+  uint32_t internal_key_size = key_size + 8;
+  const uint32_t encoded_len = VarintLength(internal_key_size) +
+                               internal_key_size + VarintLength(val_size) +
+                               val_size;
+  char* buf = nullptr;
+  std::unique_ptr<MemTableRep>& table =
+      type == kTypeRangeDeletion ? range_del_table_ : table_;
+  KeyHandle handle = table->Allocate(encoded_len, &buf);
+
+  char* p = EncodeVarint32(buf, internal_key_size);
+  memcpy(p, key.data(), key_size);
+  Slice key_slice(p, key_size);
+  p += key_size;
+  uint64_t packed = PackSequenceAndType(s, type);
+  EncodeFixed64(p, packed);
+  p += 8;
+  p = EncodeVarint32(p, val_size);
+  memcpy(p, value.data(), val_size);
+  assert((unsigned)(p + val_size - buf) == (unsigned)encoded_len);
+  size_t ts_sz = GetInternalKeyComparator().user_comparator()->timestamp_size();
+
+  if (!allow_concurrent) {
+    // Extract prefix for insert with hint.
+    if (insert_with_hint_prefix_extractor_ != nullptr &&
+        insert_with_hint_prefix_extractor_->InDomain(key_slice)) {
+      Slice prefix = insert_with_hint_prefix_extractor_->Transform(key_slice);
+      bool res = table->InsertKeyWithHint(handle, &insert_hints_[prefix]);
+      if (UNLIKELY(!res)) {
+        return res;
+      }
+    } else {
+      bool res = table->InsertKey(handle);
+      if (UNLIKELY(!res)) {
+        return res;
+      }
+    }
+
+    // this is a bit ugly, but is the way to avoid locked instructions
+    // when incrementing an atomic
+    num_entries_.store(num_entries_.load(std::memory_order_relaxed) + 1,
+                       std::memory_order_relaxed);
+    data_size_.store(data_size_.load(std::memory_order_relaxed) + encoded_len,
+                     std::memory_order_relaxed);
+    if (type == kTypeDeletion) {
+      num_deletes_.store(num_deletes_.load(std::memory_order_relaxed) + 1,
+                         std::memory_order_relaxed);
+    }
+
+    if (bloom_filter_ && prefix_extractor_ &&
+        prefix_extractor_->InDomain(key)) {
+      bloom_filter_->Add(prefix_extractor_->Transform(key));
+    }
+    if (bloom_filter_ && moptions_.memtable_whole_key_filtering) {
+      bloom_filter_->Add(StripTimestampFromUserKey(key, ts_sz));
+    }
+
+    // The first sequence number inserted into the memtable
+    assert(first_seqno_ == 0 || s >= first_seqno_);
+    if (first_seqno_ == 0) {
+      first_seqno_.store(s, std::memory_order_relaxed);
+
+      if (earliest_seqno_ == kMaxSequenceNumber) {
+        earliest_seqno_.store(GetFirstSequenceNumber(),
+                              std::memory_order_relaxed);
+      }
+      assert(first_seqno_.load() >= earliest_seqno_.load());
+    }
+    assert(post_process_info == nullptr);
+    UpdateFlushState();
+  } else {
+    bool res = (hint == nullptr)
+                   ? table->InsertKeyConcurrently(handle)
+                   : table->InsertKeyWithHintConcurrently(handle, hint);
+    if (UNLIKELY(!res)) {
+      return res;
+    }
+
+    assert(post_process_info != nullptr);
+    post_process_info->num_entries++;
+    post_process_info->data_size += encoded_len;
+    if (type == kTypeDeletion) {
+      post_process_info->num_deletes++;
+    }
+
+    if (bloom_filter_ && prefix_extractor_ &&
+        prefix_extractor_->InDomain(key)) {
+      bloom_filter_->AddConcurrently(prefix_extractor_->Transform(key));
+    }
+    if (bloom_filter_ && moptions_.memtable_whole_key_filtering) {
+      bloom_filter_->AddConcurrently(StripTimestampFromUserKey(key, ts_sz));
+    }
+
+    // atomically update first_seqno_ and earliest_seqno_.
+    uint64_t cur_seq_num = first_seqno_.load(std::memory_order_relaxed);
+    while ((cur_seq_num == 0 || s < cur_seq_num) &&
+           !first_seqno_.compare_exchange_weak(cur_seq_num, s)) {
+    }
+    uint64_t cur_earliest_seqno =
+        earliest_seqno_.load(std::memory_order_relaxed);
+    while (
+        (cur_earliest_seqno == kMaxSequenceNumber || s < cur_earliest_seqno) &&
+        !first_seqno_.compare_exchange_weak(cur_earliest_seqno, s)) {
+    }
+  }
+  if (type == kTypeRangeDeletion) {
+    is_range_del_table_empty_.store(false, std::memory_order_relaxed);
+  }
+  UpdateOldestKeyTime();
+  return true;
+}
+
+// Callback from MemTable::Get()
+namespace {
+
+struct Saver {
+  Status* status;
+  const LookupKey* key;
+  bool* found_final_value;  // Is value set correctly? Used by KeyMayExist
+  bool* merge_in_progress;
+  std::string* value;
+  SequenceNumber seq;
+  const MergeOperator* merge_operator;
+  // the merge operations encountered;
+  MergeContext* merge_context;
+  SequenceNumber max_covering_tombstone_seq;
+  MemTable* mem;
+  Logger* logger;
+  Statistics* statistics;
+  bool inplace_update_support;
+  bool do_merge;
+  Env* env_;
+  ReadCallback* callback_;
+  bool* is_blob_index;
+
+  bool CheckCallback(SequenceNumber _seq) {
+    if (callback_) {
+      return callback_->IsVisible(_seq);
+    }
+    return true;
+  }
+};
+}  // namespace
+
+static bool SaveValue(void* arg, const char* entry) {
+  Saver* s = reinterpret_cast<Saver*>(arg);
+  assert(s != nullptr);
+  MergeContext* merge_context = s->merge_context;
+  SequenceNumber max_covering_tombstone_seq = s->max_covering_tombstone_seq;
+  const MergeOperator* merge_operator = s->merge_operator;
+
+  assert(merge_context != nullptr);
+
+  // entry format is:
+  //    klength  varint32
+  //    userkey  char[klength-8]
+  //    tag      uint64
+  //    vlength  varint32f
+  //    value    char[vlength]
+  // Check that it belongs to same user key.  We do not check the
+  // sequence number since the Seek() call above should have skipped
+  // all entries with overly large sequence numbers.
+  uint32_t key_length;
+  const char* key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length);
+  Slice user_key_slice = Slice(key_ptr, key_length - 8);
+  if (s->mem->GetInternalKeyComparator()
+          .user_comparator()
+          ->CompareWithoutTimestamp(user_key_slice, s->key->user_key()) == 0) {
+    // Correct user key
+    const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8);
+    ValueType type;
+    SequenceNumber seq;
+    UnPackSequenceAndType(tag, &seq, &type);
+    // If the value is not in the snapshot, skip it
+    if (!s->CheckCallback(seq)) {
+      return true;  // to continue to the next seq
+    }
+
+    s->seq = seq;
+
+    if ((type == kTypeValue || type == kTypeMerge || type == kTypeBlobIndex) &&
+        max_covering_tombstone_seq > seq) {
+      type = kTypeRangeDeletion;
+    }
+    switch (type) {
+      case kTypeBlobIndex:
+        if (s->is_blob_index == nullptr) {
+          ROCKS_LOG_ERROR(s->logger, "Encounter unexpected blob index.");
+          *(s->status) = Status::NotSupported(
+              "Encounter unsupported blob value. Please open DB with "
+              "ROCKSDB_NAMESPACE::blob_db::BlobDB instead.");
+        } else if (*(s->merge_in_progress)) {
+          *(s->status) =
+              Status::NotSupported("Blob DB does not support merge operator.");
+        }
+        if (!s->status->ok()) {
+          *(s->found_final_value) = true;
+          return false;
+        }
+        FALLTHROUGH_INTENDED;
+      case kTypeValue: {
+        if (s->inplace_update_support) {
+          s->mem->GetLock(s->key->user_key())->ReadLock();
+        }
+        Slice v = GetLengthPrefixedSlice(key_ptr + key_length);
+        *(s->status) = Status::OK();
+        if (*(s->merge_in_progress)) {
+          if (s->do_merge) {
+            if (s->value != nullptr) {
+              *(s->status) = MergeHelper::TimedFullMerge(
+                  merge_operator, s->key->user_key(), &v,
+                  merge_context->GetOperands(), s->value, s->logger,
+                  s->statistics, s->env_, nullptr /* result_operand */, true);
+            }
+          } else {
+            // Preserve the value with the goal of returning it as part of
+            // raw merge operands to the user
+            merge_context->PushOperand(
+                v, s->inplace_update_support == false /* operand_pinned */);
+          }
+        } else if (!s->do_merge) {
+          // Preserve the value with the goal of returning it as part of
+          // raw merge operands to the user
+          merge_context->PushOperand(
+              v, s->inplace_update_support == false /* operand_pinned */);
+        } else if (s->value != nullptr) {
+          s->value->assign(v.data(), v.size());
+        }
+        if (s->inplace_update_support) {
+          s->mem->GetLock(s->key->user_key())->ReadUnlock();
+        }
+        *(s->found_final_value) = true;
+        if (s->is_blob_index != nullptr) {
+          *(s->is_blob_index) = (type == kTypeBlobIndex);
+        }
+        return false;
+      }
+      case kTypeDeletion:
+      case kTypeSingleDeletion:
+      case kTypeRangeDeletion: {
+        if (*(s->merge_in_progress)) {
+          if (s->value != nullptr) {
+            *(s->status) = MergeHelper::TimedFullMerge(
+                merge_operator, s->key->user_key(), nullptr,
+                merge_context->GetOperands(), s->value, s->logger,
+                s->statistics, s->env_, nullptr /* result_operand */, true);
+          }
+        } else {
+          *(s->status) = Status::NotFound();
+        }
+        *(s->found_final_value) = true;
+        return false;
+      }
+      case kTypeMerge: {
+        if (!merge_operator) {
+          *(s->status) = Status::InvalidArgument(
+              "merge_operator is not properly initialized.");
+          // Normally we continue the loop (return true) when we see a merge
+          // operand.  But in case of an error, we should stop the loop
+          // immediately and pretend we have found the value to stop further
+          // seek.  Otherwise, the later call will override this error status.
+          *(s->found_final_value) = true;
+          return false;
+        }
+        Slice v = GetLengthPrefixedSlice(key_ptr + key_length);
+        *(s->merge_in_progress) = true;
+        merge_context->PushOperand(
+            v, s->inplace_update_support == false /* operand_pinned */);
+        if (s->do_merge && merge_operator->ShouldMerge(
+                               merge_context->GetOperandsDirectionBackward())) {
+          *(s->status) = MergeHelper::TimedFullMerge(
+              merge_operator, s->key->user_key(), nullptr,
+              merge_context->GetOperands(), s->value, s->logger, s->statistics,
+              s->env_, nullptr /* result_operand */, true);
+          *(s->found_final_value) = true;
+          return false;
+        }
+        return true;
+      }
+      default:
+        assert(false);
+        return true;
+    }
+  }
+
+  // s->state could be Corrupt, merge or notfound
+  return false;
+}
+
+bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
+                   MergeContext* merge_context,
+                   SequenceNumber* max_covering_tombstone_seq,
+                   SequenceNumber* seq, const ReadOptions& read_opts,
+                   ReadCallback* callback, bool* is_blob_index, bool do_merge) {
+  // The sequence number is updated synchronously in version_set.h
+  if (IsEmpty()) {
+    // Avoiding recording stats for speed.
+    return false;
+  }
+  PERF_TIMER_GUARD(get_from_memtable_time);
+
+  std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter(
+      NewRangeTombstoneIterator(read_opts,
+                                GetInternalKeySeqno(key.internal_key())));
+  if (range_del_iter != nullptr) {
+    *max_covering_tombstone_seq =
+        std::max(*max_covering_tombstone_seq,
+                 range_del_iter->MaxCoveringTombstoneSeqnum(key.user_key()));
+  }
+
+  Slice user_key = key.user_key();
+  bool found_final_value = false;
+  bool merge_in_progress = s->IsMergeInProgress();
+  bool may_contain = true;
+  size_t ts_sz = GetInternalKeyComparator().user_comparator()->timestamp_size();
+  if (bloom_filter_) {
+    // when both memtable_whole_key_filtering and prefix_extractor_ are set,
+    // only do whole key filtering for Get() to save CPU
+    if (moptions_.memtable_whole_key_filtering) {
+      may_contain =
+          bloom_filter_->MayContain(StripTimestampFromUserKey(user_key, ts_sz));
+    } else {
+      assert(prefix_extractor_);
+      may_contain =
+          !prefix_extractor_->InDomain(user_key) ||
+          bloom_filter_->MayContain(prefix_extractor_->Transform(user_key));
+    }
+  }
+
+  if (bloom_filter_ && !may_contain) {
+    // iter is null if prefix bloom says the key does not exist
+    PERF_COUNTER_ADD(bloom_memtable_miss_count, 1);
+    *seq = kMaxSequenceNumber;
+  } else {
+    if (bloom_filter_) {
+      PERF_COUNTER_ADD(bloom_memtable_hit_count, 1);
+    }
+    GetFromTable(key, *max_covering_tombstone_seq, do_merge, callback,
+                 is_blob_index, value, s, merge_context, seq,
+                 &found_final_value, &merge_in_progress);
+  }
+
+  // No change to value, since we have not yet found a Put/Delete
+  if (!found_final_value && merge_in_progress) {
+    *s = Status::MergeInProgress();
+  }
+  PERF_COUNTER_ADD(get_from_memtable_count, 1);
+  return found_final_value;
+}
+
+void MemTable::GetFromTable(const LookupKey& key,
+                            SequenceNumber max_covering_tombstone_seq,
+                            bool do_merge, ReadCallback* callback,
+                            bool* is_blob_index, std::string* value, Status* s,
+                            MergeContext* merge_context, SequenceNumber* seq,
+                            bool* found_final_value, bool* merge_in_progress) {
+  Saver saver;
+  saver.status = s;
+  saver.found_final_value = found_final_value;
+  saver.merge_in_progress = merge_in_progress;
+  saver.key = &key;
+  saver.value = value;
+  saver.seq = kMaxSequenceNumber;
+  saver.mem = this;
+  saver.merge_context = merge_context;
+  saver.max_covering_tombstone_seq = max_covering_tombstone_seq;
+  saver.merge_operator = moptions_.merge_operator;
+  saver.logger = moptions_.info_log;
+  saver.inplace_update_support = moptions_.inplace_update_support;
+  saver.statistics = moptions_.statistics;
+  saver.env_ = env_;
+  saver.callback_ = callback;
+  saver.is_blob_index = is_blob_index;
+  saver.do_merge = do_merge;
+  table_->Get(key, &saver, SaveValue);
+  *seq = saver.seq;
+}
+
+void MemTable::MultiGet(const ReadOptions& read_options, MultiGetRange* range,
+                        ReadCallback* callback, bool* is_blob) {
+  // The sequence number is updated synchronously in version_set.h
+  if (IsEmpty()) {
+    // Avoiding recording stats for speed.
+    return;
+  }
+  PERF_TIMER_GUARD(get_from_memtable_time);
+
+  MultiGetRange temp_range(*range, range->begin(), range->end());
+  if (bloom_filter_) {
+    std::array<Slice*, MultiGetContext::MAX_BATCH_SIZE> keys;
+    std::array<bool, MultiGetContext::MAX_BATCH_SIZE> may_match = {{true}};
+    autovector<Slice, MultiGetContext::MAX_BATCH_SIZE> prefixes;
+    int num_keys = 0;
+    for (auto iter = temp_range.begin(); iter != temp_range.end(); ++iter) {
+      if (!prefix_extractor_) {
+        keys[num_keys++] = &iter->ukey;
+      } else if (prefix_extractor_->InDomain(iter->ukey)) {
+        prefixes.emplace_back(prefix_extractor_->Transform(iter->ukey));
+        keys[num_keys++] = &prefixes.back();
+      }
+    }
+    bloom_filter_->MayContain(num_keys, &keys[0], &may_match[0]);
+    int idx = 0;
+    for (auto iter = temp_range.begin(); iter != temp_range.end(); ++iter) {
+      if (prefix_extractor_ && !prefix_extractor_->InDomain(iter->ukey)) {
+        PERF_COUNTER_ADD(bloom_memtable_hit_count, 1);
+        continue;
+      }
+      if (!may_match[idx]) {
+        temp_range.SkipKey(iter);
+        PERF_COUNTER_ADD(bloom_memtable_miss_count, 1);
+      } else {
+        PERF_COUNTER_ADD(bloom_memtable_hit_count, 1);
+      }
+      idx++;
+    }
+  }
+  for (auto iter = temp_range.begin(); iter != temp_range.end(); ++iter) {
+    SequenceNumber seq = kMaxSequenceNumber;
+    bool found_final_value{false};
+    bool merge_in_progress = iter->s->IsMergeInProgress();
+    std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter(
+        NewRangeTombstoneIterator(
+            read_options, GetInternalKeySeqno(iter->lkey->internal_key())));
+    if (range_del_iter != nullptr) {
+      iter->max_covering_tombstone_seq = std::max(
+          iter->max_covering_tombstone_seq,
+          range_del_iter->MaxCoveringTombstoneSeqnum(iter->lkey->user_key()));
+    }
+    GetFromTable(*(iter->lkey), iter->max_covering_tombstone_seq, true,
+                 callback, is_blob, iter->value->GetSelf(), iter->s,
+                 &(iter->merge_context), &seq, &found_final_value,
+                 &merge_in_progress);
+
+    if (!found_final_value && merge_in_progress) {
+      *(iter->s) = Status::MergeInProgress();
+    }
+
+    if (found_final_value) {
+      iter->value->PinSelf();
+      range->MarkKeyDone(iter);
+      RecordTick(moptions_.statistics, MEMTABLE_HIT);
+    }
+  }
+  PERF_COUNTER_ADD(get_from_memtable_count, 1);
+}
+
+void MemTable::Update(SequenceNumber seq,
+                      const Slice& key,
+                      const Slice& value) {
+  LookupKey lkey(key, seq);
+  Slice mem_key = lkey.memtable_key();
+
+  std::unique_ptr<MemTableRep::Iterator> iter(
+      table_->GetDynamicPrefixIterator());
+  iter->Seek(lkey.internal_key(), mem_key.data());
+
+  if (iter->Valid()) {
+    // entry format is:
+    //    key_length  varint32
+    //    userkey  char[klength-8]
+    //    tag      uint64
+    //    vlength  varint32
+    //    value    char[vlength]
+    // Check that it belongs to same user key.  We do not check the
+    // sequence number since the Seek() call above should have skipped
+    // all entries with overly large sequence numbers.
+    const char* entry = iter->key();
+    uint32_t key_length = 0;
+    const char* key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length);
+    if (comparator_.comparator.user_comparator()->Equal(
+            Slice(key_ptr, key_length - 8), lkey.user_key())) {
+      // Correct user key
+      const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8);
+      ValueType type;
+      SequenceNumber existing_seq;
+      UnPackSequenceAndType(tag, &existing_seq, &type);
+      assert(existing_seq != seq);
+      if (type == kTypeValue) {
+        Slice prev_value = GetLengthPrefixedSlice(key_ptr + key_length);
+        uint32_t prev_size = static_cast<uint32_t>(prev_value.size());
+        uint32_t new_size = static_cast<uint32_t>(value.size());
+
+        // Update value, if new value size  <= previous value size
+        if (new_size <= prev_size) {
+          char* p =
+              EncodeVarint32(const_cast<char*>(key_ptr) + key_length, new_size);
+          WriteLock wl(GetLock(lkey.user_key()));
+          memcpy(p, value.data(), value.size());
+          assert((unsigned)((p + value.size()) - entry) ==
+                 (unsigned)(VarintLength(key_length) + key_length +
+                            VarintLength(value.size()) + value.size()));
+          RecordTick(moptions_.statistics, NUMBER_KEYS_UPDATED);
+          return;
+        }
+      }
+    }
+  }
+
+  // key doesn't exist
+  bool add_res __attribute__((__unused__));
+  add_res = Add(seq, kTypeValue, key, value);
+  // We already checked unused != seq above. In that case, Add should not fail.
+  assert(add_res);
+}
+
+bool MemTable::UpdateCallback(SequenceNumber seq,
+                              const Slice& key,
+                              const Slice& delta) {
+  LookupKey lkey(key, seq);
+  Slice memkey = lkey.memtable_key();
+
+  std::unique_ptr<MemTableRep::Iterator> iter(
+      table_->GetDynamicPrefixIterator());
+  iter->Seek(lkey.internal_key(), memkey.data());
+
+  if (iter->Valid()) {
+    // entry format is:
+    //    key_length  varint32
+    //    userkey  char[klength-8]
+    //    tag      uint64
+    //    vlength  varint32
+    //    value    char[vlength]
+    // Check that it belongs to same user key.  We do not check the
+    // sequence number since the Seek() call above should have skipped
+    // all entries with overly large sequence numbers.
+    const char* entry = iter->key();
+    uint32_t key_length = 0;
+    const char* key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length);
+    if (comparator_.comparator.user_comparator()->Equal(
+            Slice(key_ptr, key_length - 8), lkey.user_key())) {
+      // Correct user key
+      const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8);
+      ValueType type;
+      uint64_t unused;
+      UnPackSequenceAndType(tag, &unused, &type);
+      switch (type) {
+        case kTypeValue: {
+          Slice prev_value = GetLengthPrefixedSlice(key_ptr + key_length);
+          uint32_t prev_size = static_cast<uint32_t>(prev_value.size());
+
+          char* prev_buffer = const_cast<char*>(prev_value.data());
+          uint32_t new_prev_size = prev_size;
+
+          std::string str_value;
+          WriteLock wl(GetLock(lkey.user_key()));
+          auto status = moptions_.inplace_callback(prev_buffer, &new_prev_size,
+                                                   delta, &str_value);
+          if (status == UpdateStatus::UPDATED_INPLACE) {
+            // Value already updated by callback.
+            assert(new_prev_size <= prev_size);
+            if (new_prev_size < prev_size) {
+              // overwrite the new prev_size
+              char* p = EncodeVarint32(const_cast<char*>(key_ptr) + key_length,
+                                       new_prev_size);
+              if (VarintLength(new_prev_size) < VarintLength(prev_size)) {
+                // shift the value buffer as well.
+                memcpy(p, prev_buffer, new_prev_size);
+              }
+            }
+            RecordTick(moptions_.statistics, NUMBER_KEYS_UPDATED);
+            UpdateFlushState();
+            return true;
+          } else if (status == UpdateStatus::UPDATED) {
+            Add(seq, kTypeValue, key, Slice(str_value));
+            RecordTick(moptions_.statistics, NUMBER_KEYS_WRITTEN);
+            UpdateFlushState();
+            return true;
+          } else if (status == UpdateStatus::UPDATE_FAILED) {
+            // No action required. Return.
+            UpdateFlushState();
+            return true;
+          }
+        }
+        default:
+          break;
+      }
+    }
+  }
+  // If the latest value is not kTypeValue
+  // or key doesn't exist
+  return false;
+}
+
+size_t MemTable::CountSuccessiveMergeEntries(const LookupKey& key) {
+  Slice memkey = key.memtable_key();
+
+  // A total ordered iterator is costly for some memtablerep (prefix aware
+  // reps). By passing in the user key, we allow efficient iterator creation.
+  // The iterator only needs to be ordered within the same user key.
+  std::unique_ptr<MemTableRep::Iterator> iter(
+      table_->GetDynamicPrefixIterator());
+  iter->Seek(key.internal_key(), memkey.data());
+
+  size_t num_successive_merges = 0;
+
+  for (; iter->Valid(); iter->Next()) {
+    const char* entry = iter->key();
+    uint32_t key_length = 0;
+    const char* iter_key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length);
+    if (!comparator_.comparator.user_comparator()->Equal(
+            Slice(iter_key_ptr, key_length - 8), key.user_key())) {
+      break;
+    }
+
+    const uint64_t tag = DecodeFixed64(iter_key_ptr + key_length - 8);
+    ValueType type;
+    uint64_t unused;
+    UnPackSequenceAndType(tag, &unused, &type);
+    if (type != kTypeMerge) {
+      break;
+    }
+
+    ++num_successive_merges;
+  }
+
+  return num_successive_merges;
+}
+
+void MemTableRep::Get(const LookupKey& k, void* callback_args,
+                      bool (*callback_func)(void* arg, const char* entry)) {
+  auto iter = GetDynamicPrefixIterator();
+  for (iter->Seek(k.internal_key(), k.memtable_key().data());
+       iter->Valid() && callback_func(callback_args, iter->key());
+       iter->Next()) {
+  }
+}
+
+void MemTable::RefLogContainingPrepSection(uint64_t log) {
+  assert(log > 0);
+  auto cur = min_prep_log_referenced_.load();
+  while ((log < cur || cur == 0) &&
+         !min_prep_log_referenced_.compare_exchange_strong(cur, log)) {
+    cur = min_prep_log_referenced_.load();
+  }
+}
+
+uint64_t MemTable::GetMinLogContainingPrepSection() {
+  return min_prep_log_referenced_.load();
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/memtable.h b/src/rocksdb/db/memtable.h
new file mode 100644
index 000000000..f4e4b98a9
--- /dev/null
+++ b/src/rocksdb/db/memtable.h
@@ -0,0 +1,542 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <atomic>
+#include <deque>
+#include <functional>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include "db/dbformat.h"
+#include "db/range_tombstone_fragmenter.h"
+#include "db/read_callback.h"
+#include "db/version_edit.h"
+#include "memory/allocator.h"
+#include "memory/concurrent_arena.h"
+#include "monitoring/instrumented_mutex.h"
+#include "options/cf_options.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/memtablerep.h"
+#include "table/multiget_context.h"
+#include "util/dynamic_bloom.h"
+#include "util/hash.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+struct FlushJobInfo;
+class Mutex;
+class MemTableIterator;
+class MergeContext;
+
+struct ImmutableMemTableOptions {
+  explicit ImmutableMemTableOptions(const ImmutableCFOptions& ioptions,
+                                    const MutableCFOptions& mutable_cf_options);
+  size_t arena_block_size;
+  uint32_t memtable_prefix_bloom_bits;
+  size_t memtable_huge_page_size;
+  bool memtable_whole_key_filtering;
+  bool inplace_update_support;
+  size_t inplace_update_num_locks;
+  UpdateStatus (*inplace_callback)(char* existing_value,
+                                   uint32_t* existing_value_size,
+                                   Slice delta_value,
+                                   std::string* merged_value);
+  size_t max_successive_merges;
+  Statistics* statistics;
+  MergeOperator* merge_operator;
+  Logger* info_log;
+};
+
+// Batched counters to updated when inserting keys in one write batch.
+// In post process of the write batch, these can be updated together.
+// Only used in concurrent memtable insert case.
+struct MemTablePostProcessInfo {
+  uint64_t data_size = 0;
+  uint64_t num_entries = 0;
+  uint64_t num_deletes = 0;
+};
+
+using MultiGetRange = MultiGetContext::Range;
+// Note:  Many of the methods in this class have comments indicating that
+// external synchronization is required as these methods are not thread-safe.
+// It is up to higher layers of code to decide how to prevent concurrent
+// invokation of these methods.  This is usually done by acquiring either
+// the db mutex or the single writer thread.
+//
+// Some of these methods are documented to only require external
+// synchronization if this memtable is immutable.  Calling MarkImmutable() is
+// not sufficient to guarantee immutability.  It is up to higher layers of
+// code to determine if this MemTable can still be modified by other threads.
+// Eg: The Superversion stores a pointer to the current MemTable (that can
+// be modified) and a separate list of the MemTables that can no longer be
+// written to (aka the 'immutable memtables').
+class MemTable {
+ public:
+  struct KeyComparator : public MemTableRep::KeyComparator {
+    const InternalKeyComparator comparator;
+    explicit KeyComparator(const InternalKeyComparator& c) : comparator(c) { }
+    virtual int operator()(const char* prefix_len_key1,
+                           const char* prefix_len_key2) const override;
+    virtual int operator()(const char* prefix_len_key,
+                           const DecodedType& key) const override;
+  };
+
+  // MemTables are reference counted.  The initial reference count
+  // is zero and the caller must call Ref() at least once.
+  //
+  // earliest_seq should be the current SequenceNumber in the db such that any
+  // key inserted into this memtable will have an equal or larger seq number.
+  // (When a db is first created, the earliest sequence number will be 0).
+  // If the earliest sequence number is not known, kMaxSequenceNumber may be
+  // used, but this may prevent some transactions from succeeding until the
+  // first key is inserted into the memtable.
+  explicit MemTable(const InternalKeyComparator& comparator,
+                    const ImmutableCFOptions& ioptions,
+                    const MutableCFOptions& mutable_cf_options,
+                    WriteBufferManager* write_buffer_manager,
+                    SequenceNumber earliest_seq, uint32_t column_family_id);
+  // No copying allowed
+  MemTable(const MemTable&) = delete;
+  MemTable& operator=(const MemTable&) = delete;
+
+  // Do not delete this MemTable unless Unref() indicates it not in use.
+  ~MemTable();
+
+  // Increase reference count.
+  // REQUIRES: external synchronization to prevent simultaneous
+  // operations on the same MemTable.
+  void Ref() { ++refs_; }
+
+  // Drop reference count.
+  // If the refcount goes to zero return this memtable, otherwise return null.
+  // REQUIRES: external synchronization to prevent simultaneous
+  // operations on the same MemTable.
+  MemTable* Unref() {
+    --refs_;
+    assert(refs_ >= 0);
+    if (refs_ <= 0) {
+      return this;
+    }
+    return nullptr;
+  }
+
+  // Returns an estimate of the number of bytes of data in use by this
+  // data structure.
+  //
+  // REQUIRES: external synchronization to prevent simultaneous
+  // operations on the same MemTable (unless this Memtable is immutable).
+  size_t ApproximateMemoryUsage();
+
+  // As a cheap version of `ApproximateMemoryUsage()`, this function doens't
+  // require external synchronization. The value may be less accurate though
+  size_t ApproximateMemoryUsageFast() const {
+    return approximate_memory_usage_.load(std::memory_order_relaxed);
+  }
+
+  // This method heuristically determines if the memtable should continue to
+  // host more data.
+  bool ShouldScheduleFlush() const {
+    return flush_state_.load(std::memory_order_relaxed) == FLUSH_REQUESTED;
+  }
+
+  // Returns true if a flush should be scheduled and the caller should
+  // be the one to schedule it
+  bool MarkFlushScheduled() {
+    auto before = FLUSH_REQUESTED;
+    return flush_state_.compare_exchange_strong(before, FLUSH_SCHEDULED,
+                                                std::memory_order_relaxed,
+                                                std::memory_order_relaxed);
+  }
+
+  // Return an iterator that yields the contents of the memtable.
+  //
+  // The caller must ensure that the underlying MemTable remains live
+  // while the returned iterator is live.  The keys returned by this
+  // iterator are internal keys encoded by AppendInternalKey in the
+  // db/dbformat.{h,cc} module.
+  //
+  // By default, it returns an iterator for prefix seek if prefix_extractor
+  // is configured in Options.
+  // arena: If not null, the arena needs to be used to allocate the Iterator.
+  //        Calling ~Iterator of the iterator will destroy all the states but
+  //        those allocated in arena.
+  InternalIterator* NewIterator(const ReadOptions& read_options, Arena* arena);
+
+  FragmentedRangeTombstoneIterator* NewRangeTombstoneIterator(
+      const ReadOptions& read_options, SequenceNumber read_seq);
+
+  // Add an entry into memtable that maps key to value at the
+  // specified sequence number and with the specified type.
+  // Typically value will be empty if type==kTypeDeletion.
+  //
+  // REQUIRES: if allow_concurrent = false, external synchronization to prevent
+  // simultaneous operations on the same MemTable.
+  //
+  // Returns false if MemTableRepFactory::CanHandleDuplicatedKey() is true and
+  // the <key, seq> already exists.
+  bool Add(SequenceNumber seq, ValueType type, const Slice& key,
+           const Slice& value, bool allow_concurrent = false,
+           MemTablePostProcessInfo* post_process_info = nullptr,
+           void** hint = nullptr);
+
+  // Used to Get value associated with key or Get Merge Operands associated
+  // with key.
+  // If do_merge = true the default behavior which is Get value for key is
+  // executed. Expected behavior is described right below.
+  // If memtable contains a value for key, store it in *value and return true.
+  // If memtable contains a deletion for key, store a NotFound() error
+  // in *status and return true.
+  // If memtable contains Merge operation as the most recent entry for a key,
+  //   and the merge process does not stop (not reaching a value or delete),
+  //   prepend the current merge operand to *operands.
+  //   store MergeInProgress in s, and return false.
+  // Else, return false.
+  // If any operation was found, its most recent sequence number
+  // will be stored in *seq on success (regardless of whether true/false is
+  // returned).  Otherwise, *seq will be set to kMaxSequenceNumber.
+  // On success, *s may be set to OK, NotFound, or MergeInProgress.  Any other
+  // status returned indicates a corruption or other unexpected error.
+  // If do_merge = false then any Merge Operands encountered for key are simply
+  // stored in merge_context.operands_list and never actually merged to get a
+  // final value. The raw Merge Operands are eventually returned to the user.
+  bool Get(const LookupKey& key, std::string* value, Status* s,
+           MergeContext* merge_context,
+           SequenceNumber* max_covering_tombstone_seq, SequenceNumber* seq,
+           const ReadOptions& read_opts, ReadCallback* callback = nullptr,
+           bool* is_blob_index = nullptr, bool do_merge = true);
+
+  bool Get(const LookupKey& key, std::string* value, Status* s,
+           MergeContext* merge_context,
+           SequenceNumber* max_covering_tombstone_seq,
+           const ReadOptions& read_opts, ReadCallback* callback = nullptr,
+           bool* is_blob_index = nullptr, bool do_merge = true) {
+    SequenceNumber seq;
+    return Get(key, value, s, merge_context, max_covering_tombstone_seq, &seq,
+               read_opts, callback, is_blob_index, do_merge);
+  }
+
+  void MultiGet(const ReadOptions& read_options, MultiGetRange* range,
+                ReadCallback* callback, bool* is_blob);
+
+  // Attempts to update the new_value inplace, else does normal Add
+  // Pseudocode
+  //   if key exists in current memtable && prev_value is of type kTypeValue
+  //     if new sizeof(new_value) <= sizeof(prev_value)
+  //       update inplace
+  //     else add(key, new_value)
+  //   else add(key, new_value)
+  //
+  // REQUIRES: external synchronization to prevent simultaneous
+  // operations on the same MemTable.
+  void Update(SequenceNumber seq,
+              const Slice& key,
+              const Slice& value);
+
+  // If prev_value for key exists, attempts to update it inplace.
+  // else returns false
+  // Pseudocode
+  //   if key exists in current memtable && prev_value is of type kTypeValue
+  //     new_value = delta(prev_value)
+  //     if sizeof(new_value) <= sizeof(prev_value)
+  //       update inplace
+  //     else add(key, new_value)
+  //   else return false
+  //
+  // REQUIRES: external synchronization to prevent simultaneous
+  // operations on the same MemTable.
+  bool UpdateCallback(SequenceNumber seq,
+                      const Slice& key,
+                      const Slice& delta);
+
+  // Returns the number of successive merge entries starting from the newest
+  // entry for the key up to the last non-merge entry or last entry for the
+  // key in the memtable.
+  size_t CountSuccessiveMergeEntries(const LookupKey& key);
+
+  // Update counters and flush status after inserting a whole write batch
+  // Used in concurrent memtable inserts.
+  void BatchPostProcess(const MemTablePostProcessInfo& update_counters) {
+    num_entries_.fetch_add(update_counters.num_entries,
+                           std::memory_order_relaxed);
+    data_size_.fetch_add(update_counters.data_size, std::memory_order_relaxed);
+    if (update_counters.num_deletes != 0) {
+      num_deletes_.fetch_add(update_counters.num_deletes,
+                             std::memory_order_relaxed);
+    }
+    UpdateFlushState();
+  }
+
+  // Get total number of entries in the mem table.
+  // REQUIRES: external synchronization to prevent simultaneous
+  // operations on the same MemTable (unless this Memtable is immutable).
+  uint64_t num_entries() const {
+    return num_entries_.load(std::memory_order_relaxed);
+  }
+
+  // Get total number of deletes in the mem table.
+  // REQUIRES: external synchronization to prevent simultaneous
+  // operations on the same MemTable (unless this Memtable is immutable).
+  uint64_t num_deletes() const {
+    return num_deletes_.load(std::memory_order_relaxed);
+  }
+
+  uint64_t get_data_size() const {
+    return data_size_.load(std::memory_order_relaxed);
+  }
+
+  // Dynamically change the memtable's capacity. If set below the current usage,
+  // the next key added will trigger a flush. Can only increase size when
+  // memtable prefix bloom is disabled, since we can't easily allocate more
+  // space.
+  void UpdateWriteBufferSize(size_t new_write_buffer_size) {
+    if (bloom_filter_ == nullptr ||
+        new_write_buffer_size < write_buffer_size_) {
+      write_buffer_size_.store(new_write_buffer_size,
+                               std::memory_order_relaxed);
+    }
+  }
+
+  // Returns the edits area that is needed for flushing the memtable
+  VersionEdit* GetEdits() { return &edit_; }
+
+  // Returns if there is no entry inserted to the mem table.
+  // REQUIRES: external synchronization to prevent simultaneous
+  // operations on the same MemTable (unless this Memtable is immutable).
+  bool IsEmpty() const { return first_seqno_ == 0; }
+
+  // Returns the sequence number of the first element that was inserted
+  // into the memtable.
+  // REQUIRES: external synchronization to prevent simultaneous
+  // operations on the same MemTable (unless this Memtable is immutable).
+  SequenceNumber GetFirstSequenceNumber() {
+    return first_seqno_.load(std::memory_order_relaxed);
+  }
+
+  // Returns the sequence number that is guaranteed to be smaller than or equal
+  // to the sequence number of any key that could be inserted into this
+  // memtable. It can then be assumed that any write with a larger(or equal)
+  // sequence number will be present in this memtable or a later memtable.
+  //
+  // If the earliest sequence number could not be determined,
+  // kMaxSequenceNumber will be returned.
+  SequenceNumber GetEarliestSequenceNumber() {
+    return earliest_seqno_.load(std::memory_order_relaxed);
+  }
+
+  // DB's latest sequence ID when the memtable is created. This number
+  // may be updated to a more recent one before any key is inserted.
+  SequenceNumber GetCreationSeq() const { return creation_seq_; }
+
+  void SetCreationSeq(SequenceNumber sn) { creation_seq_ = sn; }
+
+  // Returns the next active logfile number when this memtable is about to
+  // be flushed to storage
+  // REQUIRES: external synchronization to prevent simultaneous
+  // operations on the same MemTable.
+  uint64_t GetNextLogNumber() { return mem_next_logfile_number_; }
+
+  // Sets the next active logfile number when this memtable is about to
+  // be flushed to storage
+  // REQUIRES: external synchronization to prevent simultaneous
+  // operations on the same MemTable.
+  void SetNextLogNumber(uint64_t num) { mem_next_logfile_number_ = num; }
+
+  // if this memtable contains data from a committed
+  // two phase transaction we must take note of the
+  // log which contains that data so we can know
+  // when to relese that log
+  void RefLogContainingPrepSection(uint64_t log);
+  uint64_t GetMinLogContainingPrepSection();
+
+  // Notify the underlying storage that no more items will be added.
+  // REQUIRES: external synchronization to prevent simultaneous
+  // operations on the same MemTable.
+  // After MarkImmutable() is called, you should not attempt to
+  // write anything to this MemTable().  (Ie. do not call Add() or Update()).
+  void MarkImmutable() {
+    table_->MarkReadOnly();
+    mem_tracker_.DoneAllocating();
+  }
+
+  // Notify the underlying storage that all data it contained has been
+  // persisted.
+  // REQUIRES: external synchronization to prevent simultaneous
+  // operations on the same MemTable.
+  void MarkFlushed() {
+    table_->MarkFlushed();
+  }
+
+  // return true if the current MemTableRep supports merge operator.
+  bool IsMergeOperatorSupported() const {
+    return table_->IsMergeOperatorSupported();
+  }
+
+  // return true if the current MemTableRep supports snapshots.
+  // inplace update prevents snapshots,
+  bool IsSnapshotSupported() const {
+    return table_->IsSnapshotSupported() && !moptions_.inplace_update_support;
+  }
+
+  struct MemTableStats {
+    uint64_t size;
+    uint64_t count;
+  };
+
+  MemTableStats ApproximateStats(const Slice& start_ikey,
+                                 const Slice& end_ikey);
+
+  // Get the lock associated for the key
+  port::RWMutex* GetLock(const Slice& key);
+
+  const InternalKeyComparator& GetInternalKeyComparator() const {
+    return comparator_.comparator;
+  }
+
+  const ImmutableMemTableOptions* GetImmutableMemTableOptions() const {
+    return &moptions_;
+  }
+
+  uint64_t ApproximateOldestKeyTime() const {
+    return oldest_key_time_.load(std::memory_order_relaxed);
+  }
+
+  // REQUIRES: db_mutex held.
+  void SetID(uint64_t id) { id_ = id; }
+
+  uint64_t GetID() const { return id_; }
+
+  void SetFlushCompleted(bool completed) { flush_completed_ = completed; }
+
+  uint64_t GetFileNumber() const { return file_number_; }
+
+  void SetFileNumber(uint64_t file_num) { file_number_ = file_num; }
+
+  void SetFlushInProgress(bool in_progress) {
+    flush_in_progress_ = in_progress;
+  }
+
+#ifndef ROCKSDB_LITE
+  void SetFlushJobInfo(std::unique_ptr<FlushJobInfo>&& info) {
+    flush_job_info_ = std::move(info);
+  }
+
+  std::unique_ptr<FlushJobInfo> ReleaseFlushJobInfo() {
+    return std::move(flush_job_info_);
+  }
+#endif  // !ROCKSDB_LITE
+
+ private:
+  enum FlushStateEnum { FLUSH_NOT_REQUESTED, FLUSH_REQUESTED, FLUSH_SCHEDULED };
+
+  friend class MemTableIterator;
+  friend class MemTableBackwardIterator;
+  friend class MemTableList;
+
+  KeyComparator comparator_;
+  const ImmutableMemTableOptions moptions_;
+  int refs_;
+  const size_t kArenaBlockSize;
+  AllocTracker mem_tracker_;
+  ConcurrentArena arena_;
+  std::unique_ptr<MemTableRep> table_;
+  std::unique_ptr<MemTableRep> range_del_table_;
+  std::atomic_bool is_range_del_table_empty_;
+
+  // Total data size of all data inserted
+  std::atomic<uint64_t> data_size_;
+  std::atomic<uint64_t> num_entries_;
+  std::atomic<uint64_t> num_deletes_;
+
+  // Dynamically changeable memtable option
+  std::atomic<size_t> write_buffer_size_;
+
+  // These are used to manage memtable flushes to storage
+  bool flush_in_progress_; // started the flush
+  bool flush_completed_;   // finished the flush
+  uint64_t file_number_;    // filled up after flush is complete
+
+  // The updates to be applied to the transaction log when this
+  // memtable is flushed to storage.
+  VersionEdit edit_;
+
+  // The sequence number of the kv that was inserted first
+  std::atomic<SequenceNumber> first_seqno_;
+
+  // The db sequence number at the time of creation or kMaxSequenceNumber
+  // if not set.
+  std::atomic<SequenceNumber> earliest_seqno_;
+
+  SequenceNumber creation_seq_;
+
+  // The log files earlier than this number can be deleted.
+  uint64_t mem_next_logfile_number_;
+
+  // the earliest log containing a prepared section
+  // which has been inserted into this memtable.
+  std::atomic<uint64_t> min_prep_log_referenced_;
+
+  // rw locks for inplace updates
+  std::vector<port::RWMutex> locks_;
+
+  const SliceTransform* const prefix_extractor_;
+  std::unique_ptr<DynamicBloom> bloom_filter_;
+
+  std::atomic<FlushStateEnum> flush_state_;
+
+  Env* env_;
+
+  // Extract sequential insert prefixes.
+  const SliceTransform* insert_with_hint_prefix_extractor_;
+
+  // Insert hints for each prefix.
+  std::unordered_map<Slice, void*, SliceHasher> insert_hints_;
+
+  // Timestamp of oldest key
+  std::atomic<uint64_t> oldest_key_time_;
+
+  // Memtable id to track flush.
+  uint64_t id_ = 0;
+
+  // Sequence number of the atomic flush that is responsible for this memtable.
+  // The sequence number of atomic flush is a seq, such that no writes with
+  // sequence numbers greater than or equal to seq are flushed, while all
+  // writes with sequence number smaller than seq are flushed.
+  SequenceNumber atomic_flush_seqno_;
+
+  // keep track of memory usage in table_, arena_, and range_del_table_.
+  // Gets refrshed inside `ApproximateMemoryUsage()` or `ShouldFlushNow`
+  std::atomic<uint64_t> approximate_memory_usage_;
+
+#ifndef ROCKSDB_LITE
+  // Flush job info of the current memtable.
+  std::unique_ptr<FlushJobInfo> flush_job_info_;
+#endif  // !ROCKSDB_LITE
+
+  // Returns a heuristic flush decision
+  bool ShouldFlushNow();
+
+  // Updates flush_state_ using ShouldFlushNow()
+  void UpdateFlushState();
+
+  void UpdateOldestKeyTime();
+
+  void GetFromTable(const LookupKey& key,
+                    SequenceNumber max_covering_tombstone_seq, bool do_merge,
+                    ReadCallback* callback, bool* is_blob_index,
+                    std::string* value, Status* s, MergeContext* merge_context,
+                    SequenceNumber* seq, bool* found_final_value,
+                    bool* merge_in_progress);
+};
+
+extern const char* EncodeKey(std::string* scratch, const Slice& target);
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/memtable_list.cc b/src/rocksdb/db/memtable_list.cc
new file mode 100644
index 000000000..a8b358fa6
--- /dev/null
+++ b/src/rocksdb/db/memtable_list.cc
@@ -0,0 +1,771 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#include "db/memtable_list.h"
+
+#include <cinttypes>
+#include <limits>
+#include <queue>
+#include <string>
+#include "db/db_impl/db_impl.h"
+#include "db/memtable.h"
+#include "db/range_tombstone_fragmenter.h"
+#include "db/version_set.h"
+#include "logging/log_buffer.h"
+#include "monitoring/thread_status_util.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/iterator.h"
+#include "table/merging_iterator.h"
+#include "test_util/sync_point.h"
+#include "util/coding.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class InternalKeyComparator;
+class Mutex;
+class VersionSet;
+
+void MemTableListVersion::AddMemTable(MemTable* m) {
+  memlist_.push_front(m);
+  *parent_memtable_list_memory_usage_ += m->ApproximateMemoryUsage();
+}
+
+void MemTableListVersion::UnrefMemTable(autovector<MemTable*>* to_delete,
+                                        MemTable* m) {
+  if (m->Unref()) {
+    to_delete->push_back(m);
+    assert(*parent_memtable_list_memory_usage_ >= m->ApproximateMemoryUsage());
+    *parent_memtable_list_memory_usage_ -= m->ApproximateMemoryUsage();
+  }
+}
+
+MemTableListVersion::MemTableListVersion(
+    size_t* parent_memtable_list_memory_usage, MemTableListVersion* old)
+    : max_write_buffer_number_to_maintain_(
+          old->max_write_buffer_number_to_maintain_),
+      max_write_buffer_size_to_maintain_(
+          old->max_write_buffer_size_to_maintain_),
+      parent_memtable_list_memory_usage_(parent_memtable_list_memory_usage) {
+  if (old != nullptr) {
+    memlist_ = old->memlist_;
+    for (auto& m : memlist_) {
+      m->Ref();
+    }
+
+    memlist_history_ = old->memlist_history_;
+    for (auto& m : memlist_history_) {
+      m->Ref();
+    }
+  }
+}
+
+MemTableListVersion::MemTableListVersion(
+    size_t* parent_memtable_list_memory_usage,
+    int max_write_buffer_number_to_maintain,
+    int64_t max_write_buffer_size_to_maintain)
+    : max_write_buffer_number_to_maintain_(max_write_buffer_number_to_maintain),
+      max_write_buffer_size_to_maintain_(max_write_buffer_size_to_maintain),
+      parent_memtable_list_memory_usage_(parent_memtable_list_memory_usage) {}
+
+void MemTableListVersion::Ref() { ++refs_; }
+
+// called by superversion::clean()
+void MemTableListVersion::Unref(autovector<MemTable*>* to_delete) {
+  assert(refs_ >= 1);
+  --refs_;
+  if (refs_ == 0) {
+    // if to_delete is equal to nullptr it means we're confident
+    // that refs_ will not be zero
+    assert(to_delete != nullptr);
+    for (const auto& m : memlist_) {
+      UnrefMemTable(to_delete, m);
+    }
+    for (const auto& m : memlist_history_) {
+      UnrefMemTable(to_delete, m);
+    }
+    delete this;
+  }
+}
+
+int MemTableList::NumNotFlushed() const {
+  int size = static_cast<int>(current_->memlist_.size());
+  assert(num_flush_not_started_ <= size);
+  return size;
+}
+
+int MemTableList::NumFlushed() const {
+  return static_cast<int>(current_->memlist_history_.size());
+}
+
+// Search all the memtables starting from the most recent one.
+// Return the most recent value found, if any.
+// Operands stores the list of merge operations to apply, so far.
+bool MemTableListVersion::Get(const LookupKey& key, std::string* value,
+                              Status* s, MergeContext* merge_context,
+                              SequenceNumber* max_covering_tombstone_seq,
+                              SequenceNumber* seq, const ReadOptions& read_opts,
+                              ReadCallback* callback, bool* is_blob_index) {
+  return GetFromList(&memlist_, key, value, s, merge_context,
+                     max_covering_tombstone_seq, seq, read_opts, callback,
+                     is_blob_index);
+}
+
+void MemTableListVersion::MultiGet(const ReadOptions& read_options,
+                                   MultiGetRange* range, ReadCallback* callback,
+                                   bool* is_blob) {
+  for (auto memtable : memlist_) {
+    memtable->MultiGet(read_options, range, callback, is_blob);
+    if (range->empty()) {
+      return;
+    }
+  }
+}
+
+bool MemTableListVersion::GetMergeOperands(
+    const LookupKey& key, Status* s, MergeContext* merge_context,
+    SequenceNumber* max_covering_tombstone_seq, const ReadOptions& read_opts) {
+  for (MemTable* memtable : memlist_) {
+    bool done = memtable->Get(key, nullptr, s, merge_context,
+                              max_covering_tombstone_seq, read_opts, nullptr,
+                              nullptr, false);
+    if (done) {
+      return true;
+    }
+  }
+  return false;
+}
+
+bool MemTableListVersion::GetFromHistory(
+    const LookupKey& key, std::string* value, Status* s,
+    MergeContext* merge_context, SequenceNumber* max_covering_tombstone_seq,
+    SequenceNumber* seq, const ReadOptions& read_opts, bool* is_blob_index) {
+  return GetFromList(&memlist_history_, key, value, s, merge_context,
+                     max_covering_tombstone_seq, seq, read_opts,
+                     nullptr /*read_callback*/, is_blob_index);
+}
+
+bool MemTableListVersion::GetFromList(
+    std::list<MemTable*>* list, const LookupKey& key, std::string* value,
+    Status* s, MergeContext* merge_context,
+    SequenceNumber* max_covering_tombstone_seq, SequenceNumber* seq,
+    const ReadOptions& read_opts, ReadCallback* callback, bool* is_blob_index) {
+  *seq = kMaxSequenceNumber;
+
+  for (auto& memtable : *list) {
+    SequenceNumber current_seq = kMaxSequenceNumber;
+
+    bool done =
+        memtable->Get(key, value, s, merge_context, max_covering_tombstone_seq,
+                      &current_seq, read_opts, callback, is_blob_index);
+    if (*seq == kMaxSequenceNumber) {
+      // Store the most recent sequence number of any operation on this key.
+      // Since we only care about the most recent change, we only need to
+      // return the first operation found when searching memtables in
+      // reverse-chronological order.
+      // current_seq would be equal to kMaxSequenceNumber if the value was to be
+      // skipped. This allows seq to be assigned again when the next value is
+      // read.
+      *seq = current_seq;
+    }
+
+    if (done) {
+      assert(*seq != kMaxSequenceNumber || s->IsNotFound());
+      return true;
+    }
+    if (!done && !s->ok() && !s->IsMergeInProgress() && !s->IsNotFound()) {
+      return false;
+    }
+  }
+  return false;
+}
+
+Status MemTableListVersion::AddRangeTombstoneIterators(
+    const ReadOptions& read_opts, Arena* /*arena*/,
+    RangeDelAggregator* range_del_agg) {
+  assert(range_del_agg != nullptr);
+  // Except for snapshot read, using kMaxSequenceNumber is OK because these
+  // are immutable memtables.
+  SequenceNumber read_seq = read_opts.snapshot != nullptr
+                                ? read_opts.snapshot->GetSequenceNumber()
+                                : kMaxSequenceNumber;
+  for (auto& m : memlist_) {
+    std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter(
+        m->NewRangeTombstoneIterator(read_opts, read_seq));
+    range_del_agg->AddTombstones(std::move(range_del_iter));
+  }
+  return Status::OK();
+}
+
+void MemTableListVersion::AddIterators(
+    const ReadOptions& options, std::vector<InternalIterator*>* iterator_list,
+    Arena* arena) {
+  for (auto& m : memlist_) {
+    iterator_list->push_back(m->NewIterator(options, arena));
+  }
+}
+
+void MemTableListVersion::AddIterators(
+    const ReadOptions& options, MergeIteratorBuilder* merge_iter_builder) {
+  for (auto& m : memlist_) {
+    merge_iter_builder->AddIterator(
+        m->NewIterator(options, merge_iter_builder->GetArena()));
+  }
+}
+
+uint64_t MemTableListVersion::GetTotalNumEntries() const {
+  uint64_t total_num = 0;
+  for (auto& m : memlist_) {
+    total_num += m->num_entries();
+  }
+  return total_num;
+}
+
+MemTable::MemTableStats MemTableListVersion::ApproximateStats(
+    const Slice& start_ikey, const Slice& end_ikey) {
+  MemTable::MemTableStats total_stats = {0, 0};
+  for (auto& m : memlist_) {
+    auto mStats = m->ApproximateStats(start_ikey, end_ikey);
+    total_stats.size += mStats.size;
+    total_stats.count += mStats.count;
+  }
+  return total_stats;
+}
+
+uint64_t MemTableListVersion::GetTotalNumDeletes() const {
+  uint64_t total_num = 0;
+  for (auto& m : memlist_) {
+    total_num += m->num_deletes();
+  }
+  return total_num;
+}
+
+SequenceNumber MemTableListVersion::GetEarliestSequenceNumber(
+    bool include_history) const {
+  if (include_history && !memlist_history_.empty()) {
+    return memlist_history_.back()->GetEarliestSequenceNumber();
+  } else if (!memlist_.empty()) {
+    return memlist_.back()->GetEarliestSequenceNumber();
+  } else {
+    return kMaxSequenceNumber;
+  }
+}
+
+// caller is responsible for referencing m
+void MemTableListVersion::Add(MemTable* m, autovector<MemTable*>* to_delete) {
+  assert(refs_ == 1);  // only when refs_ == 1 is MemTableListVersion mutable
+  AddMemTable(m);
+
+  TrimHistory(to_delete, m->ApproximateMemoryUsage());
+}
+
+// Removes m from list of memtables not flushed.  Caller should NOT Unref m.
+void MemTableListVersion::Remove(MemTable* m,
+                                 autovector<MemTable*>* to_delete) {
+  assert(refs_ == 1);  // only when refs_ == 1 is MemTableListVersion mutable
+  memlist_.remove(m);
+
+  m->MarkFlushed();
+  if (max_write_buffer_size_to_maintain_ > 0 ||
+      max_write_buffer_number_to_maintain_ > 0) {
+    memlist_history_.push_front(m);
+    // Unable to get size of mutable memtable at this point, pass 0 to
+    // TrimHistory as a best effort.
+    TrimHistory(to_delete, 0);
+  } else {
+    UnrefMemTable(to_delete, m);
+  }
+}
+
+// return the total memory usage assuming the oldest flushed memtable is dropped
+size_t MemTableListVersion::ApproximateMemoryUsageExcludingLast() const {
+  size_t total_memtable_size = 0;
+  for (auto& memtable : memlist_) {
+    total_memtable_size += memtable->ApproximateMemoryUsage();
+  }
+  for (auto& memtable : memlist_history_) {
+    total_memtable_size += memtable->ApproximateMemoryUsage();
+  }
+  if (!memlist_history_.empty()) {
+    total_memtable_size -= memlist_history_.back()->ApproximateMemoryUsage();
+  }
+  return total_memtable_size;
+}
+
+bool MemTableListVersion::MemtableLimitExceeded(size_t usage) {
+  if (max_write_buffer_size_to_maintain_ > 0) {
+    // calculate the total memory usage after dropping the oldest flushed
+    // memtable, compare with max_write_buffer_size_to_maintain_ to decide
+    // whether to trim history
+    return ApproximateMemoryUsageExcludingLast() + usage >=
+           static_cast<size_t>(max_write_buffer_size_to_maintain_);
+  } else if (max_write_buffer_number_to_maintain_ > 0) {
+    return memlist_.size() + memlist_history_.size() >
+           static_cast<size_t>(max_write_buffer_number_to_maintain_);
+  } else {
+    return false;
+  }
+}
+
+// Make sure we don't use up too much space in history
+void MemTableListVersion::TrimHistory(autovector<MemTable*>* to_delete,
+                                      size_t usage) {
+  while (MemtableLimitExceeded(usage) && !memlist_history_.empty()) {
+    MemTable* x = memlist_history_.back();
+    memlist_history_.pop_back();
+
+    UnrefMemTable(to_delete, x);
+  }
+}
+
+// Returns true if there is at least one memtable on which flush has
+// not yet started.
+bool MemTableList::IsFlushPending() const {
+  if ((flush_requested_ && num_flush_not_started_ > 0) ||
+      (num_flush_not_started_ >= min_write_buffer_number_to_merge_)) {
+    assert(imm_flush_needed.load(std::memory_order_relaxed));
+    return true;
+  }
+  return false;
+}
+
+// Returns the memtables that need to be flushed.
+void MemTableList::PickMemtablesToFlush(const uint64_t* max_memtable_id,
+                                        autovector<MemTable*>* ret) {
+  AutoThreadOperationStageUpdater stage_updater(
+      ThreadStatus::STAGE_PICK_MEMTABLES_TO_FLUSH);
+  const auto& memlist = current_->memlist_;
+  bool atomic_flush = false;
+  for (auto it = memlist.rbegin(); it != memlist.rend(); ++it) {
+    MemTable* m = *it;
+    if (!atomic_flush && m->atomic_flush_seqno_ != kMaxSequenceNumber) {
+      atomic_flush = true;
+    }
+    if (max_memtable_id != nullptr && m->GetID() > *max_memtable_id) {
+      break;
+    }
+    if (!m->flush_in_progress_) {
+      assert(!m->flush_completed_);
+      num_flush_not_started_--;
+      if (num_flush_not_started_ == 0) {
+        imm_flush_needed.store(false, std::memory_order_release);
+      }
+      m->flush_in_progress_ = true;  // flushing will start very soon
+      ret->push_back(m);
+    }
+  }
+  if (!atomic_flush || num_flush_not_started_ == 0) {
+    flush_requested_ = false;  // start-flush request is complete
+  }
+}
+
+void MemTableList::RollbackMemtableFlush(const autovector<MemTable*>& mems,
+                                         uint64_t /*file_number*/) {
+  AutoThreadOperationStageUpdater stage_updater(
+      ThreadStatus::STAGE_MEMTABLE_ROLLBACK);
+  assert(!mems.empty());
+
+  // If the flush was not successful, then just reset state.
+  // Maybe a succeeding attempt to flush will be successful.
+  for (MemTable* m : mems) {
+    assert(m->flush_in_progress_);
+    assert(m->file_number_ == 0);
+
+    m->flush_in_progress_ = false;
+    m->flush_completed_ = false;
+    m->edit_.Clear();
+    num_flush_not_started_++;
+  }
+  imm_flush_needed.store(true, std::memory_order_release);
+}
+
+// Try record a successful flush in the manifest file. It might just return
+// Status::OK letting a concurrent flush to do actual the recording..
+Status MemTableList::TryInstallMemtableFlushResults(
+    ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options,
+    const autovector<MemTable*>& mems, LogsWithPrepTracker* prep_tracker,
+    VersionSet* vset, InstrumentedMutex* mu, uint64_t file_number,
+    autovector<MemTable*>* to_delete, Directory* db_directory,
+    LogBuffer* log_buffer,
+    std::list<std::unique_ptr<FlushJobInfo>>* committed_flush_jobs_info) {
+  AutoThreadOperationStageUpdater stage_updater(
+      ThreadStatus::STAGE_MEMTABLE_INSTALL_FLUSH_RESULTS);
+  mu->AssertHeld();
+
+  // Flush was successful
+  // Record the status on the memtable object. Either this call or a call by a
+  // concurrent flush thread will read the status and write it to manifest.
+  for (size_t i = 0; i < mems.size(); ++i) {
+    // All the edits are associated with the first memtable of this batch.
+    assert(i == 0 || mems[i]->GetEdits()->NumEntries() == 0);
+
+    mems[i]->flush_completed_ = true;
+    mems[i]->file_number_ = file_number;
+  }
+
+  // if some other thread is already committing, then return
+  Status s;
+  if (commit_in_progress_) {
+    TEST_SYNC_POINT("MemTableList::TryInstallMemtableFlushResults:InProgress");
+    return s;
+  }
+
+  // Only a single thread can be executing this piece of code
+  commit_in_progress_ = true;
+
+  // Retry until all completed flushes are committed. New flushes can finish
+  // while the current thread is writing manifest where mutex is released.
+  while (s.ok()) {
+    auto& memlist = current_->memlist_;
+    // The back is the oldest; if flush_completed_ is not set to it, it means
+    // that we were assigned a more recent memtable. The memtables' flushes must
+    // be recorded in manifest in order. A concurrent flush thread, who is
+    // assigned to flush the oldest memtable, will later wake up and does all
+    // the pending writes to manifest, in order.
+    if (memlist.empty() || !memlist.back()->flush_completed_) {
+      break;
+    }
+    // scan all memtables from the earliest, and commit those
+    // (in that order) that have finished flushing. Memtables
+    // are always committed in the order that they were created.
+    uint64_t batch_file_number = 0;
+    size_t batch_count = 0;
+    autovector<VersionEdit*> edit_list;
+    autovector<MemTable*> memtables_to_flush;
+    // enumerate from the last (earliest) element to see how many batch finished
+    for (auto it = memlist.rbegin(); it != memlist.rend(); ++it) {
+      MemTable* m = *it;
+      if (!m->flush_completed_) {
+        break;
+      }
+      if (it == memlist.rbegin() || batch_file_number != m->file_number_) {
+        batch_file_number = m->file_number_;
+        ROCKS_LOG_BUFFER(log_buffer,
+                         "[%s] Level-0 commit table #%" PRIu64 " started",
+                         cfd->GetName().c_str(), m->file_number_);
+        edit_list.push_back(&m->edit_);
+        memtables_to_flush.push_back(m);
+#ifndef ROCKSDB_LITE
+        std::unique_ptr<FlushJobInfo> info = m->ReleaseFlushJobInfo();
+        if (info != nullptr) {
+          committed_flush_jobs_info->push_back(std::move(info));
+        }
+#else
+        (void)committed_flush_jobs_info;
+#endif  // !ROCKSDB_LITE
+      }
+      batch_count++;
+    }
+
+    // TODO(myabandeh): Not sure how batch_count could be 0 here.
+    if (batch_count > 0) {
+      if (vset->db_options()->allow_2pc) {
+        assert(edit_list.size() > 0);
+        // We piggyback the information of  earliest log file to keep in the
+        // manifest entry for the last file flushed.
+        edit_list.back()->SetMinLogNumberToKeep(PrecomputeMinLogNumberToKeep(
+            vset, *cfd, edit_list, memtables_to_flush, prep_tracker));
+      }
+
+      // this can release and reacquire the mutex.
+      s = vset->LogAndApply(cfd, mutable_cf_options, edit_list, mu,
+                            db_directory);
+
+      // we will be changing the version in the next code path,
+      // so we better create a new one, since versions are immutable
+      InstallNewVersion();
+
+      // All the later memtables that have the same filenum
+      // are part of the same batch. They can be committed now.
+      uint64_t mem_id = 1;  // how many memtables have been flushed.
+
+      // commit new state only if the column family is NOT dropped.
+      // The reason is as follows (refer to
+      // ColumnFamilyTest.FlushAndDropRaceCondition).
+      // If the column family is dropped, then according to LogAndApply, its
+      // corresponding flush operation is NOT written to the MANIFEST. This
+      // means the DB is not aware of the L0 files generated from the flush.
+      // By committing the new state, we remove the memtable from the memtable
+      // list. Creating an iterator on this column family will not be able to
+      // read full data since the memtable is removed, and the DB is not aware
+      // of the L0 files, causing MergingIterator unable to build child
+      // iterators. RocksDB contract requires that the iterator can be created
+      // on a dropped column family, and we must be able to
+      // read full data as long as column family handle is not deleted, even if
+      // the column family is dropped.
+      if (s.ok() && !cfd->IsDropped()) {  // commit new state
+        while (batch_count-- > 0) {
+          MemTable* m = current_->memlist_.back();
+          ROCKS_LOG_BUFFER(log_buffer, "[%s] Level-0 commit table #%" PRIu64
+                                       ": memtable #%" PRIu64 " done",
+                           cfd->GetName().c_str(), m->file_number_, mem_id);
+          assert(m->file_number_ > 0);
+          current_->Remove(m, to_delete);
+          UpdateCachedValuesFromMemTableListVersion();
+          ResetTrimHistoryNeeded();
+          ++mem_id;
+        }
+      } else {
+        for (auto it = current_->memlist_.rbegin(); batch_count-- > 0; ++it) {
+          MemTable* m = *it;
+          // commit failed. setup state so that we can flush again.
+          ROCKS_LOG_BUFFER(log_buffer, "Level-0 commit table #%" PRIu64
+                                       ": memtable #%" PRIu64 " failed",
+                           m->file_number_, mem_id);
+          m->flush_completed_ = false;
+          m->flush_in_progress_ = false;
+          m->edit_.Clear();
+          num_flush_not_started_++;
+          m->file_number_ = 0;
+          imm_flush_needed.store(true, std::memory_order_release);
+          ++mem_id;
+        }
+      }
+    }
+  }
+  commit_in_progress_ = false;
+  return s;
+}
+
+// New memtables are inserted at the front of the list.
+void MemTableList::Add(MemTable* m, autovector<MemTable*>* to_delete) {
+  assert(static_cast<int>(current_->memlist_.size()) >= num_flush_not_started_);
+  InstallNewVersion();
+  // this method is used to move mutable memtable into an immutable list.
+  // since mutable memtable is already refcounted by the DBImpl,
+  // and when moving to the imutable list we don't unref it,
+  // we don't have to ref the memtable here. we just take over the
+  // reference from the DBImpl.
+  current_->Add(m, to_delete);
+  m->MarkImmutable();
+  num_flush_not_started_++;
+  if (num_flush_not_started_ == 1) {
+    imm_flush_needed.store(true, std::memory_order_release);
+  }
+  UpdateCachedValuesFromMemTableListVersion();
+  ResetTrimHistoryNeeded();
+}
+
+void MemTableList::TrimHistory(autovector<MemTable*>* to_delete, size_t usage) {
+  InstallNewVersion();
+  current_->TrimHistory(to_delete, usage);
+  UpdateCachedValuesFromMemTableListVersion();
+  ResetTrimHistoryNeeded();
+}
+
+// Returns an estimate of the number of bytes of data in use.
+size_t MemTableList::ApproximateUnflushedMemTablesMemoryUsage() {
+  size_t total_size = 0;
+  for (auto& memtable : current_->memlist_) {
+    total_size += memtable->ApproximateMemoryUsage();
+  }
+  return total_size;
+}
+
+size_t MemTableList::ApproximateMemoryUsage() { return current_memory_usage_; }
+
+size_t MemTableList::ApproximateMemoryUsageExcludingLast() const {
+  const size_t usage =
+      current_memory_usage_excluding_last_.load(std::memory_order_relaxed);
+  return usage;
+}
+
+bool MemTableList::HasHistory() const {
+  const bool has_history = current_has_history_.load(std::memory_order_relaxed);
+  return has_history;
+}
+
+void MemTableList::UpdateCachedValuesFromMemTableListVersion() {
+  const size_t total_memtable_size =
+      current_->ApproximateMemoryUsageExcludingLast();
+  current_memory_usage_excluding_last_.store(total_memtable_size,
+                                             std::memory_order_relaxed);
+
+  const bool has_history = current_->HasHistory();
+  current_has_history_.store(has_history, std::memory_order_relaxed);
+}
+
+uint64_t MemTableList::ApproximateOldestKeyTime() const {
+  if (!current_->memlist_.empty()) {
+    return current_->memlist_.back()->ApproximateOldestKeyTime();
+  }
+  return std::numeric_limits<uint64_t>::max();
+}
+
+void MemTableList::InstallNewVersion() {
+  if (current_->refs_ == 1) {
+    // we're the only one using the version, just keep using it
+  } else {
+    // somebody else holds the current version, we need to create new one
+    MemTableListVersion* version = current_;
+    current_ = new MemTableListVersion(&current_memory_usage_, current_);
+    current_->Ref();
+    version->Unref();
+  }
+}
+
+uint64_t MemTableList::PrecomputeMinLogContainingPrepSection(
+    const autovector<MemTable*>& memtables_to_flush) {
+  uint64_t min_log = 0;
+
+  for (auto& m : current_->memlist_) {
+    // Assume the list is very short, we can live with O(m*n). We can optimize
+    // if the performance has some problem.
+    bool should_skip = false;
+    for (MemTable* m_to_flush : memtables_to_flush) {
+      if (m == m_to_flush) {
+        should_skip = true;
+        break;
+      }
+    }
+    if (should_skip) {
+      continue;
+    }
+
+    auto log = m->GetMinLogContainingPrepSection();
+
+    if (log > 0 && (min_log == 0 || log < min_log)) {
+      min_log = log;
+    }
+  }
+
+  return min_log;
+}
+
+// Commit a successful atomic flush in the manifest file.
+Status InstallMemtableAtomicFlushResults(
+    const autovector<MemTableList*>* imm_lists,
+    const autovector<ColumnFamilyData*>& cfds,
+    const autovector<const MutableCFOptions*>& mutable_cf_options_list,
+    const autovector<const autovector<MemTable*>*>& mems_list, VersionSet* vset,
+    InstrumentedMutex* mu, const autovector<FileMetaData*>& file_metas,
+    autovector<MemTable*>* to_delete, Directory* db_directory,
+    LogBuffer* log_buffer) {
+  AutoThreadOperationStageUpdater stage_updater(
+      ThreadStatus::STAGE_MEMTABLE_INSTALL_FLUSH_RESULTS);
+  mu->AssertHeld();
+
+  size_t num = mems_list.size();
+  assert(cfds.size() == num);
+  if (imm_lists != nullptr) {
+    assert(imm_lists->size() == num);
+  }
+  for (size_t k = 0; k != num; ++k) {
+#ifndef NDEBUG
+    const auto* imm =
+        (imm_lists == nullptr) ? cfds[k]->imm() : imm_lists->at(k);
+    if (!mems_list[k]->empty()) {
+      assert((*mems_list[k])[0]->GetID() == imm->GetEarliestMemTableID());
+    }
+#endif
+    assert(nullptr != file_metas[k]);
+    for (size_t i = 0; i != mems_list[k]->size(); ++i) {
+      assert(i == 0 || (*mems_list[k])[i]->GetEdits()->NumEntries() == 0);
+      (*mems_list[k])[i]->SetFlushCompleted(true);
+      (*mems_list[k])[i]->SetFileNumber(file_metas[k]->fd.GetNumber());
+    }
+  }
+
+  Status s;
+
+  autovector<autovector<VersionEdit*>> edit_lists;
+  uint32_t num_entries = 0;
+  for (const auto mems : mems_list) {
+    assert(mems != nullptr);
+    autovector<VersionEdit*> edits;
+    assert(!mems->empty());
+    edits.emplace_back((*mems)[0]->GetEdits());
+    ++num_entries;
+    edit_lists.emplace_back(edits);
+  }
+  // Mark the version edits as an atomic group if the number of version edits
+  // exceeds 1.
+  if (cfds.size() > 1) {
+    for (auto& edits : edit_lists) {
+      assert(edits.size() == 1);
+      edits[0]->MarkAtomicGroup(--num_entries);
+    }
+    assert(0 == num_entries);
+  }
+
+  // this can release and reacquire the mutex.
+  s = vset->LogAndApply(cfds, mutable_cf_options_list, edit_lists, mu,
+                        db_directory);
+
+  for (size_t k = 0; k != cfds.size(); ++k) {
+    auto* imm = (imm_lists == nullptr) ? cfds[k]->imm() : imm_lists->at(k);
+    imm->InstallNewVersion();
+  }
+
+  if (s.ok() || s.IsColumnFamilyDropped()) {
+    for (size_t i = 0; i != cfds.size(); ++i) {
+      if (cfds[i]->IsDropped()) {
+        continue;
+      }
+      auto* imm = (imm_lists == nullptr) ? cfds[i]->imm() : imm_lists->at(i);
+      for (auto m : *mems_list[i]) {
+        assert(m->GetFileNumber() > 0);
+        uint64_t mem_id = m->GetID();
+        ROCKS_LOG_BUFFER(log_buffer,
+                         "[%s] Level-0 commit table #%" PRIu64
+                         ": memtable #%" PRIu64 " done",
+                         cfds[i]->GetName().c_str(), m->GetFileNumber(),
+                         mem_id);
+        imm->current_->Remove(m, to_delete);
+        imm->UpdateCachedValuesFromMemTableListVersion();
+        imm->ResetTrimHistoryNeeded();
+      }
+    }
+  } else {
+    for (size_t i = 0; i != cfds.size(); ++i) {
+      auto* imm = (imm_lists == nullptr) ? cfds[i]->imm() : imm_lists->at(i);
+      for (auto m : *mems_list[i]) {
+        uint64_t mem_id = m->GetID();
+        ROCKS_LOG_BUFFER(log_buffer,
+                         "[%s] Level-0 commit table #%" PRIu64
+                         ": memtable #%" PRIu64 " failed",
+                         cfds[i]->GetName().c_str(), m->GetFileNumber(),
+                         mem_id);
+        m->SetFlushCompleted(false);
+        m->SetFlushInProgress(false);
+        m->GetEdits()->Clear();
+        m->SetFileNumber(0);
+        imm->num_flush_not_started_++;
+      }
+      imm->imm_flush_needed.store(true, std::memory_order_release);
+    }
+  }
+
+  return s;
+}
+
+void MemTableList::RemoveOldMemTables(uint64_t log_number,
+                                      autovector<MemTable*>* to_delete) {
+  assert(to_delete != nullptr);
+  InstallNewVersion();
+  auto& memlist = current_->memlist_;
+  autovector<MemTable*> old_memtables;
+  for (auto it = memlist.rbegin(); it != memlist.rend(); ++it) {
+    MemTable* mem = *it;
+    if (mem->GetNextLogNumber() > log_number) {
+      break;
+    }
+    old_memtables.push_back(mem);
+  }
+
+  for (auto it = old_memtables.begin(); it != old_memtables.end(); ++it) {
+    MemTable* mem = *it;
+    current_->Remove(mem, to_delete);
+    --num_flush_not_started_;
+    if (0 == num_flush_not_started_) {
+      imm_flush_needed.store(false, std::memory_order_release);
+    }
+  }
+
+  UpdateCachedValuesFromMemTableListVersion();
+  ResetTrimHistoryNeeded();
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/memtable_list.h b/src/rocksdb/db/memtable_list.h
new file mode 100644
index 000000000..a6acf6a32
--- /dev/null
+++ b/src/rocksdb/db/memtable_list.h
@@ -0,0 +1,422 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#pragma once
+
+#include <deque>
+#include <limits>
+#include <list>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "db/dbformat.h"
+#include "db/logs_with_prep_tracker.h"
+#include "db/memtable.h"
+#include "db/range_del_aggregator.h"
+#include "file/filename.h"
+#include "logging/log_buffer.h"
+#include "monitoring/instrumented_mutex.h"
+#include "rocksdb/db.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/options.h"
+#include "rocksdb/types.h"
+#include "util/autovector.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class ColumnFamilyData;
+class InternalKeyComparator;
+class InstrumentedMutex;
+class MergeIteratorBuilder;
+class MemTableList;
+
+struct FlushJobInfo;
+
+// keeps a list of immutable memtables in a vector. the list is immutable
+// if refcount is bigger than one. It is used as a state for Get() and
+// Iterator code paths
+//
+// This class is not thread-safe.  External synchronization is required
+// (such as holding the db mutex or being on the write thread).
+class MemTableListVersion {
+ public:
+  explicit MemTableListVersion(size_t* parent_memtable_list_memory_usage,
+                               MemTableListVersion* old = nullptr);
+  explicit MemTableListVersion(size_t* parent_memtable_list_memory_usage,
+                               int max_write_buffer_number_to_maintain,
+                               int64_t max_write_buffer_size_to_maintain);
+
+  void Ref();
+  void Unref(autovector<MemTable*>* to_delete = nullptr);
+
+  // Search all the memtables starting from the most recent one.
+  // Return the most recent value found, if any.
+  //
+  // If any operation was found for this key, its most recent sequence number
+  // will be stored in *seq on success (regardless of whether true/false is
+  // returned).  Otherwise, *seq will be set to kMaxSequenceNumber.
+  bool Get(const LookupKey& key, std::string* value, Status* s,
+           MergeContext* merge_context,
+           SequenceNumber* max_covering_tombstone_seq, SequenceNumber* seq,
+           const ReadOptions& read_opts, ReadCallback* callback = nullptr,
+           bool* is_blob_index = nullptr);
+
+  bool Get(const LookupKey& key, std::string* value, Status* s,
+           MergeContext* merge_context,
+           SequenceNumber* max_covering_tombstone_seq,
+           const ReadOptions& read_opts, ReadCallback* callback = nullptr,
+           bool* is_blob_index = nullptr) {
+    SequenceNumber seq;
+    return Get(key, value, s, merge_context, max_covering_tombstone_seq, &seq,
+               read_opts, callback, is_blob_index);
+  }
+
+  void MultiGet(const ReadOptions& read_options, MultiGetRange* range,
+                ReadCallback* callback, bool* is_blob);
+
+  // Returns all the merge operands corresponding to the key by searching all
+  // memtables starting from the most recent one.
+  bool GetMergeOperands(const LookupKey& key, Status* s,
+                        MergeContext* merge_context,
+                        SequenceNumber* max_covering_tombstone_seq,
+                        const ReadOptions& read_opts);
+
+  // Similar to Get(), but searches the Memtable history of memtables that
+  // have already been flushed.  Should only be used from in-memory only
+  // queries (such as Transaction validation) as the history may contain
+  // writes that are also present in the SST files.
+  bool GetFromHistory(const LookupKey& key, std::string* value, Status* s,
+                      MergeContext* merge_context,
+                      SequenceNumber* max_covering_tombstone_seq,
+                      SequenceNumber* seq, const ReadOptions& read_opts,
+                      bool* is_blob_index = nullptr);
+  bool GetFromHistory(const LookupKey& key, std::string* value, Status* s,
+                      MergeContext* merge_context,
+                      SequenceNumber* max_covering_tombstone_seq,
+                      const ReadOptions& read_opts,
+                      bool* is_blob_index = nullptr) {
+    SequenceNumber seq;
+    return GetFromHistory(key, value, s, merge_context,
+                          max_covering_tombstone_seq, &seq, read_opts,
+                          is_blob_index);
+  }
+
+  Status AddRangeTombstoneIterators(const ReadOptions& read_opts, Arena* arena,
+                                    RangeDelAggregator* range_del_agg);
+
+  void AddIterators(const ReadOptions& options,
+                    std::vector<InternalIterator*>* iterator_list,
+                    Arena* arena);
+
+  void AddIterators(const ReadOptions& options,
+                    MergeIteratorBuilder* merge_iter_builder);
+
+  uint64_t GetTotalNumEntries() const;
+
+  uint64_t GetTotalNumDeletes() const;
+
+  MemTable::MemTableStats ApproximateStats(const Slice& start_ikey,
+                                           const Slice& end_ikey);
+
+  // Returns the value of MemTable::GetEarliestSequenceNumber() on the most
+  // recent MemTable in this list or kMaxSequenceNumber if the list is empty.
+  // If include_history=true, will also search Memtables in MemTableList
+  // History.
+  SequenceNumber GetEarliestSequenceNumber(bool include_history = false) const;
+
+ private:
+  friend class MemTableList;
+
+  friend Status InstallMemtableAtomicFlushResults(
+      const autovector<MemTableList*>* imm_lists,
+      const autovector<ColumnFamilyData*>& cfds,
+      const autovector<const MutableCFOptions*>& mutable_cf_options_list,
+      const autovector<const autovector<MemTable*>*>& mems_list,
+      VersionSet* vset, InstrumentedMutex* mu,
+      const autovector<FileMetaData*>& file_meta,
+      autovector<MemTable*>* to_delete, Directory* db_directory,
+      LogBuffer* log_buffer);
+
+  // REQUIRE: m is an immutable memtable
+  void Add(MemTable* m, autovector<MemTable*>* to_delete);
+  // REQUIRE: m is an immutable memtable
+  void Remove(MemTable* m, autovector<MemTable*>* to_delete);
+
+  void TrimHistory(autovector<MemTable*>* to_delete, size_t usage);
+
+  bool GetFromList(std::list<MemTable*>* list, const LookupKey& key,
+                   std::string* value, Status* s, MergeContext* merge_context,
+                   SequenceNumber* max_covering_tombstone_seq,
+                   SequenceNumber* seq, const ReadOptions& read_opts,
+                   ReadCallback* callback = nullptr,
+                   bool* is_blob_index = nullptr);
+
+  void AddMemTable(MemTable* m);
+
+  void UnrefMemTable(autovector<MemTable*>* to_delete, MemTable* m);
+
+  // Calculate the total amount of memory used by memlist_ and memlist_history_
+  // excluding the last MemTable in memlist_history_. The reason for excluding
+  // the last MemTable is to see if dropping the last MemTable will keep total
+  // memory usage above or equal to max_write_buffer_size_to_maintain_
+  size_t ApproximateMemoryUsageExcludingLast() const;
+
+  // Whether this version contains flushed memtables that are only kept around
+  // for transaction conflict checking.
+  bool HasHistory() const { return !memlist_history_.empty(); }
+
+  bool MemtableLimitExceeded(size_t usage);
+
+  // Immutable MemTables that have not yet been flushed.
+  std::list<MemTable*> memlist_;
+
+  // MemTables that have already been flushed
+  // (used during Transaction validation)
+  std::list<MemTable*> memlist_history_;
+
+  // Maximum number of MemTables to keep in memory (including both flushed
+  const int max_write_buffer_number_to_maintain_;
+  // Maximum size of MemTables to keep in memory (including both flushed
+  // and not-yet-flushed tables).
+  const int64_t max_write_buffer_size_to_maintain_;
+
+  int refs_ = 0;
+
+  size_t* parent_memtable_list_memory_usage_;
+};
+
+// This class stores references to all the immutable memtables.
+// The memtables are flushed to L0 as soon as possible and in
+// any order. If there are more than one immutable memtable, their
+// flushes can occur concurrently.  However, they are 'committed'
+// to the manifest in FIFO order to maintain correctness and
+// recoverability from a crash.
+//
+//
+// Other than imm_flush_needed and imm_trim_needed, this class is not
+// thread-safe and requires external synchronization (such as holding the db
+// mutex or being on the write thread.)
+class MemTableList {
+ public:
+  // A list of memtables.
+  explicit MemTableList(int min_write_buffer_number_to_merge,
+                        int max_write_buffer_number_to_maintain,
+                        int64_t max_write_buffer_size_to_maintain)
+      : imm_flush_needed(false),
+        imm_trim_needed(false),
+        min_write_buffer_number_to_merge_(min_write_buffer_number_to_merge),
+        current_(new MemTableListVersion(&current_memory_usage_,
+                                         max_write_buffer_number_to_maintain,
+                                         max_write_buffer_size_to_maintain)),
+        num_flush_not_started_(0),
+        commit_in_progress_(false),
+        flush_requested_(false),
+        current_memory_usage_(0),
+        current_memory_usage_excluding_last_(0),
+        current_has_history_(false) {
+    current_->Ref();
+  }
+
+  // Should not delete MemTableList without making sure MemTableList::current()
+  // is Unref()'d.
+  ~MemTableList() {}
+
+  MemTableListVersion* current() const { return current_; }
+
+  // so that background threads can detect non-nullptr pointer to
+  // determine whether there is anything more to start flushing.
+  std::atomic<bool> imm_flush_needed;
+
+  std::atomic<bool> imm_trim_needed;
+
+  // Returns the total number of memtables in the list that haven't yet
+  // been flushed and logged.
+  int NumNotFlushed() const;
+
+  // Returns total number of memtables in the list that have been
+  // completely flushed and logged.
+  int NumFlushed() const;
+
+  // Returns true if there is at least one memtable on which flush has
+  // not yet started.
+  bool IsFlushPending() const;
+
+  // Returns the earliest memtables that needs to be flushed. The returned
+  // memtables are guaranteed to be in the ascending order of created time.
+  void PickMemtablesToFlush(const uint64_t* max_memtable_id,
+                            autovector<MemTable*>* mems);
+
+  // Reset status of the given memtable list back to pending state so that
+  // they can get picked up again on the next round of flush.
+  void RollbackMemtableFlush(const autovector<MemTable*>& mems,
+                             uint64_t file_number);
+
+  // Try commit a successful flush in the manifest file. It might just return
+  // Status::OK letting a concurrent flush to do the actual the recording.
+  Status TryInstallMemtableFlushResults(
+      ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options,
+      const autovector<MemTable*>& m, LogsWithPrepTracker* prep_tracker,
+      VersionSet* vset, InstrumentedMutex* mu, uint64_t file_number,
+      autovector<MemTable*>* to_delete, Directory* db_directory,
+      LogBuffer* log_buffer,
+      std::list<std::unique_ptr<FlushJobInfo>>* committed_flush_jobs_info);
+
+  // New memtables are inserted at the front of the list.
+  // Takes ownership of the referenced held on *m by the caller of Add().
+  void Add(MemTable* m, autovector<MemTable*>* to_delete);
+
+  // Returns an estimate of the number of bytes of data in use.
+  size_t ApproximateMemoryUsage();
+
+  // Returns the cached current_memory_usage_excluding_last_ value.
+  size_t ApproximateMemoryUsageExcludingLast() const;
+
+  // Returns the cached current_has_history_ value.
+  bool HasHistory() const;
+
+  // Updates current_memory_usage_excluding_last_ and current_has_history_
+  // from MemTableListVersion. Must be called whenever InstallNewVersion is
+  // called.
+  void UpdateCachedValuesFromMemTableListVersion();
+
+  // `usage` is the current size of the mutable Memtable. When
+  // max_write_buffer_size_to_maintain is used, total size of mutable and
+  // immutable memtables is checked against it to decide whether to trim
+  // memtable list.
+  void TrimHistory(autovector<MemTable*>* to_delete, size_t usage);
+
+  // Returns an estimate of the number of bytes of data used by
+  // the unflushed mem-tables.
+  size_t ApproximateUnflushedMemTablesMemoryUsage();
+
+  // Returns an estimate of the timestamp of the earliest key.
+  uint64_t ApproximateOldestKeyTime() const;
+
+  // Request a flush of all existing memtables to storage.  This will
+  // cause future calls to IsFlushPending() to return true if this list is
+  // non-empty (regardless of the min_write_buffer_number_to_merge
+  // parameter). This flush request will persist until the next time
+  // PickMemtablesToFlush() is called.
+  void FlushRequested() { flush_requested_ = true; }
+
+  bool HasFlushRequested() { return flush_requested_; }
+
+  // Returns true if a trim history should be scheduled and the caller should
+  // be the one to schedule it
+  bool MarkTrimHistoryNeeded() {
+    auto expected = false;
+    return imm_trim_needed.compare_exchange_strong(
+        expected, true, std::memory_order_relaxed, std::memory_order_relaxed);
+  }
+
+  void ResetTrimHistoryNeeded() {
+    auto expected = true;
+    imm_trim_needed.compare_exchange_strong(
+        expected, false, std::memory_order_relaxed, std::memory_order_relaxed);
+  }
+
+  // Copying allowed
+  // MemTableList(const MemTableList&);
+  // void operator=(const MemTableList&);
+
+  size_t* current_memory_usage() { return &current_memory_usage_; }
+
+  // Returns the min log containing the prep section after memtables listsed in
+  // `memtables_to_flush` are flushed and their status is persisted in manifest.
+  uint64_t PrecomputeMinLogContainingPrepSection(
+      const autovector<MemTable*>& memtables_to_flush);
+
+  uint64_t GetEarliestMemTableID() const {
+    auto& memlist = current_->memlist_;
+    if (memlist.empty()) {
+      return std::numeric_limits<uint64_t>::max();
+    }
+    return memlist.back()->GetID();
+  }
+
+  uint64_t GetLatestMemTableID() const {
+    auto& memlist = current_->memlist_;
+    if (memlist.empty()) {
+      return 0;
+    }
+    return memlist.front()->GetID();
+  }
+
+  void AssignAtomicFlushSeq(const SequenceNumber& seq) {
+    const auto& memlist = current_->memlist_;
+    // Scan the memtable list from new to old
+    for (auto it = memlist.begin(); it != memlist.end(); ++it) {
+      MemTable* mem = *it;
+      if (mem->atomic_flush_seqno_ == kMaxSequenceNumber) {
+        mem->atomic_flush_seqno_ = seq;
+      } else {
+        // Earlier memtables must have been assigned a atomic flush seq, no
+        // need to continue scan.
+        break;
+      }
+    }
+  }
+
+  // Used only by DBImplSecondary during log replay.
+  // Remove memtables whose data were written before the WAL with log_number
+  // was created, i.e. mem->GetNextLogNumber() <= log_number. The memtables are
+  // not freed, but put into a vector for future deref and reclamation.
+  void RemoveOldMemTables(uint64_t log_number,
+                          autovector<MemTable*>* to_delete);
+
+ private:
+  friend Status InstallMemtableAtomicFlushResults(
+      const autovector<MemTableList*>* imm_lists,
+      const autovector<ColumnFamilyData*>& cfds,
+      const autovector<const MutableCFOptions*>& mutable_cf_options_list,
+      const autovector<const autovector<MemTable*>*>& mems_list,
+      VersionSet* vset, InstrumentedMutex* mu,
+      const autovector<FileMetaData*>& file_meta,
+      autovector<MemTable*>* to_delete, Directory* db_directory,
+      LogBuffer* log_buffer);
+
+  // DB mutex held
+  void InstallNewVersion();
+
+  const int min_write_buffer_number_to_merge_;
+
+  MemTableListVersion* current_;
+
+  // the number of elements that still need flushing
+  int num_flush_not_started_;
+
+  // committing in progress
+  bool commit_in_progress_;
+
+  // Requested a flush of memtables to storage. It's possible to request that
+  // a subset of memtables be flushed.
+  bool flush_requested_;
+
+  // The current memory usage.
+  size_t current_memory_usage_;
+
+  // Cached value of current_->ApproximateMemoryUsageExcludingLast().
+  std::atomic<size_t> current_memory_usage_excluding_last_;
+
+  // Cached value of current_->HasHistory().
+  std::atomic<bool> current_has_history_;
+};
+
+// Installs memtable atomic flush results.
+// In most cases, imm_lists is nullptr, and the function simply uses the
+// immutable memtable lists associated with the cfds. There are unit tests that
+// installs flush results for external immutable memtable lists other than the
+// cfds' own immutable memtable lists, e.g. MemTableLIstTest. In this case,
+// imm_lists parameter is not nullptr.
+extern Status InstallMemtableAtomicFlushResults(
+    const autovector<MemTableList*>* imm_lists,
+    const autovector<ColumnFamilyData*>& cfds,
+    const autovector<const MutableCFOptions*>& mutable_cf_options_list,
+    const autovector<const autovector<MemTable*>*>& mems_list, VersionSet* vset,
+    InstrumentedMutex* mu, const autovector<FileMetaData*>& file_meta,
+    autovector<MemTable*>* to_delete, Directory* db_directory,
+    LogBuffer* log_buffer);
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/memtable_list_test.cc b/src/rocksdb/db/memtable_list_test.cc
new file mode 100644
index 000000000..a92bc6c79
--- /dev/null
+++ b/src/rocksdb/db/memtable_list_test.cc
@@ -0,0 +1,922 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/memtable_list.h"
+#include <algorithm>
+#include <string>
+#include <vector>
+#include "db/merge_context.h"
+#include "db/version_set.h"
+#include "db/write_controller.h"
+#include "rocksdb/db.h"
+#include "rocksdb/status.h"
+#include "rocksdb/write_buffer_manager.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class MemTableListTest : public testing::Test {
+ public:
+  std::string dbname;
+  DB* db;
+  Options options;
+  std::vector<ColumnFamilyHandle*> handles;
+  std::atomic<uint64_t> file_number;
+
+  MemTableListTest() : db(nullptr), file_number(1) {
+    dbname = test::PerThreadDBPath("memtable_list_test");
+    options.create_if_missing = true;
+    DestroyDB(dbname, options);
+  }
+
+  // Create a test db if not yet created
+  void CreateDB() {
+    if (db == nullptr) {
+      options.create_if_missing = true;
+      DestroyDB(dbname, options);
+      // Open DB only with default column family
+      ColumnFamilyOptions cf_options;
+      std::vector<ColumnFamilyDescriptor> cf_descs;
+      cf_descs.emplace_back(kDefaultColumnFamilyName, cf_options);
+      Status s = DB::Open(options, dbname, cf_descs, &handles, &db);
+      EXPECT_OK(s);
+
+      ColumnFamilyOptions cf_opt1, cf_opt2;
+      cf_opt1.cf_paths.emplace_back(dbname + "_one_1",
+                                    std::numeric_limits<uint64_t>::max());
+      cf_opt2.cf_paths.emplace_back(dbname + "_two_1",
+                                    std::numeric_limits<uint64_t>::max());
+      int sz = static_cast<int>(handles.size());
+      handles.resize(sz + 2);
+      s = db->CreateColumnFamily(cf_opt1, "one", &handles[1]);
+      EXPECT_OK(s);
+      s = db->CreateColumnFamily(cf_opt2, "two", &handles[2]);
+      EXPECT_OK(s);
+
+      cf_descs.emplace_back("one", cf_options);
+      cf_descs.emplace_back("two", cf_options);
+    }
+  }
+
+  ~MemTableListTest() override {
+    if (db) {
+      std::vector<ColumnFamilyDescriptor> cf_descs(handles.size());
+      for (int i = 0; i != static_cast<int>(handles.size()); ++i) {
+        handles[i]->GetDescriptor(&cf_descs[i]);
+      }
+      for (auto h : handles) {
+        if (h) {
+          db->DestroyColumnFamilyHandle(h);
+        }
+      }
+      handles.clear();
+      delete db;
+      db = nullptr;
+      DestroyDB(dbname, options, cf_descs);
+    }
+  }
+
+  // Calls MemTableList::TryInstallMemtableFlushResults() and sets up all
+  // structures needed to call this function.
+  Status Mock_InstallMemtableFlushResults(
+      MemTableList* list, const MutableCFOptions& mutable_cf_options,
+      const autovector<MemTable*>& m, autovector<MemTable*>* to_delete) {
+    // Create a mock Logger
+    test::NullLogger logger;
+    LogBuffer log_buffer(DEBUG_LEVEL, &logger);
+
+    CreateDB();
+    // Create a mock VersionSet
+    DBOptions db_options;
+    db_options.file_system = FileSystem::Default();
+    ImmutableDBOptions immutable_db_options(db_options);
+    EnvOptions env_options;
+    std::shared_ptr<Cache> table_cache(NewLRUCache(50000, 16));
+    WriteBufferManager write_buffer_manager(db_options.db_write_buffer_size);
+    WriteController write_controller(10000000u);
+
+    VersionSet versions(dbname, &immutable_db_options, env_options,
+                        table_cache.get(), &write_buffer_manager,
+                        &write_controller, /*block_cache_tracer=*/nullptr);
+    std::vector<ColumnFamilyDescriptor> cf_descs;
+    cf_descs.emplace_back(kDefaultColumnFamilyName, ColumnFamilyOptions());
+    cf_descs.emplace_back("one", ColumnFamilyOptions());
+    cf_descs.emplace_back("two", ColumnFamilyOptions());
+
+    EXPECT_OK(versions.Recover(cf_descs, false));
+
+    // Create mock default ColumnFamilyData
+    auto column_family_set = versions.GetColumnFamilySet();
+    LogsWithPrepTracker dummy_prep_tracker;
+    auto cfd = column_family_set->GetDefault();
+    EXPECT_TRUE(nullptr != cfd);
+    uint64_t file_num = file_number.fetch_add(1);
+    // Create dummy mutex.
+    InstrumentedMutex mutex;
+    InstrumentedMutexLock l(&mutex);
+    std::list<std::unique_ptr<FlushJobInfo>> flush_jobs_info;
+    Status s = list->TryInstallMemtableFlushResults(
+        cfd, mutable_cf_options, m, &dummy_prep_tracker, &versions, &mutex,
+        file_num, to_delete, nullptr, &log_buffer, &flush_jobs_info);
+    return s;
+  }
+
+  // Calls MemTableList::InstallMemtableFlushResults() and sets up all
+  // structures needed to call this function.
+  Status Mock_InstallMemtableAtomicFlushResults(
+      autovector<MemTableList*>& lists, const autovector<uint32_t>& cf_ids,
+      const autovector<const MutableCFOptions*>& mutable_cf_options_list,
+      const autovector<const autovector<MemTable*>*>& mems_list,
+      autovector<MemTable*>* to_delete) {
+    // Create a mock Logger
+    test::NullLogger logger;
+    LogBuffer log_buffer(DEBUG_LEVEL, &logger);
+
+    CreateDB();
+    // Create a mock VersionSet
+    DBOptions db_options;
+    db_options.file_system.reset(new LegacyFileSystemWrapper(db_options.env));
+
+    ImmutableDBOptions immutable_db_options(db_options);
+    EnvOptions env_options;
+    std::shared_ptr<Cache> table_cache(NewLRUCache(50000, 16));
+    WriteBufferManager write_buffer_manager(db_options.db_write_buffer_size);
+    WriteController write_controller(10000000u);
+
+    VersionSet versions(dbname, &immutable_db_options, env_options,
+                        table_cache.get(), &write_buffer_manager,
+                        &write_controller, /*block_cache_tracer=*/nullptr);
+    std::vector<ColumnFamilyDescriptor> cf_descs;
+    cf_descs.emplace_back(kDefaultColumnFamilyName, ColumnFamilyOptions());
+    cf_descs.emplace_back("one", ColumnFamilyOptions());
+    cf_descs.emplace_back("two", ColumnFamilyOptions());
+    EXPECT_OK(versions.Recover(cf_descs, false));
+
+    // Create mock default ColumnFamilyData
+
+    auto column_family_set = versions.GetColumnFamilySet();
+
+    LogsWithPrepTracker dummy_prep_tracker;
+    autovector<ColumnFamilyData*> cfds;
+    for (int i = 0; i != static_cast<int>(cf_ids.size()); ++i) {
+      cfds.emplace_back(column_family_set->GetColumnFamily(cf_ids[i]));
+      EXPECT_NE(nullptr, cfds[i]);
+    }
+    std::vector<FileMetaData> file_metas;
+    file_metas.reserve(cf_ids.size());
+    for (size_t i = 0; i != cf_ids.size(); ++i) {
+      FileMetaData meta;
+      uint64_t file_num = file_number.fetch_add(1);
+      meta.fd = FileDescriptor(file_num, 0, 0);
+      file_metas.emplace_back(meta);
+    }
+    autovector<FileMetaData*> file_meta_ptrs;
+    for (auto& meta : file_metas) {
+      file_meta_ptrs.push_back(&meta);
+    }
+    InstrumentedMutex mutex;
+    InstrumentedMutexLock l(&mutex);
+    return InstallMemtableAtomicFlushResults(
+        &lists, cfds, mutable_cf_options_list, mems_list, &versions, &mutex,
+        file_meta_ptrs, to_delete, nullptr, &log_buffer);
+  }
+};
+
+TEST_F(MemTableListTest, Empty) {
+  // Create an empty MemTableList and validate basic functions.
+  MemTableList list(1, 0, 0);
+
+  ASSERT_EQ(0, list.NumNotFlushed());
+  ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
+  ASSERT_FALSE(list.IsFlushPending());
+
+  autovector<MemTable*> mems;
+  list.PickMemtablesToFlush(nullptr /* memtable_id */, &mems);
+  ASSERT_EQ(0, mems.size());
+
+  autovector<MemTable*> to_delete;
+  list.current()->Unref(&to_delete);
+  ASSERT_EQ(0, to_delete.size());
+}
+
+TEST_F(MemTableListTest, GetTest) {
+  // Create MemTableList
+  int min_write_buffer_number_to_merge = 2;
+  int max_write_buffer_number_to_maintain = 0;
+  int64_t max_write_buffer_size_to_maintain = 0;
+  MemTableList list(min_write_buffer_number_to_merge,
+                    max_write_buffer_number_to_maintain,
+                    max_write_buffer_size_to_maintain);
+
+  SequenceNumber seq = 1;
+  std::string value;
+  Status s;
+  MergeContext merge_context;
+  InternalKeyComparator ikey_cmp(options.comparator);
+  SequenceNumber max_covering_tombstone_seq = 0;
+  autovector<MemTable*> to_delete;
+
+  LookupKey lkey("key1", seq);
+  bool found = list.current()->Get(lkey, &value, &s, &merge_context,
+                                   &max_covering_tombstone_seq, ReadOptions());
+  ASSERT_FALSE(found);
+
+  // Create a MemTable
+  InternalKeyComparator cmp(BytewiseComparator());
+  auto factory = std::make_shared<SkipListFactory>();
+  options.memtable_factory = factory;
+  ImmutableCFOptions ioptions(options);
+
+  WriteBufferManager wb(options.db_write_buffer_size);
+  MemTable* mem = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb,
+                               kMaxSequenceNumber, 0 /* column_family_id */);
+  mem->Ref();
+
+  // Write some keys to this memtable.
+  mem->Add(++seq, kTypeDeletion, "key1", "");
+  mem->Add(++seq, kTypeValue, "key2", "value2");
+  mem->Add(++seq, kTypeValue, "key1", "value1");
+  mem->Add(++seq, kTypeValue, "key2", "value2.2");
+
+  // Fetch the newly written keys
+  merge_context.Clear();
+  found = mem->Get(LookupKey("key1", seq), &value, &s, &merge_context,
+                   &max_covering_tombstone_seq, ReadOptions());
+  ASSERT_TRUE(s.ok() && found);
+  ASSERT_EQ(value, "value1");
+
+  merge_context.Clear();
+  found = mem->Get(LookupKey("key1", 2), &value, &s, &merge_context,
+                   &max_covering_tombstone_seq, ReadOptions());
+  // MemTable found out that this key is *not* found (at this sequence#)
+  ASSERT_TRUE(found && s.IsNotFound());
+
+  merge_context.Clear();
+  found = mem->Get(LookupKey("key2", seq), &value, &s, &merge_context,
+                   &max_covering_tombstone_seq, ReadOptions());
+  ASSERT_TRUE(s.ok() && found);
+  ASSERT_EQ(value, "value2.2");
+
+  ASSERT_EQ(4, mem->num_entries());
+  ASSERT_EQ(1, mem->num_deletes());
+
+  // Add memtable to list
+  list.Add(mem, &to_delete);
+
+  SequenceNumber saved_seq = seq;
+
+  // Create another memtable and write some keys to it
+  WriteBufferManager wb2(options.db_write_buffer_size);
+  MemTable* mem2 = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb2,
+                                kMaxSequenceNumber, 0 /* column_family_id */);
+  mem2->Ref();
+
+  mem2->Add(++seq, kTypeDeletion, "key1", "");
+  mem2->Add(++seq, kTypeValue, "key2", "value2.3");
+
+  // Add second memtable to list
+  list.Add(mem2, &to_delete);
+
+  // Fetch keys via MemTableList
+  merge_context.Clear();
+  found =
+      list.current()->Get(LookupKey("key1", seq), &value, &s, &merge_context,
+                          &max_covering_tombstone_seq, ReadOptions());
+  ASSERT_TRUE(found && s.IsNotFound());
+
+  merge_context.Clear();
+  found = list.current()->Get(LookupKey("key1", saved_seq), &value, &s,
+                              &merge_context, &max_covering_tombstone_seq,
+                              ReadOptions());
+  ASSERT_TRUE(s.ok() && found);
+  ASSERT_EQ("value1", value);
+
+  merge_context.Clear();
+  found =
+      list.current()->Get(LookupKey("key2", seq), &value, &s, &merge_context,
+                          &max_covering_tombstone_seq, ReadOptions());
+  ASSERT_TRUE(s.ok() && found);
+  ASSERT_EQ(value, "value2.3");
+
+  merge_context.Clear();
+  found = list.current()->Get(LookupKey("key2", 1), &value, &s, &merge_context,
+                              &max_covering_tombstone_seq, ReadOptions());
+  ASSERT_FALSE(found);
+
+  ASSERT_EQ(2, list.NumNotFlushed());
+
+  list.current()->Unref(&to_delete);
+  for (MemTable* m : to_delete) {
+    delete m;
+  }
+}
+
+TEST_F(MemTableListTest, GetFromHistoryTest) {
+  // Create MemTableList
+  int min_write_buffer_number_to_merge = 2;
+  int max_write_buffer_number_to_maintain = 2;
+  int64_t max_write_buffer_size_to_maintain = 2000;
+  MemTableList list(min_write_buffer_number_to_merge,
+                    max_write_buffer_number_to_maintain,
+                    max_write_buffer_size_to_maintain);
+
+  SequenceNumber seq = 1;
+  std::string value;
+  Status s;
+  MergeContext merge_context;
+  InternalKeyComparator ikey_cmp(options.comparator);
+  SequenceNumber max_covering_tombstone_seq = 0;
+  autovector<MemTable*> to_delete;
+
+  LookupKey lkey("key1", seq);
+  bool found = list.current()->Get(lkey, &value, &s, &merge_context,
+                                   &max_covering_tombstone_seq, ReadOptions());
+  ASSERT_FALSE(found);
+
+  // Create a MemTable
+  InternalKeyComparator cmp(BytewiseComparator());
+  auto factory = std::make_shared<SkipListFactory>();
+  options.memtable_factory = factory;
+  ImmutableCFOptions ioptions(options);
+
+  WriteBufferManager wb(options.db_write_buffer_size);
+  MemTable* mem = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb,
+                               kMaxSequenceNumber, 0 /* column_family_id */);
+  mem->Ref();
+
+  // Write some keys to this memtable.
+  mem->Add(++seq, kTypeDeletion, "key1", "");
+  mem->Add(++seq, kTypeValue, "key2", "value2");
+  mem->Add(++seq, kTypeValue, "key2", "value2.2");
+
+  // Fetch the newly written keys
+  merge_context.Clear();
+  found = mem->Get(LookupKey("key1", seq), &value, &s, &merge_context,
+                   &max_covering_tombstone_seq, ReadOptions());
+  // MemTable found out that this key is *not* found (at this sequence#)
+  ASSERT_TRUE(found && s.IsNotFound());
+
+  merge_context.Clear();
+  found = mem->Get(LookupKey("key2", seq), &value, &s, &merge_context,
+                   &max_covering_tombstone_seq, ReadOptions());
+  ASSERT_TRUE(s.ok() && found);
+  ASSERT_EQ(value, "value2.2");
+
+  // Add memtable to list
+  list.Add(mem, &to_delete);
+  ASSERT_EQ(0, to_delete.size());
+
+  // Fetch keys via MemTableList
+  merge_context.Clear();
+  found =
+      list.current()->Get(LookupKey("key1", seq), &value, &s, &merge_context,
+                          &max_covering_tombstone_seq, ReadOptions());
+  ASSERT_TRUE(found && s.IsNotFound());
+
+  merge_context.Clear();
+  found =
+      list.current()->Get(LookupKey("key2", seq), &value, &s, &merge_context,
+                          &max_covering_tombstone_seq, ReadOptions());
+  ASSERT_TRUE(s.ok() && found);
+  ASSERT_EQ("value2.2", value);
+
+  // Flush this memtable from the list.
+  // (It will then be a part of the memtable history).
+  autovector<MemTable*> to_flush;
+  list.PickMemtablesToFlush(nullptr /* memtable_id */, &to_flush);
+  ASSERT_EQ(1, to_flush.size());
+
+  MutableCFOptions mutable_cf_options(options);
+  s = Mock_InstallMemtableFlushResults(&list, mutable_cf_options, to_flush,
+                                       &to_delete);
+  ASSERT_OK(s);
+  ASSERT_EQ(0, list.NumNotFlushed());
+  ASSERT_EQ(1, list.NumFlushed());
+  ASSERT_EQ(0, to_delete.size());
+
+  // Verify keys are no longer in MemTableList
+  merge_context.Clear();
+  found =
+      list.current()->Get(LookupKey("key1", seq), &value, &s, &merge_context,
+                          &max_covering_tombstone_seq, ReadOptions());
+  ASSERT_FALSE(found);
+
+  merge_context.Clear();
+  found =
+      list.current()->Get(LookupKey("key2", seq), &value, &s, &merge_context,
+                          &max_covering_tombstone_seq, ReadOptions());
+  ASSERT_FALSE(found);
+
+  // Verify keys are present in history
+  merge_context.Clear();
+  found = list.current()->GetFromHistory(
+      LookupKey("key1", seq), &value, &s, &merge_context,
+      &max_covering_tombstone_seq, ReadOptions());
+  ASSERT_TRUE(found && s.IsNotFound());
+
+  merge_context.Clear();
+  found = list.current()->GetFromHistory(
+      LookupKey("key2", seq), &value, &s, &merge_context,
+      &max_covering_tombstone_seq, ReadOptions());
+  ASSERT_TRUE(found);
+  ASSERT_EQ("value2.2", value);
+
+  // Create another memtable and write some keys to it
+  WriteBufferManager wb2(options.db_write_buffer_size);
+  MemTable* mem2 = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb2,
+                                kMaxSequenceNumber, 0 /* column_family_id */);
+  mem2->Ref();
+
+  mem2->Add(++seq, kTypeDeletion, "key1", "");
+  mem2->Add(++seq, kTypeValue, "key3", "value3");
+
+  // Add second memtable to list
+  list.Add(mem2, &to_delete);
+  ASSERT_EQ(0, to_delete.size());
+
+  to_flush.clear();
+  list.PickMemtablesToFlush(nullptr /* memtable_id */, &to_flush);
+  ASSERT_EQ(1, to_flush.size());
+
+  // Flush second memtable
+  s = Mock_InstallMemtableFlushResults(&list, mutable_cf_options, to_flush,
+                                       &to_delete);
+  ASSERT_OK(s);
+  ASSERT_EQ(0, list.NumNotFlushed());
+  ASSERT_EQ(2, list.NumFlushed());
+  ASSERT_EQ(0, to_delete.size());
+
+  // Add a third memtable to push the first memtable out of the history
+  WriteBufferManager wb3(options.db_write_buffer_size);
+  MemTable* mem3 = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb3,
+                                kMaxSequenceNumber, 0 /* column_family_id */);
+  mem3->Ref();
+  list.Add(mem3, &to_delete);
+  ASSERT_EQ(1, list.NumNotFlushed());
+  ASSERT_EQ(1, list.NumFlushed());
+  ASSERT_EQ(1, to_delete.size());
+
+  // Verify keys are no longer in MemTableList
+  merge_context.Clear();
+  found =
+      list.current()->Get(LookupKey("key1", seq), &value, &s, &merge_context,
+                          &max_covering_tombstone_seq, ReadOptions());
+  ASSERT_FALSE(found);
+
+  merge_context.Clear();
+  found =
+      list.current()->Get(LookupKey("key2", seq), &value, &s, &merge_context,
+                          &max_covering_tombstone_seq, ReadOptions());
+  ASSERT_FALSE(found);
+
+  merge_context.Clear();
+  found =
+      list.current()->Get(LookupKey("key3", seq), &value, &s, &merge_context,
+                          &max_covering_tombstone_seq, ReadOptions());
+  ASSERT_FALSE(found);
+
+  // Verify that the second memtable's keys are in the history
+  merge_context.Clear();
+  found = list.current()->GetFromHistory(
+      LookupKey("key1", seq), &value, &s, &merge_context,
+      &max_covering_tombstone_seq, ReadOptions());
+  ASSERT_TRUE(found && s.IsNotFound());
+
+  merge_context.Clear();
+  found = list.current()->GetFromHistory(
+      LookupKey("key3", seq), &value, &s, &merge_context,
+      &max_covering_tombstone_seq, ReadOptions());
+  ASSERT_TRUE(found);
+  ASSERT_EQ("value3", value);
+
+  // Verify that key2 from the first memtable is no longer in the history
+  merge_context.Clear();
+  found =
+      list.current()->Get(LookupKey("key2", seq), &value, &s, &merge_context,
+                          &max_covering_tombstone_seq, ReadOptions());
+  ASSERT_FALSE(found);
+
+  // Cleanup
+  list.current()->Unref(&to_delete);
+  ASSERT_EQ(3, to_delete.size());
+  for (MemTable* m : to_delete) {
+    delete m;
+  }
+}
+
+TEST_F(MemTableListTest, FlushPendingTest) {
+  const int num_tables = 6;
+  SequenceNumber seq = 1;
+  Status s;
+
+  auto factory = std::make_shared<SkipListFactory>();
+  options.memtable_factory = factory;
+  ImmutableCFOptions ioptions(options);
+  InternalKeyComparator cmp(BytewiseComparator());
+  WriteBufferManager wb(options.db_write_buffer_size);
+  autovector<MemTable*> to_delete;
+
+  // Create MemTableList
+  int min_write_buffer_number_to_merge = 3;
+  int max_write_buffer_number_to_maintain = 7;
+  int64_t max_write_buffer_size_to_maintain =
+      7 * static_cast<int>(options.write_buffer_size);
+  MemTableList list(min_write_buffer_number_to_merge,
+                    max_write_buffer_number_to_maintain,
+                    max_write_buffer_size_to_maintain);
+
+  // Create some MemTables
+  uint64_t memtable_id = 0;
+  std::vector<MemTable*> tables;
+  MutableCFOptions mutable_cf_options(options);
+  for (int i = 0; i < num_tables; i++) {
+    MemTable* mem = new MemTable(cmp, ioptions, mutable_cf_options, &wb,
+                                 kMaxSequenceNumber, 0 /* column_family_id */);
+    mem->SetID(memtable_id++);
+    mem->Ref();
+
+    std::string value;
+    MergeContext merge_context;
+
+    mem->Add(++seq, kTypeValue, "key1", ToString(i));
+    mem->Add(++seq, kTypeValue, "keyN" + ToString(i), "valueN");
+    mem->Add(++seq, kTypeValue, "keyX" + ToString(i), "value");
+    mem->Add(++seq, kTypeValue, "keyM" + ToString(i), "valueM");
+    mem->Add(++seq, kTypeDeletion, "keyX" + ToString(i), "");
+
+    tables.push_back(mem);
+  }
+
+  // Nothing to flush
+  ASSERT_FALSE(list.IsFlushPending());
+  ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
+  autovector<MemTable*> to_flush;
+  list.PickMemtablesToFlush(nullptr /* memtable_id */, &to_flush);
+  ASSERT_EQ(0, to_flush.size());
+
+  // Request a flush even though there is nothing to flush
+  list.FlushRequested();
+  ASSERT_FALSE(list.IsFlushPending());
+  ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
+
+  // Attempt to 'flush' to clear request for flush
+  list.PickMemtablesToFlush(nullptr /* memtable_id */, &to_flush);
+  ASSERT_EQ(0, to_flush.size());
+  ASSERT_FALSE(list.IsFlushPending());
+  ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
+
+  // Request a flush again
+  list.FlushRequested();
+  // No flush pending since the list is empty.
+  ASSERT_FALSE(list.IsFlushPending());
+  ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
+
+  // Add 2 tables
+  list.Add(tables[0], &to_delete);
+  list.Add(tables[1], &to_delete);
+  ASSERT_EQ(2, list.NumNotFlushed());
+  ASSERT_EQ(0, to_delete.size());
+
+  // Even though we have less than the minimum to flush, a flush is
+  // pending since we had previously requested a flush and never called
+  // PickMemtablesToFlush() to clear the flush.
+  ASSERT_TRUE(list.IsFlushPending());
+  ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire));
+
+  // Pick tables to flush
+  list.PickMemtablesToFlush(nullptr /* memtable_id */, &to_flush);
+  ASSERT_EQ(2, to_flush.size());
+  ASSERT_EQ(2, list.NumNotFlushed());
+  ASSERT_FALSE(list.IsFlushPending());
+  ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
+
+  // Revert flush
+  list.RollbackMemtableFlush(to_flush, 0);
+  ASSERT_FALSE(list.IsFlushPending());
+  ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire));
+  to_flush.clear();
+
+  // Add another table
+  list.Add(tables[2], &to_delete);
+  // We now have the minimum to flush regardles of whether FlushRequested()
+  // was called.
+  ASSERT_TRUE(list.IsFlushPending());
+  ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire));
+  ASSERT_EQ(0, to_delete.size());
+
+  // Pick tables to flush
+  list.PickMemtablesToFlush(nullptr /* memtable_id */, &to_flush);
+  ASSERT_EQ(3, to_flush.size());
+  ASSERT_EQ(3, list.NumNotFlushed());
+  ASSERT_FALSE(list.IsFlushPending());
+  ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
+
+  // Pick tables to flush again
+  autovector<MemTable*> to_flush2;
+  list.PickMemtablesToFlush(nullptr /* memtable_id */, &to_flush2);
+  ASSERT_EQ(0, to_flush2.size());
+  ASSERT_EQ(3, list.NumNotFlushed());
+  ASSERT_FALSE(list.IsFlushPending());
+  ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
+
+  // Add another table
+  list.Add(tables[3], &to_delete);
+  ASSERT_FALSE(list.IsFlushPending());
+  ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire));
+  ASSERT_EQ(0, to_delete.size());
+
+  // Request a flush again
+  list.FlushRequested();
+  ASSERT_TRUE(list.IsFlushPending());
+  ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire));
+
+  // Pick tables to flush again
+  list.PickMemtablesToFlush(nullptr /* memtable_id */, &to_flush2);
+  ASSERT_EQ(1, to_flush2.size());
+  ASSERT_EQ(4, list.NumNotFlushed());
+  ASSERT_FALSE(list.IsFlushPending());
+  ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
+
+  // Rollback first pick of tables
+  list.RollbackMemtableFlush(to_flush, 0);
+  ASSERT_TRUE(list.IsFlushPending());
+  ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire));
+  to_flush.clear();
+
+  // Add another tables
+  list.Add(tables[4], &to_delete);
+  ASSERT_EQ(5, list.NumNotFlushed());
+  // We now have the minimum to flush regardles of whether FlushRequested()
+  ASSERT_TRUE(list.IsFlushPending());
+  ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire));
+  ASSERT_EQ(0, to_delete.size());
+
+  // Pick tables to flush
+  list.PickMemtablesToFlush(nullptr /* memtable_id */, &to_flush);
+  // Should pick 4 of 5 since 1 table has been picked in to_flush2
+  ASSERT_EQ(4, to_flush.size());
+  ASSERT_EQ(5, list.NumNotFlushed());
+  ASSERT_FALSE(list.IsFlushPending());
+  ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
+
+  // Pick tables to flush again
+  autovector<MemTable*> to_flush3;
+  list.PickMemtablesToFlush(nullptr /* memtable_id */, &to_flush3);
+  ASSERT_EQ(0, to_flush3.size());  // nothing not in progress of being flushed
+  ASSERT_EQ(5, list.NumNotFlushed());
+  ASSERT_FALSE(list.IsFlushPending());
+  ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
+
+  // Flush the 4 memtables that were picked in to_flush
+  s = Mock_InstallMemtableFlushResults(&list, mutable_cf_options, to_flush,
+                                       &to_delete);
+  ASSERT_OK(s);
+
+  // Note:  now to_flush contains tables[0,1,2,4].  to_flush2 contains
+  // tables[3].
+  // Current implementation will only commit memtables in the order they were
+  // created. So TryInstallMemtableFlushResults will install the first 3 tables
+  // in to_flush and stop when it encounters a table not yet flushed.
+  ASSERT_EQ(2, list.NumNotFlushed());
+  int num_in_history =
+      std::min(3, static_cast<int>(max_write_buffer_size_to_maintain) /
+                      static_cast<int>(options.write_buffer_size));
+  ASSERT_EQ(num_in_history, list.NumFlushed());
+  ASSERT_EQ(5 - list.NumNotFlushed() - num_in_history, to_delete.size());
+
+  // Request a flush again. Should be nothing to flush
+  list.FlushRequested();
+  ASSERT_FALSE(list.IsFlushPending());
+  ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
+
+  // Flush the 1 memtable that was picked in to_flush2
+  s = MemTableListTest::Mock_InstallMemtableFlushResults(
+      &list, mutable_cf_options, to_flush2, &to_delete);
+  ASSERT_OK(s);
+
+  // This will actually install 2 tables.  The 1 we told it to flush, and also
+  // tables[4] which has been waiting for tables[3] to commit.
+  ASSERT_EQ(0, list.NumNotFlushed());
+  num_in_history =
+      std::min(5, static_cast<int>(max_write_buffer_size_to_maintain) /
+                      static_cast<int>(options.write_buffer_size));
+  ASSERT_EQ(num_in_history, list.NumFlushed());
+  ASSERT_EQ(5 - list.NumNotFlushed() - num_in_history, to_delete.size());
+
+  for (const auto& m : to_delete) {
+    // Refcount should be 0 after calling TryInstallMemtableFlushResults.
+    // Verify this, by Ref'ing then UnRef'ing:
+    m->Ref();
+    ASSERT_EQ(m, m->Unref());
+    delete m;
+  }
+  to_delete.clear();
+
+  // Add another table
+  list.Add(tables[5], &to_delete);
+  ASSERT_EQ(1, list.NumNotFlushed());
+  ASSERT_EQ(5, list.GetLatestMemTableID());
+  memtable_id = 4;
+  // Pick tables to flush. The tables to pick must have ID smaller than or
+  // equal to 4. Therefore, no table will be selected in this case.
+  autovector<MemTable*> to_flush4;
+  list.FlushRequested();
+  ASSERT_TRUE(list.HasFlushRequested());
+  list.PickMemtablesToFlush(&memtable_id, &to_flush4);
+  ASSERT_TRUE(to_flush4.empty());
+  ASSERT_EQ(1, list.NumNotFlushed());
+  ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire));
+  ASSERT_FALSE(list.IsFlushPending());
+  ASSERT_FALSE(list.HasFlushRequested());
+
+  // Pick tables to flush. The tables to pick must have ID smaller than or
+  // equal to 5. Therefore, only tables[5] will be selected.
+  memtable_id = 5;
+  list.FlushRequested();
+  list.PickMemtablesToFlush(&memtable_id, &to_flush4);
+  ASSERT_EQ(1, static_cast<int>(to_flush4.size()));
+  ASSERT_EQ(1, list.NumNotFlushed());
+  ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
+  ASSERT_FALSE(list.IsFlushPending());
+  to_delete.clear();
+
+  list.current()->Unref(&to_delete);
+  int to_delete_size =
+      std::min(num_tables, static_cast<int>(max_write_buffer_size_to_maintain) /
+                               static_cast<int>(options.write_buffer_size));
+  ASSERT_EQ(to_delete_size, to_delete.size());
+
+  for (const auto& m : to_delete) {
+    // Refcount should be 0 after calling TryInstallMemtableFlushResults.
+    // Verify this, by Ref'ing then UnRef'ing:
+    m->Ref();
+    ASSERT_EQ(m, m->Unref());
+    delete m;
+  }
+  to_delete.clear();
+}
+
+TEST_F(MemTableListTest, EmptyAtomicFlusTest) {
+  autovector<MemTableList*> lists;
+  autovector<uint32_t> cf_ids;
+  autovector<const MutableCFOptions*> options_list;
+  autovector<const autovector<MemTable*>*> to_flush;
+  autovector<MemTable*> to_delete;
+  Status s = Mock_InstallMemtableAtomicFlushResults(lists, cf_ids, options_list,
+                                                    to_flush, &to_delete);
+  ASSERT_OK(s);
+  ASSERT_TRUE(to_delete.empty());
+}
+
+TEST_F(MemTableListTest, AtomicFlusTest) {
+  const int num_cfs = 3;
+  const int num_tables_per_cf = 2;
+  SequenceNumber seq = 1;
+
+  auto factory = std::make_shared<SkipListFactory>();
+  options.memtable_factory = factory;
+  ImmutableCFOptions ioptions(options);
+  InternalKeyComparator cmp(BytewiseComparator());
+  WriteBufferManager wb(options.db_write_buffer_size);
+
+  // Create MemTableLists
+  int min_write_buffer_number_to_merge = 3;
+  int max_write_buffer_number_to_maintain = 7;
+  int64_t max_write_buffer_size_to_maintain =
+      7 * static_cast<int64_t>(options.write_buffer_size);
+  autovector<MemTableList*> lists;
+  for (int i = 0; i != num_cfs; ++i) {
+    lists.emplace_back(new MemTableList(min_write_buffer_number_to_merge,
+                                        max_write_buffer_number_to_maintain,
+                                        max_write_buffer_size_to_maintain));
+  }
+
+  autovector<uint32_t> cf_ids;
+  std::vector<std::vector<MemTable*>> tables(num_cfs);
+  autovector<const MutableCFOptions*> mutable_cf_options_list;
+  uint32_t cf_id = 0;
+  for (auto& elem : tables) {
+    mutable_cf_options_list.emplace_back(new MutableCFOptions(options));
+    uint64_t memtable_id = 0;
+    for (int i = 0; i != num_tables_per_cf; ++i) {
+      MemTable* mem =
+          new MemTable(cmp, ioptions, *(mutable_cf_options_list.back()), &wb,
+                       kMaxSequenceNumber, cf_id);
+      mem->SetID(memtable_id++);
+      mem->Ref();
+
+      std::string value;
+
+      mem->Add(++seq, kTypeValue, "key1", ToString(i));
+      mem->Add(++seq, kTypeValue, "keyN" + ToString(i), "valueN");
+      mem->Add(++seq, kTypeValue, "keyX" + ToString(i), "value");
+      mem->Add(++seq, kTypeValue, "keyM" + ToString(i), "valueM");
+      mem->Add(++seq, kTypeDeletion, "keyX" + ToString(i), "");
+
+      elem.push_back(mem);
+    }
+    cf_ids.push_back(cf_id++);
+  }
+
+  std::vector<autovector<MemTable*>> flush_candidates(num_cfs);
+
+  // Nothing to flush
+  for (auto i = 0; i != num_cfs; ++i) {
+    auto* list = lists[i];
+    ASSERT_FALSE(list->IsFlushPending());
+    ASSERT_FALSE(list->imm_flush_needed.load(std::memory_order_acquire));
+    list->PickMemtablesToFlush(nullptr /* memtable_id */, &flush_candidates[i]);
+    ASSERT_EQ(0, flush_candidates[i].size());
+  }
+  // Request flush even though there is nothing to flush
+  for (auto i = 0; i != num_cfs; ++i) {
+    auto* list = lists[i];
+    list->FlushRequested();
+    ASSERT_FALSE(list->IsFlushPending());
+    ASSERT_FALSE(list->imm_flush_needed.load(std::memory_order_acquire));
+  }
+  autovector<MemTable*> to_delete;
+  // Add tables to the immutable memtalbe lists associated with column families
+  for (auto i = 0; i != num_cfs; ++i) {
+    for (auto j = 0; j != num_tables_per_cf; ++j) {
+      lists[i]->Add(tables[i][j], &to_delete);
+    }
+    ASSERT_EQ(num_tables_per_cf, lists[i]->NumNotFlushed());
+    ASSERT_TRUE(lists[i]->IsFlushPending());
+    ASSERT_TRUE(lists[i]->imm_flush_needed.load(std::memory_order_acquire));
+  }
+  std::vector<uint64_t> flush_memtable_ids = {1, 1, 0};
+  //          +----+
+  // list[0]: |0  1|
+  // list[1]: |0  1|
+  //          | +--+
+  // list[2]: |0| 1
+  //          +-+
+  // Pick memtables to flush
+  for (auto i = 0; i != num_cfs; ++i) {
+    flush_candidates[i].clear();
+    lists[i]->PickMemtablesToFlush(&flush_memtable_ids[i],
+                                   &flush_candidates[i]);
+    ASSERT_EQ(flush_memtable_ids[i] - 0 + 1,
+              static_cast<uint64_t>(flush_candidates[i].size()));
+  }
+  autovector<MemTableList*> tmp_lists;
+  autovector<uint32_t> tmp_cf_ids;
+  autovector<const MutableCFOptions*> tmp_options_list;
+  autovector<const autovector<MemTable*>*> to_flush;
+  for (auto i = 0; i != num_cfs; ++i) {
+    if (!flush_candidates[i].empty()) {
+      to_flush.push_back(&flush_candidates[i]);
+      tmp_lists.push_back(lists[i]);
+      tmp_cf_ids.push_back(i);
+      tmp_options_list.push_back(mutable_cf_options_list[i]);
+    }
+  }
+  Status s = Mock_InstallMemtableAtomicFlushResults(
+      tmp_lists, tmp_cf_ids, tmp_options_list, to_flush, &to_delete);
+  ASSERT_OK(s);
+
+  for (auto i = 0; i != num_cfs; ++i) {
+    for (auto j = 0; j != num_tables_per_cf; ++j) {
+      if (static_cast<uint64_t>(j) <= flush_memtable_ids[i]) {
+        ASSERT_LT(0, tables[i][j]->GetFileNumber());
+      }
+    }
+    ASSERT_EQ(
+        static_cast<size_t>(num_tables_per_cf) - flush_candidates[i].size(),
+        lists[i]->NumNotFlushed());
+  }
+
+  to_delete.clear();
+  for (auto list : lists) {
+    list->current()->Unref(&to_delete);
+    delete list;
+  }
+  for (auto& mutable_cf_options : mutable_cf_options_list) {
+    if (mutable_cf_options != nullptr) {
+      delete mutable_cf_options;
+      mutable_cf_options = nullptr;
+    }
+  }
+  // All memtables in tables array must have been flushed, thus ready to be
+  // deleted.
+  ASSERT_EQ(to_delete.size(), tables.size() * tables.front().size());
+  for (const auto& m : to_delete) {
+    // Refcount should be 0 after calling InstallMemtableFlushResults.
+    // Verify this by Ref'ing and then Unref'ing.
+    m->Ref();
+    ASSERT_EQ(m, m->Unref());
+    delete m;
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/merge_context.h b/src/rocksdb/db/merge_context.h
new file mode 100644
index 000000000..e1869a341
--- /dev/null
+++ b/src/rocksdb/db/merge_context.h
@@ -0,0 +1,134 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#pragma once
+#include <algorithm>
+#include <memory>
+#include <string>
+#include <vector>
+#include "rocksdb/slice.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+const std::vector<Slice> empty_operand_list;
+
+// The merge context for merging a user key.
+// When doing a Get(), DB will create such a class and pass it when
+// issuing Get() operation to memtables and version_set. The operands
+// will be fetched from the context when issuing partial of full merge.
+class MergeContext {
+ public:
+  // Clear all the operands
+  void Clear() {
+    if (operand_list_) {
+      operand_list_->clear();
+      copied_operands_->clear();
+    }
+  }
+
+  // Push a merge operand
+  void PushOperand(const Slice& operand_slice, bool operand_pinned = false) {
+    Initialize();
+    SetDirectionBackward();
+
+    if (operand_pinned) {
+      operand_list_->push_back(operand_slice);
+    } else {
+      // We need to have our own copy of the operand since it's not pinned
+      copied_operands_->emplace_back(
+          new std::string(operand_slice.data(), operand_slice.size()));
+      operand_list_->push_back(*copied_operands_->back());
+    }
+  }
+
+  // Push back a merge operand
+  void PushOperandBack(const Slice& operand_slice,
+                       bool operand_pinned = false) {
+    Initialize();
+    SetDirectionForward();
+
+    if (operand_pinned) {
+      operand_list_->push_back(operand_slice);
+    } else {
+      // We need to have our own copy of the operand since it's not pinned
+      copied_operands_->emplace_back(
+          new std::string(operand_slice.data(), operand_slice.size()));
+      operand_list_->push_back(*copied_operands_->back());
+    }
+  }
+
+  // return total number of operands in the list
+  size_t GetNumOperands() const {
+    if (!operand_list_) {
+      return 0;
+    }
+    return operand_list_->size();
+  }
+
+  // Get the operand at the index.
+  Slice GetOperand(int index) {
+    assert(operand_list_);
+
+    SetDirectionForward();
+    return (*operand_list_)[index];
+  }
+
+  // Same as GetOperandsDirectionForward
+  const std::vector<Slice>& GetOperands() {
+    return GetOperandsDirectionForward();
+  }
+
+  // Return all the operands in the order as they were merged (passed to
+  // FullMerge or FullMergeV2)
+  const std::vector<Slice>& GetOperandsDirectionForward() {
+    if (!operand_list_) {
+      return empty_operand_list;
+    }
+
+    SetDirectionForward();
+    return *operand_list_;
+  }
+
+  // Return all the operands in the reversed order relative to how they were
+  // merged (passed to FullMerge or FullMergeV2)
+  const std::vector<Slice>& GetOperandsDirectionBackward() {
+    if (!operand_list_) {
+      return empty_operand_list;
+    }
+
+    SetDirectionBackward();
+    return *operand_list_;
+  }
+
+ private:
+  void Initialize() {
+    if (!operand_list_) {
+      operand_list_.reset(new std::vector<Slice>());
+      copied_operands_.reset(new std::vector<std::unique_ptr<std::string>>());
+    }
+  }
+
+  void SetDirectionForward() {
+    if (operands_reversed_ == true) {
+      std::reverse(operand_list_->begin(), operand_list_->end());
+      operands_reversed_ = false;
+    }
+  }
+
+  void SetDirectionBackward() {
+    if (operands_reversed_ == false) {
+      std::reverse(operand_list_->begin(), operand_list_->end());
+      operands_reversed_ = true;
+    }
+  }
+
+  // List of operands
+  std::unique_ptr<std::vector<Slice>> operand_list_;
+  // Copy of operands that are not pinned.
+  std::unique_ptr<std::vector<std::unique_ptr<std::string>>> copied_operands_;
+  bool operands_reversed_ = true;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/merge_helper.cc b/src/rocksdb/db/merge_helper.cc
new file mode 100644
index 000000000..96fe79251
--- /dev/null
+++ b/src/rocksdb/db/merge_helper.cc
@@ -0,0 +1,417 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/merge_helper.h"
+
+#include <string>
+
+#include "db/dbformat.h"
+#include "monitoring/perf_context_imp.h"
+#include "monitoring/statistics.h"
+#include "port/likely.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/db.h"
+#include "rocksdb/merge_operator.h"
+#include "table/format.h"
+#include "table/internal_iterator.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+MergeHelper::MergeHelper(Env* env, const Comparator* user_comparator,
+                         const MergeOperator* user_merge_operator,
+                         const CompactionFilter* compaction_filter,
+                         Logger* logger, bool assert_valid_internal_key,
+                         SequenceNumber latest_snapshot,
+                         const SnapshotChecker* snapshot_checker, int level,
+                         Statistics* stats,
+                         const std::atomic<bool>* shutting_down)
+    : env_(env),
+      user_comparator_(user_comparator),
+      user_merge_operator_(user_merge_operator),
+      compaction_filter_(compaction_filter),
+      shutting_down_(shutting_down),
+      logger_(logger),
+      assert_valid_internal_key_(assert_valid_internal_key),
+      allow_single_operand_(false),
+      latest_snapshot_(latest_snapshot),
+      snapshot_checker_(snapshot_checker),
+      level_(level),
+      keys_(),
+      filter_timer_(env_),
+      total_filter_time_(0U),
+      stats_(stats) {
+  assert(user_comparator_ != nullptr);
+  if (user_merge_operator_) {
+    allow_single_operand_ = user_merge_operator_->AllowSingleOperand();
+  }
+}
+
+Status MergeHelper::TimedFullMerge(const MergeOperator* merge_operator,
+                                   const Slice& key, const Slice* value,
+                                   const std::vector<Slice>& operands,
+                                   std::string* result, Logger* logger,
+                                   Statistics* statistics, Env* env,
+                                   Slice* result_operand,
+                                   bool update_num_ops_stats) {
+  assert(merge_operator != nullptr);
+
+  if (operands.size() == 0) {
+    assert(value != nullptr && result != nullptr);
+    result->assign(value->data(), value->size());
+    return Status::OK();
+  }
+
+  if (update_num_ops_stats) {
+    RecordInHistogram(statistics, READ_NUM_MERGE_OPERANDS,
+                      static_cast<uint64_t>(operands.size()));
+  }
+
+  bool success;
+  Slice tmp_result_operand(nullptr, 0);
+  const MergeOperator::MergeOperationInput merge_in(key, value, operands,
+                                                    logger);
+  MergeOperator::MergeOperationOutput merge_out(*result, tmp_result_operand);
+  {
+    // Setup to time the merge
+    StopWatchNano timer(env, statistics != nullptr);
+    PERF_TIMER_GUARD(merge_operator_time_nanos);
+
+    // Do the merge
+    success = merge_operator->FullMergeV2(merge_in, &merge_out);
+
+    if (tmp_result_operand.data()) {
+      // FullMergeV2 result is an existing operand
+      if (result_operand != nullptr) {
+        *result_operand = tmp_result_operand;
+      } else {
+        result->assign(tmp_result_operand.data(), tmp_result_operand.size());
+      }
+    } else if (result_operand) {
+      *result_operand = Slice(nullptr, 0);
+    }
+
+    RecordTick(statistics, MERGE_OPERATION_TOTAL_TIME,
+               statistics ? timer.ElapsedNanos() : 0);
+  }
+
+  if (!success) {
+    RecordTick(statistics, NUMBER_MERGE_FAILURES);
+    return Status::Corruption("Error: Could not perform merge.");
+  }
+
+  return Status::OK();
+}
+
+// PRE:  iter points to the first merge type entry
+// POST: iter points to the first entry beyond the merge process (or the end)
+//       keys_, operands_ are updated to reflect the merge result.
+//       keys_ stores the list of keys encountered while merging.
+//       operands_ stores the list of merge operands encountered while merging.
+//       keys_[i] corresponds to operands_[i] for each i.
+//
+// TODO: Avoid the snapshot stripe map lookup in CompactionRangeDelAggregator
+// and just pass the StripeRep corresponding to the stripe being merged.
+Status MergeHelper::MergeUntil(InternalIterator* iter,
+                               CompactionRangeDelAggregator* range_del_agg,
+                               const SequenceNumber stop_before,
+                               const bool at_bottom) {
+  // Get a copy of the internal key, before it's invalidated by iter->Next()
+  // Also maintain the list of merge operands seen.
+  assert(HasOperator());
+  keys_.clear();
+  merge_context_.Clear();
+  has_compaction_filter_skip_until_ = false;
+  assert(user_merge_operator_);
+  bool first_key = true;
+
+  // We need to parse the internal key again as the parsed key is
+  // backed by the internal key!
+  // Assume no internal key corruption as it has been successfully parsed
+  // by the caller.
+  // original_key_is_iter variable is just caching the information:
+  // original_key_is_iter == (iter->key().ToString() == original_key)
+  bool original_key_is_iter = true;
+  std::string original_key = iter->key().ToString();
+  // Important:
+  // orig_ikey is backed by original_key if keys_.empty()
+  // orig_ikey is backed by keys_.back() if !keys_.empty()
+  ParsedInternalKey orig_ikey;
+  bool succ = ParseInternalKey(original_key, &orig_ikey);
+  assert(succ);
+  if (!succ) {
+    return Status::Corruption("Cannot parse key in MergeUntil");
+  }
+
+  Status s;
+  bool hit_the_next_user_key = false;
+  for (; iter->Valid(); iter->Next(), original_key_is_iter = false) {
+    if (IsShuttingDown()) {
+      return Status::ShutdownInProgress();
+    }
+
+    ParsedInternalKey ikey;
+    assert(keys_.size() == merge_context_.GetNumOperands());
+
+    if (!ParseInternalKey(iter->key(), &ikey)) {
+      // stop at corrupted key
+      if (assert_valid_internal_key_) {
+        assert(!"Corrupted internal key not expected.");
+        return Status::Corruption("Corrupted internal key not expected.");
+      }
+      break;
+    } else if (first_key) {
+      assert(user_comparator_->Equal(ikey.user_key, orig_ikey.user_key));
+      first_key = false;
+    } else if (!user_comparator_->Equal(ikey.user_key, orig_ikey.user_key)) {
+      // hit a different user key, stop right here
+      hit_the_next_user_key = true;
+      break;
+    } else if (stop_before > 0 && ikey.sequence <= stop_before &&
+               LIKELY(snapshot_checker_ == nullptr ||
+                      snapshot_checker_->CheckInSnapshot(ikey.sequence,
+                                                         stop_before) !=
+                          SnapshotCheckerResult::kNotInSnapshot)) {
+      // hit an entry that's possibly visible by the previous snapshot, can't
+      // touch that
+      break;
+    }
+
+    // At this point we are guaranteed that we need to process this key.
+
+    assert(IsValueType(ikey.type));
+    if (ikey.type != kTypeMerge) {
+
+      // hit a put/delete/single delete
+      //   => merge the put value or a nullptr with operands_
+      //   => store result in operands_.back() (and update keys_.back())
+      //   => change the entry type to kTypeValue for keys_.back()
+      // We are done! Success!
+
+      // If there are no operands, just return the Status::OK(). That will cause
+      // the compaction iterator to write out the key we're currently at, which
+      // is the put/delete we just encountered.
+      if (keys_.empty()) {
+        return Status::OK();
+      }
+
+      // TODO(noetzli) If the merge operator returns false, we are currently
+      // (almost) silently dropping the put/delete. That's probably not what we
+      // want. Also if we're in compaction and it's a put, it would be nice to
+      // run compaction filter on it.
+      const Slice val = iter->value();
+      const Slice* val_ptr;
+      if (kTypeValue == ikey.type &&
+          (range_del_agg == nullptr ||
+           !range_del_agg->ShouldDelete(
+               ikey, RangeDelPositioningMode::kForwardTraversal))) {
+        val_ptr = &val;
+      } else {
+        val_ptr = nullptr;
+      }
+      std::string merge_result;
+      s = TimedFullMerge(user_merge_operator_, ikey.user_key, val_ptr,
+                         merge_context_.GetOperands(), &merge_result, logger_,
+                         stats_, env_);
+
+      // We store the result in keys_.back() and operands_.back()
+      // if nothing went wrong (i.e.: no operand corruption on disk)
+      if (s.ok()) {
+        // The original key encountered
+        original_key = std::move(keys_.back());
+        orig_ikey.type = kTypeValue;
+        UpdateInternalKey(&original_key, orig_ikey.sequence, orig_ikey.type);
+        keys_.clear();
+        merge_context_.Clear();
+        keys_.emplace_front(std::move(original_key));
+        merge_context_.PushOperand(merge_result);
+      }
+
+      // move iter to the next entry
+      iter->Next();
+      return s;
+    } else {
+      // hit a merge
+      //   => if there is a compaction filter, apply it.
+      //   => check for range tombstones covering the operand
+      //   => merge the operand into the front of the operands_ list
+      //      if not filtered
+      //   => then continue because we haven't yet seen a Put/Delete.
+      //
+      // Keep queuing keys and operands until we either meet a put / delete
+      // request or later did a partial merge.
+
+      Slice value_slice = iter->value();
+      // add an operand to the list if:
+      // 1) it's included in one of the snapshots. in that case we *must* write
+      // it out, no matter what compaction filter says
+      // 2) it's not filtered by a compaction filter
+      CompactionFilter::Decision filter =
+          ikey.sequence <= latest_snapshot_
+              ? CompactionFilter::Decision::kKeep
+              : FilterMerge(orig_ikey.user_key, value_slice);
+      if (filter != CompactionFilter::Decision::kRemoveAndSkipUntil &&
+          range_del_agg != nullptr &&
+          range_del_agg->ShouldDelete(
+              iter->key(), RangeDelPositioningMode::kForwardTraversal)) {
+        filter = CompactionFilter::Decision::kRemove;
+      }
+      if (filter == CompactionFilter::Decision::kKeep ||
+          filter == CompactionFilter::Decision::kChangeValue) {
+        if (original_key_is_iter) {
+          // this is just an optimization that saves us one memcpy
+          keys_.push_front(std::move(original_key));
+        } else {
+          keys_.push_front(iter->key().ToString());
+        }
+        if (keys_.size() == 1) {
+          // we need to re-anchor the orig_ikey because it was anchored by
+          // original_key before
+          ParseInternalKey(keys_.back(), &orig_ikey);
+        }
+        if (filter == CompactionFilter::Decision::kKeep) {
+          merge_context_.PushOperand(
+              value_slice, iter->IsValuePinned() /* operand_pinned */);
+        } else {  // kChangeValue
+          // Compaction filter asked us to change the operand from value_slice
+          // to compaction_filter_value_.
+          merge_context_.PushOperand(compaction_filter_value_, false);
+        }
+      } else if (filter == CompactionFilter::Decision::kRemoveAndSkipUntil) {
+        // Compaction filter asked us to remove this key altogether
+        // (not just this operand), along with some keys following it.
+        keys_.clear();
+        merge_context_.Clear();
+        has_compaction_filter_skip_until_ = true;
+        return Status::OK();
+      }
+    }
+  }
+
+  if (merge_context_.GetNumOperands() == 0) {
+    // we filtered out all the merge operands
+    return Status::OK();
+  }
+
+  // We are sure we have seen this key's entire history if:
+  // at_bottom == true (this does not necessarily mean it is the bottommost
+  // layer, but rather that we are confident the key does not appear on any of
+  // the lower layers, at_bottom == false doesn't mean it does appear, just
+  // that we can't be sure, see Compaction::IsBottommostLevel for details)
+  // AND
+  // we have either encountered another key or end of key history on this
+  // layer.
+  //
+  // When these conditions are true we are able to merge all the keys
+  // using full merge.
+  //
+  // For these cases we are not sure about, we simply miss the opportunity
+  // to combine the keys. Since VersionSet::SetupOtherInputs() always makes
+  // sure that all merge-operands on the same level get compacted together,
+  // this will simply lead to these merge operands moving to the next level.
+  bool surely_seen_the_beginning =
+      (hit_the_next_user_key || !iter->Valid()) && at_bottom;
+  if (surely_seen_the_beginning) {
+    // do a final merge with nullptr as the existing value and say
+    // bye to the merge type (it's now converted to a Put)
+    assert(kTypeMerge == orig_ikey.type);
+    assert(merge_context_.GetNumOperands() >= 1);
+    assert(merge_context_.GetNumOperands() == keys_.size());
+    std::string merge_result;
+    s = TimedFullMerge(user_merge_operator_, orig_ikey.user_key, nullptr,
+                       merge_context_.GetOperands(), &merge_result, logger_,
+                       stats_, env_);
+    if (s.ok()) {
+      // The original key encountered
+      // We are certain that keys_ is not empty here (see assertions couple of
+      // lines before).
+      original_key = std::move(keys_.back());
+      orig_ikey.type = kTypeValue;
+      UpdateInternalKey(&original_key, orig_ikey.sequence, orig_ikey.type);
+      keys_.clear();
+      merge_context_.Clear();
+      keys_.emplace_front(std::move(original_key));
+      merge_context_.PushOperand(merge_result);
+    }
+  } else {
+    // We haven't seen the beginning of the key nor a Put/Delete.
+    // Attempt to use the user's associative merge function to
+    // merge the stacked merge operands into a single operand.
+    s = Status::MergeInProgress();
+    if (merge_context_.GetNumOperands() >= 2 ||
+        (allow_single_operand_ && merge_context_.GetNumOperands() == 1)) {
+      bool merge_success = false;
+      std::string merge_result;
+      {
+        StopWatchNano timer(env_, stats_ != nullptr);
+        PERF_TIMER_GUARD(merge_operator_time_nanos);
+        merge_success = user_merge_operator_->PartialMergeMulti(
+            orig_ikey.user_key,
+            std::deque<Slice>(merge_context_.GetOperands().begin(),
+                              merge_context_.GetOperands().end()),
+            &merge_result, logger_);
+        RecordTick(stats_, MERGE_OPERATION_TOTAL_TIME,
+                   stats_ ? timer.ElapsedNanosSafe() : 0);
+      }
+      if (merge_success) {
+        // Merging of operands (associative merge) was successful.
+        // Replace operands with the merge result
+        merge_context_.Clear();
+        merge_context_.PushOperand(merge_result);
+        keys_.erase(keys_.begin(), keys_.end() - 1);
+      }
+    }
+  }
+
+  return s;
+}
+
+MergeOutputIterator::MergeOutputIterator(const MergeHelper* merge_helper)
+    : merge_helper_(merge_helper) {
+  it_keys_ = merge_helper_->keys().rend();
+  it_values_ = merge_helper_->values().rend();
+}
+
+void MergeOutputIterator::SeekToFirst() {
+  const auto& keys = merge_helper_->keys();
+  const auto& values = merge_helper_->values();
+  assert(keys.size() == values.size());
+  it_keys_ = keys.rbegin();
+  it_values_ = values.rbegin();
+}
+
+void MergeOutputIterator::Next() {
+  ++it_keys_;
+  ++it_values_;
+}
+
+CompactionFilter::Decision MergeHelper::FilterMerge(const Slice& user_key,
+                                                    const Slice& value_slice) {
+  if (compaction_filter_ == nullptr) {
+    return CompactionFilter::Decision::kKeep;
+  }
+  if (stats_ != nullptr && ShouldReportDetailedTime(env_, stats_)) {
+    filter_timer_.Start();
+  }
+  compaction_filter_value_.clear();
+  compaction_filter_skip_until_.Clear();
+  auto ret = compaction_filter_->FilterV2(
+      level_, user_key, CompactionFilter::ValueType::kMergeOperand, value_slice,
+      &compaction_filter_value_, compaction_filter_skip_until_.rep());
+  if (ret == CompactionFilter::Decision::kRemoveAndSkipUntil) {
+    if (user_comparator_->Compare(*compaction_filter_skip_until_.rep(),
+                                  user_key) <= 0) {
+      // Invalid skip_until returned from compaction filter.
+      // Keep the key as per FilterV2 documentation.
+      ret = CompactionFilter::Decision::kKeep;
+    } else {
+      compaction_filter_skip_until_.ConvertFromUserKey(kMaxSequenceNumber,
+                                                       kValueTypeForSeek);
+    }
+  }
+  total_filter_time_ += filter_timer_.ElapsedNanosSafe();
+  return ret;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/merge_helper.h b/src/rocksdb/db/merge_helper.h
new file mode 100644
index 000000000..c0534f08b
--- /dev/null
+++ b/src/rocksdb/db/merge_helper.h
@@ -0,0 +1,194 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#pragma once
+
+#include <deque>
+#include <string>
+#include <vector>
+
+#include "db/dbformat.h"
+#include "db/merge_context.h"
+#include "db/range_del_aggregator.h"
+#include "db/snapshot_checker.h"
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/env.h"
+#include "rocksdb/slice.h"
+#include "util/stop_watch.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Comparator;
+class Iterator;
+class Logger;
+class MergeOperator;
+class Statistics;
+
+class MergeHelper {
+ public:
+  MergeHelper(Env* env, const Comparator* user_comparator,
+              const MergeOperator* user_merge_operator,
+              const CompactionFilter* compaction_filter, Logger* logger,
+              bool assert_valid_internal_key, SequenceNumber latest_snapshot,
+              const SnapshotChecker* snapshot_checker = nullptr, int level = 0,
+              Statistics* stats = nullptr,
+              const std::atomic<bool>* shutting_down = nullptr);
+
+  // Wrapper around MergeOperator::FullMergeV2() that records perf statistics.
+  // Result of merge will be written to result if status returned is OK.
+  // If operands is empty, the value will simply be copied to result.
+  // Set `update_num_ops_stats` to true if it is from a user read, so that
+  // the latency is sensitive.
+  // Returns one of the following statuses:
+  // - OK: Entries were successfully merged.
+  // - Corruption: Merge operator reported unsuccessful merge.
+  static Status TimedFullMerge(const MergeOperator* merge_operator,
+                               const Slice& key, const Slice* value,
+                               const std::vector<Slice>& operands,
+                               std::string* result, Logger* logger,
+                               Statistics* statistics, Env* env,
+                               Slice* result_operand = nullptr,
+                               bool update_num_ops_stats = false);
+
+  // Merge entries until we hit
+  //     - a corrupted key
+  //     - a Put/Delete,
+  //     - a different user key,
+  //     - a specific sequence number (snapshot boundary),
+  //     - REMOVE_AND_SKIP_UNTIL returned from compaction filter,
+  //  or - the end of iteration
+  // iter: (IN)  points to the first merge type entry
+  //       (OUT) points to the first entry not included in the merge process
+  // range_del_agg: (IN) filters merge operands covered by range tombstones.
+  // stop_before: (IN) a sequence number that merge should not cross.
+  //                   0 means no restriction
+  // at_bottom:   (IN) true if the iterator covers the bottem level, which means
+  //                   we could reach the start of the history of this user key.
+  //
+  // Returns one of the following statuses:
+  // - OK: Entries were successfully merged.
+  // - MergeInProgress: Put/Delete not encountered, and didn't reach the start
+  //   of key's history. Output consists of merge operands only.
+  // - Corruption: Merge operator reported unsuccessful merge or a corrupted
+  //   key has been encountered and not expected (applies only when compiling
+  //   with asserts removed).
+  // - ShutdownInProgress: interrupted by shutdown (*shutting_down == true).
+  //
+  // REQUIRED: The first key in the input is not corrupted.
+  Status MergeUntil(InternalIterator* iter,
+                    CompactionRangeDelAggregator* range_del_agg = nullptr,
+                    const SequenceNumber stop_before = 0,
+                    const bool at_bottom = false);
+
+  // Filters a merge operand using the compaction filter specified
+  // in the constructor. Returns the decision that the filter made.
+  // Uses compaction_filter_value_ and compaction_filter_skip_until_ for the
+  // optional outputs of compaction filter.
+  CompactionFilter::Decision FilterMerge(const Slice& user_key,
+                                         const Slice& value_slice);
+
+  // Query the merge result
+  // These are valid until the next MergeUntil call
+  // If the merging was successful:
+  //   - keys() contains a single element with the latest sequence number of
+  //     the merges. The type will be Put or Merge. See IMPORTANT 1 note, below.
+  //   - values() contains a single element with the result of merging all the
+  //     operands together
+  //
+  //   IMPORTANT 1: the key type could change after the MergeUntil call.
+  //        Put/Delete + Merge + ... + Merge => Put
+  //        Merge + ... + Merge => Merge
+  //
+  // If the merge operator is not associative, and if a Put/Delete is not found
+  // then the merging will be unsuccessful. In this case:
+  //   - keys() contains the list of internal keys seen in order of iteration.
+  //   - values() contains the list of values (merges) seen in the same order.
+  //              values() is parallel to keys() so that the first entry in
+  //              keys() is the key associated with the first entry in values()
+  //              and so on. These lists will be the same length.
+  //              All of these pairs will be merges over the same user key.
+  //              See IMPORTANT 2 note below.
+  //
+  //   IMPORTANT 2: The entries were traversed in order from BACK to FRONT.
+  //                So keys().back() was the first key seen by iterator.
+  // TODO: Re-style this comment to be like the first one
+  const std::deque<std::string>& keys() const { return keys_; }
+  const std::vector<Slice>& values() const {
+    return merge_context_.GetOperands();
+  }
+  uint64_t TotalFilterTime() const { return total_filter_time_; }
+  bool HasOperator() const { return user_merge_operator_ != nullptr; }
+
+  // If compaction filter returned REMOVE_AND_SKIP_UNTIL, this method will
+  // return true and fill *until with the key to which we should skip.
+  // If true, keys() and values() are empty.
+  bool FilteredUntil(Slice* skip_until) const {
+    if (!has_compaction_filter_skip_until_) {
+      return false;
+    }
+    assert(compaction_filter_ != nullptr);
+    assert(skip_until != nullptr);
+    assert(compaction_filter_skip_until_.Valid());
+    *skip_until = compaction_filter_skip_until_.Encode();
+    return true;
+  }
+
+ private:
+  Env* env_;
+  const Comparator* user_comparator_;
+  const MergeOperator* user_merge_operator_;
+  const CompactionFilter* compaction_filter_;
+  const std::atomic<bool>* shutting_down_;
+  Logger* logger_;
+  bool assert_valid_internal_key_; // enforce no internal key corruption?
+  bool allow_single_operand_;
+  SequenceNumber latest_snapshot_;
+  const SnapshotChecker* const snapshot_checker_;
+  int level_;
+
+  // the scratch area that holds the result of MergeUntil
+  // valid up to the next MergeUntil call
+
+  // Keeps track of the sequence of keys seen
+  std::deque<std::string> keys_;
+  // Parallel with keys_; stores the operands
+  mutable MergeContext merge_context_;
+
+  StopWatchNano filter_timer_;
+  uint64_t total_filter_time_;
+  Statistics* stats_;
+
+  bool has_compaction_filter_skip_until_ = false;
+  std::string compaction_filter_value_;
+  InternalKey compaction_filter_skip_until_;
+
+  bool IsShuttingDown() {
+    // This is a best-effort facility, so memory_order_relaxed is sufficient.
+    return shutting_down_ && shutting_down_->load(std::memory_order_relaxed);
+  }
+};
+
+// MergeOutputIterator can be used to iterate over the result of a merge.
+class MergeOutputIterator {
+ public:
+  // The MergeOutputIterator is bound to a MergeHelper instance.
+  explicit MergeOutputIterator(const MergeHelper* merge_helper);
+
+  // Seeks to the first record in the output.
+  void SeekToFirst();
+  // Advances to the next record in the output.
+  void Next();
+
+  Slice key() { return Slice(*it_keys_); }
+  Slice value() { return Slice(*it_values_); }
+  bool Valid() { return it_keys_ != merge_helper_->keys().rend(); }
+
+ private:
+  const MergeHelper* merge_helper_;
+  std::deque<std::string>::const_reverse_iterator it_keys_;
+  std::vector<Slice>::const_reverse_iterator it_values_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/merge_helper_test.cc b/src/rocksdb/db/merge_helper_test.cc
new file mode 100644
index 000000000..117916c8f
--- /dev/null
+++ b/src/rocksdb/db/merge_helper_test.cc
@@ -0,0 +1,290 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include <algorithm>
+#include <string>
+#include <vector>
+
+#include "db/merge_helper.h"
+#include "rocksdb/comparator.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/coding.h"
+#include "utilities/merge_operators.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class MergeHelperTest : public testing::Test {
+ public:
+  MergeHelperTest() { env_ = Env::Default(); }
+
+  ~MergeHelperTest() override = default;
+
+  Status Run(SequenceNumber stop_before, bool at_bottom,
+             SequenceNumber latest_snapshot = 0) {
+    iter_.reset(new test::VectorIterator(ks_, vs_));
+    iter_->SeekToFirst();
+    merge_helper_.reset(new MergeHelper(env_, BytewiseComparator(),
+                                        merge_op_.get(), filter_.get(), nullptr,
+                                        false, latest_snapshot));
+    return merge_helper_->MergeUntil(iter_.get(), nullptr /* range_del_agg */,
+                                     stop_before, at_bottom);
+  }
+
+  void AddKeyVal(const std::string& user_key, const SequenceNumber& seq,
+                 const ValueType& t, const std::string& val,
+                 bool corrupt = false) {
+    InternalKey ikey(user_key, seq, t);
+    if (corrupt) {
+      test::CorruptKeyType(&ikey);
+    }
+    ks_.push_back(ikey.Encode().ToString());
+    vs_.push_back(val);
+  }
+
+  Env* env_;
+  std::unique_ptr<test::VectorIterator> iter_;
+  std::shared_ptr<MergeOperator> merge_op_;
+  std::unique_ptr<MergeHelper> merge_helper_;
+  std::vector<std::string> ks_;
+  std::vector<std::string> vs_;
+  std::unique_ptr<test::FilterNumber> filter_;
+};
+
+// If MergeHelper encounters a new key on the last level, we know that
+// the key has no more history and it can merge keys.
+TEST_F(MergeHelperTest, MergeAtBottomSuccess) {
+  merge_op_ = MergeOperators::CreateUInt64AddOperator();
+
+  AddKeyVal("a", 20, kTypeMerge, test::EncodeInt(1U));
+  AddKeyVal("a", 10, kTypeMerge, test::EncodeInt(3U));
+  AddKeyVal("b", 10, kTypeMerge, test::EncodeInt(4U));  // <- iter_ after merge
+
+  ASSERT_TRUE(Run(0, true).ok());
+  ASSERT_EQ(ks_[2], iter_->key());
+  ASSERT_EQ(test::KeyStr("a", 20, kTypeValue), merge_helper_->keys()[0]);
+  ASSERT_EQ(test::EncodeInt(4U), merge_helper_->values()[0]);
+  ASSERT_EQ(1U, merge_helper_->keys().size());
+  ASSERT_EQ(1U, merge_helper_->values().size());
+}
+
+// Merging with a value results in a successful merge.
+TEST_F(MergeHelperTest, MergeValue) {
+  merge_op_ = MergeOperators::CreateUInt64AddOperator();
+
+  AddKeyVal("a", 40, kTypeMerge, test::EncodeInt(1U));
+  AddKeyVal("a", 30, kTypeMerge, test::EncodeInt(3U));
+  AddKeyVal("a", 20, kTypeValue, test::EncodeInt(4U));  // <- iter_ after merge
+  AddKeyVal("a", 10, kTypeMerge, test::EncodeInt(1U));
+
+  ASSERT_TRUE(Run(0, false).ok());
+  ASSERT_EQ(ks_[3], iter_->key());
+  ASSERT_EQ(test::KeyStr("a", 40, kTypeValue), merge_helper_->keys()[0]);
+  ASSERT_EQ(test::EncodeInt(8U), merge_helper_->values()[0]);
+  ASSERT_EQ(1U, merge_helper_->keys().size());
+  ASSERT_EQ(1U, merge_helper_->values().size());
+}
+
+// Merging stops before a snapshot.
+TEST_F(MergeHelperTest, SnapshotBeforeValue) {
+  merge_op_ = MergeOperators::CreateUInt64AddOperator();
+
+  AddKeyVal("a", 50, kTypeMerge, test::EncodeInt(1U));
+  AddKeyVal("a", 40, kTypeMerge, test::EncodeInt(3U));  // <- iter_ after merge
+  AddKeyVal("a", 30, kTypeMerge, test::EncodeInt(1U));
+  AddKeyVal("a", 20, kTypeValue, test::EncodeInt(4U));
+  AddKeyVal("a", 10, kTypeMerge, test::EncodeInt(1U));
+
+  ASSERT_TRUE(Run(31, true).IsMergeInProgress());
+  ASSERT_EQ(ks_[2], iter_->key());
+  ASSERT_EQ(test::KeyStr("a", 50, kTypeMerge), merge_helper_->keys()[0]);
+  ASSERT_EQ(test::EncodeInt(4U), merge_helper_->values()[0]);
+  ASSERT_EQ(1U, merge_helper_->keys().size());
+  ASSERT_EQ(1U, merge_helper_->values().size());
+}
+
+// MergeHelper preserves the operand stack for merge operators that
+// cannot do a partial merge.
+TEST_F(MergeHelperTest, NoPartialMerge) {
+  merge_op_ = MergeOperators::CreateStringAppendTESTOperator();
+
+  AddKeyVal("a", 50, kTypeMerge, "v2");
+  AddKeyVal("a", 40, kTypeMerge, "v");  // <- iter_ after merge
+  AddKeyVal("a", 30, kTypeMerge, "v");
+
+  ASSERT_TRUE(Run(31, true).IsMergeInProgress());
+  ASSERT_EQ(ks_[2], iter_->key());
+  ASSERT_EQ(test::KeyStr("a", 40, kTypeMerge), merge_helper_->keys()[0]);
+  ASSERT_EQ("v", merge_helper_->values()[0]);
+  ASSERT_EQ(test::KeyStr("a", 50, kTypeMerge), merge_helper_->keys()[1]);
+  ASSERT_EQ("v2", merge_helper_->values()[1]);
+  ASSERT_EQ(2U, merge_helper_->keys().size());
+  ASSERT_EQ(2U, merge_helper_->values().size());
+}
+
+// A single operand can not be merged.
+TEST_F(MergeHelperTest, SingleOperand) {
+  merge_op_ = MergeOperators::CreateUInt64AddOperator();
+
+  AddKeyVal("a", 50, kTypeMerge, test::EncodeInt(1U));
+
+  ASSERT_TRUE(Run(31, false).IsMergeInProgress());
+  ASSERT_FALSE(iter_->Valid());
+  ASSERT_EQ(test::KeyStr("a", 50, kTypeMerge), merge_helper_->keys()[0]);
+  ASSERT_EQ(test::EncodeInt(1U), merge_helper_->values()[0]);
+  ASSERT_EQ(1U, merge_helper_->keys().size());
+  ASSERT_EQ(1U, merge_helper_->values().size());
+}
+
+// Merging with a deletion turns the deletion into a value
+TEST_F(MergeHelperTest, MergeDeletion) {
+  merge_op_ = MergeOperators::CreateUInt64AddOperator();
+
+  AddKeyVal("a", 30, kTypeMerge, test::EncodeInt(3U));
+  AddKeyVal("a", 20, kTypeDeletion, "");
+
+  ASSERT_TRUE(Run(15, false).ok());
+  ASSERT_FALSE(iter_->Valid());
+  ASSERT_EQ(test::KeyStr("a", 30, kTypeValue), merge_helper_->keys()[0]);
+  ASSERT_EQ(test::EncodeInt(3U), merge_helper_->values()[0]);
+  ASSERT_EQ(1U, merge_helper_->keys().size());
+  ASSERT_EQ(1U, merge_helper_->values().size());
+}
+
+// The merge helper stops upon encountering a corrupt key
+TEST_F(MergeHelperTest, CorruptKey) {
+  merge_op_ = MergeOperators::CreateUInt64AddOperator();
+
+  AddKeyVal("a", 30, kTypeMerge, test::EncodeInt(3U));
+  AddKeyVal("a", 25, kTypeMerge, test::EncodeInt(1U));
+  // Corrupt key
+  AddKeyVal("a", 20, kTypeDeletion, "", true);  // <- iter_ after merge
+
+  ASSERT_TRUE(Run(15, false).IsMergeInProgress());
+  ASSERT_EQ(ks_[2], iter_->key());
+  ASSERT_EQ(test::KeyStr("a", 30, kTypeMerge), merge_helper_->keys()[0]);
+  ASSERT_EQ(test::EncodeInt(4U), merge_helper_->values()[0]);
+  ASSERT_EQ(1U, merge_helper_->keys().size());
+  ASSERT_EQ(1U, merge_helper_->values().size());
+}
+
+// The compaction filter is called on every merge operand
+TEST_F(MergeHelperTest, FilterMergeOperands) {
+  merge_op_ = MergeOperators::CreateUInt64AddOperator();
+  filter_.reset(new test::FilterNumber(5U));
+
+  AddKeyVal("a", 30, kTypeMerge, test::EncodeInt(3U));
+  AddKeyVal("a", 29, kTypeMerge, test::EncodeInt(5U));  // Filtered
+  AddKeyVal("a", 28, kTypeMerge, test::EncodeInt(3U));
+  AddKeyVal("a", 27, kTypeMerge, test::EncodeInt(1U));
+  AddKeyVal("a", 26, kTypeMerge, test::EncodeInt(5U));  // Filtered
+  AddKeyVal("a", 25, kTypeValue, test::EncodeInt(1U));
+
+  ASSERT_TRUE(Run(15, false).ok());
+  ASSERT_FALSE(iter_->Valid());
+  MergeOutputIterator merge_output_iter(merge_helper_.get());
+  merge_output_iter.SeekToFirst();
+  ASSERT_EQ(test::KeyStr("a", 30, kTypeValue),
+            merge_output_iter.key().ToString());
+  ASSERT_EQ(test::EncodeInt(8U), merge_output_iter.value().ToString());
+  merge_output_iter.Next();
+  ASSERT_FALSE(merge_output_iter.Valid());
+}
+
+TEST_F(MergeHelperTest, FilterAllMergeOperands) {
+  merge_op_ = MergeOperators::CreateUInt64AddOperator();
+  filter_.reset(new test::FilterNumber(5U));
+
+  AddKeyVal("a", 30, kTypeMerge, test::EncodeInt(5U));
+  AddKeyVal("a", 29, kTypeMerge, test::EncodeInt(5U));
+  AddKeyVal("a", 28, kTypeMerge, test::EncodeInt(5U));
+  AddKeyVal("a", 27, kTypeMerge, test::EncodeInt(5U));
+  AddKeyVal("a", 26, kTypeMerge, test::EncodeInt(5U));
+  AddKeyVal("a", 25, kTypeMerge, test::EncodeInt(5U));
+
+  // filtered out all
+  ASSERT_TRUE(Run(15, false).ok());
+  ASSERT_FALSE(iter_->Valid());
+  MergeOutputIterator merge_output_iter(merge_helper_.get());
+  merge_output_iter.SeekToFirst();
+  ASSERT_FALSE(merge_output_iter.Valid());
+
+  // we have one operand that will survive because it's a delete
+  AddKeyVal("a", 24, kTypeDeletion, test::EncodeInt(5U));
+  AddKeyVal("b", 23, kTypeValue, test::EncodeInt(5U));
+  ASSERT_TRUE(Run(15, true).ok());
+  merge_output_iter = MergeOutputIterator(merge_helper_.get());
+  ASSERT_TRUE(iter_->Valid());
+  merge_output_iter.SeekToFirst();
+  ASSERT_FALSE(merge_output_iter.Valid());
+
+  // when all merge operands are filtered out, we leave the iterator pointing to
+  // the Put/Delete that survived
+  ASSERT_EQ(test::KeyStr("a", 24, kTypeDeletion), iter_->key().ToString());
+  ASSERT_EQ(test::EncodeInt(5U), iter_->value().ToString());
+}
+
+// Make sure that merge operands are filtered at the beginning
+TEST_F(MergeHelperTest, FilterFirstMergeOperand) {
+  merge_op_ = MergeOperators::CreateUInt64AddOperator();
+  filter_.reset(new test::FilterNumber(5U));
+
+  AddKeyVal("a", 31, kTypeMerge, test::EncodeInt(5U));  // Filtered
+  AddKeyVal("a", 30, kTypeMerge, test::EncodeInt(5U));  // Filtered
+  AddKeyVal("a", 29, kTypeMerge, test::EncodeInt(2U));
+  AddKeyVal("a", 28, kTypeMerge, test::EncodeInt(1U));
+  AddKeyVal("a", 27, kTypeMerge, test::EncodeInt(3U));
+  AddKeyVal("a", 26, kTypeMerge, test::EncodeInt(5U));  // Filtered
+  AddKeyVal("a", 25, kTypeMerge, test::EncodeInt(5U));  // Filtered
+  AddKeyVal("b", 24, kTypeValue, test::EncodeInt(5U));  // next user key
+
+  ASSERT_OK(Run(15, true));
+  ASSERT_TRUE(iter_->Valid());
+  MergeOutputIterator merge_output_iter(merge_helper_.get());
+  merge_output_iter.SeekToFirst();
+  // sequence number is 29 here, because the first merge operand got filtered
+  // out
+  ASSERT_EQ(test::KeyStr("a", 29, kTypeValue),
+            merge_output_iter.key().ToString());
+  ASSERT_EQ(test::EncodeInt(6U), merge_output_iter.value().ToString());
+  merge_output_iter.Next();
+  ASSERT_FALSE(merge_output_iter.Valid());
+
+  // make sure that we're passing user keys into the filter
+  ASSERT_EQ("a", filter_->last_merge_operand_key());
+}
+
+// Make sure that merge operands are not filtered out if there's a snapshot
+// pointing at them
+TEST_F(MergeHelperTest, DontFilterMergeOperandsBeforeSnapshotTest) {
+  merge_op_ = MergeOperators::CreateUInt64AddOperator();
+  filter_.reset(new test::FilterNumber(5U));
+
+  AddKeyVal("a", 31, kTypeMerge, test::EncodeInt(5U));
+  AddKeyVal("a", 30, kTypeMerge, test::EncodeInt(5U));
+  AddKeyVal("a", 29, kTypeMerge, test::EncodeInt(2U));
+  AddKeyVal("a", 28, kTypeMerge, test::EncodeInt(1U));
+  AddKeyVal("a", 27, kTypeMerge, test::EncodeInt(3U));
+  AddKeyVal("a", 26, kTypeMerge, test::EncodeInt(5U));
+  AddKeyVal("a", 25, kTypeMerge, test::EncodeInt(5U));
+  AddKeyVal("b", 24, kTypeValue, test::EncodeInt(5U));
+
+  ASSERT_OK(Run(15, true, 32));
+  ASSERT_TRUE(iter_->Valid());
+  MergeOutputIterator merge_output_iter(merge_helper_.get());
+  merge_output_iter.SeekToFirst();
+  ASSERT_EQ(test::KeyStr("a", 31, kTypeValue),
+            merge_output_iter.key().ToString());
+  ASSERT_EQ(test::EncodeInt(26U), merge_output_iter.value().ToString());
+  merge_output_iter.Next();
+  ASSERT_FALSE(merge_output_iter.Valid());
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/merge_operator.cc b/src/rocksdb/db/merge_operator.cc
new file mode 100644
index 000000000..75dea432c
--- /dev/null
+++ b/src/rocksdb/db/merge_operator.cc
@@ -0,0 +1,86 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+/**
+ * Back-end implementation details specific to the Merge Operator.
+ */
+
+#include "rocksdb/merge_operator.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+bool MergeOperator::FullMergeV2(const MergeOperationInput& merge_in,
+                                MergeOperationOutput* merge_out) const {
+  // If FullMergeV2 is not implemented, we convert the operand_list to
+  // std::deque<std::string> and pass it to FullMerge
+  std::deque<std::string> operand_list_str;
+  for (auto& op : merge_in.operand_list) {
+    operand_list_str.emplace_back(op.data(), op.size());
+  }
+  return FullMerge(merge_in.key, merge_in.existing_value, operand_list_str,
+                   &merge_out->new_value, merge_in.logger);
+}
+
+// The default implementation of PartialMergeMulti, which invokes
+// PartialMerge multiple times internally and merges two operands at
+// a time.
+bool MergeOperator::PartialMergeMulti(const Slice& key,
+                                      const std::deque<Slice>& operand_list,
+                                      std::string* new_value,
+                                      Logger* logger) const {
+  assert(operand_list.size() >= 2);
+  // Simply loop through the operands
+  Slice temp_slice(operand_list[0]);
+
+  for (size_t i = 1; i < operand_list.size(); ++i) {
+    auto& operand = operand_list[i];
+    std::string temp_value;
+    if (!PartialMerge(key, temp_slice, operand, &temp_value, logger)) {
+      return false;
+    }
+    swap(temp_value, *new_value);
+    temp_slice = Slice(*new_value);
+  }
+
+  // The result will be in *new_value. All merges succeeded.
+  return true;
+}
+
+// Given a "real" merge from the library, call the user's
+// associative merge function one-by-one on each of the operands.
+// NOTE: It is assumed that the client's merge-operator will handle any errors.
+bool AssociativeMergeOperator::FullMergeV2(
+    const MergeOperationInput& merge_in,
+    MergeOperationOutput* merge_out) const {
+  // Simply loop through the operands
+  Slice temp_existing;
+  const Slice* existing_value = merge_in.existing_value;
+  for (const auto& operand : merge_in.operand_list) {
+    std::string temp_value;
+    if (!Merge(merge_in.key, existing_value, operand, &temp_value,
+               merge_in.logger)) {
+      return false;
+    }
+    swap(temp_value, merge_out->new_value);
+    temp_existing = Slice(merge_out->new_value);
+    existing_value = &temp_existing;
+  }
+
+  // The result will be in *new_value. All merges succeeded.
+  return true;
+}
+
+// Call the user defined simple merge on the operands;
+// NOTE: It is assumed that the client's merge-operator will handle any errors.
+bool AssociativeMergeOperator::PartialMerge(
+    const Slice& key,
+    const Slice& left_operand,
+    const Slice& right_operand,
+    std::string* new_value,
+    Logger* logger) const {
+  return Merge(key, &left_operand, right_operand, new_value, logger);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/merge_test.cc b/src/rocksdb/db/merge_test.cc
new file mode 100644
index 000000000..3f85f6464
--- /dev/null
+++ b/src/rocksdb/db/merge_test.cc
@@ -0,0 +1,504 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#include <assert.h>
+#include <memory>
+#include <iostream>
+
+#include "db/db_impl/db_impl.h"
+#include "db/dbformat.h"
+#include "db/write_batch_internal.h"
+#include "port/stack_trace.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/utilities/db_ttl.h"
+#include "test_util/testharness.h"
+#include "utilities/merge_operators.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+bool use_compression;
+
+class MergeTest : public testing::Test {};
+
+size_t num_merge_operator_calls;
+void resetNumMergeOperatorCalls() { num_merge_operator_calls = 0; }
+
+size_t num_partial_merge_calls;
+void resetNumPartialMergeCalls() { num_partial_merge_calls = 0; }
+
+class CountMergeOperator : public AssociativeMergeOperator {
+ public:
+  CountMergeOperator() {
+    mergeOperator_ = MergeOperators::CreateUInt64AddOperator();
+  }
+
+  bool Merge(const Slice& key, const Slice* existing_value, const Slice& value,
+             std::string* new_value, Logger* logger) const override {
+    assert(new_value->empty());
+    ++num_merge_operator_calls;
+    if (existing_value == nullptr) {
+      new_value->assign(value.data(), value.size());
+      return true;
+    }
+
+    return mergeOperator_->PartialMerge(
+        key,
+        *existing_value,
+        value,
+        new_value,
+        logger);
+  }
+
+  bool PartialMergeMulti(const Slice& key,
+                         const std::deque<Slice>& operand_list,
+                         std::string* new_value,
+                         Logger* logger) const override {
+    assert(new_value->empty());
+    ++num_partial_merge_calls;
+    return mergeOperator_->PartialMergeMulti(key, operand_list, new_value,
+                                             logger);
+  }
+
+  const char* Name() const override { return "UInt64AddOperator"; }
+
+ private:
+  std::shared_ptr<MergeOperator> mergeOperator_;
+};
+
+std::shared_ptr<DB> OpenDb(const std::string& dbname, const bool ttl = false,
+                           const size_t max_successive_merges = 0) {
+  DB* db;
+  Options options;
+  options.create_if_missing = true;
+  options.merge_operator = std::make_shared<CountMergeOperator>();
+  options.max_successive_merges = max_successive_merges;
+  Status s;
+  DestroyDB(dbname, Options());
+// DBWithTTL is not supported in ROCKSDB_LITE
+#ifndef ROCKSDB_LITE
+  if (ttl) {
+    DBWithTTL* db_with_ttl;
+    s = DBWithTTL::Open(options, dbname, &db_with_ttl);
+    db = db_with_ttl;
+  } else {
+    s = DB::Open(options, dbname, &db);
+  }
+#else
+  assert(!ttl);
+  s = DB::Open(options, dbname, &db);
+#endif  // !ROCKSDB_LITE
+  if (!s.ok()) {
+    std::cerr << s.ToString() << std::endl;
+    assert(false);
+  }
+  return std::shared_ptr<DB>(db);
+}
+
+// Imagine we are maintaining a set of uint64 counters.
+// Each counter has a distinct name. And we would like
+// to support four high level operations:
+// set, add, get and remove
+// This is a quick implementation without a Merge operation.
+class Counters {
+
+ protected:
+  std::shared_ptr<DB> db_;
+
+  WriteOptions put_option_;
+  ReadOptions get_option_;
+  WriteOptions delete_option_;
+
+  uint64_t default_;
+
+ public:
+  explicit Counters(std::shared_ptr<DB> db, uint64_t defaultCount = 0)
+      : db_(db),
+        put_option_(),
+        get_option_(),
+        delete_option_(),
+        default_(defaultCount) {
+    assert(db_);
+  }
+
+  virtual ~Counters() {}
+
+  // public interface of Counters.
+  // All four functions return false
+  // if the underlying level db operation failed.
+
+  // mapped to a levedb Put
+  bool set(const std::string& key, uint64_t value) {
+    // just treat the internal rep of int64 as the string
+    char buf[sizeof(value)];
+    EncodeFixed64(buf, value);
+    Slice slice(buf, sizeof(value));
+    auto s = db_->Put(put_option_, key, slice);
+
+    if (s.ok()) {
+      return true;
+    } else {
+      std::cerr << s.ToString() << std::endl;
+      return false;
+    }
+  }
+
+  // mapped to a rocksdb Delete
+  bool remove(const std::string& key) {
+    auto s = db_->Delete(delete_option_, key);
+
+    if (s.ok()) {
+      return true;
+    } else {
+      std::cerr << s.ToString() << std::endl;
+      return false;
+    }
+  }
+
+  // mapped to a rocksdb Get
+  bool get(const std::string& key, uint64_t* value) {
+    std::string str;
+    auto s = db_->Get(get_option_, key, &str);
+
+    if (s.IsNotFound()) {
+      // return default value if not found;
+      *value = default_;
+      return true;
+    } else if (s.ok()) {
+      // deserialization
+      if (str.size() != sizeof(uint64_t)) {
+        std::cerr << "value corruption\n";
+        return false;
+      }
+      *value = DecodeFixed64(&str[0]);
+      return true;
+    } else {
+      std::cerr << s.ToString() << std::endl;
+      return false;
+    }
+  }
+
+  // 'add' is implemented as get -> modify -> set
+  // An alternative is a single merge operation, see MergeBasedCounters
+  virtual bool add(const std::string& key, uint64_t value) {
+    uint64_t base = default_;
+    return get(key, &base) && set(key, base + value);
+  }
+
+
+  // convenience functions for testing
+  void assert_set(const std::string& key, uint64_t value) {
+    assert(set(key, value));
+  }
+
+  void assert_remove(const std::string& key) { assert(remove(key)); }
+
+  uint64_t assert_get(const std::string& key) {
+    uint64_t value = default_;
+    int result = get(key, &value);
+    assert(result);
+    if (result == 0) exit(1); // Disable unused variable warning.
+    return value;
+  }
+
+  void assert_add(const std::string& key, uint64_t value) {
+    int result = add(key, value);
+    assert(result);
+    if (result == 0) exit(1); // Disable unused variable warning.
+  }
+};
+
+// Implement 'add' directly with the new Merge operation
+class MergeBasedCounters : public Counters {
+ private:
+  WriteOptions merge_option_; // for merge
+
+ public:
+  explicit MergeBasedCounters(std::shared_ptr<DB> db, uint64_t defaultCount = 0)
+      : Counters(db, defaultCount),
+        merge_option_() {
+  }
+
+  // mapped to a rocksdb Merge operation
+  bool add(const std::string& key, uint64_t value) override {
+    char encoded[sizeof(uint64_t)];
+    EncodeFixed64(encoded, value);
+    Slice slice(encoded, sizeof(uint64_t));
+    auto s = db_->Merge(merge_option_, key, slice);
+
+    if (s.ok()) {
+      return true;
+    } else {
+      std::cerr << s.ToString() << std::endl;
+      return false;
+    }
+  }
+};
+
+void dumpDb(DB* db) {
+  auto it = std::unique_ptr<Iterator>(db->NewIterator(ReadOptions()));
+  for (it->SeekToFirst(); it->Valid(); it->Next()) {
+    //uint64_t value = DecodeFixed64(it->value().data());
+    //std::cout << it->key().ToString() << ": " << value << std::endl;
+  }
+  assert(it->status().ok());  // Check for any errors found during the scan
+}
+
+void testCounters(Counters& counters, DB* db, bool test_compaction) {
+
+  FlushOptions o;
+  o.wait = true;
+
+  counters.assert_set("a", 1);
+
+  if (test_compaction) db->Flush(o);
+
+  assert(counters.assert_get("a") == 1);
+
+  counters.assert_remove("b");
+
+  // defaut value is 0 if non-existent
+  assert(counters.assert_get("b") == 0);
+
+  counters.assert_add("a", 2);
+
+  if (test_compaction) db->Flush(o);
+
+  // 1+2 = 3
+  assert(counters.assert_get("a")== 3);
+
+  dumpDb(db);
+
+  // 1+...+49 = ?
+  uint64_t sum = 0;
+  for (int i = 1; i < 50; i++) {
+    counters.assert_add("b", i);
+    sum += i;
+  }
+  assert(counters.assert_get("b") == sum);
+
+  dumpDb(db);
+
+  if (test_compaction) {
+    db->Flush(o);
+
+    db->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+
+    dumpDb(db);
+
+    assert(counters.assert_get("a")== 3);
+    assert(counters.assert_get("b") == sum);
+  }
+}
+
+void testSuccessiveMerge(Counters& counters, size_t max_num_merges,
+                         size_t num_merges) {
+
+  counters.assert_remove("z");
+  uint64_t sum = 0;
+
+  for (size_t i = 1; i <= num_merges; ++i) {
+    resetNumMergeOperatorCalls();
+    counters.assert_add("z", i);
+    sum += i;
+
+    if (i % (max_num_merges + 1) == 0) {
+      assert(num_merge_operator_calls == max_num_merges + 1);
+    } else {
+      assert(num_merge_operator_calls == 0);
+    }
+
+    resetNumMergeOperatorCalls();
+    assert(counters.assert_get("z") == sum);
+    assert(num_merge_operator_calls == i % (max_num_merges + 1));
+  }
+}
+
+void testPartialMerge(Counters* counters, DB* db, size_t max_merge,
+                      size_t min_merge, size_t count) {
+  FlushOptions o;
+  o.wait = true;
+
+  // Test case 1: partial merge should be called when the number of merge
+  //              operands exceeds the threshold.
+  uint64_t tmp_sum = 0;
+  resetNumPartialMergeCalls();
+  for (size_t i = 1; i <= count; i++) {
+    counters->assert_add("b", i);
+    tmp_sum += i;
+  }
+  db->Flush(o);
+  db->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_EQ(tmp_sum, counters->assert_get("b"));
+  if (count > max_merge) {
+    // in this case, FullMerge should be called instead.
+    ASSERT_EQ(num_partial_merge_calls, 0U);
+  } else {
+    // if count >= min_merge, then partial merge should be called once.
+    ASSERT_EQ((count >= min_merge), (num_partial_merge_calls == 1));
+  }
+
+  // Test case 2: partial merge should not be called when a put is found.
+  resetNumPartialMergeCalls();
+  tmp_sum = 0;
+  db->Put(ROCKSDB_NAMESPACE::WriteOptions(), "c", "10");
+  for (size_t i = 1; i <= count; i++) {
+    counters->assert_add("c", i);
+    tmp_sum += i;
+  }
+  db->Flush(o);
+  db->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_EQ(tmp_sum, counters->assert_get("c"));
+  ASSERT_EQ(num_partial_merge_calls, 0U);
+}
+
+void testSingleBatchSuccessiveMerge(DB* db, size_t max_num_merges,
+                                    size_t num_merges) {
+  assert(num_merges > max_num_merges);
+
+  Slice key("BatchSuccessiveMerge");
+  uint64_t merge_value = 1;
+  char buf[sizeof(merge_value)];
+  EncodeFixed64(buf, merge_value);
+  Slice merge_value_slice(buf, sizeof(merge_value));
+
+  // Create the batch
+  WriteBatch batch;
+  for (size_t i = 0; i < num_merges; ++i) {
+    batch.Merge(key, merge_value_slice);
+  }
+
+  // Apply to memtable and count the number of merges
+  resetNumMergeOperatorCalls();
+  {
+    Status s = db->Write(WriteOptions(), &batch);
+    assert(s.ok());
+  }
+  ASSERT_EQ(
+      num_merge_operator_calls,
+      static_cast<size_t>(num_merges - (num_merges % (max_num_merges + 1))));
+
+  // Get the value
+  resetNumMergeOperatorCalls();
+  std::string get_value_str;
+  {
+    Status s = db->Get(ReadOptions(), key, &get_value_str);
+    assert(s.ok());
+  }
+  assert(get_value_str.size() == sizeof(uint64_t));
+  uint64_t get_value = DecodeFixed64(&get_value_str[0]);
+  ASSERT_EQ(get_value, num_merges * merge_value);
+  ASSERT_EQ(num_merge_operator_calls,
+            static_cast<size_t>((num_merges % (max_num_merges + 1))));
+}
+
+void runTest(const std::string& dbname, const bool use_ttl = false) {
+
+  {
+    auto db = OpenDb(dbname, use_ttl);
+
+    {
+      Counters counters(db, 0);
+      testCounters(counters, db.get(), true);
+    }
+
+    {
+      MergeBasedCounters counters(db, 0);
+      testCounters(counters, db.get(), use_compression);
+    }
+  }
+
+  DestroyDB(dbname, Options());
+
+  {
+    size_t max_merge = 5;
+    auto db = OpenDb(dbname, use_ttl, max_merge);
+    MergeBasedCounters counters(db, 0);
+    testCounters(counters, db.get(), use_compression);
+    testSuccessiveMerge(counters, max_merge, max_merge * 2);
+    testSingleBatchSuccessiveMerge(db.get(), 5, 7);
+    DestroyDB(dbname, Options());
+  }
+
+  {
+    size_t max_merge = 100;
+    // Min merge is hard-coded to 2.
+    uint32_t min_merge = 2;
+    for (uint32_t count = min_merge - 1; count <= min_merge + 1; count++) {
+      auto db = OpenDb(dbname, use_ttl, max_merge);
+      MergeBasedCounters counters(db, 0);
+      testPartialMerge(&counters, db.get(), max_merge, min_merge, count);
+      DestroyDB(dbname, Options());
+    }
+    {
+      auto db = OpenDb(dbname, use_ttl, max_merge);
+      MergeBasedCounters counters(db, 0);
+      testPartialMerge(&counters, db.get(), max_merge, min_merge,
+                       min_merge * 10);
+      DestroyDB(dbname, Options());
+    }
+  }
+
+  {
+    {
+      auto db = OpenDb(dbname);
+      MergeBasedCounters counters(db, 0);
+      counters.add("test-key", 1);
+      counters.add("test-key", 1);
+      counters.add("test-key", 1);
+      db->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+    }
+
+    DB* reopen_db;
+    ASSERT_OK(DB::Open(Options(), dbname, &reopen_db));
+    std::string value;
+    ASSERT_TRUE(!(reopen_db->Get(ReadOptions(), "test-key", &value).ok()));
+    delete reopen_db;
+    DestroyDB(dbname, Options());
+  }
+
+  /* Temporary remove this test
+  {
+    std::cout << "Test merge-operator not set after reopen (recovery case)\n";
+    {
+      auto db = OpenDb(dbname);
+      MergeBasedCounters counters(db, 0);
+      counters.add("test-key", 1);
+      counters.add("test-key", 1);
+      counters.add("test-key", 1);
+    }
+
+    DB* reopen_db;
+    ASSERT_TRUE(DB::Open(Options(), dbname, &reopen_db).IsInvalidArgument());
+  }
+  */
+}
+
+TEST_F(MergeTest, MergeDbTest) {
+  runTest(test::PerThreadDBPath("merge_testdb"));
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(MergeTest, MergeDbTtlTest) {
+  runTest(test::PerThreadDBPath("merge_testdbttl"),
+          true);  // Run test on TTL database
+}
+#endif  // !ROCKSDB_LITE
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::use_compression = false;
+  if (argc > 1) {
+    ROCKSDB_NAMESPACE::use_compression = true;
+  }
+
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/obsolete_files_test.cc b/src/rocksdb/db/obsolete_files_test.cc
new file mode 100644
index 000000000..bf018a0e3
--- /dev/null
+++ b/src/rocksdb/db/obsolete_files_test.cc
@@ -0,0 +1,222 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef ROCKSDB_LITE
+
+#include <stdlib.h>
+#include <map>
+#include <string>
+#include <vector>
+#include "db/db_impl/db_impl.h"
+#include "db/db_test_util.h"
+#include "db/version_set.h"
+#include "db/write_batch_internal.h"
+#include "file/filename.h"
+#include "port/stack_trace.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/transaction_log.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/string_util.h"
+
+using std::cerr;
+using std::cout;
+using std::endl;
+using std::flush;
+
+namespace ROCKSDB_NAMESPACE {
+
+class ObsoleteFilesTest : public DBTestBase {
+ public:
+  ObsoleteFilesTest()
+      : DBTestBase("/obsolete_files_test"), wal_dir_(dbname_ + "/wal_files") {}
+
+  void AddKeys(int numkeys, int startkey) {
+    WriteOptions options;
+    options.sync = false;
+    for (int i = startkey; i < (numkeys + startkey) ; i++) {
+      std::string temp = ToString(i);
+      Slice key(temp);
+      Slice value(temp);
+      ASSERT_OK(db_->Put(options, key, value));
+    }
+  }
+
+  void createLevel0Files(int numFiles, int numKeysPerFile) {
+    int startKey = 0;
+    for (int i = 0; i < numFiles; i++) {
+      AddKeys(numKeysPerFile, startKey);
+      startKey += numKeysPerFile;
+      ASSERT_OK(dbfull()->TEST_FlushMemTable());
+      ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+    }
+  }
+
+  void CheckFileTypeCounts(const std::string& dir, int required_log,
+                           int required_sst, int required_manifest) {
+    std::vector<std::string> filenames;
+    env_->GetChildren(dir, &filenames);
+
+    int log_cnt = 0;
+    int sst_cnt = 0;
+    int manifest_cnt = 0;
+    for (auto file : filenames) {
+      uint64_t number;
+      FileType type;
+      if (ParseFileName(file, &number, &type)) {
+        log_cnt += (type == kLogFile);
+        sst_cnt += (type == kTableFile);
+        manifest_cnt += (type == kDescriptorFile);
+      }
+    }
+    ASSERT_EQ(required_log, log_cnt);
+    ASSERT_EQ(required_sst, sst_cnt);
+    ASSERT_EQ(required_manifest, manifest_cnt);
+  }
+
+  void ReopenDB() {
+    Options options = CurrentOptions();
+    // Trigger compaction when the number of level 0 files reaches 2.
+    options.create_if_missing = true;
+    options.level0_file_num_compaction_trigger = 2;
+    options.disable_auto_compactions = false;
+    options.delete_obsolete_files_period_micros = 0;  // always do full purge
+    options.enable_thread_tracking = true;
+    options.write_buffer_size = 1024 * 1024 * 1000;
+    options.target_file_size_base = 1024 * 1024 * 1000;
+    options.max_bytes_for_level_base = 1024 * 1024 * 1000;
+    options.WAL_ttl_seconds = 300;     // Used to test log files
+    options.WAL_size_limit_MB = 1024;  // Used to test log files
+    options.wal_dir = wal_dir_;
+    Destroy(options);
+    Reopen(options);
+  }
+
+  const std::string wal_dir_;
+};
+
+TEST_F(ObsoleteFilesTest, RaceForObsoleteFileDeletion) {
+  ReopenDB();
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->LoadDependency({
+      {"DBImpl::BackgroundCallCompaction:FoundObsoleteFiles",
+       "ObsoleteFilesTest::RaceForObsoleteFileDeletion:1"},
+      {"DBImpl::BackgroundCallCompaction:PurgedObsoleteFiles",
+       "ObsoleteFilesTest::RaceForObsoleteFileDeletion:2"},
+      });
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::DeleteObsoleteFileImpl:AfterDeletion", [&](void* arg) {
+        Status* p_status = reinterpret_cast<Status*>(arg);
+        ASSERT_OK(*p_status);
+      });
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::CloseHelper:PendingPurgeFinished", [&](void* arg) {
+        std::unordered_set<uint64_t>* files_grabbed_for_purge_ptr =
+            reinterpret_cast<std::unordered_set<uint64_t>*>(arg);
+        ASSERT_TRUE(files_grabbed_for_purge_ptr->empty());
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  createLevel0Files(2, 50000);
+  CheckFileTypeCounts(wal_dir_, 1, 0, 0);
+
+  port::Thread user_thread([this]() {
+    JobContext jobCxt(0);
+    TEST_SYNC_POINT("ObsoleteFilesTest::RaceForObsoleteFileDeletion:1");
+    dbfull()->TEST_LockMutex();
+    dbfull()->FindObsoleteFiles(&jobCxt, true /* force=true */,
+                                false /* no_full_scan=false */);
+    dbfull()->TEST_UnlockMutex();
+    TEST_SYNC_POINT("ObsoleteFilesTest::RaceForObsoleteFileDeletion:2");
+    dbfull()->PurgeObsoleteFiles(jobCxt);
+    jobCxt.Clean();
+  });
+
+  user_thread.join();
+}
+
+TEST_F(ObsoleteFilesTest, DeleteObsoleteOptionsFile) {
+  ReopenDB();
+  SyncPoint::GetInstance()->DisableProcessing();
+  std::vector<uint64_t> optsfiles_nums;
+  std::vector<bool> optsfiles_keep;
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::PurgeObsoleteFiles:CheckOptionsFiles:1", [&](void* arg) {
+        optsfiles_nums.push_back(*reinterpret_cast<uint64_t*>(arg));
+      });
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::PurgeObsoleteFiles:CheckOptionsFiles:2", [&](void* arg) {
+        optsfiles_keep.push_back(*reinterpret_cast<bool*>(arg));
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  createLevel0Files(2, 50000);
+  CheckFileTypeCounts(wal_dir_, 1, 0, 0);
+
+  ASSERT_OK(dbfull()->DisableFileDeletions());
+  for (int i = 0; i != 4; ++i) {
+    if (i % 2) {
+      ASSERT_OK(dbfull()->SetOptions(dbfull()->DefaultColumnFamily(),
+                                     {{"paranoid_file_checks", "false"}}));
+    } else {
+      ASSERT_OK(dbfull()->SetOptions(dbfull()->DefaultColumnFamily(),
+                                     {{"paranoid_file_checks", "true"}}));
+    }
+  }
+  ASSERT_OK(dbfull()->EnableFileDeletions(true /* force */));
+  ASSERT_EQ(optsfiles_nums.size(), optsfiles_keep.size());
+
+  Close();
+
+  std::vector<std::string> files;
+  int opts_file_count = 0;
+  ASSERT_OK(env_->GetChildren(dbname_, &files));
+  for (const auto& file : files) {
+    uint64_t file_num;
+    Slice dummy_info_log_name_prefix;
+    FileType type;
+    WalFileType log_type;
+    if (ParseFileName(file, &file_num, dummy_info_log_name_prefix, &type,
+                      &log_type) &&
+        type == kOptionsFile) {
+      opts_file_count++;
+    }
+  }
+  ASSERT_EQ(2, opts_file_count);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#ifdef ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS
+extern "C" {
+void RegisterCustomObjects(int argc, char** argv);
+}
+#else
+void RegisterCustomObjects(int /*argc*/, char** /*argv*/) {}
+#endif  // !ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  RegisterCustomObjects(argc, argv);
+  return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+  fprintf(stderr,
+          "SKIPPED as DBImpl::DeleteFile is not supported in ROCKSDB_LITE\n");
+  return 0;
+}
+
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/options_file_test.cc b/src/rocksdb/db/options_file_test.cc
new file mode 100644
index 000000000..00427de8a
--- /dev/null
+++ b/src/rocksdb/db/options_file_test.cc
@@ -0,0 +1,119 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+#include <string>
+
+#include "db/db_impl/db_impl.h"
+#include "db/db_test_util.h"
+#include "rocksdb/options.h"
+#include "rocksdb/table.h"
+#include "test_util/testharness.h"
+
+namespace ROCKSDB_NAMESPACE {
+class OptionsFileTest : public testing::Test {
+ public:
+  OptionsFileTest() : dbname_(test::PerThreadDBPath("options_file_test")) {}
+
+  std::string dbname_;
+};
+
+namespace {
+void UpdateOptionsFiles(DB* db,
+                        std::unordered_set<std::string>* filename_history,
+                        int* options_files_count) {
+  std::vector<std::string> filenames;
+  db->GetEnv()->GetChildren(db->GetName(), &filenames);
+  uint64_t number;
+  FileType type;
+  *options_files_count = 0;
+  for (auto filename : filenames) {
+    if (ParseFileName(filename, &number, &type) && type == kOptionsFile) {
+      filename_history->insert(filename);
+      (*options_files_count)++;
+    }
+  }
+}
+
+// Verify whether the current Options Files are the latest ones.
+void VerifyOptionsFileName(
+    DB* db, const std::unordered_set<std::string>& past_filenames) {
+  std::vector<std::string> filenames;
+  std::unordered_set<std::string> current_filenames;
+  db->GetEnv()->GetChildren(db->GetName(), &filenames);
+  uint64_t number;
+  FileType type;
+  for (auto filename : filenames) {
+    if (ParseFileName(filename, &number, &type) && type == kOptionsFile) {
+      current_filenames.insert(filename);
+    }
+  }
+  for (auto past_filename : past_filenames) {
+    if (current_filenames.find(past_filename) != current_filenames.end()) {
+      continue;
+    }
+    for (auto filename : current_filenames) {
+      ASSERT_GT(filename, past_filename);
+    }
+  }
+}
+}  // namespace
+
+TEST_F(OptionsFileTest, NumberOfOptionsFiles) {
+  const int kReopenCount = 20;
+  Options opt;
+  opt.create_if_missing = true;
+  DestroyDB(dbname_, opt);
+  std::unordered_set<std::string> filename_history;
+  DB* db;
+  for (int i = 0; i < kReopenCount; ++i) {
+    ASSERT_OK(DB::Open(opt, dbname_, &db));
+    int num_options_files = 0;
+    UpdateOptionsFiles(db, &filename_history, &num_options_files);
+    ASSERT_GT(num_options_files, 0);
+    ASSERT_LE(num_options_files, 2);
+    // Make sure we always keep the latest option files.
+    VerifyOptionsFileName(db, filename_history);
+    delete db;
+  }
+}
+
+TEST_F(OptionsFileTest, OptionsFileName) {
+  const uint64_t kOptionsFileNum = 12345;
+  uint64_t number;
+  FileType type;
+
+  auto options_file_name = OptionsFileName("", kOptionsFileNum);
+  ASSERT_TRUE(ParseFileName(options_file_name, &number, &type, nullptr));
+  ASSERT_EQ(type, kOptionsFile);
+  ASSERT_EQ(number, kOptionsFileNum);
+
+  const uint64_t kTempOptionsFileNum = 54352;
+  auto temp_options_file_name = TempOptionsFileName("", kTempOptionsFileNum);
+  ASSERT_TRUE(ParseFileName(temp_options_file_name, &number, &type, nullptr));
+  ASSERT_NE(temp_options_file_name.find(kTempFileNameSuffix),
+            std::string::npos);
+  ASSERT_EQ(type, kTempFile);
+  ASSERT_EQ(number, kTempOptionsFileNum);
+}
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+#if !(defined NDEBUG) || !defined(OS_WIN)
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+#else
+  return 0;
+#endif  // !(defined NDEBUG) || !defined(OS_WIN)
+}
+#else
+
+#include <cstdio>
+
+int main(int /*argc*/, char** /*argv*/) {
+  printf("Skipped as Options file is not supported in RocksDBLite.\n");
+  return 0;
+}
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/perf_context_test.cc b/src/rocksdb/db/perf_context_test.cc
new file mode 100644
index 000000000..86f2db7b6
--- /dev/null
+++ b/src/rocksdb/db/perf_context_test.cc
@@ -0,0 +1,981 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#include <algorithm>
+#include <iostream>
+#include <thread>
+#include <vector>
+
+#include "monitoring/histogram.h"
+#include "monitoring/instrumented_mutex.h"
+#include "monitoring/perf_context_imp.h"
+#include "monitoring/thread_status_util.h"
+#include "port/port.h"
+#include "rocksdb/db.h"
+#include "rocksdb/memtablerep.h"
+#include "rocksdb/perf_context.h"
+#include "rocksdb/slice_transform.h"
+#include "test_util/testharness.h"
+#include "util/stop_watch.h"
+#include "util/string_util.h"
+#include "utilities/merge_operators.h"
+
+bool FLAGS_random_key = false;
+bool FLAGS_use_set_based_memetable = false;
+int FLAGS_total_keys = 100;
+int FLAGS_write_buffer_size = 1000000000;
+int FLAGS_max_write_buffer_number = 8;
+int FLAGS_min_write_buffer_number_to_merge = 7;
+bool FLAGS_verbose = false;
+
+// Path to the database on file system
+const std::string kDbName =
+    ROCKSDB_NAMESPACE::test::PerThreadDBPath("perf_context_test");
+
+namespace ROCKSDB_NAMESPACE {
+
+std::shared_ptr<DB> OpenDb(bool read_only = false) {
+    DB* db;
+    Options options;
+    options.create_if_missing = true;
+    options.max_open_files = -1;
+    options.write_buffer_size = FLAGS_write_buffer_size;
+    options.max_write_buffer_number = FLAGS_max_write_buffer_number;
+    options.min_write_buffer_number_to_merge =
+      FLAGS_min_write_buffer_number_to_merge;
+
+    if (FLAGS_use_set_based_memetable) {
+#ifndef ROCKSDB_LITE
+      options.prefix_extractor.reset(
+          ROCKSDB_NAMESPACE::NewFixedPrefixTransform(0));
+      options.memtable_factory.reset(NewHashSkipListRepFactory());
+#endif  // ROCKSDB_LITE
+    }
+
+    Status s;
+    if (!read_only) {
+      s = DB::Open(options, kDbName, &db);
+    } else {
+      s = DB::OpenForReadOnly(options, kDbName, &db);
+    }
+    EXPECT_OK(s);
+    return std::shared_ptr<DB>(db);
+}
+
+class PerfContextTest : public testing::Test {};
+
+TEST_F(PerfContextTest, SeekIntoDeletion) {
+  DestroyDB(kDbName, Options());
+  auto db = OpenDb();
+  WriteOptions write_options;
+  ReadOptions read_options;
+
+  for (int i = 0; i < FLAGS_total_keys; ++i) {
+    std::string key = "k" + ToString(i);
+    std::string value = "v" + ToString(i);
+
+    db->Put(write_options, key, value);
+  }
+
+  for (int i = 0; i < FLAGS_total_keys -1 ; ++i) {
+    std::string key = "k" + ToString(i);
+    db->Delete(write_options, key);
+  }
+
+  HistogramImpl hist_get;
+  HistogramImpl hist_get_time;
+  for (int i = 0; i < FLAGS_total_keys - 1; ++i) {
+    std::string key = "k" + ToString(i);
+    std::string value;
+
+    get_perf_context()->Reset();
+    StopWatchNano timer(Env::Default());
+    timer.Start();
+    auto status = db->Get(read_options, key, &value);
+    auto elapsed_nanos = timer.ElapsedNanos();
+    ASSERT_TRUE(status.IsNotFound());
+    hist_get.Add(get_perf_context()->user_key_comparison_count);
+    hist_get_time.Add(elapsed_nanos);
+  }
+
+  if (FLAGS_verbose) {
+    std::cout << "Get user key comparison: \n" << hist_get.ToString()
+              << "Get time: \n" << hist_get_time.ToString();
+  }
+
+  {
+    HistogramImpl hist_seek_to_first;
+    std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
+
+    get_perf_context()->Reset();
+    StopWatchNano timer(Env::Default(), true);
+    iter->SeekToFirst();
+    hist_seek_to_first.Add(get_perf_context()->user_key_comparison_count);
+    auto elapsed_nanos = timer.ElapsedNanos();
+
+    if (FLAGS_verbose) {
+      std::cout << "SeekToFirst uesr key comparison: \n"
+                << hist_seek_to_first.ToString()
+                << "ikey skipped: " << get_perf_context()->internal_key_skipped_count
+                << "\n"
+                << "idelete skipped: "
+                << get_perf_context()->internal_delete_skipped_count << "\n"
+                << "elapsed: " << elapsed_nanos << "\n";
+    }
+  }
+
+  HistogramImpl hist_seek;
+  for (int i = 0; i < FLAGS_total_keys; ++i) {
+    std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
+    std::string key = "k" + ToString(i);
+
+    get_perf_context()->Reset();
+    StopWatchNano timer(Env::Default(), true);
+    iter->Seek(key);
+    auto elapsed_nanos = timer.ElapsedNanos();
+    hist_seek.Add(get_perf_context()->user_key_comparison_count);
+    if (FLAGS_verbose) {
+      std::cout << "seek cmp: " << get_perf_context()->user_key_comparison_count
+                << " ikey skipped " << get_perf_context()->internal_key_skipped_count
+                << " idelete skipped "
+                << get_perf_context()->internal_delete_skipped_count
+                << " elapsed: " << elapsed_nanos << "ns\n";
+    }
+
+    get_perf_context()->Reset();
+    ASSERT_TRUE(iter->Valid());
+    StopWatchNano timer2(Env::Default(), true);
+    iter->Next();
+    auto elapsed_nanos2 = timer2.ElapsedNanos();
+    if (FLAGS_verbose) {
+      std::cout << "next cmp: " << get_perf_context()->user_key_comparison_count
+                << "elapsed: " << elapsed_nanos2 << "ns\n";
+    }
+  }
+
+  if (FLAGS_verbose) {
+    std::cout << "Seek uesr key comparison: \n" << hist_seek.ToString();
+  }
+}
+
+TEST_F(PerfContextTest, StopWatchNanoOverhead) {
+  // profile the timer cost by itself!
+  const int kTotalIterations = 1000000;
+  std::vector<uint64_t> timings(kTotalIterations);
+
+  StopWatchNano timer(Env::Default(), true);
+  for (auto& timing : timings) {
+    timing = timer.ElapsedNanos(true /* reset */);
+  }
+
+  HistogramImpl histogram;
+  for (const auto timing : timings) {
+    histogram.Add(timing);
+  }
+
+  if (FLAGS_verbose) {
+    std::cout << histogram.ToString();
+  }
+}
+
+TEST_F(PerfContextTest, StopWatchOverhead) {
+  // profile the timer cost by itself!
+  const int kTotalIterations = 1000000;
+  uint64_t elapsed = 0;
+  std::vector<uint64_t> timings(kTotalIterations);
+
+  StopWatch timer(Env::Default(), nullptr, 0, &elapsed);
+  for (auto& timing : timings) {
+    timing = elapsed;
+  }
+
+  HistogramImpl histogram;
+  uint64_t prev_timing = 0;
+  for (const auto timing : timings) {
+    histogram.Add(timing - prev_timing);
+    prev_timing = timing;
+  }
+
+  if (FLAGS_verbose) {
+    std::cout << histogram.ToString();
+  }
+}
+
+void ProfileQueries(bool enabled_time = false) {
+  DestroyDB(kDbName, Options());    // Start this test with a fresh DB
+
+  auto db = OpenDb();
+
+  WriteOptions write_options;
+  ReadOptions read_options;
+
+  HistogramImpl hist_put;
+
+  HistogramImpl hist_get;
+  HistogramImpl hist_get_snapshot;
+  HistogramImpl hist_get_memtable;
+  HistogramImpl hist_get_files;
+  HistogramImpl hist_get_post_process;
+  HistogramImpl hist_num_memtable_checked;
+
+  HistogramImpl hist_mget;
+  HistogramImpl hist_mget_snapshot;
+  HistogramImpl hist_mget_memtable;
+  HistogramImpl hist_mget_files;
+  HistogramImpl hist_mget_post_process;
+  HistogramImpl hist_mget_num_memtable_checked;
+
+  HistogramImpl hist_write_pre_post;
+  HistogramImpl hist_write_wal_time;
+  HistogramImpl hist_write_memtable_time;
+  HistogramImpl hist_write_delay_time;
+  HistogramImpl hist_write_thread_wait_nanos;
+  HistogramImpl hist_write_scheduling_time;
+
+  uint64_t total_db_mutex_nanos = 0;
+
+  if (FLAGS_verbose) {
+    std::cout << "Inserting " << FLAGS_total_keys << " key/value pairs\n...\n";
+  }
+
+  std::vector<int> keys;
+  const int kFlushFlag = -1;
+  for (int i = 0; i < FLAGS_total_keys; ++i) {
+    keys.push_back(i);
+    if (i == FLAGS_total_keys / 2) {
+      // Issuing a flush in the middle.
+      keys.push_back(kFlushFlag);
+    }
+  }
+
+  if (FLAGS_random_key) {
+    std::random_shuffle(keys.begin(), keys.end());
+  }
+#ifndef NDEBUG
+  ThreadStatusUtil::TEST_SetStateDelay(ThreadStatus::STATE_MUTEX_WAIT, 1U);
+#endif
+  int num_mutex_waited = 0;
+  for (const int i : keys) {
+    if (i == kFlushFlag) {
+      FlushOptions fo;
+      db->Flush(fo);
+      continue;
+    }
+
+    std::string key = "k" + ToString(i);
+    std::string value = "v" + ToString(i);
+
+    std::vector<std::string> values;
+
+    get_perf_context()->Reset();
+    db->Put(write_options, key, value);
+    if (++num_mutex_waited > 3) {
+#ifndef NDEBUG
+      ThreadStatusUtil::TEST_SetStateDelay(ThreadStatus::STATE_MUTEX_WAIT, 0U);
+#endif
+    }
+    hist_write_pre_post.Add(
+        get_perf_context()->write_pre_and_post_process_time);
+    hist_write_wal_time.Add(get_perf_context()->write_wal_time);
+    hist_write_memtable_time.Add(get_perf_context()->write_memtable_time);
+    hist_write_delay_time.Add(get_perf_context()->write_delay_time);
+    hist_write_thread_wait_nanos.Add(
+        get_perf_context()->write_thread_wait_nanos);
+    hist_write_scheduling_time.Add(
+        get_perf_context()->write_scheduling_flushes_compactions_time);
+    hist_put.Add(get_perf_context()->user_key_comparison_count);
+    total_db_mutex_nanos += get_perf_context()->db_mutex_lock_nanos;
+  }
+#ifndef NDEBUG
+  ThreadStatusUtil::TEST_SetStateDelay(ThreadStatus::STATE_MUTEX_WAIT, 0U);
+#endif
+
+  for (const int i : keys) {
+    if (i == kFlushFlag) {
+      continue;
+    }
+    std::string key = "k" + ToString(i);
+    std::string expected_value = "v" + ToString(i);
+    std::string value;
+
+    std::vector<Slice> multiget_keys = {Slice(key)};
+    std::vector<std::string> values;
+
+    get_perf_context()->Reset();
+    ASSERT_OK(db->Get(read_options, key, &value));
+    ASSERT_EQ(expected_value, value);
+    hist_get_snapshot.Add(get_perf_context()->get_snapshot_time);
+    hist_get_memtable.Add(get_perf_context()->get_from_memtable_time);
+    hist_get_files.Add(get_perf_context()->get_from_output_files_time);
+    hist_num_memtable_checked.Add(get_perf_context()->get_from_memtable_count);
+    hist_get_post_process.Add(get_perf_context()->get_post_process_time);
+    hist_get.Add(get_perf_context()->user_key_comparison_count);
+
+    get_perf_context()->Reset();
+    db->MultiGet(read_options, multiget_keys, &values);
+    hist_mget_snapshot.Add(get_perf_context()->get_snapshot_time);
+    hist_mget_memtable.Add(get_perf_context()->get_from_memtable_time);
+    hist_mget_files.Add(get_perf_context()->get_from_output_files_time);
+    hist_mget_num_memtable_checked.Add(get_perf_context()->get_from_memtable_count);
+    hist_mget_post_process.Add(get_perf_context()->get_post_process_time);
+    hist_mget.Add(get_perf_context()->user_key_comparison_count);
+  }
+
+  if (FLAGS_verbose) {
+    std::cout << "Put uesr key comparison: \n" << hist_put.ToString()
+              << "Get uesr key comparison: \n" << hist_get.ToString()
+              << "MultiGet uesr key comparison: \n" << hist_get.ToString();
+    std::cout << "Put(): Pre and Post Process Time: \n"
+              << hist_write_pre_post.ToString() << " Writing WAL time: \n"
+              << hist_write_wal_time.ToString() << "\n"
+              << " Writing Mem Table time: \n"
+              << hist_write_memtable_time.ToString() << "\n"
+              << " Write Delay: \n" << hist_write_delay_time.ToString() << "\n"
+              << " Waiting for Batch time: \n"
+              << hist_write_thread_wait_nanos.ToString() << "\n"
+              << " Scheduling Flushes and Compactions Time: \n"
+              << hist_write_scheduling_time.ToString() << "\n"
+              << " Total DB mutex nanos: \n" << total_db_mutex_nanos << "\n";
+
+    std::cout << "Get(): Time to get snapshot: \n"
+              << hist_get_snapshot.ToString()
+              << " Time to get value from memtables: \n"
+              << hist_get_memtable.ToString() << "\n"
+              << " Time to get value from output files: \n"
+              << hist_get_files.ToString() << "\n"
+              << " Number of memtables checked: \n"
+              << hist_num_memtable_checked.ToString() << "\n"
+              << " Time to post process: \n" << hist_get_post_process.ToString()
+              << "\n";
+
+    std::cout << "MultiGet(): Time to get snapshot: \n"
+              << hist_mget_snapshot.ToString()
+              << " Time to get value from memtables: \n"
+              << hist_mget_memtable.ToString() << "\n"
+              << " Time to get value from output files: \n"
+              << hist_mget_files.ToString() << "\n"
+              << " Number of memtables checked: \n"
+              << hist_mget_num_memtable_checked.ToString() << "\n"
+              << " Time to post process: \n"
+              << hist_mget_post_process.ToString() << "\n";
+  }
+
+  if (enabled_time) {
+    ASSERT_GT(hist_get.Average(), 0);
+    ASSERT_GT(hist_get_snapshot.Average(), 0);
+    ASSERT_GT(hist_get_memtable.Average(), 0);
+    ASSERT_GT(hist_get_files.Average(), 0);
+    ASSERT_GT(hist_get_post_process.Average(), 0);
+    ASSERT_GT(hist_num_memtable_checked.Average(), 0);
+
+    ASSERT_GT(hist_mget.Average(), 0);
+    ASSERT_GT(hist_mget_snapshot.Average(), 0);
+    ASSERT_GT(hist_mget_memtable.Average(), 0);
+    ASSERT_GT(hist_mget_files.Average(), 0);
+    ASSERT_GT(hist_mget_post_process.Average(), 0);
+    ASSERT_GT(hist_mget_num_memtable_checked.Average(), 0);
+
+    EXPECT_GT(hist_write_pre_post.Average(), 0);
+    EXPECT_GT(hist_write_wal_time.Average(), 0);
+    EXPECT_GT(hist_write_memtable_time.Average(), 0);
+    EXPECT_EQ(hist_write_delay_time.Average(), 0);
+    EXPECT_EQ(hist_write_thread_wait_nanos.Average(), 0);
+    EXPECT_GT(hist_write_scheduling_time.Average(), 0);
+
+#ifndef NDEBUG
+    ASSERT_GT(total_db_mutex_nanos, 2000U);
+#endif
+  }
+
+  db.reset();
+  db = OpenDb(true);
+
+  hist_get.Clear();
+  hist_get_snapshot.Clear();
+  hist_get_memtable.Clear();
+  hist_get_files.Clear();
+  hist_get_post_process.Clear();
+  hist_num_memtable_checked.Clear();
+
+  hist_mget.Clear();
+  hist_mget_snapshot.Clear();
+  hist_mget_memtable.Clear();
+  hist_mget_files.Clear();
+  hist_mget_post_process.Clear();
+  hist_mget_num_memtable_checked.Clear();
+
+  for (const int i : keys) {
+    if (i == kFlushFlag) {
+      continue;
+    }
+    std::string key = "k" + ToString(i);
+    std::string expected_value = "v" + ToString(i);
+    std::string value;
+
+    std::vector<Slice> multiget_keys = {Slice(key)};
+    std::vector<std::string> values;
+
+    get_perf_context()->Reset();
+    ASSERT_OK(db->Get(read_options, key, &value));
+    ASSERT_EQ(expected_value, value);
+    hist_get_snapshot.Add(get_perf_context()->get_snapshot_time);
+    hist_get_memtable.Add(get_perf_context()->get_from_memtable_time);
+    hist_get_files.Add(get_perf_context()->get_from_output_files_time);
+    hist_num_memtable_checked.Add(get_perf_context()->get_from_memtable_count);
+    hist_get_post_process.Add(get_perf_context()->get_post_process_time);
+    hist_get.Add(get_perf_context()->user_key_comparison_count);
+
+    get_perf_context()->Reset();
+    db->MultiGet(read_options, multiget_keys, &values);
+    hist_mget_snapshot.Add(get_perf_context()->get_snapshot_time);
+    hist_mget_memtable.Add(get_perf_context()->get_from_memtable_time);
+    hist_mget_files.Add(get_perf_context()->get_from_output_files_time);
+    hist_mget_num_memtable_checked.Add(get_perf_context()->get_from_memtable_count);
+    hist_mget_post_process.Add(get_perf_context()->get_post_process_time);
+    hist_mget.Add(get_perf_context()->user_key_comparison_count);
+  }
+
+  if (FLAGS_verbose) {
+    std::cout << "ReadOnly Get uesr key comparison: \n" << hist_get.ToString()
+              << "ReadOnly MultiGet uesr key comparison: \n"
+              << hist_mget.ToString();
+
+    std::cout << "ReadOnly Get(): Time to get snapshot: \n"
+              << hist_get_snapshot.ToString()
+              << " Time to get value from memtables: \n"
+              << hist_get_memtable.ToString() << "\n"
+              << " Time to get value from output files: \n"
+              << hist_get_files.ToString() << "\n"
+              << " Number of memtables checked: \n"
+              << hist_num_memtable_checked.ToString() << "\n"
+              << " Time to post process: \n" << hist_get_post_process.ToString()
+              << "\n";
+
+    std::cout << "ReadOnly MultiGet(): Time to get snapshot: \n"
+              << hist_mget_snapshot.ToString()
+              << " Time to get value from memtables: \n"
+              << hist_mget_memtable.ToString() << "\n"
+              << " Time to get value from output files: \n"
+              << hist_mget_files.ToString() << "\n"
+              << " Number of memtables checked: \n"
+              << hist_mget_num_memtable_checked.ToString() << "\n"
+              << " Time to post process: \n"
+              << hist_mget_post_process.ToString() << "\n";
+  }
+
+  if (enabled_time) {
+    ASSERT_GT(hist_get.Average(), 0);
+    ASSERT_GT(hist_get_memtable.Average(), 0);
+    ASSERT_GT(hist_get_files.Average(), 0);
+    ASSERT_GT(hist_num_memtable_checked.Average(), 0);
+    // In read-only mode Get(), no super version operation is needed
+    ASSERT_EQ(hist_get_post_process.Average(), 0);
+    ASSERT_GT(hist_get_snapshot.Average(), 0);
+
+    ASSERT_GT(hist_mget.Average(), 0);
+    ASSERT_GT(hist_mget_snapshot.Average(), 0);
+    ASSERT_GT(hist_mget_memtable.Average(), 0);
+    ASSERT_GT(hist_mget_files.Average(), 0);
+    ASSERT_GT(hist_mget_post_process.Average(), 0);
+    ASSERT_GT(hist_mget_num_memtable_checked.Average(), 0);
+  }
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(PerfContextTest, KeyComparisonCount) {
+  SetPerfLevel(kEnableCount);
+  ProfileQueries();
+
+  SetPerfLevel(kDisable);
+  ProfileQueries();
+
+  SetPerfLevel(kEnableTime);
+  ProfileQueries(true);
+}
+#endif  // ROCKSDB_LITE
+
+// make perf_context_test
+// export ROCKSDB_TESTS=PerfContextTest.SeekKeyComparison
+// For one memtable:
+// ./perf_context_test --write_buffer_size=500000 --total_keys=10000
+// For two memtables:
+// ./perf_context_test --write_buffer_size=250000 --total_keys=10000
+// Specify --random_key=1 to shuffle the key before insertion
+// Results show that, for sequential insertion, worst-case Seek Key comparison
+// is close to the total number of keys (linear), when there is only one
+// memtable. When there are two memtables, even the avg Seek Key comparison
+// starts to become linear to the input size.
+
+TEST_F(PerfContextTest, SeekKeyComparison) {
+  DestroyDB(kDbName, Options());
+  auto db = OpenDb();
+  WriteOptions write_options;
+  ReadOptions read_options;
+
+  if (FLAGS_verbose) {
+    std::cout << "Inserting " << FLAGS_total_keys << " key/value pairs\n...\n";
+  }
+
+  std::vector<int> keys;
+  for (int i = 0; i < FLAGS_total_keys; ++i) {
+    keys.push_back(i);
+  }
+
+  if (FLAGS_random_key) {
+    std::random_shuffle(keys.begin(), keys.end());
+  }
+
+  HistogramImpl hist_put_time;
+  HistogramImpl hist_wal_time;
+  HistogramImpl hist_time_diff;
+
+  SetPerfLevel(kEnableTime);
+  StopWatchNano timer(Env::Default());
+  for (const int i : keys) {
+    std::string key = "k" + ToString(i);
+    std::string value = "v" + ToString(i);
+
+    get_perf_context()->Reset();
+    timer.Start();
+    db->Put(write_options, key, value);
+    auto put_time = timer.ElapsedNanos();
+    hist_put_time.Add(put_time);
+    hist_wal_time.Add(get_perf_context()->write_wal_time);
+    hist_time_diff.Add(put_time - get_perf_context()->write_wal_time);
+  }
+
+  if (FLAGS_verbose) {
+    std::cout << "Put time:\n" << hist_put_time.ToString() << "WAL time:\n"
+              << hist_wal_time.ToString() << "time diff:\n"
+              << hist_time_diff.ToString();
+  }
+
+  HistogramImpl hist_seek;
+  HistogramImpl hist_next;
+
+  for (int i = 0; i < FLAGS_total_keys; ++i) {
+    std::string key = "k" + ToString(i);
+    std::string value = "v" + ToString(i);
+
+    std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
+    get_perf_context()->Reset();
+    iter->Seek(key);
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->value().ToString(), value);
+    hist_seek.Add(get_perf_context()->user_key_comparison_count);
+  }
+
+  std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
+  for (iter->SeekToFirst(); iter->Valid();) {
+    get_perf_context()->Reset();
+    iter->Next();
+    hist_next.Add(get_perf_context()->user_key_comparison_count);
+  }
+
+  if (FLAGS_verbose) {
+    std::cout << "Seek:\n" << hist_seek.ToString() << "Next:\n"
+              << hist_next.ToString();
+  }
+}
+
+TEST_F(PerfContextTest, DBMutexLockCounter) {
+  int stats_code[] = {0, static_cast<int>(DB_MUTEX_WAIT_MICROS)};
+  for (PerfLevel perf_level_test :
+       {PerfLevel::kEnableTimeExceptForMutex, PerfLevel::kEnableTime}) {
+    for (int c = 0; c < 2; ++c) {
+    InstrumentedMutex mutex(nullptr, Env::Default(), stats_code[c]);
+    mutex.Lock();
+    ROCKSDB_NAMESPACE::port::Thread child_thread([&] {
+      SetPerfLevel(perf_level_test);
+      get_perf_context()->Reset();
+      ASSERT_EQ(get_perf_context()->db_mutex_lock_nanos, 0);
+      mutex.Lock();
+      mutex.Unlock();
+      if (perf_level_test == PerfLevel::kEnableTimeExceptForMutex ||
+          stats_code[c] != DB_MUTEX_WAIT_MICROS) {
+        ASSERT_EQ(get_perf_context()->db_mutex_lock_nanos, 0);
+      } else {
+        // increment the counter only when it's a DB Mutex
+        ASSERT_GT(get_perf_context()->db_mutex_lock_nanos, 0);
+      }
+    });
+    Env::Default()->SleepForMicroseconds(100);
+    mutex.Unlock();
+    child_thread.join();
+  }
+  }
+}
+
+TEST_F(PerfContextTest, FalseDBMutexWait) {
+  SetPerfLevel(kEnableTime);
+  int stats_code[] = {0, static_cast<int>(DB_MUTEX_WAIT_MICROS)};
+  for (int c = 0; c < 2; ++c) {
+    InstrumentedMutex mutex(nullptr, Env::Default(), stats_code[c]);
+    InstrumentedCondVar lock(&mutex);
+    get_perf_context()->Reset();
+    mutex.Lock();
+    lock.TimedWait(100);
+    mutex.Unlock();
+    if (stats_code[c] == static_cast<int>(DB_MUTEX_WAIT_MICROS)) {
+      // increment the counter only when it's a DB Mutex
+      ASSERT_GT(get_perf_context()->db_condition_wait_nanos, 0);
+    } else {
+      ASSERT_EQ(get_perf_context()->db_condition_wait_nanos, 0);
+    }
+  }
+}
+
+TEST_F(PerfContextTest, ToString) {
+  get_perf_context()->Reset();
+  get_perf_context()->block_read_count = 12345;
+
+  std::string zero_included = get_perf_context()->ToString();
+  ASSERT_NE(std::string::npos, zero_included.find("= 0"));
+  ASSERT_NE(std::string::npos, zero_included.find("= 12345"));
+
+  std::string zero_excluded = get_perf_context()->ToString(true);
+  ASSERT_EQ(std::string::npos, zero_excluded.find("= 0"));
+  ASSERT_NE(std::string::npos, zero_excluded.find("= 12345"));
+}
+
+TEST_F(PerfContextTest, MergeOperatorTime) {
+  DestroyDB(kDbName, Options());
+  DB* db;
+  Options options;
+  options.create_if_missing = true;
+  options.merge_operator = MergeOperators::CreateStringAppendOperator();
+  Status s = DB::Open(options, kDbName, &db);
+  EXPECT_OK(s);
+
+  std::string val;
+  ASSERT_OK(db->Merge(WriteOptions(), "k1", "val1"));
+  ASSERT_OK(db->Merge(WriteOptions(), "k1", "val2"));
+  ASSERT_OK(db->Merge(WriteOptions(), "k1", "val3"));
+  ASSERT_OK(db->Merge(WriteOptions(), "k1", "val4"));
+
+  SetPerfLevel(kEnableTime);
+  get_perf_context()->Reset();
+  ASSERT_OK(db->Get(ReadOptions(), "k1", &val));
+#ifdef OS_SOLARIS
+  for (int i = 0; i < 100; i++) {
+    ASSERT_OK(db->Get(ReadOptions(), "k1", &val));
+  }
+#endif
+  EXPECT_GT(get_perf_context()->merge_operator_time_nanos, 0);
+
+  ASSERT_OK(db->Flush(FlushOptions()));
+
+  get_perf_context()->Reset();
+  ASSERT_OK(db->Get(ReadOptions(), "k1", &val));
+#ifdef OS_SOLARIS
+  for (int i = 0; i < 100; i++) {
+    ASSERT_OK(db->Get(ReadOptions(), "k1", &val));
+  }
+#endif
+  EXPECT_GT(get_perf_context()->merge_operator_time_nanos, 0);
+
+  ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+  get_perf_context()->Reset();
+  ASSERT_OK(db->Get(ReadOptions(), "k1", &val));
+#ifdef OS_SOLARIS
+  for (int i = 0; i < 100; i++) {
+    ASSERT_OK(db->Get(ReadOptions(), "k1", &val));
+  }
+#endif
+  EXPECT_GT(get_perf_context()->merge_operator_time_nanos, 0);
+
+  delete db;
+}
+
+TEST_F(PerfContextTest, CopyAndMove) {
+  // Assignment operator
+  {
+    get_perf_context()->Reset();
+    get_perf_context()->EnablePerLevelPerfContext();
+    PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, 5);
+    ASSERT_EQ(
+        1,
+        (*(get_perf_context()->level_to_perf_context))[5].bloom_filter_useful);
+    PerfContext perf_context_assign;
+    perf_context_assign = *get_perf_context();
+    ASSERT_EQ(
+        1,
+        (*(perf_context_assign.level_to_perf_context))[5].bloom_filter_useful);
+    get_perf_context()->ClearPerLevelPerfContext();
+    get_perf_context()->Reset();
+    ASSERT_EQ(
+        1,
+        (*(perf_context_assign.level_to_perf_context))[5].bloom_filter_useful);
+    perf_context_assign.ClearPerLevelPerfContext();
+    perf_context_assign.Reset();
+  }
+  // Copy constructor
+  {
+    get_perf_context()->Reset();
+    get_perf_context()->EnablePerLevelPerfContext();
+    PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, 5);
+    ASSERT_EQ(
+        1,
+        (*(get_perf_context()->level_to_perf_context))[5].bloom_filter_useful);
+    PerfContext perf_context_copy(*get_perf_context());
+    ASSERT_EQ(
+        1, (*(perf_context_copy.level_to_perf_context))[5].bloom_filter_useful);
+    get_perf_context()->ClearPerLevelPerfContext();
+    get_perf_context()->Reset();
+    ASSERT_EQ(
+        1, (*(perf_context_copy.level_to_perf_context))[5].bloom_filter_useful);
+    perf_context_copy.ClearPerLevelPerfContext();
+    perf_context_copy.Reset();
+  }
+  // Move constructor
+  {
+    get_perf_context()->Reset();
+    get_perf_context()->EnablePerLevelPerfContext();
+    PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, 5);
+    ASSERT_EQ(
+        1,
+        (*(get_perf_context()->level_to_perf_context))[5].bloom_filter_useful);
+    PerfContext perf_context_move = std::move(*get_perf_context());
+    ASSERT_EQ(
+        1, (*(perf_context_move.level_to_perf_context))[5].bloom_filter_useful);
+    get_perf_context()->ClearPerLevelPerfContext();
+    get_perf_context()->Reset();
+    ASSERT_EQ(
+        1, (*(perf_context_move.level_to_perf_context))[5].bloom_filter_useful);
+    perf_context_move.ClearPerLevelPerfContext();
+    perf_context_move.Reset();
+  }
+}
+
+TEST_F(PerfContextTest, PerfContextDisableEnable) {
+  get_perf_context()->Reset();
+  get_perf_context()->EnablePerLevelPerfContext();
+  PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_positive, 1, 0);
+  get_perf_context()->DisablePerLevelPerfContext();
+  PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, 5);
+  get_perf_context()->EnablePerLevelPerfContext();
+  PERF_COUNTER_BY_LEVEL_ADD(block_cache_hit_count, 1, 0);
+  get_perf_context()->DisablePerLevelPerfContext();
+  PerfContext perf_context_copy(*get_perf_context());
+  ASSERT_EQ(1, (*(perf_context_copy.level_to_perf_context))[0]
+                   .bloom_filter_full_positive);
+  // this was set when per level perf context is disabled, should not be copied
+  ASSERT_NE(
+      1, (*(perf_context_copy.level_to_perf_context))[5].bloom_filter_useful);
+  ASSERT_EQ(
+      1, (*(perf_context_copy.level_to_perf_context))[0].block_cache_hit_count);
+  perf_context_copy.ClearPerLevelPerfContext();
+  perf_context_copy.Reset();
+  get_perf_context()->ClearPerLevelPerfContext();
+  get_perf_context()->Reset();
+}
+
+TEST_F(PerfContextTest, PerfContextByLevelGetSet) {
+  get_perf_context()->Reset();
+  get_perf_context()->EnablePerLevelPerfContext();
+  PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_positive, 1, 0);
+  PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, 5);
+  PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, 7);
+  PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, 7);
+  PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_true_positive, 1, 2);
+  PERF_COUNTER_BY_LEVEL_ADD(block_cache_hit_count, 1, 0);
+  PERF_COUNTER_BY_LEVEL_ADD(block_cache_hit_count, 5, 2);
+  PERF_COUNTER_BY_LEVEL_ADD(block_cache_miss_count, 2, 3);
+  PERF_COUNTER_BY_LEVEL_ADD(block_cache_miss_count, 4, 1);
+  ASSERT_EQ(
+      0, (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful);
+  ASSERT_EQ(
+      1, (*(get_perf_context()->level_to_perf_context))[5].bloom_filter_useful);
+  ASSERT_EQ(
+      2, (*(get_perf_context()->level_to_perf_context))[7].bloom_filter_useful);
+  ASSERT_EQ(1, (*(get_perf_context()->level_to_perf_context))[0]
+                   .bloom_filter_full_positive);
+  ASSERT_EQ(1, (*(get_perf_context()->level_to_perf_context))[2]
+                   .bloom_filter_full_true_positive);
+  ASSERT_EQ(1, (*(get_perf_context()->level_to_perf_context))[0]
+                  .block_cache_hit_count);
+  ASSERT_EQ(5, (*(get_perf_context()->level_to_perf_context))[2]
+                  .block_cache_hit_count);
+  ASSERT_EQ(2, (*(get_perf_context()->level_to_perf_context))[3]
+                  .block_cache_miss_count);
+  ASSERT_EQ(4, (*(get_perf_context()->level_to_perf_context))[1]
+                  .block_cache_miss_count);
+  std::string zero_excluded = get_perf_context()->ToString(true);
+  ASSERT_NE(std::string::npos,
+            zero_excluded.find("bloom_filter_useful = 1@level5, 2@level7"));
+  ASSERT_NE(std::string::npos,
+            zero_excluded.find("bloom_filter_full_positive = 1@level0"));
+  ASSERT_NE(std::string::npos,
+            zero_excluded.find("bloom_filter_full_true_positive = 1@level2"));
+  ASSERT_NE(std::string::npos,
+            zero_excluded.find("block_cache_hit_count = 1@level0, 5@level2"));
+  ASSERT_NE(std::string::npos,
+            zero_excluded.find("block_cache_miss_count = 4@level1, 2@level3"));
+}
+
+TEST_F(PerfContextTest, CPUTimer) {
+  DestroyDB(kDbName, Options());
+  auto db = OpenDb();
+  WriteOptions write_options;
+  ReadOptions read_options;
+  SetPerfLevel(PerfLevel::kEnableTimeAndCPUTimeExceptForMutex);
+
+  std::string max_str = "0";
+  for (int i = 0; i < FLAGS_total_keys; ++i) {
+    std::string i_str = ToString(i);
+    std::string key = "k" + i_str;
+    std::string value = "v" + i_str;
+    max_str = max_str > i_str ? max_str : i_str;
+
+    db->Put(write_options, key, value);
+  }
+  std::string last_key = "k" + max_str;
+  std::string last_value = "v" + max_str;
+
+  {
+    // Get
+    get_perf_context()->Reset();
+    std::string value;
+    ASSERT_OK(db->Get(read_options, "k0", &value));
+    ASSERT_EQ(value, "v0");
+
+    if (FLAGS_verbose) {
+      std::cout << "Get CPU time nanos: " << get_perf_context()->get_cpu_nanos
+                << "ns\n";
+    }
+
+    // Iter
+    std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
+
+    // Seek
+    get_perf_context()->Reset();
+    iter->Seek(last_key);
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(last_value, iter->value().ToString());
+
+    if (FLAGS_verbose) {
+      std::cout << "Iter Seek CPU time nanos: "
+                << get_perf_context()->iter_seek_cpu_nanos << "ns\n";
+    }
+
+    // SeekForPrev
+    get_perf_context()->Reset();
+    iter->SeekForPrev(last_key);
+    ASSERT_TRUE(iter->Valid());
+
+    if (FLAGS_verbose) {
+      std::cout << "Iter SeekForPrev CPU time nanos: "
+                << get_perf_context()->iter_seek_cpu_nanos << "ns\n";
+    }
+
+    // SeekToLast
+    get_perf_context()->Reset();
+    iter->SeekToLast();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(last_value, iter->value().ToString());
+
+    if (FLAGS_verbose) {
+      std::cout << "Iter SeekToLast CPU time nanos: "
+                << get_perf_context()->iter_seek_cpu_nanos << "ns\n";
+    }
+
+    // SeekToFirst
+    get_perf_context()->Reset();
+    iter->SeekToFirst();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("v0", iter->value().ToString());
+
+    if (FLAGS_verbose) {
+      std::cout << "Iter SeekToFirst CPU time nanos: "
+                << get_perf_context()->iter_seek_cpu_nanos << "ns\n";
+    }
+
+    // Next
+    get_perf_context()->Reset();
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("v1", iter->value().ToString());
+
+    if (FLAGS_verbose) {
+      std::cout << "Iter Next CPU time nanos: "
+                << get_perf_context()->iter_next_cpu_nanos << "ns\n";
+    }
+
+    // Prev
+    get_perf_context()->Reset();
+    iter->Prev();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("v0", iter->value().ToString());
+
+    if (FLAGS_verbose) {
+      std::cout << "Iter Prev CPU time nanos: "
+                << get_perf_context()->iter_prev_cpu_nanos << "ns\n";
+    }
+
+    // monotonically increasing
+    get_perf_context()->Reset();
+    auto count = get_perf_context()->iter_seek_cpu_nanos;
+    for (int i = 0; i < FLAGS_total_keys; ++i) {
+      iter->Seek("k" + ToString(i));
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ("v" + ToString(i), iter->value().ToString());
+      auto next_count = get_perf_context()->iter_seek_cpu_nanos;
+      ASSERT_GT(next_count, count);
+      count = next_count;
+    }
+
+    // iterator creation/destruction; multiple iterators
+    {
+      std::unique_ptr<Iterator> iter2(db->NewIterator(read_options));
+      ASSERT_EQ(count, get_perf_context()->iter_seek_cpu_nanos);
+      iter2->Seek(last_key);
+      ASSERT_TRUE(iter2->Valid());
+      ASSERT_EQ(last_value, iter2->value().ToString());
+      ASSERT_GT(get_perf_context()->iter_seek_cpu_nanos, count);
+      count = get_perf_context()->iter_seek_cpu_nanos;
+    }
+    ASSERT_EQ(count, get_perf_context()->iter_seek_cpu_nanos);
+  }
+}
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+
+  for (int i = 1; i < argc; i++) {
+    int n;
+    char junk;
+
+    if (sscanf(argv[i], "--write_buffer_size=%d%c", &n, &junk) == 1) {
+      FLAGS_write_buffer_size = n;
+    }
+
+    if (sscanf(argv[i], "--total_keys=%d%c", &n, &junk) == 1) {
+      FLAGS_total_keys = n;
+    }
+
+    if (sscanf(argv[i], "--random_key=%d%c", &n, &junk) == 1 &&
+        (n == 0 || n == 1)) {
+      FLAGS_random_key = n;
+    }
+
+    if (sscanf(argv[i], "--use_set_based_memetable=%d%c", &n, &junk) == 1 &&
+        (n == 0 || n == 1)) {
+      FLAGS_use_set_based_memetable = n;
+    }
+
+    if (sscanf(argv[i], "--verbose=%d%c", &n, &junk) == 1 &&
+        (n == 0 || n == 1)) {
+      FLAGS_verbose = n;
+    }
+  }
+
+  if (FLAGS_verbose) {
+    std::cout << kDbName << "\n";
+  }
+
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/pinned_iterators_manager.h b/src/rocksdb/db/pinned_iterators_manager.h
new file mode 100644
index 000000000..5e8ad51dd
--- /dev/null
+++ b/src/rocksdb/db/pinned_iterators_manager.h
@@ -0,0 +1,87 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#pragma once
+#include <algorithm>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "table/internal_iterator.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// PinnedIteratorsManager will be notified whenever we need to pin an Iterator
+// and it will be responsible for deleting pinned Iterators when they are
+// not needed anymore.
+class PinnedIteratorsManager : public Cleanable {
+ public:
+  PinnedIteratorsManager() : pinning_enabled(false) {}
+  ~PinnedIteratorsManager() {
+    if (pinning_enabled) {
+      ReleasePinnedData();
+    }
+  }
+
+  // Enable Iterators pinning
+  void StartPinning() {
+    assert(pinning_enabled == false);
+    pinning_enabled = true;
+  }
+
+  // Is pinning enabled ?
+  bool PinningEnabled() { return pinning_enabled; }
+
+  // Take ownership of iter and delete it when ReleasePinnedData() is called
+  void PinIterator(InternalIterator* iter, bool arena = false) {
+    if (arena) {
+      PinPtr(iter, &PinnedIteratorsManager::ReleaseArenaInternalIterator);
+    } else {
+      PinPtr(iter, &PinnedIteratorsManager::ReleaseInternalIterator);
+    }
+  }
+
+  typedef void (*ReleaseFunction)(void* arg1);
+  void PinPtr(void* ptr, ReleaseFunction release_func) {
+    assert(pinning_enabled);
+    if (ptr == nullptr) {
+      return;
+    }
+    pinned_ptrs_.emplace_back(ptr, release_func);
+  }
+
+  // Release pinned Iterators
+  inline void ReleasePinnedData() {
+    assert(pinning_enabled == true);
+    pinning_enabled = false;
+
+    // Remove duplicate pointers
+    std::sort(pinned_ptrs_.begin(), pinned_ptrs_.end());
+    auto unique_end = std::unique(pinned_ptrs_.begin(), pinned_ptrs_.end());
+
+    for (auto i = pinned_ptrs_.begin(); i != unique_end; ++i) {
+      void* ptr = i->first;
+      ReleaseFunction release_func = i->second;
+      release_func(ptr);
+    }
+    pinned_ptrs_.clear();
+    // Also do cleanups from the base Cleanable
+    Cleanable::Reset();
+  }
+
+ private:
+  static void ReleaseInternalIterator(void* ptr) {
+    delete reinterpret_cast<InternalIterator*>(ptr);
+  }
+
+  static void ReleaseArenaInternalIterator(void* ptr) {
+    reinterpret_cast<InternalIterator*>(ptr)->~InternalIterator();
+  }
+
+  bool pinning_enabled;
+  std::vector<std::pair<void*, ReleaseFunction>> pinned_ptrs_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/plain_table_db_test.cc b/src/rocksdb/db/plain_table_db_test.cc
new file mode 100644
index 000000000..d80cc4f67
--- /dev/null
+++ b/src/rocksdb/db/plain_table_db_test.cc
@@ -0,0 +1,1375 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef ROCKSDB_LITE
+
+#include <algorithm>
+#include <set>
+
+#include "db/db_impl/db_impl.h"
+#include "db/version_set.h"
+#include "db/write_batch_internal.h"
+#include "file/filename.h"
+#include "logging/logging.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/table.h"
+#include "table/meta_blocks.h"
+#include "table/plain/plain_table_bloom.h"
+#include "table/plain/plain_table_factory.h"
+#include "table/plain/plain_table_key_coding.h"
+#include "table/plain/plain_table_reader.h"
+#include "table/table_builder.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/hash.h"
+#include "util/mutexlock.h"
+#include "util/string_util.h"
+#include "utilities/merge_operators.h"
+
+using std::unique_ptr;
+
+namespace ROCKSDB_NAMESPACE {
+class PlainTableKeyDecoderTest : public testing::Test {};
+
+TEST_F(PlainTableKeyDecoderTest, ReadNonMmap) {
+  std::string tmp;
+  Random rnd(301);
+  const uint32_t kLength = 2222;
+  Slice contents = test::RandomString(&rnd, kLength, &tmp);
+  test::StringSource* string_source =
+      new test::StringSource(contents, 0, false);
+
+  std::unique_ptr<RandomAccessFileReader> file_reader(
+      test::GetRandomAccessFileReader(string_source));
+  std::unique_ptr<PlainTableReaderFileInfo> file_info(
+      new PlainTableReaderFileInfo(std::move(file_reader), EnvOptions(),
+                                   kLength));
+
+  {
+    PlainTableFileReader reader(file_info.get());
+
+    const uint32_t kReadSize = 77;
+    for (uint32_t pos = 0; pos < kLength; pos += kReadSize) {
+      uint32_t read_size = std::min(kLength - pos, kReadSize);
+      Slice out;
+      ASSERT_TRUE(reader.Read(pos, read_size, &out));
+      ASSERT_EQ(0, out.compare(tmp.substr(pos, read_size)));
+    }
+
+    ASSERT_LT(uint32_t(string_source->total_reads()), kLength / kReadSize / 2);
+  }
+
+  std::vector<std::vector<std::pair<uint32_t, uint32_t>>> reads = {
+      {{600, 30}, {590, 30}, {600, 20}, {600, 40}},
+      {{800, 20}, {100, 20}, {500, 20}, {1500, 20}, {100, 20}, {80, 20}},
+      {{1000, 20}, {500, 20}, {1000, 50}},
+      {{1000, 20}, {500, 20}, {500, 20}},
+      {{1000, 20}, {500, 20}, {200, 20}, {500, 20}},
+      {{1000, 20}, {500, 20}, {200, 20}, {1000, 50}},
+      {{600, 500}, {610, 20}, {100, 20}},
+      {{500, 100}, {490, 100}, {550, 50}},
+  };
+
+  std::vector<int> num_file_reads = {2, 6, 2, 2, 4, 3, 2, 2};
+
+  for (size_t i = 0; i < reads.size(); i++) {
+    string_source->set_total_reads(0);
+    PlainTableFileReader reader(file_info.get());
+    for (auto p : reads[i]) {
+      Slice out;
+      ASSERT_TRUE(reader.Read(p.first, p.second, &out));
+      ASSERT_EQ(0, out.compare(tmp.substr(p.first, p.second)));
+    }
+    ASSERT_EQ(num_file_reads[i], string_source->total_reads());
+  }
+}
+
+class PlainTableDBTest : public testing::Test,
+                         public testing::WithParamInterface<bool> {
+ protected:
+ private:
+  std::string dbname_;
+  Env* env_;
+  DB* db_;
+
+  bool mmap_mode_;
+  Options last_options_;
+
+ public:
+  PlainTableDBTest() : env_(Env::Default()) {}
+
+  ~PlainTableDBTest() override {
+    delete db_;
+    EXPECT_OK(DestroyDB(dbname_, Options()));
+  }
+
+  void SetUp() override {
+    mmap_mode_ = GetParam();
+    dbname_ = test::PerThreadDBPath("plain_table_db_test");
+    EXPECT_OK(DestroyDB(dbname_, Options()));
+    db_ = nullptr;
+    Reopen();
+  }
+
+  // Return the current option configuration.
+  Options CurrentOptions() {
+    Options options;
+
+    PlainTableOptions plain_table_options;
+    plain_table_options.user_key_len = 0;
+    plain_table_options.bloom_bits_per_key = 2;
+    plain_table_options.hash_table_ratio = 0.8;
+    plain_table_options.index_sparseness = 3;
+    plain_table_options.huge_page_tlb_size = 0;
+    plain_table_options.encoding_type = kPrefix;
+    plain_table_options.full_scan_mode = false;
+    plain_table_options.store_index_in_file = false;
+
+    options.table_factory.reset(NewPlainTableFactory(plain_table_options));
+    options.memtable_factory.reset(NewHashLinkListRepFactory(4, 0, 3, true));
+
+    options.prefix_extractor.reset(NewFixedPrefixTransform(8));
+    options.allow_mmap_reads = mmap_mode_;
+    options.allow_concurrent_memtable_write = false;
+    options.unordered_write = false;
+    return options;
+  }
+
+  DBImpl* dbfull() {
+    return reinterpret_cast<DBImpl*>(db_);
+  }
+
+  void Reopen(Options* options = nullptr) {
+    ASSERT_OK(TryReopen(options));
+  }
+
+  void Close() {
+    delete db_;
+    db_ = nullptr;
+  }
+
+  bool mmap_mode() const { return mmap_mode_; }
+
+  void DestroyAndReopen(Options* options = nullptr) {
+    //Destroy using last options
+    Destroy(&last_options_);
+    ASSERT_OK(TryReopen(options));
+  }
+
+  void Destroy(Options* options) {
+    delete db_;
+    db_ = nullptr;
+    ASSERT_OK(DestroyDB(dbname_, *options));
+  }
+
+  Status PureReopen(Options* options, DB** db) {
+    return DB::Open(*options, dbname_, db);
+  }
+
+  Status ReopenForReadOnly(Options* options) {
+    delete db_;
+    db_ = nullptr;
+    return DB::OpenForReadOnly(*options, dbname_, &db_);
+  }
+
+  Status TryReopen(Options* options = nullptr) {
+    delete db_;
+    db_ = nullptr;
+    Options opts;
+    if (options != nullptr) {
+      opts = *options;
+    } else {
+      opts = CurrentOptions();
+      opts.create_if_missing = true;
+    }
+    last_options_ = opts;
+
+    return DB::Open(opts, dbname_, &db_);
+  }
+
+  Status Put(const Slice& k, const Slice& v) {
+    return db_->Put(WriteOptions(), k, v);
+  }
+
+  Status Delete(const std::string& k) {
+    return db_->Delete(WriteOptions(), k);
+  }
+
+  std::string Get(const std::string& k, const Snapshot* snapshot = nullptr) {
+    ReadOptions options;
+    options.snapshot = snapshot;
+    std::string result;
+    Status s = db_->Get(options, k, &result);
+    if (s.IsNotFound()) {
+      result = "NOT_FOUND";
+    } else if (!s.ok()) {
+      result = s.ToString();
+    }
+    return result;
+  }
+
+
+  int NumTableFilesAtLevel(int level) {
+    std::string property;
+    EXPECT_TRUE(db_->GetProperty(
+        "rocksdb.num-files-at-level" + NumberToString(level), &property));
+    return atoi(property.c_str());
+  }
+
+  // Return spread of files per level
+  std::string FilesPerLevel() {
+    std::string result;
+    size_t last_non_zero_offset = 0;
+    for (int level = 0; level < db_->NumberLevels(); level++) {
+      int f = NumTableFilesAtLevel(level);
+      char buf[100];
+      snprintf(buf, sizeof(buf), "%s%d", (level ? "," : ""), f);
+      result += buf;
+      if (f > 0) {
+        last_non_zero_offset = result.size();
+      }
+    }
+    result.resize(last_non_zero_offset);
+    return result;
+  }
+
+  std::string IterStatus(Iterator* iter) {
+    std::string result;
+    if (iter->Valid()) {
+      result = iter->key().ToString() + "->" + iter->value().ToString();
+    } else {
+      result = "(invalid)";
+    }
+    return result;
+  }
+};
+
+TEST_P(PlainTableDBTest, Empty) {
+  ASSERT_TRUE(dbfull() != nullptr);
+  ASSERT_EQ("NOT_FOUND", Get("0000000000000foo"));
+}
+
+extern const uint64_t kPlainTableMagicNumber;
+
+class TestPlainTableReader : public PlainTableReader {
+ public:
+  TestPlainTableReader(const EnvOptions& env_options,
+                       const InternalKeyComparator& icomparator,
+                       EncodingType encoding_type, uint64_t file_size,
+                       int bloom_bits_per_key, double hash_table_ratio,
+                       size_t index_sparseness,
+                       const TableProperties* table_properties,
+                       std::unique_ptr<RandomAccessFileReader>&& file,
+                       const ImmutableCFOptions& ioptions,
+                       const SliceTransform* prefix_extractor,
+                       bool* expect_bloom_not_match, bool store_index_in_file,
+                       uint32_t column_family_id,
+                       const std::string& column_family_name)
+      : PlainTableReader(ioptions, std::move(file), env_options, icomparator,
+                         encoding_type, file_size, table_properties,
+                         prefix_extractor),
+        expect_bloom_not_match_(expect_bloom_not_match) {
+    Status s = MmapDataIfNeeded();
+    EXPECT_TRUE(s.ok());
+
+    s = PopulateIndex(const_cast<TableProperties*>(table_properties),
+                      bloom_bits_per_key, hash_table_ratio, index_sparseness,
+                      2 * 1024 * 1024);
+    EXPECT_TRUE(s.ok());
+
+    TableProperties* props = const_cast<TableProperties*>(table_properties);
+    EXPECT_EQ(column_family_id, static_cast<uint32_t>(props->column_family_id));
+    EXPECT_EQ(column_family_name, props->column_family_name);
+    if (store_index_in_file) {
+      auto bloom_version_ptr = props->user_collected_properties.find(
+          PlainTablePropertyNames::kBloomVersion);
+      EXPECT_TRUE(bloom_version_ptr != props->user_collected_properties.end());
+      EXPECT_EQ(bloom_version_ptr->second, std::string("1"));
+      if (ioptions.bloom_locality > 0) {
+        auto num_blocks_ptr = props->user_collected_properties.find(
+            PlainTablePropertyNames::kNumBloomBlocks);
+        EXPECT_TRUE(num_blocks_ptr != props->user_collected_properties.end());
+      }
+    }
+    table_properties_.reset(props);
+  }
+
+  ~TestPlainTableReader() override {}
+
+ private:
+  bool MatchBloom(uint32_t hash) const override {
+    bool ret = PlainTableReader::MatchBloom(hash);
+    if (*expect_bloom_not_match_) {
+      EXPECT_TRUE(!ret);
+    } else {
+      EXPECT_TRUE(ret);
+    }
+    return ret;
+  }
+  bool* expect_bloom_not_match_;
+};
+
+extern const uint64_t kPlainTableMagicNumber;
+class TestPlainTableFactory : public PlainTableFactory {
+ public:
+  explicit TestPlainTableFactory(bool* expect_bloom_not_match,
+                                 const PlainTableOptions& options,
+                                 uint32_t column_family_id,
+                                 std::string column_family_name)
+      : PlainTableFactory(options),
+        bloom_bits_per_key_(options.bloom_bits_per_key),
+        hash_table_ratio_(options.hash_table_ratio),
+        index_sparseness_(options.index_sparseness),
+        store_index_in_file_(options.store_index_in_file),
+        expect_bloom_not_match_(expect_bloom_not_match),
+        column_family_id_(column_family_id),
+        column_family_name_(std::move(column_family_name)) {}
+
+  Status NewTableReader(
+      const TableReaderOptions& table_reader_options,
+      std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
+      std::unique_ptr<TableReader>* table,
+      bool /*prefetch_index_and_filter_in_cache*/) const override {
+    TableProperties* props = nullptr;
+    auto s =
+        ReadTableProperties(file.get(), file_size, kPlainTableMagicNumber,
+                            table_reader_options.ioptions, &props,
+                            true /* compression_type_missing */);
+    EXPECT_TRUE(s.ok());
+
+    if (store_index_in_file_) {
+      BlockHandle bloom_block_handle;
+      s = FindMetaBlock(file.get(), file_size, kPlainTableMagicNumber,
+                        table_reader_options.ioptions,
+                        BloomBlockBuilder::kBloomBlock, &bloom_block_handle,
+                        /* compression_type_missing */ true);
+      EXPECT_TRUE(s.ok());
+
+      BlockHandle index_block_handle;
+      s = FindMetaBlock(file.get(), file_size, kPlainTableMagicNumber,
+                        table_reader_options.ioptions,
+                        PlainTableIndexBuilder::kPlainTableIndexBlock,
+                        &index_block_handle, /* compression_type_missing */ true);
+      EXPECT_TRUE(s.ok());
+    }
+
+    auto& user_props = props->user_collected_properties;
+    auto encoding_type_prop =
+        user_props.find(PlainTablePropertyNames::kEncodingType);
+    assert(encoding_type_prop != user_props.end());
+    EncodingType encoding_type = static_cast<EncodingType>(
+        DecodeFixed32(encoding_type_prop->second.c_str()));
+
+    std::unique_ptr<PlainTableReader> new_reader(new TestPlainTableReader(
+        table_reader_options.env_options,
+        table_reader_options.internal_comparator, encoding_type, file_size,
+        bloom_bits_per_key_, hash_table_ratio_, index_sparseness_, props,
+        std::move(file), table_reader_options.ioptions,
+        table_reader_options.prefix_extractor, expect_bloom_not_match_,
+        store_index_in_file_, column_family_id_, column_family_name_));
+
+    *table = std::move(new_reader);
+    return s;
+  }
+
+ private:
+  int bloom_bits_per_key_;
+  double hash_table_ratio_;
+  size_t index_sparseness_;
+  bool store_index_in_file_;
+  bool* expect_bloom_not_match_;
+  const uint32_t column_family_id_;
+  const std::string column_family_name_;
+};
+
+TEST_P(PlainTableDBTest, BadOptions1) {
+  // Build with a prefix extractor
+  ASSERT_OK(Put("1000000000000foo", "v1"));
+  dbfull()->TEST_FlushMemTable();
+
+  // Bad attempt to re-open without a prefix extractor
+  Options options = CurrentOptions();
+  options.prefix_extractor.reset();
+  ASSERT_EQ(
+      "Invalid argument: Prefix extractor is missing when opening a PlainTable "
+      "built using a prefix extractor",
+      TryReopen(&options).ToString());
+
+  // Bad attempt to re-open with different prefix extractor
+  options.prefix_extractor.reset(NewFixedPrefixTransform(6));
+  ASSERT_EQ(
+      "Invalid argument: Prefix extractor given doesn't match the one used to "
+      "build PlainTable",
+      TryReopen(&options).ToString());
+
+  // Correct prefix extractor
+  options.prefix_extractor.reset(NewFixedPrefixTransform(8));
+  Reopen(&options);
+  ASSERT_EQ("v1", Get("1000000000000foo"));
+}
+
+TEST_P(PlainTableDBTest, BadOptions2) {
+  Options options = CurrentOptions();
+  options.prefix_extractor.reset();
+  options.create_if_missing = true;
+  DestroyAndReopen(&options);
+  // Build without a prefix extractor
+  // (apparently works even if hash_table_ratio > 0)
+  ASSERT_OK(Put("1000000000000foo", "v1"));
+  dbfull()->TEST_FlushMemTable();
+
+  // Bad attempt to re-open with hash_table_ratio > 0 and no prefix extractor
+  Status s = TryReopen(&options);
+  ASSERT_EQ(
+      "Not implemented: PlainTable requires a prefix extractor enable prefix "
+      "hash mode.",
+      s.ToString());
+
+  // OK to open with hash_table_ratio == 0 and no prefix extractor
+  PlainTableOptions plain_table_options;
+  plain_table_options.hash_table_ratio = 0;
+  options.table_factory.reset(NewPlainTableFactory(plain_table_options));
+  Reopen(&options);
+  ASSERT_EQ("v1", Get("1000000000000foo"));
+
+  // OK to open newly with a prefix_extractor and hash table; builds index
+  // in memory.
+  options = CurrentOptions();
+  Reopen(&options);
+  ASSERT_EQ("v1", Get("1000000000000foo"));
+}
+
+TEST_P(PlainTableDBTest, Flush) {
+  for (size_t huge_page_tlb_size = 0; huge_page_tlb_size <= 2 * 1024 * 1024;
+       huge_page_tlb_size += 2 * 1024 * 1024) {
+    for (EncodingType encoding_type : {kPlain, kPrefix}) {
+    for (int bloom = -1; bloom <= 117; bloom += 117) {
+      const int bloom_bits = std::max(bloom, 0);
+      const bool full_scan_mode = bloom < 0;
+      for (int total_order = 0; total_order <= 1; total_order++) {
+        for (int store_index_in_file = 0; store_index_in_file <= 1;
+             ++store_index_in_file) {
+          Options options = CurrentOptions();
+          options.create_if_missing = true;
+          // Set only one bucket to force bucket conflict.
+          // Test index interval for the same prefix to be 1, 2 and 4
+          if (total_order) {
+            options.prefix_extractor.reset();
+
+            PlainTableOptions plain_table_options;
+            plain_table_options.user_key_len = 0;
+            plain_table_options.bloom_bits_per_key = bloom_bits;
+            plain_table_options.hash_table_ratio = 0;
+            plain_table_options.index_sparseness = 2;
+            plain_table_options.huge_page_tlb_size = huge_page_tlb_size;
+            plain_table_options.encoding_type = encoding_type;
+            plain_table_options.full_scan_mode = full_scan_mode;
+            plain_table_options.store_index_in_file = store_index_in_file;
+
+            options.table_factory.reset(
+                NewPlainTableFactory(plain_table_options));
+          } else {
+            PlainTableOptions plain_table_options;
+            plain_table_options.user_key_len = 0;
+            plain_table_options.bloom_bits_per_key = bloom_bits;
+            plain_table_options.hash_table_ratio = 0.75;
+            plain_table_options.index_sparseness = 16;
+            plain_table_options.huge_page_tlb_size = huge_page_tlb_size;
+            plain_table_options.encoding_type = encoding_type;
+            plain_table_options.full_scan_mode = full_scan_mode;
+            plain_table_options.store_index_in_file = store_index_in_file;
+
+            options.table_factory.reset(
+                NewPlainTableFactory(plain_table_options));
+          }
+          DestroyAndReopen(&options);
+          uint64_t int_num;
+          ASSERT_TRUE(dbfull()->GetIntProperty(
+              "rocksdb.estimate-table-readers-mem", &int_num));
+          ASSERT_EQ(int_num, 0U);
+
+          ASSERT_OK(Put("1000000000000foo", "v1"));
+          ASSERT_OK(Put("0000000000000bar", "v2"));
+          ASSERT_OK(Put("1000000000000foo", "v3"));
+          dbfull()->TEST_FlushMemTable();
+
+          ASSERT_TRUE(dbfull()->GetIntProperty(
+              "rocksdb.estimate-table-readers-mem", &int_num));
+          ASSERT_GT(int_num, 0U);
+
+          TablePropertiesCollection ptc;
+          reinterpret_cast<DB*>(dbfull())->GetPropertiesOfAllTables(&ptc);
+          ASSERT_EQ(1U, ptc.size());
+          auto row = ptc.begin();
+          auto tp = row->second;
+
+          if (full_scan_mode) {
+            // Does not support Get/Seek
+            std::unique_ptr<Iterator> iter(dbfull()->NewIterator(ReadOptions()));
+            iter->SeekToFirst();
+            ASSERT_TRUE(iter->Valid());
+            ASSERT_EQ("0000000000000bar", iter->key().ToString());
+            ASSERT_EQ("v2", iter->value().ToString());
+            iter->Next();
+            ASSERT_TRUE(iter->Valid());
+            ASSERT_EQ("1000000000000foo", iter->key().ToString());
+            ASSERT_EQ("v3", iter->value().ToString());
+            iter->Next();
+            ASSERT_TRUE(!iter->Valid());
+            ASSERT_TRUE(iter->status().ok());
+          } else {
+            if (!store_index_in_file) {
+              ASSERT_EQ(total_order ? "4" : "12",
+                        (tp->user_collected_properties)
+                            .at("plain_table_hash_table_size"));
+              ASSERT_EQ("0", (tp->user_collected_properties)
+                                 .at("plain_table_sub_index_size"));
+            } else {
+              ASSERT_EQ("0", (tp->user_collected_properties)
+                                 .at("plain_table_hash_table_size"));
+              ASSERT_EQ("0", (tp->user_collected_properties)
+                                 .at("plain_table_sub_index_size"));
+            }
+            ASSERT_EQ("v3", Get("1000000000000foo"));
+            ASSERT_EQ("v2", Get("0000000000000bar"));
+          }
+        }
+        }
+      }
+    }
+  }
+}
+
+TEST_P(PlainTableDBTest, Flush2) {
+  for (size_t huge_page_tlb_size = 0; huge_page_tlb_size <= 2 * 1024 * 1024;
+       huge_page_tlb_size += 2 * 1024 * 1024) {
+    for (EncodingType encoding_type : {kPlain, kPrefix}) {
+    for (int bloom_bits = 0; bloom_bits <= 117; bloom_bits += 117) {
+      for (int total_order = 0; total_order <= 1; total_order++) {
+        for (int store_index_in_file = 0; store_index_in_file <= 1;
+             ++store_index_in_file) {
+          if (encoding_type == kPrefix && total_order) {
+            continue;
+          }
+          if (!bloom_bits && store_index_in_file) {
+            continue;
+          }
+          if (total_order && store_index_in_file) {
+          continue;
+        }
+        bool expect_bloom_not_match = false;
+        Options options = CurrentOptions();
+        options.create_if_missing = true;
+        // Set only one bucket to force bucket conflict.
+        // Test index interval for the same prefix to be 1, 2 and 4
+        PlainTableOptions plain_table_options;
+        if (total_order) {
+          options.prefix_extractor = nullptr;
+          plain_table_options.hash_table_ratio = 0;
+          plain_table_options.index_sparseness = 2;
+        } else {
+          plain_table_options.hash_table_ratio = 0.75;
+          plain_table_options.index_sparseness = 16;
+        }
+        plain_table_options.user_key_len = kPlainTableVariableLength;
+        plain_table_options.bloom_bits_per_key = bloom_bits;
+        plain_table_options.huge_page_tlb_size = huge_page_tlb_size;
+        plain_table_options.encoding_type = encoding_type;
+        plain_table_options.store_index_in_file = store_index_in_file;
+        options.table_factory.reset(new TestPlainTableFactory(
+            &expect_bloom_not_match, plain_table_options,
+            0 /* column_family_id */, kDefaultColumnFamilyName));
+
+        DestroyAndReopen(&options);
+        ASSERT_OK(Put("0000000000000bar", "b"));
+        ASSERT_OK(Put("1000000000000foo", "v1"));
+        dbfull()->TEST_FlushMemTable();
+
+        ASSERT_OK(Put("1000000000000foo", "v2"));
+        dbfull()->TEST_FlushMemTable();
+        ASSERT_EQ("v2", Get("1000000000000foo"));
+
+        ASSERT_OK(Put("0000000000000eee", "v3"));
+        dbfull()->TEST_FlushMemTable();
+        ASSERT_EQ("v3", Get("0000000000000eee"));
+
+        ASSERT_OK(Delete("0000000000000bar"));
+        dbfull()->TEST_FlushMemTable();
+        ASSERT_EQ("NOT_FOUND", Get("0000000000000bar"));
+
+        ASSERT_OK(Put("0000000000000eee", "v5"));
+        ASSERT_OK(Put("9000000000000eee", "v5"));
+        dbfull()->TEST_FlushMemTable();
+        ASSERT_EQ("v5", Get("0000000000000eee"));
+
+        // Test Bloom Filter
+        if (bloom_bits > 0) {
+          // Neither key nor value should exist.
+          expect_bloom_not_match = true;
+          ASSERT_EQ("NOT_FOUND", Get("5_not00000000bar"));
+          // Key doesn't exist any more but prefix exists.
+          if (total_order) {
+            ASSERT_EQ("NOT_FOUND", Get("1000000000000not"));
+            ASSERT_EQ("NOT_FOUND", Get("0000000000000not"));
+          }
+          expect_bloom_not_match = false;
+        }
+      }
+      }
+    }
+    }
+  }
+}
+
+TEST_P(PlainTableDBTest, Immortal) {
+  for (EncodingType encoding_type : {kPlain, kPrefix}) {
+    Options options = CurrentOptions();
+    options.create_if_missing = true;
+    options.max_open_files = -1;
+    // Set only one bucket to force bucket conflict.
+    // Test index interval for the same prefix to be 1, 2 and 4
+    PlainTableOptions plain_table_options;
+    plain_table_options.hash_table_ratio = 0.75;
+    plain_table_options.index_sparseness = 16;
+    plain_table_options.user_key_len = kPlainTableVariableLength;
+    plain_table_options.bloom_bits_per_key = 10;
+    plain_table_options.encoding_type = encoding_type;
+    options.table_factory.reset(NewPlainTableFactory(plain_table_options));
+
+    DestroyAndReopen(&options);
+    ASSERT_OK(Put("0000000000000bar", "b"));
+    ASSERT_OK(Put("1000000000000foo", "v1"));
+    dbfull()->TEST_FlushMemTable();
+
+    int copied = 0;
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+        "GetContext::SaveValue::PinSelf", [&](void* /*arg*/) { copied++; });
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+    ASSERT_EQ("b", Get("0000000000000bar"));
+    ASSERT_EQ("v1", Get("1000000000000foo"));
+    ASSERT_EQ(2, copied);
+    copied = 0;
+
+    Close();
+    ASSERT_OK(ReopenForReadOnly(&options));
+
+    ASSERT_EQ("b", Get("0000000000000bar"));
+    ASSERT_EQ("v1", Get("1000000000000foo"));
+    ASSERT_EQ("NOT_FOUND", Get("1000000000000bar"));
+    if (mmap_mode()) {
+      ASSERT_EQ(0, copied);
+    } else {
+      ASSERT_EQ(2, copied);
+    }
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  }
+}
+
+TEST_P(PlainTableDBTest, Iterator) {
+  for (size_t huge_page_tlb_size = 0; huge_page_tlb_size <= 2 * 1024 * 1024;
+       huge_page_tlb_size += 2 * 1024 * 1024) {
+    for (EncodingType encoding_type : {kPlain, kPrefix}) {
+    for (int bloom_bits = 0; bloom_bits <= 117; bloom_bits += 117) {
+      for (int total_order = 0; total_order <= 1; total_order++) {
+        if (encoding_type == kPrefix && total_order == 1) {
+          continue;
+        }
+        bool expect_bloom_not_match = false;
+        Options options = CurrentOptions();
+        options.create_if_missing = true;
+        // Set only one bucket to force bucket conflict.
+        // Test index interval for the same prefix to be 1, 2 and 4
+        if (total_order) {
+          options.prefix_extractor = nullptr;
+
+          PlainTableOptions plain_table_options;
+          plain_table_options.user_key_len = 16;
+          plain_table_options.bloom_bits_per_key = bloom_bits;
+          plain_table_options.hash_table_ratio = 0;
+          plain_table_options.index_sparseness = 2;
+          plain_table_options.huge_page_tlb_size = huge_page_tlb_size;
+          plain_table_options.encoding_type = encoding_type;
+
+          options.table_factory.reset(new TestPlainTableFactory(
+              &expect_bloom_not_match, plain_table_options,
+              0 /* column_family_id */, kDefaultColumnFamilyName));
+        } else {
+          PlainTableOptions plain_table_options;
+          plain_table_options.user_key_len = 16;
+          plain_table_options.bloom_bits_per_key = bloom_bits;
+          plain_table_options.hash_table_ratio = 0.75;
+          plain_table_options.index_sparseness = 16;
+          plain_table_options.huge_page_tlb_size = huge_page_tlb_size;
+          plain_table_options.encoding_type = encoding_type;
+
+          options.table_factory.reset(new TestPlainTableFactory(
+              &expect_bloom_not_match, plain_table_options,
+              0 /* column_family_id */, kDefaultColumnFamilyName));
+        }
+        DestroyAndReopen(&options);
+
+        ASSERT_OK(Put("1000000000foo002", "v_2"));
+        ASSERT_OK(Put("0000000000000bar", "random"));
+        ASSERT_OK(Put("1000000000foo001", "v1"));
+        ASSERT_OK(Put("3000000000000bar", "bar_v"));
+        ASSERT_OK(Put("1000000000foo003", "v__3"));
+        ASSERT_OK(Put("1000000000foo004", "v__4"));
+        ASSERT_OK(Put("1000000000foo005", "v__5"));
+        ASSERT_OK(Put("1000000000foo007", "v__7"));
+        ASSERT_OK(Put("1000000000foo008", "v__8"));
+        dbfull()->TEST_FlushMemTable();
+        ASSERT_EQ("v1", Get("1000000000foo001"));
+        ASSERT_EQ("v__3", Get("1000000000foo003"));
+        Iterator* iter = dbfull()->NewIterator(ReadOptions());
+        iter->Seek("1000000000foo000");
+        ASSERT_TRUE(iter->Valid());
+        ASSERT_EQ("1000000000foo001", iter->key().ToString());
+        ASSERT_EQ("v1", iter->value().ToString());
+
+        iter->Next();
+        ASSERT_TRUE(iter->Valid());
+        ASSERT_EQ("1000000000foo002", iter->key().ToString());
+        ASSERT_EQ("v_2", iter->value().ToString());
+
+        iter->Next();
+        ASSERT_TRUE(iter->Valid());
+        ASSERT_EQ("1000000000foo003", iter->key().ToString());
+        ASSERT_EQ("v__3", iter->value().ToString());
+
+        iter->Next();
+        ASSERT_TRUE(iter->Valid());
+        ASSERT_EQ("1000000000foo004", iter->key().ToString());
+        ASSERT_EQ("v__4", iter->value().ToString());
+
+        iter->Seek("3000000000000bar");
+        ASSERT_TRUE(iter->Valid());
+        ASSERT_EQ("3000000000000bar", iter->key().ToString());
+        ASSERT_EQ("bar_v", iter->value().ToString());
+
+        iter->Seek("1000000000foo000");
+        ASSERT_TRUE(iter->Valid());
+        ASSERT_EQ("1000000000foo001", iter->key().ToString());
+        ASSERT_EQ("v1", iter->value().ToString());
+
+        iter->Seek("1000000000foo005");
+        ASSERT_TRUE(iter->Valid());
+        ASSERT_EQ("1000000000foo005", iter->key().ToString());
+        ASSERT_EQ("v__5", iter->value().ToString());
+
+        iter->Seek("1000000000foo006");
+        ASSERT_TRUE(iter->Valid());
+        ASSERT_EQ("1000000000foo007", iter->key().ToString());
+        ASSERT_EQ("v__7", iter->value().ToString());
+
+        iter->Seek("1000000000foo008");
+        ASSERT_TRUE(iter->Valid());
+        ASSERT_EQ("1000000000foo008", iter->key().ToString());
+        ASSERT_EQ("v__8", iter->value().ToString());
+
+        if (total_order == 0) {
+          iter->Seek("1000000000foo009");
+          ASSERT_TRUE(iter->Valid());
+          ASSERT_EQ("3000000000000bar", iter->key().ToString());
+        }
+
+        // Test Bloom Filter
+        if (bloom_bits > 0) {
+          if (!total_order) {
+            // Neither key nor value should exist.
+            expect_bloom_not_match = true;
+            iter->Seek("2not000000000bar");
+            ASSERT_TRUE(!iter->Valid());
+            ASSERT_EQ("NOT_FOUND", Get("2not000000000bar"));
+            expect_bloom_not_match = false;
+          } else {
+            expect_bloom_not_match = true;
+            ASSERT_EQ("NOT_FOUND", Get("2not000000000bar"));
+            expect_bloom_not_match = false;
+          }
+        }
+
+        delete iter;
+      }
+    }
+    }
+  }
+}
+
+namespace {
+std::string NthKey(size_t n, char filler) {
+  std::string rv(16, filler);
+  rv[0] = n % 10;
+  rv[1] = (n / 10) % 10;
+  rv[2] = (n / 100) % 10;
+  rv[3] = (n / 1000) % 10;
+  return rv;
+}
+}  // anonymous namespace
+
+TEST_P(PlainTableDBTest, BloomSchema) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  for (int bloom_locality = 0; bloom_locality <= 1; bloom_locality++) {
+    options.bloom_locality = bloom_locality;
+    PlainTableOptions plain_table_options;
+    plain_table_options.user_key_len = 16;
+    plain_table_options.bloom_bits_per_key = 3;  // high FP rate for test
+    plain_table_options.hash_table_ratio = 0.75;
+    plain_table_options.index_sparseness = 16;
+    plain_table_options.huge_page_tlb_size = 0;
+    plain_table_options.encoding_type = kPlain;
+
+    bool expect_bloom_not_match = false;
+    options.table_factory.reset(new TestPlainTableFactory(
+        &expect_bloom_not_match, plain_table_options, 0 /* column_family_id */,
+        kDefaultColumnFamilyName));
+    DestroyAndReopen(&options);
+
+    for (unsigned i = 0; i < 2345; ++i) {
+      ASSERT_OK(Put(NthKey(i, 'y'), "added"));
+    }
+    dbfull()->TEST_FlushMemTable();
+    ASSERT_EQ("added", Get(NthKey(42, 'y')));
+
+    for (unsigned i = 0; i < 32; ++i) {
+      // Known pattern of Bloom filter false positives can detect schema change
+      // with high probability. Known FPs stuffed into bits:
+      uint32_t pattern;
+      if (!bloom_locality) {
+        pattern = 1785868347UL;
+      } else if (CACHE_LINE_SIZE == 64U) {
+        pattern = 2421694657UL;
+      } else if (CACHE_LINE_SIZE == 128U) {
+        pattern = 788710956UL;
+      } else {
+        ASSERT_EQ(CACHE_LINE_SIZE, 256U);
+        pattern = 163905UL;
+      }
+      bool expect_fp = pattern & (1UL << i);
+      // fprintf(stderr, "expect_fp@%u: %d\n", i, (int)expect_fp);
+      expect_bloom_not_match = !expect_fp;
+      ASSERT_EQ("NOT_FOUND", Get(NthKey(i, 'n')));
+    }
+  }
+}
+
+namespace {
+std::string MakeLongKey(size_t length, char c) {
+  return std::string(length, c);
+}
+}  // namespace
+
+TEST_P(PlainTableDBTest, IteratorLargeKeys) {
+  Options options = CurrentOptions();
+
+  PlainTableOptions plain_table_options;
+  plain_table_options.user_key_len = 0;
+  plain_table_options.bloom_bits_per_key = 0;
+  plain_table_options.hash_table_ratio = 0;
+
+  options.table_factory.reset(NewPlainTableFactory(plain_table_options));
+  options.create_if_missing = true;
+  options.prefix_extractor.reset();
+  DestroyAndReopen(&options);
+
+  std::string key_list[] = {
+      MakeLongKey(30, '0'),
+      MakeLongKey(16, '1'),
+      MakeLongKey(32, '2'),
+      MakeLongKey(60, '3'),
+      MakeLongKey(90, '4'),
+      MakeLongKey(50, '5'),
+      MakeLongKey(26, '6')
+  };
+
+  for (size_t i = 0; i < 7; i++) {
+    ASSERT_OK(Put(key_list[i], ToString(i)));
+  }
+
+  dbfull()->TEST_FlushMemTable();
+
+  Iterator* iter = dbfull()->NewIterator(ReadOptions());
+  iter->Seek(key_list[0]);
+
+  for (size_t i = 0; i < 7; i++) {
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(key_list[i], iter->key().ToString());
+    ASSERT_EQ(ToString(i), iter->value().ToString());
+    iter->Next();
+  }
+
+  ASSERT_TRUE(!iter->Valid());
+
+  delete iter;
+}
+
+namespace {
+std::string MakeLongKeyWithPrefix(size_t length, char c) {
+  return "00000000" + std::string(length - 8, c);
+}
+}  // namespace
+
+TEST_P(PlainTableDBTest, IteratorLargeKeysWithPrefix) {
+  Options options = CurrentOptions();
+
+  PlainTableOptions plain_table_options;
+  plain_table_options.user_key_len = 16;
+  plain_table_options.bloom_bits_per_key = 0;
+  plain_table_options.hash_table_ratio = 0.8;
+  plain_table_options.index_sparseness = 3;
+  plain_table_options.huge_page_tlb_size = 0;
+  plain_table_options.encoding_type = kPrefix;
+
+  options.table_factory.reset(NewPlainTableFactory(plain_table_options));
+  options.create_if_missing = true;
+  DestroyAndReopen(&options);
+
+  std::string key_list[] = {
+      MakeLongKeyWithPrefix(30, '0'), MakeLongKeyWithPrefix(16, '1'),
+      MakeLongKeyWithPrefix(32, '2'), MakeLongKeyWithPrefix(60, '3'),
+      MakeLongKeyWithPrefix(90, '4'), MakeLongKeyWithPrefix(50, '5'),
+      MakeLongKeyWithPrefix(26, '6')};
+
+  for (size_t i = 0; i < 7; i++) {
+    ASSERT_OK(Put(key_list[i], ToString(i)));
+  }
+
+  dbfull()->TEST_FlushMemTable();
+
+  Iterator* iter = dbfull()->NewIterator(ReadOptions());
+  iter->Seek(key_list[0]);
+
+  for (size_t i = 0; i < 7; i++) {
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(key_list[i], iter->key().ToString());
+    ASSERT_EQ(ToString(i), iter->value().ToString());
+    iter->Next();
+  }
+
+  ASSERT_TRUE(!iter->Valid());
+
+  delete iter;
+}
+
+TEST_P(PlainTableDBTest, IteratorReverseSuffixComparator) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  // Set only one bucket to force bucket conflict.
+  // Test index interval for the same prefix to be 1, 2 and 4
+  test::SimpleSuffixReverseComparator comp;
+  options.comparator = &comp;
+  DestroyAndReopen(&options);
+
+  ASSERT_OK(Put("1000000000foo002", "v_2"));
+  ASSERT_OK(Put("0000000000000bar", "random"));
+  ASSERT_OK(Put("1000000000foo001", "v1"));
+  ASSERT_OK(Put("3000000000000bar", "bar_v"));
+  ASSERT_OK(Put("1000000000foo003", "v__3"));
+  ASSERT_OK(Put("1000000000foo004", "v__4"));
+  ASSERT_OK(Put("1000000000foo005", "v__5"));
+  ASSERT_OK(Put("1000000000foo007", "v__7"));
+  ASSERT_OK(Put("1000000000foo008", "v__8"));
+  dbfull()->TEST_FlushMemTable();
+  ASSERT_EQ("v1", Get("1000000000foo001"));
+  ASSERT_EQ("v__3", Get("1000000000foo003"));
+  Iterator* iter = dbfull()->NewIterator(ReadOptions());
+  iter->Seek("1000000000foo009");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("1000000000foo008", iter->key().ToString());
+  ASSERT_EQ("v__8", iter->value().ToString());
+
+  iter->Next();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("1000000000foo007", iter->key().ToString());
+  ASSERT_EQ("v__7", iter->value().ToString());
+
+  iter->Next();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("1000000000foo005", iter->key().ToString());
+  ASSERT_EQ("v__5", iter->value().ToString());
+
+  iter->Next();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("1000000000foo004", iter->key().ToString());
+  ASSERT_EQ("v__4", iter->value().ToString());
+
+  iter->Seek("3000000000000bar");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("3000000000000bar", iter->key().ToString());
+  ASSERT_EQ("bar_v", iter->value().ToString());
+
+  iter->Seek("1000000000foo005");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("1000000000foo005", iter->key().ToString());
+  ASSERT_EQ("v__5", iter->value().ToString());
+
+  iter->Seek("1000000000foo006");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("1000000000foo005", iter->key().ToString());
+  ASSERT_EQ("v__5", iter->value().ToString());
+
+  iter->Seek("1000000000foo008");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("1000000000foo008", iter->key().ToString());
+  ASSERT_EQ("v__8", iter->value().ToString());
+
+  iter->Seek("1000000000foo000");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("3000000000000bar", iter->key().ToString());
+
+  delete iter;
+}
+
+TEST_P(PlainTableDBTest, HashBucketConflict) {
+  for (size_t huge_page_tlb_size = 0; huge_page_tlb_size <= 2 * 1024 * 1024;
+       huge_page_tlb_size += 2 * 1024 * 1024) {
+    for (unsigned char i = 1; i <= 3; i++) {
+      Options options = CurrentOptions();
+      options.create_if_missing = true;
+      // Set only one bucket to force bucket conflict.
+      // Test index interval for the same prefix to be 1, 2 and 4
+
+      PlainTableOptions plain_table_options;
+      plain_table_options.user_key_len = 16;
+      plain_table_options.bloom_bits_per_key = 0;
+      plain_table_options.hash_table_ratio = 0;
+      plain_table_options.index_sparseness = 2 ^ i;
+      plain_table_options.huge_page_tlb_size = huge_page_tlb_size;
+
+      options.table_factory.reset(NewPlainTableFactory(plain_table_options));
+
+      DestroyAndReopen(&options);
+      ASSERT_OK(Put("5000000000000fo0", "v1"));
+      ASSERT_OK(Put("5000000000000fo1", "v2"));
+      ASSERT_OK(Put("5000000000000fo2", "v"));
+      ASSERT_OK(Put("2000000000000fo0", "v3"));
+      ASSERT_OK(Put("2000000000000fo1", "v4"));
+      ASSERT_OK(Put("2000000000000fo2", "v"));
+      ASSERT_OK(Put("2000000000000fo3", "v"));
+
+      dbfull()->TEST_FlushMemTable();
+
+      ASSERT_EQ("v1", Get("5000000000000fo0"));
+      ASSERT_EQ("v2", Get("5000000000000fo1"));
+      ASSERT_EQ("v3", Get("2000000000000fo0"));
+      ASSERT_EQ("v4", Get("2000000000000fo1"));
+
+      ASSERT_EQ("NOT_FOUND", Get("5000000000000bar"));
+      ASSERT_EQ("NOT_FOUND", Get("2000000000000bar"));
+      ASSERT_EQ("NOT_FOUND", Get("5000000000000fo8"));
+      ASSERT_EQ("NOT_FOUND", Get("2000000000000fo8"));
+
+      ReadOptions ro;
+      Iterator* iter = dbfull()->NewIterator(ro);
+
+      iter->Seek("5000000000000fo0");
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ("5000000000000fo0", iter->key().ToString());
+      iter->Next();
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ("5000000000000fo1", iter->key().ToString());
+
+      iter->Seek("5000000000000fo1");
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ("5000000000000fo1", iter->key().ToString());
+
+      iter->Seek("2000000000000fo0");
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ("2000000000000fo0", iter->key().ToString());
+      iter->Next();
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ("2000000000000fo1", iter->key().ToString());
+
+      iter->Seek("2000000000000fo1");
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ("2000000000000fo1", iter->key().ToString());
+
+      iter->Seek("2000000000000bar");
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ("2000000000000fo0", iter->key().ToString());
+
+      iter->Seek("5000000000000bar");
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ("5000000000000fo0", iter->key().ToString());
+
+      iter->Seek("2000000000000fo8");
+      ASSERT_TRUE(!iter->Valid() ||
+                  options.comparator->Compare(iter->key(), "20000001") > 0);
+
+      iter->Seek("5000000000000fo8");
+      ASSERT_TRUE(!iter->Valid());
+
+      iter->Seek("1000000000000fo2");
+      ASSERT_TRUE(!iter->Valid());
+
+      iter->Seek("3000000000000fo2");
+      ASSERT_TRUE(!iter->Valid());
+
+      iter->Seek("8000000000000fo2");
+      ASSERT_TRUE(!iter->Valid());
+
+      delete iter;
+    }
+  }
+}
+
+TEST_P(PlainTableDBTest, HashBucketConflictReverseSuffixComparator) {
+  for (size_t huge_page_tlb_size = 0; huge_page_tlb_size <= 2 * 1024 * 1024;
+       huge_page_tlb_size += 2 * 1024 * 1024) {
+    for (unsigned char i = 1; i <= 3; i++) {
+      Options options = CurrentOptions();
+      options.create_if_missing = true;
+      test::SimpleSuffixReverseComparator comp;
+      options.comparator = &comp;
+      // Set only one bucket to force bucket conflict.
+      // Test index interval for the same prefix to be 1, 2 and 4
+
+      PlainTableOptions plain_table_options;
+      plain_table_options.user_key_len = 16;
+      plain_table_options.bloom_bits_per_key = 0;
+      plain_table_options.hash_table_ratio = 0;
+      plain_table_options.index_sparseness = 2 ^ i;
+      plain_table_options.huge_page_tlb_size = huge_page_tlb_size;
+
+      options.table_factory.reset(NewPlainTableFactory(plain_table_options));
+      DestroyAndReopen(&options);
+      ASSERT_OK(Put("5000000000000fo0", "v1"));
+      ASSERT_OK(Put("5000000000000fo1", "v2"));
+      ASSERT_OK(Put("5000000000000fo2", "v"));
+      ASSERT_OK(Put("2000000000000fo0", "v3"));
+      ASSERT_OK(Put("2000000000000fo1", "v4"));
+      ASSERT_OK(Put("2000000000000fo2", "v"));
+      ASSERT_OK(Put("2000000000000fo3", "v"));
+
+      dbfull()->TEST_FlushMemTable();
+
+      ASSERT_EQ("v1", Get("5000000000000fo0"));
+      ASSERT_EQ("v2", Get("5000000000000fo1"));
+      ASSERT_EQ("v3", Get("2000000000000fo0"));
+      ASSERT_EQ("v4", Get("2000000000000fo1"));
+
+      ASSERT_EQ("NOT_FOUND", Get("5000000000000bar"));
+      ASSERT_EQ("NOT_FOUND", Get("2000000000000bar"));
+      ASSERT_EQ("NOT_FOUND", Get("5000000000000fo8"));
+      ASSERT_EQ("NOT_FOUND", Get("2000000000000fo8"));
+
+      ReadOptions ro;
+      Iterator* iter = dbfull()->NewIterator(ro);
+
+      iter->Seek("5000000000000fo1");
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ("5000000000000fo1", iter->key().ToString());
+      iter->Next();
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ("5000000000000fo0", iter->key().ToString());
+
+      iter->Seek("5000000000000fo1");
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ("5000000000000fo1", iter->key().ToString());
+
+      iter->Seek("2000000000000fo1");
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ("2000000000000fo1", iter->key().ToString());
+      iter->Next();
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ("2000000000000fo0", iter->key().ToString());
+
+      iter->Seek("2000000000000fo1");
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ("2000000000000fo1", iter->key().ToString());
+
+      iter->Seek("2000000000000var");
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ("2000000000000fo3", iter->key().ToString());
+
+      iter->Seek("5000000000000var");
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ("5000000000000fo2", iter->key().ToString());
+
+      std::string seek_key = "2000000000000bar";
+      iter->Seek(seek_key);
+      ASSERT_TRUE(!iter->Valid() ||
+                  options.prefix_extractor->Transform(iter->key()) !=
+                      options.prefix_extractor->Transform(seek_key));
+
+      iter->Seek("1000000000000fo2");
+      ASSERT_TRUE(!iter->Valid());
+
+      iter->Seek("3000000000000fo2");
+      ASSERT_TRUE(!iter->Valid());
+
+      iter->Seek("8000000000000fo2");
+      ASSERT_TRUE(!iter->Valid());
+
+      delete iter;
+    }
+  }
+}
+
+TEST_P(PlainTableDBTest, NonExistingKeyToNonEmptyBucket) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  // Set only one bucket to force bucket conflict.
+  // Test index interval for the same prefix to be 1, 2 and 4
+  PlainTableOptions plain_table_options;
+  plain_table_options.user_key_len = 16;
+  plain_table_options.bloom_bits_per_key = 0;
+  plain_table_options.hash_table_ratio = 0;
+  plain_table_options.index_sparseness = 5;
+
+  options.table_factory.reset(NewPlainTableFactory(plain_table_options));
+  DestroyAndReopen(&options);
+  ASSERT_OK(Put("5000000000000fo0", "v1"));
+  ASSERT_OK(Put("5000000000000fo1", "v2"));
+  ASSERT_OK(Put("5000000000000fo2", "v3"));
+
+  dbfull()->TEST_FlushMemTable();
+
+  ASSERT_EQ("v1", Get("5000000000000fo0"));
+  ASSERT_EQ("v2", Get("5000000000000fo1"));
+  ASSERT_EQ("v3", Get("5000000000000fo2"));
+
+  ASSERT_EQ("NOT_FOUND", Get("8000000000000bar"));
+  ASSERT_EQ("NOT_FOUND", Get("1000000000000bar"));
+
+  Iterator* iter = dbfull()->NewIterator(ReadOptions());
+
+  iter->Seek("5000000000000bar");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("5000000000000fo0", iter->key().ToString());
+
+  iter->Seek("5000000000000fo8");
+  ASSERT_TRUE(!iter->Valid());
+
+  iter->Seek("1000000000000fo2");
+  ASSERT_TRUE(!iter->Valid());
+
+  iter->Seek("8000000000000fo2");
+  ASSERT_TRUE(!iter->Valid());
+
+  delete iter;
+}
+
+static std::string Key(int i) {
+  char buf[100];
+  snprintf(buf, sizeof(buf), "key_______%06d", i);
+  return std::string(buf);
+}
+
+static std::string RandomString(Random* rnd, int len) {
+  std::string r;
+  test::RandomString(rnd, len, &r);
+  return r;
+}
+
+TEST_P(PlainTableDBTest, CompactionTrigger) {
+  Options options = CurrentOptions();
+  options.write_buffer_size = 120 << 10;  // 100KB
+  options.num_levels = 3;
+  options.level0_file_num_compaction_trigger = 3;
+  Reopen(&options);
+
+  Random rnd(301);
+
+  for (int num = 0; num < options.level0_file_num_compaction_trigger - 1;
+      num++) {
+    std::vector<std::string> values;
+    // Write 120KB (10 values, each 12K)
+    for (int i = 0; i < 10; i++) {
+      values.push_back(RandomString(&rnd, 12000));
+      ASSERT_OK(Put(Key(i), values[i]));
+    }
+    ASSERT_OK(Put(Key(999), ""));
+    dbfull()->TEST_WaitForFlushMemTable();
+    ASSERT_EQ(NumTableFilesAtLevel(0), num + 1);
+  }
+
+  //generate one more file in level-0, and should trigger level-0 compaction
+  std::vector<std::string> values;
+  for (int i = 0; i < 12; i++) {
+    values.push_back(RandomString(&rnd, 10000));
+    ASSERT_OK(Put(Key(i), values[i]));
+  }
+  ASSERT_OK(Put(Key(999), ""));
+  dbfull()->TEST_WaitForCompact();
+
+  ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(1), 1);
+}
+
+TEST_P(PlainTableDBTest, AdaptiveTable) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+
+  options.table_factory.reset(NewPlainTableFactory());
+  DestroyAndReopen(&options);
+
+  ASSERT_OK(Put("1000000000000foo", "v1"));
+  ASSERT_OK(Put("0000000000000bar", "v2"));
+  ASSERT_OK(Put("1000000000000foo", "v3"));
+  dbfull()->TEST_FlushMemTable();
+
+  options.create_if_missing = false;
+  std::shared_ptr<TableFactory> block_based_factory(
+      NewBlockBasedTableFactory());
+  std::shared_ptr<TableFactory> plain_table_factory(
+      NewPlainTableFactory());
+  std::shared_ptr<TableFactory> dummy_factory;
+  options.table_factory.reset(NewAdaptiveTableFactory(
+      block_based_factory, block_based_factory, plain_table_factory));
+  Reopen(&options);
+  ASSERT_EQ("v3", Get("1000000000000foo"));
+  ASSERT_EQ("v2", Get("0000000000000bar"));
+
+  ASSERT_OK(Put("2000000000000foo", "v4"));
+  ASSERT_OK(Put("3000000000000bar", "v5"));
+  dbfull()->TEST_FlushMemTable();
+  ASSERT_EQ("v4", Get("2000000000000foo"));
+  ASSERT_EQ("v5", Get("3000000000000bar"));
+
+  Reopen(&options);
+  ASSERT_EQ("v3", Get("1000000000000foo"));
+  ASSERT_EQ("v2", Get("0000000000000bar"));
+  ASSERT_EQ("v4", Get("2000000000000foo"));
+  ASSERT_EQ("v5", Get("3000000000000bar"));
+
+  options.paranoid_checks = false;
+  options.table_factory.reset(NewBlockBasedTableFactory());
+  Reopen(&options);
+  ASSERT_NE("v3", Get("1000000000000foo"));
+
+  options.paranoid_checks = false;
+  options.table_factory.reset(NewPlainTableFactory());
+  Reopen(&options);
+  ASSERT_NE("v5", Get("3000000000000bar"));
+}
+
+INSTANTIATE_TEST_CASE_P(PlainTableDBTest, PlainTableDBTest, ::testing::Bool());
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+  fprintf(stderr, "SKIPPED as plain table is not supported in ROCKSDB_LITE\n");
+  return 0;
+}
+
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/pre_release_callback.h b/src/rocksdb/db/pre_release_callback.h
new file mode 100644
index 000000000..b74be9537
--- /dev/null
+++ b/src/rocksdb/db/pre_release_callback.h
@@ -0,0 +1,38 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DB;
+
+class PreReleaseCallback {
+ public:
+  virtual ~PreReleaseCallback() {}
+
+  // Will be called while on the write thread after the write to the WAL and
+  // before the write to memtable. This is useful if any operation needs to be
+  // done before the write gets visible to the readers, or if we want to reduce
+  // the overhead of locking by updating something sequentially while we are on
+  // the write thread. If the callback fails, this function returns a non-OK
+  // status, the sequence number will not be released, and same status will be
+  // propagated to all the writers in the write group.
+  // seq is the sequence number that is used for this write and will be
+  // released.
+  // is_mem_disabled is currently used for debugging purposes to assert that
+  // the callback is done from the right write queue.
+  // If non-zero, log_number indicates the WAL log to which we wrote.
+  // index >= 0 specifies the order of callback in the same write thread.
+  // total > index specifies the total number of callbacks in the same write
+  // thread. Together with index, could be used to reduce the redundant
+  // operations among the callbacks.
+  virtual Status Callback(SequenceNumber seq, bool is_mem_disabled,
+                          uint64_t log_number, size_t index, size_t total) = 0;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/prefix_test.cc b/src/rocksdb/db/prefix_test.cc
new file mode 100644
index 000000000..c61ec2a1e
--- /dev/null
+++ b/src/rocksdb/db/prefix_test.cc
@@ -0,0 +1,895 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#ifndef GFLAGS
+#include <cstdio>
+int main() {
+  fprintf(stderr, "Please install gflags to run this test... Skipping...\n");
+  return 0;
+}
+#else
+
+#include <algorithm>
+#include <iostream>
+#include <vector>
+
+#include "db/db_impl/db_impl.h"
+#include "monitoring/histogram.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/db.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/memtablerep.h"
+#include "rocksdb/perf_context.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/table.h"
+#include "test_util/testharness.h"
+#include "util/coding.h"
+#include "util/gflags_compat.h"
+#include "util/random.h"
+#include "util/stop_watch.h"
+#include "util/string_util.h"
+#include "utilities/merge_operators.h"
+
+using GFLAGS_NAMESPACE::ParseCommandLineFlags;
+
+DEFINE_bool(trigger_deadlock, false,
+            "issue delete in range scan to trigger PrefixHashMap deadlock");
+DEFINE_int32(bucket_count, 100000, "number of buckets");
+DEFINE_uint64(num_locks, 10001, "number of locks");
+DEFINE_bool(random_prefix, false, "randomize prefix");
+DEFINE_uint64(total_prefixes, 100000, "total number of prefixes");
+DEFINE_uint64(items_per_prefix, 1, "total number of values per prefix");
+DEFINE_int64(write_buffer_size, 33554432, "");
+DEFINE_int32(max_write_buffer_number, 2, "");
+DEFINE_int32(min_write_buffer_number_to_merge, 1, "");
+DEFINE_int32(skiplist_height, 4, "");
+DEFINE_double(memtable_prefix_bloom_size_ratio, 0.1, "");
+DEFINE_int32(memtable_huge_page_size, 2 * 1024 * 1024, "");
+DEFINE_int32(value_size, 40, "");
+DEFINE_bool(enable_print, false, "Print options generated to console.");
+
+// Path to the database on file system
+const std::string kDbName =
+    ROCKSDB_NAMESPACE::test::PerThreadDBPath("prefix_test");
+
+namespace ROCKSDB_NAMESPACE {
+
+struct TestKey {
+  uint64_t prefix;
+  uint64_t sorted;
+
+  TestKey(uint64_t _prefix, uint64_t _sorted)
+      : prefix(_prefix), sorted(_sorted) {}
+};
+
+// return a slice backed by test_key
+inline Slice TestKeyToSlice(std::string &s, const TestKey& test_key) {
+  s.clear();
+  PutFixed64(&s, test_key.prefix);
+  PutFixed64(&s, test_key.sorted);
+  return Slice(s.c_str(), s.size());
+}
+
+inline const TestKey SliceToTestKey(const Slice& slice) {
+  return TestKey(DecodeFixed64(slice.data()),
+    DecodeFixed64(slice.data() + 8));
+}
+
+class TestKeyComparator : public Comparator {
+ public:
+
+  // Compare needs to be aware of the possibility of a and/or b is
+  // prefix only
+  int Compare(const Slice& a, const Slice& b) const override {
+    const TestKey kkey_a = SliceToTestKey(a);
+    const TestKey kkey_b = SliceToTestKey(b);
+    const TestKey *key_a = &kkey_a;
+    const TestKey *key_b = &kkey_b;
+    if (key_a->prefix != key_b->prefix) {
+      if (key_a->prefix < key_b->prefix) return -1;
+      if (key_a->prefix > key_b->prefix) return 1;
+    } else {
+      EXPECT_TRUE(key_a->prefix == key_b->prefix);
+      // note, both a and b could be prefix only
+      if (a.size() != b.size()) {
+        // one of them is prefix
+        EXPECT_TRUE(
+            (a.size() == sizeof(uint64_t) && b.size() == sizeof(TestKey)) ||
+            (b.size() == sizeof(uint64_t) && a.size() == sizeof(TestKey)));
+        if (a.size() < b.size()) return -1;
+        if (a.size() > b.size()) return 1;
+      } else {
+        // both a and b are prefix
+        if (a.size() == sizeof(uint64_t)) {
+          return 0;
+        }
+
+        // both a and b are whole key
+        EXPECT_TRUE(a.size() == sizeof(TestKey) && b.size() == sizeof(TestKey));
+        if (key_a->sorted < key_b->sorted) return -1;
+        if (key_a->sorted > key_b->sorted) return 1;
+        if (key_a->sorted == key_b->sorted) return 0;
+      }
+    }
+    return 0;
+  }
+
+  bool operator()(const TestKey& a, const TestKey& b) const {
+    std::string sa, sb;
+    return Compare(TestKeyToSlice(sa, a), TestKeyToSlice(sb, b)) < 0;
+  }
+
+  const char* Name() const override { return "TestKeyComparator"; }
+
+  void FindShortestSeparator(std::string* /*start*/,
+                             const Slice& /*limit*/) const override {}
+
+  void FindShortSuccessor(std::string* /*key*/) const override {}
+};
+
+namespace {
+void PutKey(DB* db, WriteOptions write_options, uint64_t prefix,
+            uint64_t suffix, const Slice& value) {
+  TestKey test_key(prefix, suffix);
+  std::string s;
+  Slice key = TestKeyToSlice(s, test_key);
+  ASSERT_OK(db->Put(write_options, key, value));
+}
+
+void PutKey(DB* db, WriteOptions write_options, const TestKey& test_key,
+            const Slice& value) {
+  std::string s;
+  Slice key = TestKeyToSlice(s, test_key);
+  ASSERT_OK(db->Put(write_options, key, value));
+}
+
+void MergeKey(DB* db, WriteOptions write_options, const TestKey& test_key,
+              const Slice& value) {
+  std::string s;
+  Slice key = TestKeyToSlice(s, test_key);
+  ASSERT_OK(db->Merge(write_options, key, value));
+}
+
+void DeleteKey(DB* db, WriteOptions write_options, const TestKey& test_key) {
+  std::string s;
+  Slice key = TestKeyToSlice(s, test_key);
+  ASSERT_OK(db->Delete(write_options, key));
+}
+
+void SeekIterator(Iterator* iter, uint64_t prefix, uint64_t suffix) {
+  TestKey test_key(prefix, suffix);
+  std::string s;
+  Slice key = TestKeyToSlice(s, test_key);
+  iter->Seek(key);
+}
+
+const std::string kNotFoundResult = "NOT_FOUND";
+
+std::string Get(DB* db, const ReadOptions& read_options, uint64_t prefix,
+                uint64_t suffix) {
+  TestKey test_key(prefix, suffix);
+  std::string s2;
+  Slice key = TestKeyToSlice(s2, test_key);
+
+  std::string result;
+  Status s = db->Get(read_options, key, &result);
+  if (s.IsNotFound()) {
+    result = kNotFoundResult;
+  } else if (!s.ok()) {
+    result = s.ToString();
+  }
+  return result;
+}
+
+class SamePrefixTransform : public SliceTransform {
+ private:
+  const Slice prefix_;
+  std::string name_;
+
+ public:
+  explicit SamePrefixTransform(const Slice& prefix)
+      : prefix_(prefix), name_("rocksdb.SamePrefix." + prefix.ToString()) {}
+
+  const char* Name() const override { return name_.c_str(); }
+
+  Slice Transform(const Slice& src) const override {
+    assert(InDomain(src));
+    return prefix_;
+  }
+
+  bool InDomain(const Slice& src) const override {
+    if (src.size() >= prefix_.size()) {
+      return Slice(src.data(), prefix_.size()) == prefix_;
+    }
+    return false;
+  }
+
+  bool InRange(const Slice& dst) const override { return dst == prefix_; }
+
+  bool FullLengthEnabled(size_t* /*len*/) const override { return false; }
+};
+
+}  // namespace
+
+class PrefixTest : public testing::Test {
+ public:
+  std::shared_ptr<DB> OpenDb() {
+    DB* db;
+
+    options.create_if_missing = true;
+    options.write_buffer_size = FLAGS_write_buffer_size;
+    options.max_write_buffer_number = FLAGS_max_write_buffer_number;
+    options.min_write_buffer_number_to_merge =
+      FLAGS_min_write_buffer_number_to_merge;
+
+    options.memtable_prefix_bloom_size_ratio =
+        FLAGS_memtable_prefix_bloom_size_ratio;
+    options.memtable_huge_page_size = FLAGS_memtable_huge_page_size;
+
+    options.prefix_extractor.reset(NewFixedPrefixTransform(8));
+    BlockBasedTableOptions bbto;
+    bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
+    bbto.whole_key_filtering = false;
+    options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+    options.allow_concurrent_memtable_write = false;
+
+    Status s = DB::Open(options, kDbName,  &db);
+    EXPECT_OK(s);
+    return std::shared_ptr<DB>(db);
+  }
+
+  void FirstOption() {
+    option_config_ = kBegin;
+  }
+
+  bool NextOptions(int bucket_count) {
+    // skip some options
+    option_config_++;
+    if (option_config_ < kEnd) {
+      options.prefix_extractor.reset(NewFixedPrefixTransform(8));
+      switch(option_config_) {
+        case kHashSkipList:
+          options.memtable_factory.reset(
+              NewHashSkipListRepFactory(bucket_count, FLAGS_skiplist_height));
+          return true;
+        case kHashLinkList:
+          options.memtable_factory.reset(
+              NewHashLinkListRepFactory(bucket_count));
+          return true;
+        case kHashLinkListHugePageTlb:
+          options.memtable_factory.reset(
+              NewHashLinkListRepFactory(bucket_count, 2 * 1024 * 1024));
+          return true;
+        case kHashLinkListTriggerSkipList:
+          options.memtable_factory.reset(
+              NewHashLinkListRepFactory(bucket_count, 0, 3));
+          return true;
+        default:
+          return false;
+      }
+    }
+    return false;
+  }
+
+  PrefixTest() : option_config_(kBegin) {
+    options.comparator = new TestKeyComparator();
+  }
+  ~PrefixTest() override { delete options.comparator; }
+
+ protected:
+  enum OptionConfig {
+    kBegin,
+    kHashSkipList,
+    kHashLinkList,
+    kHashLinkListHugePageTlb,
+    kHashLinkListTriggerSkipList,
+    kEnd
+  };
+  int option_config_;
+  Options options;
+};
+
+TEST(SamePrefixTest, InDomainTest) {
+  DB* db;
+  Options options;
+  options.create_if_missing = true;
+  options.prefix_extractor.reset(new SamePrefixTransform("HHKB"));
+  BlockBasedTableOptions bbto;
+  bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
+  bbto.whole_key_filtering = false;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  WriteOptions write_options;
+  ReadOptions read_options;
+  {
+    ASSERT_OK(DestroyDB(kDbName, Options()));
+    ASSERT_OK(DB::Open(options, kDbName, &db));
+    ASSERT_OK(db->Put(write_options, "HHKB pro2", "Mar 24, 2006"));
+    ASSERT_OK(db->Put(write_options, "HHKB pro2 Type-S", "June 29, 2011"));
+    ASSERT_OK(db->Put(write_options, "Realforce 87u", "idk"));
+    db->Flush(FlushOptions());
+    std::string result;
+    auto db_iter = db->NewIterator(ReadOptions());
+
+    db_iter->Seek("Realforce 87u");
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_OK(db_iter->status());
+    ASSERT_EQ(db_iter->key(), "Realforce 87u");
+    ASSERT_EQ(db_iter->value(), "idk");
+
+    delete db_iter;
+    delete db;
+    ASSERT_OK(DestroyDB(kDbName, Options()));
+  }
+
+  {
+    ASSERT_OK(DB::Open(options, kDbName, &db));
+    ASSERT_OK(db->Put(write_options, "pikachu", "1"));
+    ASSERT_OK(db->Put(write_options, "Meowth", "1"));
+    ASSERT_OK(db->Put(write_options, "Mewtwo", "idk"));
+    db->Flush(FlushOptions());
+    std::string result;
+    auto db_iter = db->NewIterator(ReadOptions());
+
+    db_iter->Seek("Mewtwo");
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_OK(db_iter->status());
+    delete db_iter;
+    delete db;
+    ASSERT_OK(DestroyDB(kDbName, Options()));
+  }
+}
+
+TEST_F(PrefixTest, TestResult) {
+  for (int num_buckets = 1; num_buckets <= 2; num_buckets++) {
+    FirstOption();
+    while (NextOptions(num_buckets)) {
+      std::cout << "*** Mem table: " << options.memtable_factory->Name()
+                << " number of buckets: " << num_buckets
+                << std::endl;
+      DestroyDB(kDbName, Options());
+      auto db = OpenDb();
+      WriteOptions write_options;
+      ReadOptions read_options;
+
+      // 1. Insert one row.
+      Slice v16("v16");
+      PutKey(db.get(), write_options, 1, 6, v16);
+      std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
+      SeekIterator(iter.get(), 1, 6);
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v16 == iter->value());
+      SeekIterator(iter.get(), 1, 5);
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v16 == iter->value());
+      SeekIterator(iter.get(), 1, 5);
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v16 == iter->value());
+      iter->Next();
+      ASSERT_TRUE(!iter->Valid());
+
+      SeekIterator(iter.get(), 2, 0);
+      ASSERT_TRUE(!iter->Valid());
+
+      ASSERT_EQ(v16.ToString(), Get(db.get(), read_options, 1, 6));
+      ASSERT_EQ(kNotFoundResult, Get(db.get(), read_options, 1, 5));
+      ASSERT_EQ(kNotFoundResult, Get(db.get(), read_options, 1, 7));
+      ASSERT_EQ(kNotFoundResult, Get(db.get(), read_options, 0, 6));
+      ASSERT_EQ(kNotFoundResult, Get(db.get(), read_options, 2, 6));
+
+      // 2. Insert an entry for the same prefix as the last entry in the bucket.
+      Slice v17("v17");
+      PutKey(db.get(), write_options, 1, 7, v17);
+      iter.reset(db->NewIterator(read_options));
+      SeekIterator(iter.get(), 1, 7);
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v17 == iter->value());
+
+      SeekIterator(iter.get(), 1, 6);
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v16 == iter->value());
+      iter->Next();
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v17 == iter->value());
+      iter->Next();
+      ASSERT_TRUE(!iter->Valid());
+
+      SeekIterator(iter.get(), 2, 0);
+      ASSERT_TRUE(!iter->Valid());
+
+      // 3. Insert an entry for the same prefix as the head of the bucket.
+      Slice v15("v15");
+      PutKey(db.get(), write_options, 1, 5, v15);
+      iter.reset(db->NewIterator(read_options));
+
+      SeekIterator(iter.get(), 1, 7);
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v17 == iter->value());
+
+      SeekIterator(iter.get(), 1, 5);
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v15 == iter->value());
+      iter->Next();
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v16 == iter->value());
+      iter->Next();
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v17 == iter->value());
+
+      SeekIterator(iter.get(), 1, 5);
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v15 == iter->value());
+
+      ASSERT_EQ(v15.ToString(), Get(db.get(), read_options, 1, 5));
+      ASSERT_EQ(v16.ToString(), Get(db.get(), read_options, 1, 6));
+      ASSERT_EQ(v17.ToString(), Get(db.get(), read_options, 1, 7));
+
+      // 4. Insert an entry with a larger prefix
+      Slice v22("v22");
+      PutKey(db.get(), write_options, 2, 2, v22);
+      iter.reset(db->NewIterator(read_options));
+
+      SeekIterator(iter.get(), 2, 2);
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v22 == iter->value());
+      SeekIterator(iter.get(), 2, 0);
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v22 == iter->value());
+
+      SeekIterator(iter.get(), 1, 5);
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v15 == iter->value());
+
+      SeekIterator(iter.get(), 1, 7);
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v17 == iter->value());
+
+      // 5. Insert an entry with a smaller prefix
+      Slice v02("v02");
+      PutKey(db.get(), write_options, 0, 2, v02);
+      iter.reset(db->NewIterator(read_options));
+
+      SeekIterator(iter.get(), 0, 2);
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v02 == iter->value());
+      SeekIterator(iter.get(), 0, 0);
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v02 == iter->value());
+
+      SeekIterator(iter.get(), 2, 0);
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v22 == iter->value());
+
+      SeekIterator(iter.get(), 1, 5);
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v15 == iter->value());
+
+      SeekIterator(iter.get(), 1, 7);
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v17 == iter->value());
+
+      // 6. Insert to the beginning and the end of the first prefix
+      Slice v13("v13");
+      Slice v18("v18");
+      PutKey(db.get(), write_options, 1, 3, v13);
+      PutKey(db.get(), write_options, 1, 8, v18);
+      iter.reset(db->NewIterator(read_options));
+      SeekIterator(iter.get(), 1, 7);
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v17 == iter->value());
+
+      SeekIterator(iter.get(), 1, 3);
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v13 == iter->value());
+      iter->Next();
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v15 == iter->value());
+      iter->Next();
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v16 == iter->value());
+      iter->Next();
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v17 == iter->value());
+      iter->Next();
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v18 == iter->value());
+
+      SeekIterator(iter.get(), 0, 0);
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v02 == iter->value());
+
+      SeekIterator(iter.get(), 2, 0);
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v22 == iter->value());
+
+      ASSERT_EQ(v22.ToString(), Get(db.get(), read_options, 2, 2));
+      ASSERT_EQ(v02.ToString(), Get(db.get(), read_options, 0, 2));
+      ASSERT_EQ(v13.ToString(), Get(db.get(), read_options, 1, 3));
+      ASSERT_EQ(v15.ToString(), Get(db.get(), read_options, 1, 5));
+      ASSERT_EQ(v16.ToString(), Get(db.get(), read_options, 1, 6));
+      ASSERT_EQ(v17.ToString(), Get(db.get(), read_options, 1, 7));
+      ASSERT_EQ(v18.ToString(), Get(db.get(), read_options, 1, 8));
+    }
+  }
+}
+
+// Show results in prefix
+TEST_F(PrefixTest, PrefixValid) {
+  for (int num_buckets = 1; num_buckets <= 2; num_buckets++) {
+    FirstOption();
+    while (NextOptions(num_buckets)) {
+      std::cout << "*** Mem table: " << options.memtable_factory->Name()
+                << " number of buckets: " << num_buckets << std::endl;
+      DestroyDB(kDbName, Options());
+      auto db = OpenDb();
+      WriteOptions write_options;
+      ReadOptions read_options;
+
+      // Insert keys with common prefix and one key with different
+      Slice v16("v16");
+      Slice v17("v17");
+      Slice v18("v18");
+      Slice v19("v19");
+      PutKey(db.get(), write_options, 12345, 6, v16);
+      PutKey(db.get(), write_options, 12345, 7, v17);
+      PutKey(db.get(), write_options, 12345, 8, v18);
+      PutKey(db.get(), write_options, 12345, 9, v19);
+      PutKey(db.get(), write_options, 12346, 8, v16);
+      db->Flush(FlushOptions());
+      TestKey test_key(12346, 8);
+      std::string s;
+      db->Delete(write_options, TestKeyToSlice(s, test_key));
+      db->Flush(FlushOptions());
+      read_options.prefix_same_as_start = true;
+      std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
+      SeekIterator(iter.get(), 12345, 6);
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v16 == iter->value());
+
+      iter->Next();
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v17 == iter->value());
+
+      iter->Next();
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v18 == iter->value());
+
+      iter->Next();
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v19 == iter->value());
+      iter->Next();
+      ASSERT_FALSE(iter->Valid());
+      ASSERT_EQ(kNotFoundResult, Get(db.get(), read_options, 12346, 8));
+
+      // Verify seeking past the prefix won't return a result.
+      SeekIterator(iter.get(), 12345, 10);
+      ASSERT_TRUE(!iter->Valid());
+    }
+  }
+}
+
+TEST_F(PrefixTest, DynamicPrefixIterator) {
+  while (NextOptions(FLAGS_bucket_count)) {
+    std::cout << "*** Mem table: " << options.memtable_factory->Name()
+        << std::endl;
+    DestroyDB(kDbName, Options());
+    auto db = OpenDb();
+    WriteOptions write_options;
+    ReadOptions read_options;
+
+    std::vector<uint64_t> prefixes;
+    for (uint64_t i = 0; i < FLAGS_total_prefixes; ++i) {
+      prefixes.push_back(i);
+    }
+
+    if (FLAGS_random_prefix) {
+      std::random_shuffle(prefixes.begin(), prefixes.end());
+    }
+
+    HistogramImpl hist_put_time;
+    HistogramImpl hist_put_comparison;
+
+    // insert x random prefix, each with y continuous element.
+    for (auto prefix : prefixes) {
+       for (uint64_t sorted = 0; sorted < FLAGS_items_per_prefix; sorted++) {
+        TestKey test_key(prefix, sorted);
+
+        std::string s;
+        Slice key = TestKeyToSlice(s, test_key);
+        std::string value(FLAGS_value_size, 0);
+
+        get_perf_context()->Reset();
+        StopWatchNano timer(Env::Default(), true);
+        ASSERT_OK(db->Put(write_options, key, value));
+        hist_put_time.Add(timer.ElapsedNanos());
+        hist_put_comparison.Add(get_perf_context()->user_key_comparison_count);
+      }
+    }
+
+    std::cout << "Put key comparison: \n" << hist_put_comparison.ToString()
+              << "Put time: \n" << hist_put_time.ToString();
+
+    // test seek existing keys
+    HistogramImpl hist_seek_time;
+    HistogramImpl hist_seek_comparison;
+
+    std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
+
+    for (auto prefix : prefixes) {
+      TestKey test_key(prefix, FLAGS_items_per_prefix / 2);
+      std::string s;
+      Slice key = TestKeyToSlice(s, test_key);
+      std::string value = "v" + ToString(0);
+
+      get_perf_context()->Reset();
+      StopWatchNano timer(Env::Default(), true);
+      auto key_prefix = options.prefix_extractor->Transform(key);
+      uint64_t total_keys = 0;
+      for (iter->Seek(key);
+           iter->Valid() && iter->key().starts_with(key_prefix);
+           iter->Next()) {
+        if (FLAGS_trigger_deadlock) {
+          std::cout << "Behold the deadlock!\n";
+          db->Delete(write_options, iter->key());
+        }
+        total_keys++;
+      }
+      hist_seek_time.Add(timer.ElapsedNanos());
+      hist_seek_comparison.Add(get_perf_context()->user_key_comparison_count);
+      ASSERT_EQ(total_keys, FLAGS_items_per_prefix - FLAGS_items_per_prefix/2);
+    }
+
+    std::cout << "Seek key comparison: \n"
+              << hist_seek_comparison.ToString()
+              << "Seek time: \n"
+              << hist_seek_time.ToString();
+
+    // test non-existing keys
+    HistogramImpl hist_no_seek_time;
+    HistogramImpl hist_no_seek_comparison;
+
+    for (auto prefix = FLAGS_total_prefixes;
+         prefix < FLAGS_total_prefixes + 10000;
+         prefix++) {
+      TestKey test_key(prefix, 0);
+      std::string s;
+      Slice key = TestKeyToSlice(s, test_key);
+
+      get_perf_context()->Reset();
+      StopWatchNano timer(Env::Default(), true);
+      iter->Seek(key);
+      hist_no_seek_time.Add(timer.ElapsedNanos());
+      hist_no_seek_comparison.Add(get_perf_context()->user_key_comparison_count);
+      ASSERT_TRUE(!iter->Valid());
+    }
+
+    std::cout << "non-existing Seek key comparison: \n"
+              << hist_no_seek_comparison.ToString()
+              << "non-existing Seek time: \n"
+              << hist_no_seek_time.ToString();
+  }
+}
+
+TEST_F(PrefixTest, PrefixSeekModePrev) {
+  // Only for SkipListFactory
+  options.memtable_factory.reset(new SkipListFactory);
+  options.merge_operator = MergeOperators::CreatePutOperator();
+  options.write_buffer_size = 1024 * 1024;
+  Random rnd(1);
+  for (size_t m = 1; m < 100; m++) {
+    std::cout << "[" + std::to_string(m) + "]" + "*** Mem table: "
+              << options.memtable_factory->Name() << std::endl;
+    DestroyDB(kDbName, Options());
+    auto db = OpenDb();
+    WriteOptions write_options;
+    ReadOptions read_options;
+    std::map<TestKey, std::string, TestKeyComparator> entry_maps[3], whole_map;
+    for (uint64_t i = 0; i < 10; i++) {
+      int div = i % 3 + 1;
+      for (uint64_t j = 0; j < 10; j++) {
+        whole_map[TestKey(i, j)] = entry_maps[rnd.Uniform(div)][TestKey(i, j)] =
+            'v' + std::to_string(i) + std::to_string(j);
+      }
+    }
+
+    std::map<TestKey, std::string, TestKeyComparator> type_map;
+    for (size_t i = 0; i < 3; i++) {
+      for (auto& kv : entry_maps[i]) {
+        if (rnd.OneIn(3)) {
+          PutKey(db.get(), write_options, kv.first, kv.second);
+          type_map[kv.first] = "value";
+        } else {
+          MergeKey(db.get(), write_options, kv.first, kv.second);
+          type_map[kv.first] = "merge";
+        }
+      }
+      if (i < 2) {
+        db->Flush(FlushOptions());
+      }
+    }
+
+    for (size_t i = 0; i < 2; i++) {
+      for (auto& kv : entry_maps[i]) {
+        if (rnd.OneIn(10)) {
+          whole_map.erase(kv.first);
+          DeleteKey(db.get(), write_options, kv.first);
+          entry_maps[2][kv.first] = "delete";
+        }
+      }
+    }
+
+    if (FLAGS_enable_print) {
+      for (size_t i = 0; i < 3; i++) {
+        for (auto& kv : entry_maps[i]) {
+          std::cout << "[" << i << "]" << kv.first.prefix << kv.first.sorted
+                    << " " << kv.second + " " + type_map[kv.first] << std::endl;
+        }
+      }
+    }
+
+    std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
+    for (uint64_t prefix = 0; prefix < 10; prefix++) {
+      uint64_t start_suffix = rnd.Uniform(9);
+      SeekIterator(iter.get(), prefix, start_suffix);
+      auto it = whole_map.find(TestKey(prefix, start_suffix));
+      if (it == whole_map.end()) {
+        continue;
+      }
+      ASSERT_NE(it, whole_map.end());
+      ASSERT_TRUE(iter->Valid());
+      if (FLAGS_enable_print) {
+        std::cout << "round " << prefix
+                  << " iter: " << SliceToTestKey(iter->key()).prefix
+                  << SliceToTestKey(iter->key()).sorted
+                  << " | map: " << it->first.prefix << it->first.sorted << " | "
+                  << iter->value().ToString() << " " << it->second << std::endl;
+      }
+      ASSERT_EQ(iter->value(), it->second);
+      uint64_t stored_prefix = prefix;
+      for (size_t k = 0; k < 9; k++) {
+        if (rnd.OneIn(2) || it == whole_map.begin()) {
+          iter->Next();
+          ++it;
+          if (FLAGS_enable_print) {
+            std::cout << "Next >> ";
+          }
+        } else {
+          iter->Prev();
+          it--;
+          if (FLAGS_enable_print) {
+            std::cout << "Prev >> ";
+          }
+        }
+        if (!iter->Valid() ||
+            SliceToTestKey(iter->key()).prefix != stored_prefix) {
+          break;
+        }
+        stored_prefix = SliceToTestKey(iter->key()).prefix;
+        ASSERT_TRUE(iter->Valid());
+        ASSERT_NE(it, whole_map.end());
+        ASSERT_EQ(iter->value(), it->second);
+        if (FLAGS_enable_print) {
+          std::cout << "iter: " << SliceToTestKey(iter->key()).prefix
+                    << SliceToTestKey(iter->key()).sorted
+                    << " | map: " << it->first.prefix << it->first.sorted
+                    << " | " << iter->value().ToString() << " " << it->second
+                    << std::endl;
+        }
+      }
+    }
+  }
+}
+
+TEST_F(PrefixTest, PrefixSeekModePrev2) {
+  // Only for SkipListFactory
+  // test the case
+  //        iter1                iter2
+  // | prefix | suffix |  | prefix | suffix |
+  // |   1    |   1    |  |   1    |   2    |
+  // |   1    |   3    |  |   1    |   4    |
+  // |   2    |   1    |  |   3    |   3    |
+  // |   2    |   2    |  |   3    |   4    |
+  // after seek(15), iter1 will be at 21 and iter2 will be 33.
+  // Then if call Prev() in prefix mode where SeekForPrev(21) gets called,
+  // iter2 should turn to invalid state because of bloom filter.
+  options.memtable_factory.reset(new SkipListFactory);
+  options.write_buffer_size = 1024 * 1024;
+  std::string v13("v13");
+  DestroyDB(kDbName, Options());
+  auto db = OpenDb();
+  WriteOptions write_options;
+  ReadOptions read_options;
+  PutKey(db.get(), write_options, TestKey(1, 2), "v12");
+  PutKey(db.get(), write_options, TestKey(1, 4), "v14");
+  PutKey(db.get(), write_options, TestKey(3, 3), "v33");
+  PutKey(db.get(), write_options, TestKey(3, 4), "v34");
+  db->Flush(FlushOptions());
+  reinterpret_cast<DBImpl*>(db.get())->TEST_WaitForFlushMemTable();
+  PutKey(db.get(), write_options, TestKey(1, 1), "v11");
+  PutKey(db.get(), write_options, TestKey(1, 3), "v13");
+  PutKey(db.get(), write_options, TestKey(2, 1), "v21");
+  PutKey(db.get(), write_options, TestKey(2, 2), "v22");
+  db->Flush(FlushOptions());
+  reinterpret_cast<DBImpl*>(db.get())->TEST_WaitForFlushMemTable();
+  std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
+  SeekIterator(iter.get(), 1, 5);
+  iter->Prev();
+  ASSERT_EQ(iter->value(), v13);
+}
+
+TEST_F(PrefixTest, PrefixSeekModePrev3) {
+  // Only for SkipListFactory
+  // test SeekToLast() with iterate_upper_bound_ in prefix_seek_mode
+  options.memtable_factory.reset(new SkipListFactory);
+  options.write_buffer_size = 1024 * 1024;
+  std::string v14("v14");
+  TestKey upper_bound_key = TestKey(1, 5);
+  std::string s;
+  Slice upper_bound = TestKeyToSlice(s, upper_bound_key);
+
+  {
+    DestroyDB(kDbName, Options());
+    auto db = OpenDb();
+    WriteOptions write_options;
+    ReadOptions read_options;
+    read_options.iterate_upper_bound = &upper_bound;
+    PutKey(db.get(), write_options, TestKey(1, 2), "v12");
+    PutKey(db.get(), write_options, TestKey(1, 4), "v14");
+    db->Flush(FlushOptions());
+    reinterpret_cast<DBImpl*>(db.get())->TEST_WaitForFlushMemTable();
+    PutKey(db.get(), write_options, TestKey(1, 1), "v11");
+    PutKey(db.get(), write_options, TestKey(1, 3), "v13");
+    PutKey(db.get(), write_options, TestKey(2, 1), "v21");
+    PutKey(db.get(), write_options, TestKey(2, 2), "v22");
+    db->Flush(FlushOptions());
+    reinterpret_cast<DBImpl*>(db.get())->TEST_WaitForFlushMemTable();
+    std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
+    iter->SeekToLast();
+    ASSERT_EQ(iter->value(), v14);
+  }
+  {
+    DestroyDB(kDbName, Options());
+    auto db = OpenDb();
+    WriteOptions write_options;
+    ReadOptions read_options;
+    read_options.iterate_upper_bound = &upper_bound;
+    PutKey(db.get(), write_options, TestKey(1, 2), "v12");
+    PutKey(db.get(), write_options, TestKey(1, 4), "v14");
+    PutKey(db.get(), write_options, TestKey(3, 3), "v33");
+    PutKey(db.get(), write_options, TestKey(3, 4), "v34");
+    db->Flush(FlushOptions());
+    reinterpret_cast<DBImpl*>(db.get())->TEST_WaitForFlushMemTable();
+    PutKey(db.get(), write_options, TestKey(1, 1), "v11");
+    PutKey(db.get(), write_options, TestKey(1, 3), "v13");
+    db->Flush(FlushOptions());
+    reinterpret_cast<DBImpl*>(db.get())->TEST_WaitForFlushMemTable();
+    std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
+    iter->SeekToLast();
+    ASSERT_EQ(iter->value(), v14);
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  ParseCommandLineFlags(&argc, &argv, true);
+  return RUN_ALL_TESTS();
+}
+
+#endif  // GFLAGS
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+  fprintf(stderr,
+          "SKIPPED as HashSkipList and HashLinkList are not supported in "
+          "ROCKSDB_LITE\n");
+  return 0;
+}
+
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/range_del_aggregator.cc b/src/rocksdb/db/range_del_aggregator.cc
new file mode 100644
index 000000000..1f6a7b139
--- /dev/null
+++ b/src/rocksdb/db/range_del_aggregator.cc
@@ -0,0 +1,484 @@
+//  Copyright (c) 2018-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/range_del_aggregator.h"
+
+#include "db/compaction/compaction_iteration_stats.h"
+#include "db/dbformat.h"
+#include "db/pinned_iterators_manager.h"
+#include "db/range_del_aggregator.h"
+#include "db/range_tombstone_fragmenter.h"
+#include "db/version_edit.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/types.h"
+#include "table/internal_iterator.h"
+#include "table/scoped_arena_iterator.h"
+#include "table/table_builder.h"
+#include "util/heap.h"
+#include "util/kv_map.h"
+#include "util/vector_iterator.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+TruncatedRangeDelIterator::TruncatedRangeDelIterator(
+    std::unique_ptr<FragmentedRangeTombstoneIterator> iter,
+    const InternalKeyComparator* icmp, const InternalKey* smallest,
+    const InternalKey* largest)
+    : iter_(std::move(iter)),
+      icmp_(icmp),
+      smallest_ikey_(smallest),
+      largest_ikey_(largest) {
+  if (smallest != nullptr) {
+    pinned_bounds_.emplace_back();
+    auto& parsed_smallest = pinned_bounds_.back();
+    if (!ParseInternalKey(smallest->Encode(), &parsed_smallest)) {
+      assert(false);
+    }
+    smallest_ = &parsed_smallest;
+  }
+  if (largest != nullptr) {
+    pinned_bounds_.emplace_back();
+    auto& parsed_largest = pinned_bounds_.back();
+    if (!ParseInternalKey(largest->Encode(), &parsed_largest)) {
+      assert(false);
+    }
+    if (parsed_largest.type == kTypeRangeDeletion &&
+        parsed_largest.sequence == kMaxSequenceNumber) {
+      // The file boundary has been artificially extended by a range tombstone.
+      // We do not need to adjust largest to properly truncate range
+      // tombstones that extend past the boundary.
+    } else if (parsed_largest.sequence == 0) {
+      // The largest key in the sstable has a sequence number of 0. Since we
+      // guarantee that no internal keys with the same user key and sequence
+      // number can exist in a DB, we know that the largest key in this sstable
+      // cannot exist as the smallest key in the next sstable. This further
+      // implies that no range tombstone in this sstable covers largest;
+      // otherwise, the file boundary would have been artificially extended.
+      //
+      // Therefore, we will never truncate a range tombstone at largest, so we
+      // can leave it unchanged.
+    } else {
+      // The same user key may straddle two sstable boundaries. To ensure that
+      // the truncated end key can cover the largest key in this sstable, reduce
+      // its sequence number by 1.
+      parsed_largest.sequence -= 1;
+    }
+    largest_ = &parsed_largest;
+  }
+}
+
+bool TruncatedRangeDelIterator::Valid() const {
+  return iter_->Valid() &&
+         (smallest_ == nullptr ||
+          icmp_->Compare(*smallest_, iter_->parsed_end_key()) < 0) &&
+         (largest_ == nullptr ||
+          icmp_->Compare(iter_->parsed_start_key(), *largest_) < 0);
+}
+
+void TruncatedRangeDelIterator::Next() { iter_->TopNext(); }
+
+void TruncatedRangeDelIterator::Prev() { iter_->TopPrev(); }
+
+void TruncatedRangeDelIterator::InternalNext() { iter_->Next(); }
+
+// NOTE: target is a user key
+void TruncatedRangeDelIterator::Seek(const Slice& target) {
+  if (largest_ != nullptr &&
+      icmp_->Compare(*largest_, ParsedInternalKey(target, kMaxSequenceNumber,
+                                                  kTypeRangeDeletion)) <= 0) {
+    iter_->Invalidate();
+    return;
+  }
+  if (smallest_ != nullptr &&
+      icmp_->user_comparator()->Compare(target, smallest_->user_key) < 0) {
+    iter_->Seek(smallest_->user_key);
+    return;
+  }
+  iter_->Seek(target);
+}
+
+// NOTE: target is a user key
+void TruncatedRangeDelIterator::SeekForPrev(const Slice& target) {
+  if (smallest_ != nullptr &&
+      icmp_->Compare(ParsedInternalKey(target, 0, kTypeRangeDeletion),
+                     *smallest_) < 0) {
+    iter_->Invalidate();
+    return;
+  }
+  if (largest_ != nullptr &&
+      icmp_->user_comparator()->Compare(largest_->user_key, target) < 0) {
+    iter_->SeekForPrev(largest_->user_key);
+    return;
+  }
+  iter_->SeekForPrev(target);
+}
+
+void TruncatedRangeDelIterator::SeekToFirst() {
+  if (smallest_ != nullptr) {
+    iter_->Seek(smallest_->user_key);
+    return;
+  }
+  iter_->SeekToTopFirst();
+}
+
+void TruncatedRangeDelIterator::SeekToLast() {
+  if (largest_ != nullptr) {
+    iter_->SeekForPrev(largest_->user_key);
+    return;
+  }
+  iter_->SeekToTopLast();
+}
+
+std::map<SequenceNumber, std::unique_ptr<TruncatedRangeDelIterator>>
+TruncatedRangeDelIterator::SplitBySnapshot(
+    const std::vector<SequenceNumber>& snapshots) {
+  using FragmentedIterPair =
+      std::pair<const SequenceNumber,
+                std::unique_ptr<FragmentedRangeTombstoneIterator>>;
+
+  auto split_untruncated_iters = iter_->SplitBySnapshot(snapshots);
+  std::map<SequenceNumber, std::unique_ptr<TruncatedRangeDelIterator>>
+      split_truncated_iters;
+  std::for_each(
+      split_untruncated_iters.begin(), split_untruncated_iters.end(),
+      [&](FragmentedIterPair& iter_pair) {
+        std::unique_ptr<TruncatedRangeDelIterator> truncated_iter(
+            new TruncatedRangeDelIterator(std::move(iter_pair.second), icmp_,
+                                          smallest_ikey_, largest_ikey_));
+        split_truncated_iters.emplace(iter_pair.first,
+                                      std::move(truncated_iter));
+      });
+  return split_truncated_iters;
+}
+
+ForwardRangeDelIterator::ForwardRangeDelIterator(
+    const InternalKeyComparator* icmp)
+    : icmp_(icmp),
+      unused_idx_(0),
+      active_seqnums_(SeqMaxComparator()),
+      active_iters_(EndKeyMinComparator(icmp)),
+      inactive_iters_(StartKeyMinComparator(icmp)) {}
+
+bool ForwardRangeDelIterator::ShouldDelete(const ParsedInternalKey& parsed) {
+  // Move active iterators that end before parsed.
+  while (!active_iters_.empty() &&
+         icmp_->Compare((*active_iters_.top())->end_key(), parsed) <= 0) {
+    TruncatedRangeDelIterator* iter = PopActiveIter();
+    do {
+      iter->Next();
+    } while (iter->Valid() && icmp_->Compare(iter->end_key(), parsed) <= 0);
+    PushIter(iter, parsed);
+    assert(active_iters_.size() == active_seqnums_.size());
+  }
+
+  // Move inactive iterators that start before parsed.
+  while (!inactive_iters_.empty() &&
+         icmp_->Compare(inactive_iters_.top()->start_key(), parsed) <= 0) {
+    TruncatedRangeDelIterator* iter = PopInactiveIter();
+    while (iter->Valid() && icmp_->Compare(iter->end_key(), parsed) <= 0) {
+      iter->Next();
+    }
+    PushIter(iter, parsed);
+    assert(active_iters_.size() == active_seqnums_.size());
+  }
+
+  return active_seqnums_.empty()
+             ? false
+             : (*active_seqnums_.begin())->seq() > parsed.sequence;
+}
+
+void ForwardRangeDelIterator::Invalidate() {
+  unused_idx_ = 0;
+  active_iters_.clear();
+  active_seqnums_.clear();
+  inactive_iters_.clear();
+}
+
+ReverseRangeDelIterator::ReverseRangeDelIterator(
+    const InternalKeyComparator* icmp)
+    : icmp_(icmp),
+      unused_idx_(0),
+      active_seqnums_(SeqMaxComparator()),
+      active_iters_(StartKeyMaxComparator(icmp)),
+      inactive_iters_(EndKeyMaxComparator(icmp)) {}
+
+bool ReverseRangeDelIterator::ShouldDelete(const ParsedInternalKey& parsed) {
+  // Move active iterators that start after parsed.
+  while (!active_iters_.empty() &&
+         icmp_->Compare(parsed, (*active_iters_.top())->start_key()) < 0) {
+    TruncatedRangeDelIterator* iter = PopActiveIter();
+    do {
+      iter->Prev();
+    } while (iter->Valid() && icmp_->Compare(parsed, iter->start_key()) < 0);
+    PushIter(iter, parsed);
+    assert(active_iters_.size() == active_seqnums_.size());
+  }
+
+  // Move inactive iterators that end after parsed.
+  while (!inactive_iters_.empty() &&
+         icmp_->Compare(parsed, inactive_iters_.top()->end_key()) < 0) {
+    TruncatedRangeDelIterator* iter = PopInactiveIter();
+    while (iter->Valid() && icmp_->Compare(parsed, iter->start_key()) < 0) {
+      iter->Prev();
+    }
+    PushIter(iter, parsed);
+    assert(active_iters_.size() == active_seqnums_.size());
+  }
+
+  return active_seqnums_.empty()
+             ? false
+             : (*active_seqnums_.begin())->seq() > parsed.sequence;
+}
+
+void ReverseRangeDelIterator::Invalidate() {
+  unused_idx_ = 0;
+  active_iters_.clear();
+  active_seqnums_.clear();
+  inactive_iters_.clear();
+}
+
+bool RangeDelAggregator::StripeRep::ShouldDelete(
+    const ParsedInternalKey& parsed, RangeDelPositioningMode mode) {
+  if (!InStripe(parsed.sequence) || IsEmpty()) {
+    return false;
+  }
+  switch (mode) {
+    case RangeDelPositioningMode::kForwardTraversal:
+      InvalidateReverseIter();
+
+      // Pick up previously unseen iterators.
+      for (auto it = std::next(iters_.begin(), forward_iter_.UnusedIdx());
+           it != iters_.end(); ++it, forward_iter_.IncUnusedIdx()) {
+        auto& iter = *it;
+        forward_iter_.AddNewIter(iter.get(), parsed);
+      }
+
+      return forward_iter_.ShouldDelete(parsed);
+    case RangeDelPositioningMode::kBackwardTraversal:
+      InvalidateForwardIter();
+
+      // Pick up previously unseen iterators.
+      for (auto it = std::next(iters_.begin(), reverse_iter_.UnusedIdx());
+           it != iters_.end(); ++it, reverse_iter_.IncUnusedIdx()) {
+        auto& iter = *it;
+        reverse_iter_.AddNewIter(iter.get(), parsed);
+      }
+
+      return reverse_iter_.ShouldDelete(parsed);
+    default:
+      assert(false);
+      return false;
+  }
+}
+
+bool RangeDelAggregator::StripeRep::IsRangeOverlapped(const Slice& start,
+                                                      const Slice& end) {
+  Invalidate();
+
+  // Set the internal start/end keys so that:
+  // - if start_ikey has the same user key and sequence number as the
+  // current end key, start_ikey will be considered greater; and
+  // - if end_ikey has the same user key and sequence number as the current
+  // start key, end_ikey will be considered greater.
+  ParsedInternalKey start_ikey(start, kMaxSequenceNumber,
+                               static_cast<ValueType>(0));
+  ParsedInternalKey end_ikey(end, 0, static_cast<ValueType>(0));
+  for (auto& iter : iters_) {
+    bool checked_candidate_tombstones = false;
+    for (iter->SeekForPrev(start);
+         iter->Valid() && icmp_->Compare(iter->start_key(), end_ikey) <= 0;
+         iter->Next()) {
+      checked_candidate_tombstones = true;
+      if (icmp_->Compare(start_ikey, iter->end_key()) < 0 &&
+          icmp_->Compare(iter->start_key(), end_ikey) <= 0) {
+        return true;
+      }
+    }
+
+    if (!checked_candidate_tombstones) {
+      // Do an additional check for when the end of the range is the begin
+      // key of a tombstone, which we missed earlier since SeekForPrev'ing
+      // to the start was invalid.
+      iter->SeekForPrev(end);
+      if (iter->Valid() && icmp_->Compare(start_ikey, iter->end_key()) < 0 &&
+          icmp_->Compare(iter->start_key(), end_ikey) <= 0) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+void ReadRangeDelAggregator::AddTombstones(
+    std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter,
+    const InternalKey* smallest, const InternalKey* largest) {
+  if (input_iter == nullptr || input_iter->empty()) {
+    return;
+  }
+  rep_.AddTombstones(
+      std::unique_ptr<TruncatedRangeDelIterator>(new TruncatedRangeDelIterator(
+          std::move(input_iter), icmp_, smallest, largest)));
+}
+
+bool ReadRangeDelAggregator::ShouldDeleteImpl(const ParsedInternalKey& parsed,
+                                              RangeDelPositioningMode mode) {
+  return rep_.ShouldDelete(parsed, mode);
+}
+
+bool ReadRangeDelAggregator::IsRangeOverlapped(const Slice& start,
+                                               const Slice& end) {
+  InvalidateRangeDelMapPositions();
+  return rep_.IsRangeOverlapped(start, end);
+}
+
+void CompactionRangeDelAggregator::AddTombstones(
+    std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter,
+    const InternalKey* smallest, const InternalKey* largest) {
+  if (input_iter == nullptr || input_iter->empty()) {
+    return;
+  }
+  assert(input_iter->lower_bound() == 0);
+  assert(input_iter->upper_bound() == kMaxSequenceNumber);
+  parent_iters_.emplace_back(new TruncatedRangeDelIterator(
+      std::move(input_iter), icmp_, smallest, largest));
+
+  auto split_iters = parent_iters_.back()->SplitBySnapshot(*snapshots_);
+  for (auto& split_iter : split_iters) {
+    auto it = reps_.find(split_iter.first);
+    if (it == reps_.end()) {
+      bool inserted;
+      SequenceNumber upper_bound = split_iter.second->upper_bound();
+      SequenceNumber lower_bound = split_iter.second->lower_bound();
+      std::tie(it, inserted) = reps_.emplace(
+          split_iter.first, StripeRep(icmp_, upper_bound, lower_bound));
+      assert(inserted);
+    }
+    assert(it != reps_.end());
+    it->second.AddTombstones(std::move(split_iter.second));
+  }
+}
+
+bool CompactionRangeDelAggregator::ShouldDelete(const ParsedInternalKey& parsed,
+                                                RangeDelPositioningMode mode) {
+  auto it = reps_.lower_bound(parsed.sequence);
+  if (it == reps_.end()) {
+    return false;
+  }
+  return it->second.ShouldDelete(parsed, mode);
+}
+
+namespace {
+
+class TruncatedRangeDelMergingIter : public InternalIterator {
+ public:
+  TruncatedRangeDelMergingIter(
+      const InternalKeyComparator* icmp, const Slice* lower_bound,
+      const Slice* upper_bound, bool upper_bound_inclusive,
+      const std::vector<std::unique_ptr<TruncatedRangeDelIterator>>& children)
+      : icmp_(icmp),
+        lower_bound_(lower_bound),
+        upper_bound_(upper_bound),
+        upper_bound_inclusive_(upper_bound_inclusive),
+        heap_(StartKeyMinComparator(icmp)) {
+    for (auto& child : children) {
+      if (child != nullptr) {
+        assert(child->lower_bound() == 0);
+        assert(child->upper_bound() == kMaxSequenceNumber);
+        children_.push_back(child.get());
+      }
+    }
+  }
+
+  bool Valid() const override {
+    return !heap_.empty() && BeforeEndKey(heap_.top());
+  }
+  Status status() const override { return Status::OK(); }
+
+  void SeekToFirst() override {
+    heap_.clear();
+    for (auto& child : children_) {
+      if (lower_bound_ != nullptr) {
+        child->Seek(*lower_bound_);
+      } else {
+        child->SeekToFirst();
+      }
+      if (child->Valid()) {
+        heap_.push(child);
+      }
+    }
+  }
+
+  void Next() override {
+    auto* top = heap_.top();
+    top->InternalNext();
+    if (top->Valid()) {
+      heap_.replace_top(top);
+    } else {
+      heap_.pop();
+    }
+  }
+
+  Slice key() const override {
+    auto* top = heap_.top();
+    cur_start_key_.Set(top->start_key().user_key, top->seq(),
+                       kTypeRangeDeletion);
+    return cur_start_key_.Encode();
+  }
+
+  Slice value() const override {
+    auto* top = heap_.top();
+    assert(top->end_key().sequence == kMaxSequenceNumber);
+    return top->end_key().user_key;
+  }
+
+  // Unused InternalIterator methods
+  void Prev() override { assert(false); }
+  void Seek(const Slice& /* target */) override { assert(false); }
+  void SeekForPrev(const Slice& /* target */) override { assert(false); }
+  void SeekToLast() override { assert(false); }
+
+ private:
+  bool BeforeEndKey(const TruncatedRangeDelIterator* iter) const {
+    if (upper_bound_ == nullptr) {
+      return true;
+    }
+    int cmp = icmp_->user_comparator()->Compare(iter->start_key().user_key,
+                                                *upper_bound_);
+    return upper_bound_inclusive_ ? cmp <= 0 : cmp < 0;
+  }
+
+  const InternalKeyComparator* icmp_;
+  const Slice* lower_bound_;
+  const Slice* upper_bound_;
+  bool upper_bound_inclusive_;
+  BinaryHeap<TruncatedRangeDelIterator*, StartKeyMinComparator> heap_;
+  std::vector<TruncatedRangeDelIterator*> children_;
+
+  mutable InternalKey cur_start_key_;
+};
+
+}  // namespace
+
+std::unique_ptr<FragmentedRangeTombstoneIterator>
+CompactionRangeDelAggregator::NewIterator(const Slice* lower_bound,
+                                          const Slice* upper_bound,
+                                          bool upper_bound_inclusive) {
+  InvalidateRangeDelMapPositions();
+  std::unique_ptr<TruncatedRangeDelMergingIter> merging_iter(
+      new TruncatedRangeDelMergingIter(icmp_, lower_bound, upper_bound,
+                                       upper_bound_inclusive, parent_iters_));
+
+  auto fragmented_tombstone_list =
+      std::make_shared<FragmentedRangeTombstoneList>(
+          std::move(merging_iter), *icmp_, true /* for_compaction */,
+          *snapshots_);
+
+  return std::unique_ptr<FragmentedRangeTombstoneIterator>(
+      new FragmentedRangeTombstoneIterator(
+          fragmented_tombstone_list, *icmp_,
+          kMaxSequenceNumber /* upper_bound */));
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/range_del_aggregator.h b/src/rocksdb/db/range_del_aggregator.h
new file mode 100644
index 000000000..b47cf31d3
--- /dev/null
+++ b/src/rocksdb/db/range_del_aggregator.h
@@ -0,0 +1,441 @@
+//  Copyright (c) 2018-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <algorithm>
+#include <iterator>
+#include <list>
+#include <map>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "db/compaction/compaction_iteration_stats.h"
+#include "db/dbformat.h"
+#include "db/pinned_iterators_manager.h"
+#include "db/range_del_aggregator.h"
+#include "db/range_tombstone_fragmenter.h"
+#include "db/version_edit.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/types.h"
+#include "table/internal_iterator.h"
+#include "table/scoped_arena_iterator.h"
+#include "table/table_builder.h"
+#include "util/heap.h"
+#include "util/kv_map.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class TruncatedRangeDelIterator {
+ public:
+  TruncatedRangeDelIterator(
+      std::unique_ptr<FragmentedRangeTombstoneIterator> iter,
+      const InternalKeyComparator* icmp, const InternalKey* smallest,
+      const InternalKey* largest);
+
+  bool Valid() const;
+
+  void Next();
+  void Prev();
+
+  void InternalNext();
+
+  // Seeks to the tombstone with the highest viisble sequence number that covers
+  // target (a user key). If no such tombstone exists, the position will be at
+  // the earliest tombstone that ends after target.
+  void Seek(const Slice& target);
+
+  // Seeks to the tombstone with the highest viisble sequence number that covers
+  // target (a user key). If no such tombstone exists, the position will be at
+  // the latest tombstone that starts before target.
+  void SeekForPrev(const Slice& target);
+
+  void SeekToFirst();
+  void SeekToLast();
+
+  ParsedInternalKey start_key() const {
+    return (smallest_ == nullptr ||
+            icmp_->Compare(*smallest_, iter_->parsed_start_key()) <= 0)
+               ? iter_->parsed_start_key()
+               : *smallest_;
+  }
+
+  ParsedInternalKey end_key() const {
+    return (largest_ == nullptr ||
+            icmp_->Compare(iter_->parsed_end_key(), *largest_) <= 0)
+               ? iter_->parsed_end_key()
+               : *largest_;
+  }
+
+  SequenceNumber seq() const { return iter_->seq(); }
+
+  std::map<SequenceNumber, std::unique_ptr<TruncatedRangeDelIterator>>
+  SplitBySnapshot(const std::vector<SequenceNumber>& snapshots);
+
+  SequenceNumber upper_bound() const { return iter_->upper_bound(); }
+
+  SequenceNumber lower_bound() const { return iter_->lower_bound(); }
+
+ private:
+  std::unique_ptr<FragmentedRangeTombstoneIterator> iter_;
+  const InternalKeyComparator* icmp_;
+  const ParsedInternalKey* smallest_ = nullptr;
+  const ParsedInternalKey* largest_ = nullptr;
+  std::list<ParsedInternalKey> pinned_bounds_;
+
+  const InternalKey* smallest_ikey_;
+  const InternalKey* largest_ikey_;
+};
+
+struct SeqMaxComparator {
+  bool operator()(const TruncatedRangeDelIterator* a,
+                  const TruncatedRangeDelIterator* b) const {
+    return a->seq() > b->seq();
+  }
+};
+
+struct StartKeyMinComparator {
+  explicit StartKeyMinComparator(const InternalKeyComparator* c) : icmp(c) {}
+
+  bool operator()(const TruncatedRangeDelIterator* a,
+                  const TruncatedRangeDelIterator* b) const {
+    return icmp->Compare(a->start_key(), b->start_key()) > 0;
+  }
+
+  const InternalKeyComparator* icmp;
+};
+
+class ForwardRangeDelIterator {
+ public:
+  explicit ForwardRangeDelIterator(const InternalKeyComparator* icmp);
+
+  bool ShouldDelete(const ParsedInternalKey& parsed);
+  void Invalidate();
+
+  void AddNewIter(TruncatedRangeDelIterator* iter,
+                  const ParsedInternalKey& parsed) {
+    iter->Seek(parsed.user_key);
+    PushIter(iter, parsed);
+    assert(active_iters_.size() == active_seqnums_.size());
+  }
+
+  size_t UnusedIdx() const { return unused_idx_; }
+  void IncUnusedIdx() { unused_idx_++; }
+
+ private:
+  using ActiveSeqSet =
+      std::multiset<TruncatedRangeDelIterator*, SeqMaxComparator>;
+
+  struct EndKeyMinComparator {
+    explicit EndKeyMinComparator(const InternalKeyComparator* c) : icmp(c) {}
+
+    bool operator()(const ActiveSeqSet::const_iterator& a,
+                    const ActiveSeqSet::const_iterator& b) const {
+      return icmp->Compare((*a)->end_key(), (*b)->end_key()) > 0;
+    }
+
+    const InternalKeyComparator* icmp;
+  };
+
+  void PushIter(TruncatedRangeDelIterator* iter,
+                const ParsedInternalKey& parsed) {
+    if (!iter->Valid()) {
+      // The iterator has been fully consumed, so we don't need to add it to
+      // either of the heaps.
+      return;
+    }
+    int cmp = icmp_->Compare(parsed, iter->start_key());
+    if (cmp < 0) {
+      PushInactiveIter(iter);
+    } else {
+      PushActiveIter(iter);
+    }
+  }
+
+  void PushActiveIter(TruncatedRangeDelIterator* iter) {
+    auto seq_pos = active_seqnums_.insert(iter);
+    active_iters_.push(seq_pos);
+  }
+
+  TruncatedRangeDelIterator* PopActiveIter() {
+    auto active_top = active_iters_.top();
+    auto iter = *active_top;
+    active_iters_.pop();
+    active_seqnums_.erase(active_top);
+    return iter;
+  }
+
+  void PushInactiveIter(TruncatedRangeDelIterator* iter) {
+    inactive_iters_.push(iter);
+  }
+
+  TruncatedRangeDelIterator* PopInactiveIter() {
+    auto* iter = inactive_iters_.top();
+    inactive_iters_.pop();
+    return iter;
+  }
+
+  const InternalKeyComparator* icmp_;
+  size_t unused_idx_;
+  ActiveSeqSet active_seqnums_;
+  BinaryHeap<ActiveSeqSet::const_iterator, EndKeyMinComparator> active_iters_;
+  BinaryHeap<TruncatedRangeDelIterator*, StartKeyMinComparator> inactive_iters_;
+};
+
+class ReverseRangeDelIterator {
+ public:
+  explicit ReverseRangeDelIterator(const InternalKeyComparator* icmp);
+
+  bool ShouldDelete(const ParsedInternalKey& parsed);
+  void Invalidate();
+
+  void AddNewIter(TruncatedRangeDelIterator* iter,
+                  const ParsedInternalKey& parsed) {
+    iter->SeekForPrev(parsed.user_key);
+    PushIter(iter, parsed);
+    assert(active_iters_.size() == active_seqnums_.size());
+  }
+
+  size_t UnusedIdx() const { return unused_idx_; }
+  void IncUnusedIdx() { unused_idx_++; }
+
+ private:
+  using ActiveSeqSet =
+      std::multiset<TruncatedRangeDelIterator*, SeqMaxComparator>;
+
+  struct EndKeyMaxComparator {
+    explicit EndKeyMaxComparator(const InternalKeyComparator* c) : icmp(c) {}
+
+    bool operator()(const TruncatedRangeDelIterator* a,
+                    const TruncatedRangeDelIterator* b) const {
+      return icmp->Compare(a->end_key(), b->end_key()) < 0;
+    }
+
+    const InternalKeyComparator* icmp;
+  };
+  struct StartKeyMaxComparator {
+    explicit StartKeyMaxComparator(const InternalKeyComparator* c) : icmp(c) {}
+
+    bool operator()(const ActiveSeqSet::const_iterator& a,
+                    const ActiveSeqSet::const_iterator& b) const {
+      return icmp->Compare((*a)->start_key(), (*b)->start_key()) < 0;
+    }
+
+    const InternalKeyComparator* icmp;
+  };
+
+  void PushIter(TruncatedRangeDelIterator* iter,
+                const ParsedInternalKey& parsed) {
+    if (!iter->Valid()) {
+      // The iterator has been fully consumed, so we don't need to add it to
+      // either of the heaps.
+    } else if (icmp_->Compare(iter->end_key(), parsed) <= 0) {
+      PushInactiveIter(iter);
+    } else {
+      PushActiveIter(iter);
+    }
+  }
+
+  void PushActiveIter(TruncatedRangeDelIterator* iter) {
+    auto seq_pos = active_seqnums_.insert(iter);
+    active_iters_.push(seq_pos);
+  }
+
+  TruncatedRangeDelIterator* PopActiveIter() {
+    auto active_top = active_iters_.top();
+    auto iter = *active_top;
+    active_iters_.pop();
+    active_seqnums_.erase(active_top);
+    return iter;
+  }
+
+  void PushInactiveIter(TruncatedRangeDelIterator* iter) {
+    inactive_iters_.push(iter);
+  }
+
+  TruncatedRangeDelIterator* PopInactiveIter() {
+    auto* iter = inactive_iters_.top();
+    inactive_iters_.pop();
+    return iter;
+  }
+
+  const InternalKeyComparator* icmp_;
+  size_t unused_idx_;
+  ActiveSeqSet active_seqnums_;
+  BinaryHeap<ActiveSeqSet::const_iterator, StartKeyMaxComparator> active_iters_;
+  BinaryHeap<TruncatedRangeDelIterator*, EndKeyMaxComparator> inactive_iters_;
+};
+
+enum class RangeDelPositioningMode { kForwardTraversal, kBackwardTraversal };
+class RangeDelAggregator {
+ public:
+  explicit RangeDelAggregator(const InternalKeyComparator* icmp)
+      : icmp_(icmp) {}
+  virtual ~RangeDelAggregator() {}
+
+  virtual void AddTombstones(
+      std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter,
+      const InternalKey* smallest = nullptr,
+      const InternalKey* largest = nullptr) = 0;
+
+  bool ShouldDelete(const Slice& key, RangeDelPositioningMode mode) {
+    ParsedInternalKey parsed;
+    if (!ParseInternalKey(key, &parsed)) {
+      return false;
+    }
+    return ShouldDelete(parsed, mode);
+  }
+  virtual bool ShouldDelete(const ParsedInternalKey& parsed,
+                            RangeDelPositioningMode mode) = 0;
+
+  virtual void InvalidateRangeDelMapPositions() = 0;
+
+  virtual bool IsEmpty() const = 0;
+
+  bool AddFile(uint64_t file_number) {
+    return files_seen_.insert(file_number).second;
+  }
+
+ protected:
+  class StripeRep {
+   public:
+    StripeRep(const InternalKeyComparator* icmp, SequenceNumber upper_bound,
+              SequenceNumber lower_bound)
+        : icmp_(icmp),
+          forward_iter_(icmp),
+          reverse_iter_(icmp),
+          upper_bound_(upper_bound),
+          lower_bound_(lower_bound) {}
+
+    void AddTombstones(std::unique_ptr<TruncatedRangeDelIterator> input_iter) {
+      iters_.push_back(std::move(input_iter));
+    }
+
+    bool IsEmpty() const { return iters_.empty(); }
+
+    bool ShouldDelete(const ParsedInternalKey& parsed,
+                      RangeDelPositioningMode mode);
+
+    void Invalidate() {
+      if (!IsEmpty()) {
+        InvalidateForwardIter();
+        InvalidateReverseIter();
+      }
+    }
+
+    bool IsRangeOverlapped(const Slice& start, const Slice& end);
+
+   private:
+    bool InStripe(SequenceNumber seq) const {
+      return lower_bound_ <= seq && seq <= upper_bound_;
+    }
+
+    void InvalidateForwardIter() { forward_iter_.Invalidate(); }
+
+    void InvalidateReverseIter() { reverse_iter_.Invalidate(); }
+
+    const InternalKeyComparator* icmp_;
+    std::vector<std::unique_ptr<TruncatedRangeDelIterator>> iters_;
+    ForwardRangeDelIterator forward_iter_;
+    ReverseRangeDelIterator reverse_iter_;
+    SequenceNumber upper_bound_;
+    SequenceNumber lower_bound_;
+  };
+
+  const InternalKeyComparator* icmp_;
+
+ private:
+  std::set<uint64_t> files_seen_;
+};
+
+class ReadRangeDelAggregator final : public RangeDelAggregator {
+ public:
+  ReadRangeDelAggregator(const InternalKeyComparator* icmp,
+                         SequenceNumber upper_bound)
+      : RangeDelAggregator(icmp),
+        rep_(icmp, upper_bound, 0 /* lower_bound */) {}
+  ~ReadRangeDelAggregator() override {}
+
+  using RangeDelAggregator::ShouldDelete;
+  void AddTombstones(
+      std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter,
+      const InternalKey* smallest = nullptr,
+      const InternalKey* largest = nullptr) override;
+
+  bool ShouldDelete(const ParsedInternalKey& parsed,
+                    RangeDelPositioningMode mode) final override {
+    if (rep_.IsEmpty()) {
+      return false;
+    }
+    return ShouldDeleteImpl(parsed, mode);
+  }
+
+  bool IsRangeOverlapped(const Slice& start, const Slice& end);
+
+  void InvalidateRangeDelMapPositions() override { rep_.Invalidate(); }
+
+  bool IsEmpty() const override { return rep_.IsEmpty(); }
+
+ private:
+  StripeRep rep_;
+
+  bool ShouldDeleteImpl(const ParsedInternalKey& parsed,
+                        RangeDelPositioningMode mode);
+};
+
+class CompactionRangeDelAggregator : public RangeDelAggregator {
+ public:
+  CompactionRangeDelAggregator(const InternalKeyComparator* icmp,
+                               const std::vector<SequenceNumber>& snapshots)
+      : RangeDelAggregator(icmp), snapshots_(&snapshots) {}
+  ~CompactionRangeDelAggregator() override {}
+
+  void AddTombstones(
+      std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter,
+      const InternalKey* smallest = nullptr,
+      const InternalKey* largest = nullptr) override;
+
+  using RangeDelAggregator::ShouldDelete;
+  bool ShouldDelete(const ParsedInternalKey& parsed,
+                    RangeDelPositioningMode mode) override;
+
+  bool IsRangeOverlapped(const Slice& start, const Slice& end);
+
+  void InvalidateRangeDelMapPositions() override {
+    for (auto& rep : reps_) {
+      rep.second.Invalidate();
+    }
+  }
+
+  bool IsEmpty() const override {
+    for (const auto& rep : reps_) {
+      if (!rep.second.IsEmpty()) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  // Creates an iterator over all the range tombstones in the aggregator, for
+  // use in compaction. Nullptr arguments indicate that the iterator range is
+  // unbounded.
+  // NOTE: the boundaries are used for optimization purposes to reduce the
+  // number of tombstones that are passed to the fragmenter; they do not
+  // guarantee that the resulting iterator only contains range tombstones that
+  // cover keys in the provided range. If required, these bounds must be
+  // enforced during iteration.
+  std::unique_ptr<FragmentedRangeTombstoneIterator> NewIterator(
+      const Slice* lower_bound = nullptr, const Slice* upper_bound = nullptr,
+      bool upper_bound_inclusive = false);
+
+ private:
+  std::vector<std::unique_ptr<TruncatedRangeDelIterator>> parent_iters_;
+  std::map<SequenceNumber, StripeRep> reps_;
+
+  const std::vector<SequenceNumber>* snapshots_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/range_del_aggregator_bench.cc b/src/rocksdb/db/range_del_aggregator_bench.cc
new file mode 100644
index 000000000..3f3135f2e
--- /dev/null
+++ b/src/rocksdb/db/range_del_aggregator_bench.cc
@@ -0,0 +1,260 @@
+//  Copyright (c) 2018-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef GFLAGS
+#include <cstdio>
+int main() {
+  fprintf(stderr, "Please install gflags to run rocksdb tools\n");
+  return 1;
+}
+#else
+
+#include <iostream>
+#include <iomanip>
+#include <memory>
+#include <random>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "db/range_del_aggregator.h"
+#include "db/range_tombstone_fragmenter.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/env.h"
+#include "test_util/testutil.h"
+#include "util/coding.h"
+#include "util/random.h"
+#include "util/stop_watch.h"
+
+#include "util/gflags_compat.h"
+
+using GFLAGS_NAMESPACE::ParseCommandLineFlags;
+
+DEFINE_int32(num_range_tombstones, 1000, "number of range tombstones created");
+
+DEFINE_int32(num_runs, 1000, "number of test runs");
+
+DEFINE_int32(tombstone_start_upper_bound, 1000,
+             "exclusive upper bound on range tombstone start keys");
+
+DEFINE_int32(should_delete_upper_bound, 1000,
+             "exclusive upper bound on keys passed to ShouldDelete");
+
+DEFINE_double(tombstone_width_mean, 100.0, "average range tombstone width");
+
+DEFINE_double(tombstone_width_stddev, 0.0,
+              "standard deviation of range tombstone width");
+
+DEFINE_int32(seed, 0, "random number generator seed");
+
+DEFINE_int32(should_deletes_per_run, 1, "number of ShouldDelete calls per run");
+
+DEFINE_int32(add_tombstones_per_run, 1,
+             "number of AddTombstones calls per run");
+
+namespace {
+
+struct Stats {
+  uint64_t time_add_tombstones = 0;
+  uint64_t time_first_should_delete = 0;
+  uint64_t time_rest_should_delete = 0;
+};
+
+std::ostream& operator<<(std::ostream& os, const Stats& s) {
+  std::ios fmt_holder(nullptr);
+  fmt_holder.copyfmt(os);
+
+  os << std::left;
+  os << std::setw(25) << "AddTombstones: "
+     << s.time_add_tombstones /
+            (FLAGS_add_tombstones_per_run * FLAGS_num_runs * 1.0e3)
+     << " us\n";
+  os << std::setw(25) << "ShouldDelete (first): "
+     << s.time_first_should_delete / (FLAGS_num_runs * 1.0e3) << " us\n";
+  if (FLAGS_should_deletes_per_run > 1) {
+    os << std::setw(25) << "ShouldDelete (rest): "
+       << s.time_rest_should_delete /
+              ((FLAGS_should_deletes_per_run - 1) * FLAGS_num_runs * 1.0e3)
+       << " us\n";
+  }
+
+  os.copyfmt(fmt_holder);
+  return os;
+}
+
+auto icmp = ROCKSDB_NAMESPACE::InternalKeyComparator(
+    ROCKSDB_NAMESPACE::BytewiseComparator());
+
+}  // anonymous namespace
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+
+// A wrapper around RangeTombstones and the underlying data of its start and end
+// keys.
+struct PersistentRangeTombstone {
+  std::string start_key;
+  std::string end_key;
+  RangeTombstone tombstone;
+
+  PersistentRangeTombstone(std::string start, std::string end,
+                           SequenceNumber seq)
+      : start_key(std::move(start)), end_key(std::move(end)) {
+    tombstone = RangeTombstone(start_key, end_key, seq);
+  }
+
+  PersistentRangeTombstone() = default;
+
+  PersistentRangeTombstone(const PersistentRangeTombstone& t) { *this = t; }
+
+  PersistentRangeTombstone& operator=(const PersistentRangeTombstone& t) {
+    start_key = t.start_key;
+    end_key = t.end_key;
+    tombstone = RangeTombstone(start_key, end_key, t.tombstone.seq_);
+
+    return *this;
+  }
+
+  PersistentRangeTombstone(PersistentRangeTombstone&& t) noexcept { *this = t; }
+
+  PersistentRangeTombstone& operator=(PersistentRangeTombstone&& t) {
+    start_key = std::move(t.start_key);
+    end_key = std::move(t.end_key);
+    tombstone = RangeTombstone(start_key, end_key, t.tombstone.seq_);
+
+    return *this;
+  }
+};
+
+struct TombstoneStartKeyComparator {
+  explicit TombstoneStartKeyComparator(const Comparator* c) : cmp(c) {}
+
+  bool operator()(const RangeTombstone& a, const RangeTombstone& b) const {
+    return cmp->Compare(a.start_key_, b.start_key_) < 0;
+  }
+
+  const Comparator* cmp;
+};
+
+std::unique_ptr<InternalIterator> MakeRangeDelIterator(
+    const std::vector<PersistentRangeTombstone>& range_dels) {
+  std::vector<std::string> keys, values;
+  for (const auto& range_del : range_dels) {
+    auto key_and_value = range_del.tombstone.Serialize();
+    keys.push_back(key_and_value.first.Encode().ToString());
+    values.push_back(key_and_value.second.ToString());
+  }
+  return std::unique_ptr<test::VectorIterator>(
+      new test::VectorIterator(keys, values));
+}
+
+// convert long to a big-endian slice key
+static std::string Key(int64_t val) {
+  std::string little_endian_key;
+  std::string big_endian_key;
+  PutFixed64(&little_endian_key, val);
+  assert(little_endian_key.size() == sizeof(val));
+  big_endian_key.resize(sizeof(val));
+  for (size_t i = 0; i < sizeof(val); ++i) {
+    big_endian_key[i] = little_endian_key[sizeof(val) - 1 - i];
+  }
+  return big_endian_key;
+}
+
+}  // anonymous namespace
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ParseCommandLineFlags(&argc, &argv, true);
+
+  Stats stats;
+  ROCKSDB_NAMESPACE::Random64 rnd(FLAGS_seed);
+  std::default_random_engine random_gen(FLAGS_seed);
+  std::normal_distribution<double> normal_dist(FLAGS_tombstone_width_mean,
+                                               FLAGS_tombstone_width_stddev);
+  std::vector<std::vector<ROCKSDB_NAMESPACE::PersistentRangeTombstone> >
+      all_persistent_range_tombstones(FLAGS_add_tombstones_per_run);
+  for (int i = 0; i < FLAGS_add_tombstones_per_run; i++) {
+    all_persistent_range_tombstones[i] =
+        std::vector<ROCKSDB_NAMESPACE::PersistentRangeTombstone>(
+            FLAGS_num_range_tombstones);
+  }
+  auto mode = ROCKSDB_NAMESPACE::RangeDelPositioningMode::kForwardTraversal;
+
+  for (int i = 0; i < FLAGS_num_runs; i++) {
+    ROCKSDB_NAMESPACE::ReadRangeDelAggregator range_del_agg(
+        &icmp, ROCKSDB_NAMESPACE::kMaxSequenceNumber /* upper_bound */);
+
+    std::vector<
+        std::unique_ptr<ROCKSDB_NAMESPACE::FragmentedRangeTombstoneList> >
+        fragmented_range_tombstone_lists(FLAGS_add_tombstones_per_run);
+
+    for (auto& persistent_range_tombstones : all_persistent_range_tombstones) {
+      // TODO(abhimadan): consider whether creating the range tombstones right
+      // before AddTombstones is artificially warming the cache compared to
+      // real workloads.
+      for (int j = 0; j < FLAGS_num_range_tombstones; j++) {
+        uint64_t start = rnd.Uniform(FLAGS_tombstone_start_upper_bound);
+        uint64_t end = static_cast<uint64_t>(
+            std::round(start + std::max(1.0, normal_dist(random_gen))));
+        persistent_range_tombstones[j] =
+            ROCKSDB_NAMESPACE::PersistentRangeTombstone(
+                ROCKSDB_NAMESPACE::Key(start), ROCKSDB_NAMESPACE::Key(end), j);
+      }
+
+      auto range_del_iter =
+          ROCKSDB_NAMESPACE::MakeRangeDelIterator(persistent_range_tombstones);
+      fragmented_range_tombstone_lists.emplace_back(
+          new ROCKSDB_NAMESPACE::FragmentedRangeTombstoneList(
+              ROCKSDB_NAMESPACE::MakeRangeDelIterator(
+                  persistent_range_tombstones),
+              icmp));
+      std::unique_ptr<ROCKSDB_NAMESPACE::FragmentedRangeTombstoneIterator>
+          fragmented_range_del_iter(
+              new ROCKSDB_NAMESPACE::FragmentedRangeTombstoneIterator(
+                  fragmented_range_tombstone_lists.back().get(), icmp,
+                  ROCKSDB_NAMESPACE::kMaxSequenceNumber));
+
+      ROCKSDB_NAMESPACE::StopWatchNano stop_watch_add_tombstones(
+          ROCKSDB_NAMESPACE::Env::Default(), true /* auto_start */);
+      range_del_agg.AddTombstones(std::move(fragmented_range_del_iter));
+      stats.time_add_tombstones += stop_watch_add_tombstones.ElapsedNanos();
+    }
+
+    ROCKSDB_NAMESPACE::ParsedInternalKey parsed_key;
+    parsed_key.sequence = FLAGS_num_range_tombstones / 2;
+    parsed_key.type = ROCKSDB_NAMESPACE::kTypeValue;
+
+    uint64_t first_key = rnd.Uniform(FLAGS_should_delete_upper_bound -
+                                     FLAGS_should_deletes_per_run + 1);
+
+    for (int j = 0; j < FLAGS_should_deletes_per_run; j++) {
+      std::string key_string = ROCKSDB_NAMESPACE::Key(first_key + j);
+      parsed_key.user_key = key_string;
+
+      ROCKSDB_NAMESPACE::StopWatchNano stop_watch_should_delete(
+          ROCKSDB_NAMESPACE::Env::Default(), true /* auto_start */);
+      range_del_agg.ShouldDelete(parsed_key, mode);
+      uint64_t call_time = stop_watch_should_delete.ElapsedNanos();
+
+      if (j == 0) {
+        stats.time_first_should_delete += call_time;
+      } else {
+        stats.time_rest_should_delete += call_time;
+      }
+    }
+  }
+
+  std::cout << "=========================\n"
+            << "Results:\n"
+            << "=========================\n"
+            << stats;
+
+  return 0;
+}
+
+#endif  // GFLAGS
diff --git a/src/rocksdb/db/range_del_aggregator_test.cc b/src/rocksdb/db/range_del_aggregator_test.cc
new file mode 100644
index 000000000..0b8b5079c
--- /dev/null
+++ b/src/rocksdb/db/range_del_aggregator_test.cc
@@ -0,0 +1,709 @@
+//  Copyright (c) 2018-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/range_del_aggregator.h"
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "db/db_test_util.h"
+#include "db/dbformat.h"
+#include "db/range_tombstone_fragmenter.h"
+#include "test_util/testutil.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class RangeDelAggregatorTest : public testing::Test {};
+
+namespace {
+
+static auto bytewise_icmp = InternalKeyComparator(BytewiseComparator());
+
+std::unique_ptr<InternalIterator> MakeRangeDelIter(
+    const std::vector<RangeTombstone>& range_dels) {
+  std::vector<std::string> keys, values;
+  for (const auto& range_del : range_dels) {
+    auto key_and_value = range_del.Serialize();
+    keys.push_back(key_and_value.first.Encode().ToString());
+    values.push_back(key_and_value.second.ToString());
+  }
+  return std::unique_ptr<test::VectorIterator>(
+      new test::VectorIterator(keys, values));
+}
+
+std::vector<std::unique_ptr<FragmentedRangeTombstoneList>>
+MakeFragmentedTombstoneLists(
+    const std::vector<std::vector<RangeTombstone>>& range_dels_list) {
+  std::vector<std::unique_ptr<FragmentedRangeTombstoneList>> fragment_lists;
+  for (const auto& range_dels : range_dels_list) {
+    auto range_del_iter = MakeRangeDelIter(range_dels);
+    fragment_lists.emplace_back(new FragmentedRangeTombstoneList(
+        std::move(range_del_iter), bytewise_icmp));
+  }
+  return fragment_lists;
+}
+
+struct TruncatedIterScanTestCase {
+  ParsedInternalKey start;
+  ParsedInternalKey end;
+  SequenceNumber seq;
+};
+
+struct TruncatedIterSeekTestCase {
+  Slice target;
+  ParsedInternalKey start;
+  ParsedInternalKey end;
+  SequenceNumber seq;
+  bool invalid;
+};
+
+struct ShouldDeleteTestCase {
+  ParsedInternalKey lookup_key;
+  bool result;
+};
+
+struct IsRangeOverlappedTestCase {
+  Slice start;
+  Slice end;
+  bool result;
+};
+
+ParsedInternalKey UncutEndpoint(const Slice& s) {
+  return ParsedInternalKey(s, kMaxSequenceNumber, kTypeRangeDeletion);
+}
+
+ParsedInternalKey InternalValue(const Slice& key, SequenceNumber seq) {
+  return ParsedInternalKey(key, seq, kTypeValue);
+}
+
+void VerifyIterator(
+    TruncatedRangeDelIterator* iter, const InternalKeyComparator& icmp,
+    const std::vector<TruncatedIterScanTestCase>& expected_range_dels) {
+  // Test forward iteration.
+  iter->SeekToFirst();
+  for (size_t i = 0; i < expected_range_dels.size(); i++, iter->Next()) {
+    ASSERT_TRUE(iter->Valid());
+    EXPECT_EQ(0, icmp.Compare(iter->start_key(), expected_range_dels[i].start));
+    EXPECT_EQ(0, icmp.Compare(iter->end_key(), expected_range_dels[i].end));
+    EXPECT_EQ(expected_range_dels[i].seq, iter->seq());
+  }
+  EXPECT_FALSE(iter->Valid());
+
+  // Test reverse iteration.
+  iter->SeekToLast();
+  std::vector<TruncatedIterScanTestCase> reverse_expected_range_dels(
+      expected_range_dels.rbegin(), expected_range_dels.rend());
+  for (size_t i = 0; i < reverse_expected_range_dels.size();
+       i++, iter->Prev()) {
+    ASSERT_TRUE(iter->Valid());
+    EXPECT_EQ(0, icmp.Compare(iter->start_key(),
+                              reverse_expected_range_dels[i].start));
+    EXPECT_EQ(
+        0, icmp.Compare(iter->end_key(), reverse_expected_range_dels[i].end));
+    EXPECT_EQ(reverse_expected_range_dels[i].seq, iter->seq());
+  }
+  EXPECT_FALSE(iter->Valid());
+}
+
+void VerifySeek(TruncatedRangeDelIterator* iter,
+                const InternalKeyComparator& icmp,
+                const std::vector<TruncatedIterSeekTestCase>& test_cases) {
+  for (const auto& test_case : test_cases) {
+    iter->Seek(test_case.target);
+    if (test_case.invalid) {
+      ASSERT_FALSE(iter->Valid());
+    } else {
+      ASSERT_TRUE(iter->Valid());
+      EXPECT_EQ(0, icmp.Compare(iter->start_key(), test_case.start));
+      EXPECT_EQ(0, icmp.Compare(iter->end_key(), test_case.end));
+      EXPECT_EQ(test_case.seq, iter->seq());
+    }
+  }
+}
+
+void VerifySeekForPrev(
+    TruncatedRangeDelIterator* iter, const InternalKeyComparator& icmp,
+    const std::vector<TruncatedIterSeekTestCase>& test_cases) {
+  for (const auto& test_case : test_cases) {
+    iter->SeekForPrev(test_case.target);
+    if (test_case.invalid) {
+      ASSERT_FALSE(iter->Valid());
+    } else {
+      ASSERT_TRUE(iter->Valid());
+      EXPECT_EQ(0, icmp.Compare(iter->start_key(), test_case.start));
+      EXPECT_EQ(0, icmp.Compare(iter->end_key(), test_case.end));
+      EXPECT_EQ(test_case.seq, iter->seq());
+    }
+  }
+}
+
+void VerifyShouldDelete(RangeDelAggregator* range_del_agg,
+                        const std::vector<ShouldDeleteTestCase>& test_cases) {
+  for (const auto& test_case : test_cases) {
+    EXPECT_EQ(
+        test_case.result,
+        range_del_agg->ShouldDelete(
+            test_case.lookup_key, RangeDelPositioningMode::kForwardTraversal));
+  }
+  for (auto it = test_cases.rbegin(); it != test_cases.rend(); ++it) {
+    const auto& test_case = *it;
+    EXPECT_EQ(
+        test_case.result,
+        range_del_agg->ShouldDelete(
+            test_case.lookup_key, RangeDelPositioningMode::kBackwardTraversal));
+  }
+}
+
+void VerifyIsRangeOverlapped(
+    ReadRangeDelAggregator* range_del_agg,
+    const std::vector<IsRangeOverlappedTestCase>& test_cases) {
+  for (const auto& test_case : test_cases) {
+    EXPECT_EQ(test_case.result,
+              range_del_agg->IsRangeOverlapped(test_case.start, test_case.end));
+  }
+}
+
+void CheckIterPosition(const RangeTombstone& tombstone,
+                       const FragmentedRangeTombstoneIterator* iter) {
+  // Test InternalIterator interface.
+  EXPECT_EQ(tombstone.start_key_, ExtractUserKey(iter->key()));
+  EXPECT_EQ(tombstone.end_key_, iter->value());
+  EXPECT_EQ(tombstone.seq_, iter->seq());
+
+  // Test FragmentedRangeTombstoneIterator interface.
+  EXPECT_EQ(tombstone.start_key_, iter->start_key());
+  EXPECT_EQ(tombstone.end_key_, iter->end_key());
+  EXPECT_EQ(tombstone.seq_, GetInternalKeySeqno(iter->key()));
+}
+
+void VerifyFragmentedRangeDels(
+    FragmentedRangeTombstoneIterator* iter,
+    const std::vector<RangeTombstone>& expected_tombstones) {
+  iter->SeekToFirst();
+  for (size_t i = 0; i < expected_tombstones.size(); i++, iter->Next()) {
+    ASSERT_TRUE(iter->Valid());
+    CheckIterPosition(expected_tombstones[i], iter);
+  }
+  EXPECT_FALSE(iter->Valid());
+}
+
+}  // namespace
+
+TEST_F(RangeDelAggregatorTest, EmptyTruncatedIter) {
+  auto range_del_iter = MakeRangeDelIter({});
+  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+                                             bytewise_icmp);
+  std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
+      new FragmentedRangeTombstoneIterator(&fragment_list, bytewise_icmp,
+                                           kMaxSequenceNumber));
+
+  TruncatedRangeDelIterator iter(std::move(input_iter), &bytewise_icmp, nullptr,
+                                 nullptr);
+
+  iter.SeekToFirst();
+  ASSERT_FALSE(iter.Valid());
+
+  iter.SeekToLast();
+  ASSERT_FALSE(iter.Valid());
+}
+
+TEST_F(RangeDelAggregatorTest, UntruncatedIter) {
+  auto range_del_iter =
+      MakeRangeDelIter({{"a", "e", 10}, {"e", "g", 8}, {"j", "n", 4}});
+  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+                                             bytewise_icmp);
+  std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
+      new FragmentedRangeTombstoneIterator(&fragment_list, bytewise_icmp,
+                                           kMaxSequenceNumber));
+
+  TruncatedRangeDelIterator iter(std::move(input_iter), &bytewise_icmp, nullptr,
+                                 nullptr);
+
+  VerifyIterator(&iter, bytewise_icmp,
+                 {{UncutEndpoint("a"), UncutEndpoint("e"), 10},
+                  {UncutEndpoint("e"), UncutEndpoint("g"), 8},
+                  {UncutEndpoint("j"), UncutEndpoint("n"), 4}});
+
+  VerifySeek(
+      &iter, bytewise_icmp,
+      {{"d", UncutEndpoint("a"), UncutEndpoint("e"), 10},
+       {"e", UncutEndpoint("e"), UncutEndpoint("g"), 8},
+       {"ia", UncutEndpoint("j"), UncutEndpoint("n"), 4},
+       {"n", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */},
+       {"", UncutEndpoint("a"), UncutEndpoint("e"), 10}});
+
+  VerifySeekForPrev(
+      &iter, bytewise_icmp,
+      {{"d", UncutEndpoint("a"), UncutEndpoint("e"), 10},
+       {"e", UncutEndpoint("e"), UncutEndpoint("g"), 8},
+       {"ia", UncutEndpoint("e"), UncutEndpoint("g"), 8},
+       {"n", UncutEndpoint("j"), UncutEndpoint("n"), 4},
+       {"", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */}});
+}
+
+TEST_F(RangeDelAggregatorTest, UntruncatedIterWithSnapshot) {
+  auto range_del_iter =
+      MakeRangeDelIter({{"a", "e", 10}, {"e", "g", 8}, {"j", "n", 4}});
+  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+                                             bytewise_icmp);
+  std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
+      new FragmentedRangeTombstoneIterator(&fragment_list, bytewise_icmp,
+                                           9 /* snapshot */));
+
+  TruncatedRangeDelIterator iter(std::move(input_iter), &bytewise_icmp, nullptr,
+                                 nullptr);
+
+  VerifyIterator(&iter, bytewise_icmp,
+                 {{UncutEndpoint("e"), UncutEndpoint("g"), 8},
+                  {UncutEndpoint("j"), UncutEndpoint("n"), 4}});
+
+  VerifySeek(
+      &iter, bytewise_icmp,
+      {{"d", UncutEndpoint("e"), UncutEndpoint("g"), 8},
+       {"e", UncutEndpoint("e"), UncutEndpoint("g"), 8},
+       {"ia", UncutEndpoint("j"), UncutEndpoint("n"), 4},
+       {"n", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */},
+       {"", UncutEndpoint("e"), UncutEndpoint("g"), 8}});
+
+  VerifySeekForPrev(
+      &iter, bytewise_icmp,
+      {{"d", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */},
+       {"e", UncutEndpoint("e"), UncutEndpoint("g"), 8},
+       {"ia", UncutEndpoint("e"), UncutEndpoint("g"), 8},
+       {"n", UncutEndpoint("j"), UncutEndpoint("n"), 4},
+       {"", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */}});
+}
+
+TEST_F(RangeDelAggregatorTest, TruncatedIterPartiallyCutTombstones) {
+  auto range_del_iter =
+      MakeRangeDelIter({{"a", "e", 10}, {"e", "g", 8}, {"j", "n", 4}});
+  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+                                             bytewise_icmp);
+  std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
+      new FragmentedRangeTombstoneIterator(&fragment_list, bytewise_icmp,
+                                           kMaxSequenceNumber));
+
+  InternalKey smallest("d", 7, kTypeValue);
+  InternalKey largest("m", 9, kTypeValue);
+  TruncatedRangeDelIterator iter(std::move(input_iter), &bytewise_icmp,
+                                 &smallest, &largest);
+
+  VerifyIterator(&iter, bytewise_icmp,
+                 {{InternalValue("d", 7), UncutEndpoint("e"), 10},
+                  {UncutEndpoint("e"), UncutEndpoint("g"), 8},
+                  {UncutEndpoint("j"), InternalValue("m", 8), 4}});
+
+  VerifySeek(
+      &iter, bytewise_icmp,
+      {{"d", InternalValue("d", 7), UncutEndpoint("e"), 10},
+       {"e", UncutEndpoint("e"), UncutEndpoint("g"), 8},
+       {"ia", UncutEndpoint("j"), InternalValue("m", 8), 4},
+       {"n", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */},
+       {"", InternalValue("d", 7), UncutEndpoint("e"), 10}});
+
+  VerifySeekForPrev(
+      &iter, bytewise_icmp,
+      {{"d", InternalValue("d", 7), UncutEndpoint("e"), 10},
+       {"e", UncutEndpoint("e"), UncutEndpoint("g"), 8},
+       {"ia", UncutEndpoint("e"), UncutEndpoint("g"), 8},
+       {"n", UncutEndpoint("j"), InternalValue("m", 8), 4},
+       {"", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */}});
+}
+
+TEST_F(RangeDelAggregatorTest, TruncatedIterFullyCutTombstones) {
+  auto range_del_iter =
+      MakeRangeDelIter({{"a", "e", 10}, {"e", "g", 8}, {"j", "n", 4}});
+  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+                                             bytewise_icmp);
+  std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
+      new FragmentedRangeTombstoneIterator(&fragment_list, bytewise_icmp,
+                                           kMaxSequenceNumber));
+
+  InternalKey smallest("f", 7, kTypeValue);
+  InternalKey largest("i", 9, kTypeValue);
+  TruncatedRangeDelIterator iter(std::move(input_iter), &bytewise_icmp,
+                                 &smallest, &largest);
+
+  VerifyIterator(&iter, bytewise_icmp,
+                 {{InternalValue("f", 7), UncutEndpoint("g"), 8}});
+
+  VerifySeek(
+      &iter, bytewise_icmp,
+      {{"d", InternalValue("f", 7), UncutEndpoint("g"), 8},
+       {"f", InternalValue("f", 7), UncutEndpoint("g"), 8},
+       {"j", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */}});
+
+  VerifySeekForPrev(
+      &iter, bytewise_icmp,
+      {{"d", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */},
+       {"f", InternalValue("f", 7), UncutEndpoint("g"), 8},
+       {"j", InternalValue("f", 7), UncutEndpoint("g"), 8}});
+}
+
+TEST_F(RangeDelAggregatorTest, SingleIterInAggregator) {
+  auto range_del_iter = MakeRangeDelIter({{"a", "e", 10}, {"c", "g", 8}});
+  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+                                             bytewise_icmp);
+  std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
+      new FragmentedRangeTombstoneIterator(&fragment_list, bytewise_icmp,
+                                           kMaxSequenceNumber));
+
+  ReadRangeDelAggregator range_del_agg(&bytewise_icmp, kMaxSequenceNumber);
+  range_del_agg.AddTombstones(std::move(input_iter));
+
+  VerifyShouldDelete(&range_del_agg, {{InternalValue("a", 19), false},
+                                      {InternalValue("b", 9), true},
+                                      {InternalValue("d", 9), true},
+                                      {InternalValue("e", 7), true},
+                                      {InternalValue("g", 7), false}});
+
+  VerifyIsRangeOverlapped(&range_del_agg, {{"", "_", false},
+                                           {"_", "a", true},
+                                           {"a", "c", true},
+                                           {"d", "f", true},
+                                           {"g", "l", false}});
+}
+
+TEST_F(RangeDelAggregatorTest, MultipleItersInAggregator) {
+  auto fragment_lists = MakeFragmentedTombstoneLists(
+      {{{"a", "e", 10}, {"c", "g", 8}},
+       {{"a", "b", 20}, {"h", "i", 25}, {"ii", "j", 15}}});
+
+  ReadRangeDelAggregator range_del_agg(&bytewise_icmp, kMaxSequenceNumber);
+  for (const auto& fragment_list : fragment_lists) {
+    std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
+        new FragmentedRangeTombstoneIterator(fragment_list.get(), bytewise_icmp,
+                                             kMaxSequenceNumber));
+    range_del_agg.AddTombstones(std::move(input_iter));
+  }
+
+  VerifyShouldDelete(&range_del_agg, {{InternalValue("a", 19), true},
+                                      {InternalValue("b", 19), false},
+                                      {InternalValue("b", 9), true},
+                                      {InternalValue("d", 9), true},
+                                      {InternalValue("e", 7), true},
+                                      {InternalValue("g", 7), false},
+                                      {InternalValue("h", 24), true},
+                                      {InternalValue("i", 24), false},
+                                      {InternalValue("ii", 14), true},
+                                      {InternalValue("j", 14), false}});
+
+  VerifyIsRangeOverlapped(&range_del_agg, {{"", "_", false},
+                                           {"_", "a", true},
+                                           {"a", "c", true},
+                                           {"d", "f", true},
+                                           {"g", "l", true},
+                                           {"x", "y", false}});
+}
+
+TEST_F(RangeDelAggregatorTest, MultipleItersInAggregatorWithUpperBound) {
+  auto fragment_lists = MakeFragmentedTombstoneLists(
+      {{{"a", "e", 10}, {"c", "g", 8}},
+       {{"a", "b", 20}, {"h", "i", 25}, {"ii", "j", 15}}});
+
+  ReadRangeDelAggregator range_del_agg(&bytewise_icmp, 19);
+  for (const auto& fragment_list : fragment_lists) {
+    std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
+        new FragmentedRangeTombstoneIterator(fragment_list.get(), bytewise_icmp,
+                                             19 /* snapshot */));
+    range_del_agg.AddTombstones(std::move(input_iter));
+  }
+
+  VerifyShouldDelete(&range_del_agg, {{InternalValue("a", 19), false},
+                                      {InternalValue("a", 9), true},
+                                      {InternalValue("b", 9), true},
+                                      {InternalValue("d", 9), true},
+                                      {InternalValue("e", 7), true},
+                                      {InternalValue("g", 7), false},
+                                      {InternalValue("h", 24), false},
+                                      {InternalValue("i", 24), false},
+                                      {InternalValue("ii", 14), true},
+                                      {InternalValue("j", 14), false}});
+
+  VerifyIsRangeOverlapped(&range_del_agg, {{"", "_", false},
+                                           {"_", "a", true},
+                                           {"a", "c", true},
+                                           {"d", "f", true},
+                                           {"g", "l", true},
+                                           {"x", "y", false}});
+}
+
+TEST_F(RangeDelAggregatorTest, MultipleTruncatedItersInAggregator) {
+  auto fragment_lists = MakeFragmentedTombstoneLists(
+      {{{"a", "z", 10}}, {{"a", "z", 10}}, {{"a", "z", 10}}});
+  std::vector<std::pair<InternalKey, InternalKey>> iter_bounds = {
+      {InternalKey("a", 4, kTypeValue),
+       InternalKey("m", kMaxSequenceNumber, kTypeRangeDeletion)},
+      {InternalKey("m", 20, kTypeValue),
+       InternalKey("x", kMaxSequenceNumber, kTypeRangeDeletion)},
+      {InternalKey("x", 5, kTypeValue), InternalKey("zz", 30, kTypeValue)}};
+
+  ReadRangeDelAggregator range_del_agg(&bytewise_icmp, 19);
+  for (size_t i = 0; i < fragment_lists.size(); i++) {
+    const auto& fragment_list = fragment_lists[i];
+    const auto& bounds = iter_bounds[i];
+    std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
+        new FragmentedRangeTombstoneIterator(fragment_list.get(), bytewise_icmp,
+                                             19 /* snapshot */));
+    range_del_agg.AddTombstones(std::move(input_iter), &bounds.first,
+                                &bounds.second);
+  }
+
+  VerifyShouldDelete(&range_del_agg, {{InternalValue("a", 10), false},
+                                      {InternalValue("a", 9), false},
+                                      {InternalValue("a", 4), true},
+                                      {InternalValue("m", 10), false},
+                                      {InternalValue("m", 9), true},
+                                      {InternalValue("x", 10), false},
+                                      {InternalValue("x", 9), false},
+                                      {InternalValue("x", 5), true},
+                                      {InternalValue("z", 9), false}});
+
+  VerifyIsRangeOverlapped(&range_del_agg, {{"", "_", false},
+                                           {"_", "a", true},
+                                           {"a", "n", true},
+                                           {"l", "x", true},
+                                           {"w", "z", true},
+                                           {"zzz", "zz", false},
+                                           {"zz", "zzz", false}});
+}
+
+TEST_F(RangeDelAggregatorTest, MultipleTruncatedItersInAggregatorSameLevel) {
+  auto fragment_lists = MakeFragmentedTombstoneLists(
+      {{{"a", "z", 10}}, {{"a", "z", 10}}, {{"a", "z", 10}}});
+  std::vector<std::pair<InternalKey, InternalKey>> iter_bounds = {
+      {InternalKey("a", 4, kTypeValue),
+       InternalKey("m", kMaxSequenceNumber, kTypeRangeDeletion)},
+      {InternalKey("m", 20, kTypeValue),
+       InternalKey("x", kMaxSequenceNumber, kTypeRangeDeletion)},
+      {InternalKey("x", 5, kTypeValue), InternalKey("zz", 30, kTypeValue)}};
+
+  ReadRangeDelAggregator range_del_agg(&bytewise_icmp, 19);
+
+  auto add_iter_to_agg = [&](size_t i) {
+    std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
+        new FragmentedRangeTombstoneIterator(fragment_lists[i].get(),
+                                             bytewise_icmp, 19 /* snapshot */));
+    range_del_agg.AddTombstones(std::move(input_iter), &iter_bounds[i].first,
+                                &iter_bounds[i].second);
+  };
+
+  add_iter_to_agg(0);
+  VerifyShouldDelete(&range_del_agg, {{InternalValue("a", 10), false},
+                                      {InternalValue("a", 9), false},
+                                      {InternalValue("a", 4), true}});
+
+  add_iter_to_agg(1);
+  VerifyShouldDelete(&range_del_agg, {{InternalValue("m", 10), false},
+                                      {InternalValue("m", 9), true}});
+
+  add_iter_to_agg(2);
+  VerifyShouldDelete(&range_del_agg, {{InternalValue("x", 10), false},
+                                      {InternalValue("x", 9), false},
+                                      {InternalValue("x", 5), true},
+                                      {InternalValue("z", 9), false}});
+
+  VerifyIsRangeOverlapped(&range_del_agg, {{"", "_", false},
+                                           {"_", "a", true},
+                                           {"a", "n", true},
+                                           {"l", "x", true},
+                                           {"w", "z", true},
+                                           {"zzz", "zz", false},
+                                           {"zz", "zzz", false}});
+}
+
+TEST_F(RangeDelAggregatorTest, CompactionAggregatorNoSnapshots) {
+  auto fragment_lists = MakeFragmentedTombstoneLists(
+      {{{"a", "e", 10}, {"c", "g", 8}},
+       {{"a", "b", 20}, {"h", "i", 25}, {"ii", "j", 15}}});
+
+  std::vector<SequenceNumber> snapshots;
+  CompactionRangeDelAggregator range_del_agg(&bytewise_icmp, snapshots);
+  for (const auto& fragment_list : fragment_lists) {
+    std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
+        new FragmentedRangeTombstoneIterator(fragment_list.get(), bytewise_icmp,
+                                             kMaxSequenceNumber));
+    range_del_agg.AddTombstones(std::move(input_iter));
+  }
+
+  VerifyShouldDelete(&range_del_agg, {{InternalValue("a", 19), true},
+                                      {InternalValue("b", 19), false},
+                                      {InternalValue("b", 9), true},
+                                      {InternalValue("d", 9), true},
+                                      {InternalValue("e", 7), true},
+                                      {InternalValue("g", 7), false},
+                                      {InternalValue("h", 24), true},
+                                      {InternalValue("i", 24), false},
+                                      {InternalValue("ii", 14), true},
+                                      {InternalValue("j", 14), false}});
+
+  auto range_del_compaction_iter = range_del_agg.NewIterator();
+  VerifyFragmentedRangeDels(range_del_compaction_iter.get(), {{"a", "b", 20},
+                                                              {"b", "c", 10},
+                                                              {"c", "e", 10},
+                                                              {"e", "g", 8},
+                                                              {"h", "i", 25},
+                                                              {"ii", "j", 15}});
+}
+
+TEST_F(RangeDelAggregatorTest, CompactionAggregatorWithSnapshots) {
+  auto fragment_lists = MakeFragmentedTombstoneLists(
+      {{{"a", "e", 10}, {"c", "g", 8}},
+       {{"a", "b", 20}, {"h", "i", 25}, {"ii", "j", 15}}});
+
+  std::vector<SequenceNumber> snapshots{9, 19};
+  CompactionRangeDelAggregator range_del_agg(&bytewise_icmp, snapshots);
+  for (const auto& fragment_list : fragment_lists) {
+    std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
+        new FragmentedRangeTombstoneIterator(fragment_list.get(), bytewise_icmp,
+                                             kMaxSequenceNumber));
+    range_del_agg.AddTombstones(std::move(input_iter));
+  }
+
+  VerifyShouldDelete(
+      &range_del_agg,
+      {
+          {InternalValue("a", 19), false},  // [10, 19]
+          {InternalValue("a", 9), false},   // [0, 9]
+          {InternalValue("b", 9), false},   // [0, 9]
+          {InternalValue("d", 9), false},   // [0, 9]
+          {InternalValue("d", 7), true},    // [0, 9]
+          {InternalValue("e", 7), true},    // [0, 9]
+          {InternalValue("g", 7), false},   // [0, 9]
+          {InternalValue("h", 24), true},   // [20, kMaxSequenceNumber]
+          {InternalValue("i", 24), false},  // [20, kMaxSequenceNumber]
+          {InternalValue("ii", 14), true},  // [10, 19]
+          {InternalValue("j", 14), false}   // [10, 19]
+      });
+
+  auto range_del_compaction_iter = range_del_agg.NewIterator();
+  VerifyFragmentedRangeDels(range_del_compaction_iter.get(), {{"a", "b", 20},
+                                                              {"a", "b", 10},
+                                                              {"b", "c", 10},
+                                                              {"c", "e", 10},
+                                                              {"c", "e", 8},
+                                                              {"e", "g", 8},
+                                                              {"h", "i", 25},
+                                                              {"ii", "j", 15}});
+}
+
+TEST_F(RangeDelAggregatorTest, CompactionAggregatorEmptyIteratorLeft) {
+  auto fragment_lists = MakeFragmentedTombstoneLists(
+      {{{"a", "e", 10}, {"c", "g", 8}},
+       {{"a", "b", 20}, {"h", "i", 25}, {"ii", "j", 15}}});
+
+  std::vector<SequenceNumber> snapshots{9, 19};
+  CompactionRangeDelAggregator range_del_agg(&bytewise_icmp, snapshots);
+  for (const auto& fragment_list : fragment_lists) {
+    std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
+        new FragmentedRangeTombstoneIterator(fragment_list.get(), bytewise_icmp,
+                                             kMaxSequenceNumber));
+    range_del_agg.AddTombstones(std::move(input_iter));
+  }
+
+  Slice start("_");
+  Slice end("__");
+}
+
+TEST_F(RangeDelAggregatorTest, CompactionAggregatorEmptyIteratorRight) {
+  auto fragment_lists = MakeFragmentedTombstoneLists(
+      {{{"a", "e", 10}, {"c", "g", 8}},
+       {{"a", "b", 20}, {"h", "i", 25}, {"ii", "j", 15}}});
+
+  std::vector<SequenceNumber> snapshots{9, 19};
+  CompactionRangeDelAggregator range_del_agg(&bytewise_icmp, snapshots);
+  for (const auto& fragment_list : fragment_lists) {
+    std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
+        new FragmentedRangeTombstoneIterator(fragment_list.get(), bytewise_icmp,
+                                             kMaxSequenceNumber));
+    range_del_agg.AddTombstones(std::move(input_iter));
+  }
+
+  Slice start("p");
+  Slice end("q");
+  auto range_del_compaction_iter1 =
+      range_del_agg.NewIterator(&start, &end, false /* end_key_inclusive */);
+  VerifyFragmentedRangeDels(range_del_compaction_iter1.get(), {});
+
+  auto range_del_compaction_iter2 =
+      range_del_agg.NewIterator(&start, &end, true /* end_key_inclusive */);
+  VerifyFragmentedRangeDels(range_del_compaction_iter2.get(), {});
+}
+
+TEST_F(RangeDelAggregatorTest, CompactionAggregatorBoundedIterator) {
+  auto fragment_lists = MakeFragmentedTombstoneLists(
+      {{{"a", "e", 10}, {"c", "g", 8}},
+       {{"a", "b", 20}, {"h", "i", 25}, {"ii", "j", 15}}});
+
+  std::vector<SequenceNumber> snapshots{9, 19};
+  CompactionRangeDelAggregator range_del_agg(&bytewise_icmp, snapshots);
+  for (const auto& fragment_list : fragment_lists) {
+    std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
+        new FragmentedRangeTombstoneIterator(fragment_list.get(), bytewise_icmp,
+                                             kMaxSequenceNumber));
+    range_del_agg.AddTombstones(std::move(input_iter));
+  }
+
+  Slice start("bb");
+  Slice end("e");
+  auto range_del_compaction_iter1 =
+      range_del_agg.NewIterator(&start, &end, false /* end_key_inclusive */);
+  VerifyFragmentedRangeDels(range_del_compaction_iter1.get(),
+                            {{"a", "c", 10}, {"c", "e", 10}, {"c", "e", 8}});
+
+  auto range_del_compaction_iter2 =
+      range_del_agg.NewIterator(&start, &end, true /* end_key_inclusive */);
+  VerifyFragmentedRangeDels(
+      range_del_compaction_iter2.get(),
+      {{"a", "c", 10}, {"c", "e", 10}, {"c", "e", 8}, {"e", "g", 8}});
+}
+
+TEST_F(RangeDelAggregatorTest,
+       CompactionAggregatorBoundedIteratorExtraFragments) {
+  auto fragment_lists = MakeFragmentedTombstoneLists(
+      {{{"a", "d", 10}, {"c", "g", 8}},
+       {{"b", "c", 20}, {"d", "f", 30}, {"h", "i", 25}, {"ii", "j", 15}}});
+
+  std::vector<SequenceNumber> snapshots{9, 19};
+  CompactionRangeDelAggregator range_del_agg(&bytewise_icmp, snapshots);
+  for (const auto& fragment_list : fragment_lists) {
+    std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
+        new FragmentedRangeTombstoneIterator(fragment_list.get(), bytewise_icmp,
+                                             kMaxSequenceNumber));
+    range_del_agg.AddTombstones(std::move(input_iter));
+  }
+
+  Slice start("bb");
+  Slice end("e");
+  auto range_del_compaction_iter1 =
+      range_del_agg.NewIterator(&start, &end, false /* end_key_inclusive */);
+  VerifyFragmentedRangeDels(range_del_compaction_iter1.get(), {{"a", "b", 10},
+                                                               {"b", "c", 20},
+                                                               {"b", "c", 10},
+                                                               {"c", "d", 10},
+                                                               {"c", "d", 8},
+                                                               {"d", "f", 30},
+                                                               {"d", "f", 8},
+                                                               {"f", "g", 8}});
+
+  auto range_del_compaction_iter2 =
+      range_del_agg.NewIterator(&start, &end, true /* end_key_inclusive */);
+  VerifyFragmentedRangeDels(range_del_compaction_iter2.get(), {{"a", "b", 10},
+                                                               {"b", "c", 20},
+                                                               {"b", "c", 10},
+                                                               {"c", "d", 10},
+                                                               {"c", "d", 8},
+                                                               {"d", "f", 30},
+                                                               {"d", "f", 8},
+                                                               {"f", "g", 8}});
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/range_tombstone_fragmenter.cc b/src/rocksdb/db/range_tombstone_fragmenter.cc
new file mode 100644
index 000000000..58426248c
--- /dev/null
+++ b/src/rocksdb/db/range_tombstone_fragmenter.cc
@@ -0,0 +1,439 @@
+//  Copyright (c) 2018-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/range_tombstone_fragmenter.h"
+
+#include <algorithm>
+#include <functional>
+#include <set>
+
+#include <stdio.h>
+#include <cinttypes>
+
+#include "util/autovector.h"
+#include "util/kv_map.h"
+#include "util/vector_iterator.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+FragmentedRangeTombstoneList::FragmentedRangeTombstoneList(
+    std::unique_ptr<InternalIterator> unfragmented_tombstones,
+    const InternalKeyComparator& icmp, bool for_compaction,
+    const std::vector<SequenceNumber>& snapshots) {
+  if (unfragmented_tombstones == nullptr) {
+    return;
+  }
+  bool is_sorted = true;
+  int num_tombstones = 0;
+  InternalKey pinned_last_start_key;
+  Slice last_start_key;
+  for (unfragmented_tombstones->SeekToFirst(); unfragmented_tombstones->Valid();
+       unfragmented_tombstones->Next(), num_tombstones++) {
+    if (num_tombstones > 0 &&
+        icmp.Compare(last_start_key, unfragmented_tombstones->key()) > 0) {
+      is_sorted = false;
+      break;
+    }
+    if (unfragmented_tombstones->IsKeyPinned()) {
+      last_start_key = unfragmented_tombstones->key();
+    } else {
+      pinned_last_start_key.DecodeFrom(unfragmented_tombstones->key());
+      last_start_key = pinned_last_start_key.Encode();
+    }
+  }
+  if (is_sorted) {
+    FragmentTombstones(std::move(unfragmented_tombstones), icmp, for_compaction,
+                       snapshots);
+    return;
+  }
+
+  // Sort the tombstones before fragmenting them.
+  std::vector<std::string> keys, values;
+  keys.reserve(num_tombstones);
+  values.reserve(num_tombstones);
+  for (unfragmented_tombstones->SeekToFirst(); unfragmented_tombstones->Valid();
+       unfragmented_tombstones->Next()) {
+    keys.emplace_back(unfragmented_tombstones->key().data(),
+                      unfragmented_tombstones->key().size());
+    values.emplace_back(unfragmented_tombstones->value().data(),
+                        unfragmented_tombstones->value().size());
+  }
+  // VectorIterator implicitly sorts by key during construction.
+  auto iter = std::unique_ptr<VectorIterator>(
+      new VectorIterator(std::move(keys), std::move(values), &icmp));
+  FragmentTombstones(std::move(iter), icmp, for_compaction, snapshots);
+}
+
+void FragmentedRangeTombstoneList::FragmentTombstones(
+    std::unique_ptr<InternalIterator> unfragmented_tombstones,
+    const InternalKeyComparator& icmp, bool for_compaction,
+    const std::vector<SequenceNumber>& snapshots) {
+  Slice cur_start_key(nullptr, 0);
+  auto cmp = ParsedInternalKeyComparator(&icmp);
+
+  // Stores the end keys and sequence numbers of range tombstones with a start
+  // key less than or equal to cur_start_key. Provides an ordering by end key
+  // for use in flush_current_tombstones.
+  std::set<ParsedInternalKey, ParsedInternalKeyComparator> cur_end_keys(cmp);
+
+  // Given the next start key in unfragmented_tombstones,
+  // flush_current_tombstones writes every tombstone fragment that starts
+  // and ends with a key before next_start_key, and starts with a key greater
+  // than or equal to cur_start_key.
+  auto flush_current_tombstones = [&](const Slice& next_start_key) {
+    auto it = cur_end_keys.begin();
+    bool reached_next_start_key = false;
+    for (; it != cur_end_keys.end() && !reached_next_start_key; ++it) {
+      Slice cur_end_key = it->user_key;
+      if (icmp.user_comparator()->Compare(cur_start_key, cur_end_key) == 0) {
+        // Empty tombstone.
+        continue;
+      }
+      if (icmp.user_comparator()->Compare(next_start_key, cur_end_key) <= 0) {
+        // All of the end keys in [it, cur_end_keys.end()) are after
+        // next_start_key, so the tombstones they represent can be used in
+        // fragments that start with keys greater than or equal to
+        // next_start_key. However, the end keys we already passed will not be
+        // used in any more tombstone fragments.
+        //
+        // Remove the fully fragmented tombstones and stop iteration after a
+        // final round of flushing to preserve the tombstones we can create more
+        // fragments from.
+        reached_next_start_key = true;
+        cur_end_keys.erase(cur_end_keys.begin(), it);
+        cur_end_key = next_start_key;
+      }
+
+      // Flush a range tombstone fragment [cur_start_key, cur_end_key), which
+      // should not overlap with the last-flushed tombstone fragment.
+      assert(tombstones_.empty() ||
+             icmp.user_comparator()->Compare(tombstones_.back().end_key,
+                                             cur_start_key) <= 0);
+
+      // Sort the sequence numbers of the tombstones being fragmented in
+      // descending order, and then flush them in that order.
+      autovector<SequenceNumber> seqnums_to_flush;
+      for (auto flush_it = it; flush_it != cur_end_keys.end(); ++flush_it) {
+        seqnums_to_flush.push_back(flush_it->sequence);
+      }
+      std::sort(seqnums_to_flush.begin(), seqnums_to_flush.end(),
+                std::greater<SequenceNumber>());
+
+      size_t start_idx = tombstone_seqs_.size();
+      size_t end_idx = start_idx + seqnums_to_flush.size();
+
+      if (for_compaction) {
+        // Drop all tombstone seqnums that are not preserved by a snapshot.
+        SequenceNumber next_snapshot = kMaxSequenceNumber;
+        for (auto seq : seqnums_to_flush) {
+          if (seq <= next_snapshot) {
+            // This seqnum is visible by a lower snapshot.
+            tombstone_seqs_.push_back(seq);
+            seq_set_.insert(seq);
+            auto upper_bound_it =
+                std::lower_bound(snapshots.begin(), snapshots.end(), seq);
+            if (upper_bound_it == snapshots.begin()) {
+              // This seqnum is the topmost one visible by the earliest
+              // snapshot. None of the seqnums below it will be visible, so we
+              // can skip them.
+              break;
+            }
+            next_snapshot = *std::prev(upper_bound_it);
+          }
+        }
+        end_idx = tombstone_seqs_.size();
+      } else {
+        // The fragmentation is being done for reads, so preserve all seqnums.
+        tombstone_seqs_.insert(tombstone_seqs_.end(), seqnums_to_flush.begin(),
+                               seqnums_to_flush.end());
+        seq_set_.insert(seqnums_to_flush.begin(), seqnums_to_flush.end());
+      }
+
+      assert(start_idx < end_idx);
+      tombstones_.emplace_back(cur_start_key, cur_end_key, start_idx, end_idx);
+
+      cur_start_key = cur_end_key;
+    }
+    if (!reached_next_start_key) {
+      // There is a gap between the last flushed tombstone fragment and
+      // the next tombstone's start key. Remove all the end keys in
+      // the working set, since we have fully fragmented their corresponding
+      // tombstones.
+      cur_end_keys.clear();
+    }
+    cur_start_key = next_start_key;
+  };
+
+  pinned_iters_mgr_.StartPinning();
+
+  bool no_tombstones = true;
+  for (unfragmented_tombstones->SeekToFirst(); unfragmented_tombstones->Valid();
+       unfragmented_tombstones->Next()) {
+    const Slice& ikey = unfragmented_tombstones->key();
+    Slice tombstone_start_key = ExtractUserKey(ikey);
+    SequenceNumber tombstone_seq = GetInternalKeySeqno(ikey);
+    if (!unfragmented_tombstones->IsKeyPinned()) {
+      pinned_slices_.emplace_back(tombstone_start_key.data(),
+                                  tombstone_start_key.size());
+      tombstone_start_key = pinned_slices_.back();
+    }
+    no_tombstones = false;
+
+    Slice tombstone_end_key = unfragmented_tombstones->value();
+    if (!unfragmented_tombstones->IsValuePinned()) {
+      pinned_slices_.emplace_back(tombstone_end_key.data(),
+                                  tombstone_end_key.size());
+      tombstone_end_key = pinned_slices_.back();
+    }
+    if (!cur_end_keys.empty() && icmp.user_comparator()->Compare(
+                                     cur_start_key, tombstone_start_key) != 0) {
+      // The start key has changed. Flush all tombstones that start before
+      // this new start key.
+      flush_current_tombstones(tombstone_start_key);
+    }
+    cur_start_key = tombstone_start_key;
+
+    cur_end_keys.emplace(tombstone_end_key, tombstone_seq, kTypeRangeDeletion);
+  }
+  if (!cur_end_keys.empty()) {
+    ParsedInternalKey last_end_key = *std::prev(cur_end_keys.end());
+    flush_current_tombstones(last_end_key.user_key);
+  }
+
+  if (!no_tombstones) {
+    pinned_iters_mgr_.PinIterator(unfragmented_tombstones.release(),
+                                  false /* arena */);
+  }
+}
+
+bool FragmentedRangeTombstoneList::ContainsRange(SequenceNumber lower,
+                                                 SequenceNumber upper) const {
+  auto seq_it = seq_set_.lower_bound(lower);
+  return seq_it != seq_set_.end() && *seq_it <= upper;
+}
+
+FragmentedRangeTombstoneIterator::FragmentedRangeTombstoneIterator(
+    const FragmentedRangeTombstoneList* tombstones,
+    const InternalKeyComparator& icmp, SequenceNumber _upper_bound,
+    SequenceNumber _lower_bound)
+    : tombstone_start_cmp_(icmp.user_comparator()),
+      tombstone_end_cmp_(icmp.user_comparator()),
+      icmp_(&icmp),
+      ucmp_(icmp.user_comparator()),
+      tombstones_(tombstones),
+      upper_bound_(_upper_bound),
+      lower_bound_(_lower_bound) {
+  assert(tombstones_ != nullptr);
+  Invalidate();
+}
+
+FragmentedRangeTombstoneIterator::FragmentedRangeTombstoneIterator(
+    const std::shared_ptr<const FragmentedRangeTombstoneList>& tombstones,
+    const InternalKeyComparator& icmp, SequenceNumber _upper_bound,
+    SequenceNumber _lower_bound)
+    : tombstone_start_cmp_(icmp.user_comparator()),
+      tombstone_end_cmp_(icmp.user_comparator()),
+      icmp_(&icmp),
+      ucmp_(icmp.user_comparator()),
+      tombstones_ref_(tombstones),
+      tombstones_(tombstones_ref_.get()),
+      upper_bound_(_upper_bound),
+      lower_bound_(_lower_bound) {
+  assert(tombstones_ != nullptr);
+  Invalidate();
+}
+
+void FragmentedRangeTombstoneIterator::SeekToFirst() {
+  pos_ = tombstones_->begin();
+  seq_pos_ = tombstones_->seq_begin();
+}
+
+void FragmentedRangeTombstoneIterator::SeekToTopFirst() {
+  if (tombstones_->empty()) {
+    Invalidate();
+    return;
+  }
+  pos_ = tombstones_->begin();
+  seq_pos_ = std::lower_bound(tombstones_->seq_iter(pos_->seq_start_idx),
+                              tombstones_->seq_iter(pos_->seq_end_idx),
+                              upper_bound_, std::greater<SequenceNumber>());
+  ScanForwardToVisibleTombstone();
+}
+
+void FragmentedRangeTombstoneIterator::SeekToLast() {
+  pos_ = std::prev(tombstones_->end());
+  seq_pos_ = std::prev(tombstones_->seq_end());
+}
+
+void FragmentedRangeTombstoneIterator::SeekToTopLast() {
+  if (tombstones_->empty()) {
+    Invalidate();
+    return;
+  }
+  pos_ = std::prev(tombstones_->end());
+  seq_pos_ = std::lower_bound(tombstones_->seq_iter(pos_->seq_start_idx),
+                              tombstones_->seq_iter(pos_->seq_end_idx),
+                              upper_bound_, std::greater<SequenceNumber>());
+  ScanBackwardToVisibleTombstone();
+}
+
+void FragmentedRangeTombstoneIterator::Seek(const Slice& target) {
+  if (tombstones_->empty()) {
+    Invalidate();
+    return;
+  }
+  SeekToCoveringTombstone(target);
+  ScanForwardToVisibleTombstone();
+}
+
+void FragmentedRangeTombstoneIterator::SeekForPrev(const Slice& target) {
+  if (tombstones_->empty()) {
+    Invalidate();
+    return;
+  }
+  SeekForPrevToCoveringTombstone(target);
+  ScanBackwardToVisibleTombstone();
+}
+
+void FragmentedRangeTombstoneIterator::SeekToCoveringTombstone(
+    const Slice& target) {
+  pos_ = std::upper_bound(tombstones_->begin(), tombstones_->end(), target,
+                          tombstone_end_cmp_);
+  if (pos_ == tombstones_->end()) {
+    // All tombstones end before target.
+    seq_pos_ = tombstones_->seq_end();
+    return;
+  }
+  seq_pos_ = std::lower_bound(tombstones_->seq_iter(pos_->seq_start_idx),
+                              tombstones_->seq_iter(pos_->seq_end_idx),
+                              upper_bound_, std::greater<SequenceNumber>());
+}
+
+void FragmentedRangeTombstoneIterator::SeekForPrevToCoveringTombstone(
+    const Slice& target) {
+  if (tombstones_->empty()) {
+    Invalidate();
+    return;
+  }
+  pos_ = std::upper_bound(tombstones_->begin(), tombstones_->end(), target,
+                          tombstone_start_cmp_);
+  if (pos_ == tombstones_->begin()) {
+    // All tombstones start after target.
+    Invalidate();
+    return;
+  }
+  --pos_;
+  seq_pos_ = std::lower_bound(tombstones_->seq_iter(pos_->seq_start_idx),
+                              tombstones_->seq_iter(pos_->seq_end_idx),
+                              upper_bound_, std::greater<SequenceNumber>());
+}
+
+void FragmentedRangeTombstoneIterator::ScanForwardToVisibleTombstone() {
+  while (pos_ != tombstones_->end() &&
+         (seq_pos_ == tombstones_->seq_iter(pos_->seq_end_idx) ||
+          *seq_pos_ < lower_bound_)) {
+    ++pos_;
+    if (pos_ == tombstones_->end()) {
+      Invalidate();
+      return;
+    }
+    seq_pos_ = std::lower_bound(tombstones_->seq_iter(pos_->seq_start_idx),
+                                tombstones_->seq_iter(pos_->seq_end_idx),
+                                upper_bound_, std::greater<SequenceNumber>());
+  }
+}
+
+void FragmentedRangeTombstoneIterator::ScanBackwardToVisibleTombstone() {
+  while (pos_ != tombstones_->end() &&
+         (seq_pos_ == tombstones_->seq_iter(pos_->seq_end_idx) ||
+          *seq_pos_ < lower_bound_)) {
+    if (pos_ == tombstones_->begin()) {
+      Invalidate();
+      return;
+    }
+    --pos_;
+    seq_pos_ = std::lower_bound(tombstones_->seq_iter(pos_->seq_start_idx),
+                                tombstones_->seq_iter(pos_->seq_end_idx),
+                                upper_bound_, std::greater<SequenceNumber>());
+  }
+}
+
+void FragmentedRangeTombstoneIterator::Next() {
+  ++seq_pos_;
+  if (seq_pos_ == tombstones_->seq_iter(pos_->seq_end_idx)) {
+    ++pos_;
+  }
+}
+
+void FragmentedRangeTombstoneIterator::TopNext() {
+  ++pos_;
+  if (pos_ == tombstones_->end()) {
+    return;
+  }
+  seq_pos_ = std::lower_bound(tombstones_->seq_iter(pos_->seq_start_idx),
+                              tombstones_->seq_iter(pos_->seq_end_idx),
+                              upper_bound_, std::greater<SequenceNumber>());
+  ScanForwardToVisibleTombstone();
+}
+
+void FragmentedRangeTombstoneIterator::Prev() {
+  if (seq_pos_ == tombstones_->seq_begin()) {
+    Invalidate();
+    return;
+  }
+  --seq_pos_;
+  if (pos_ == tombstones_->end() ||
+      seq_pos_ == tombstones_->seq_iter(pos_->seq_start_idx - 1)) {
+    --pos_;
+  }
+}
+
+void FragmentedRangeTombstoneIterator::TopPrev() {
+  if (pos_ == tombstones_->begin()) {
+    Invalidate();
+    return;
+  }
+  --pos_;
+  seq_pos_ = std::lower_bound(tombstones_->seq_iter(pos_->seq_start_idx),
+                              tombstones_->seq_iter(pos_->seq_end_idx),
+                              upper_bound_, std::greater<SequenceNumber>());
+  ScanBackwardToVisibleTombstone();
+}
+
+bool FragmentedRangeTombstoneIterator::Valid() const {
+  return tombstones_ != nullptr && pos_ != tombstones_->end();
+}
+
+SequenceNumber FragmentedRangeTombstoneIterator::MaxCoveringTombstoneSeqnum(
+    const Slice& target_user_key) {
+  SeekToCoveringTombstone(target_user_key);
+  return ValidPos() && ucmp_->Compare(start_key(), target_user_key) <= 0 ? seq()
+                                                                         : 0;
+}
+
+std::map<SequenceNumber, std::unique_ptr<FragmentedRangeTombstoneIterator>>
+FragmentedRangeTombstoneIterator::SplitBySnapshot(
+    const std::vector<SequenceNumber>& snapshots) {
+  std::map<SequenceNumber, std::unique_ptr<FragmentedRangeTombstoneIterator>>
+      splits;
+  SequenceNumber lower = 0;
+  SequenceNumber upper;
+  for (size_t i = 0; i <= snapshots.size(); i++) {
+    if (i >= snapshots.size()) {
+      upper = kMaxSequenceNumber;
+    } else {
+      upper = snapshots[i];
+    }
+    if (tombstones_->ContainsRange(lower, upper)) {
+      splits.emplace(upper, std::unique_ptr<FragmentedRangeTombstoneIterator>(
+                                new FragmentedRangeTombstoneIterator(
+                                    tombstones_, *icmp_, upper, lower)));
+    }
+    lower = upper + 1;
+  }
+  return splits;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/range_tombstone_fragmenter.h b/src/rocksdb/db/range_tombstone_fragmenter.h
new file mode 100644
index 000000000..63ec24e64
--- /dev/null
+++ b/src/rocksdb/db/range_tombstone_fragmenter.h
@@ -0,0 +1,256 @@
+//  Copyright (c) 2018-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <list>
+#include <memory>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "db/dbformat.h"
+#include "db/pinned_iterators_manager.h"
+#include "rocksdb/status.h"
+#include "table/internal_iterator.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+struct FragmentedRangeTombstoneList {
+ public:
+  // A compact representation of a "stack" of range tombstone fragments, which
+  // start and end at the same user keys but have different sequence numbers.
+  // The members seq_start_idx and seq_end_idx are intended to be parameters to
+  // seq_iter().
+  struct RangeTombstoneStack {
+    RangeTombstoneStack(const Slice& start, const Slice& end, size_t start_idx,
+                        size_t end_idx)
+        : start_key(start),
+          end_key(end),
+          seq_start_idx(start_idx),
+          seq_end_idx(end_idx) {}
+
+    Slice start_key;
+    Slice end_key;
+    size_t seq_start_idx;
+    size_t seq_end_idx;
+  };
+  FragmentedRangeTombstoneList(
+      std::unique_ptr<InternalIterator> unfragmented_tombstones,
+      const InternalKeyComparator& icmp, bool for_compaction = false,
+      const std::vector<SequenceNumber>& snapshots = {});
+
+  std::vector<RangeTombstoneStack>::const_iterator begin() const {
+    return tombstones_.begin();
+  }
+
+  std::vector<RangeTombstoneStack>::const_iterator end() const {
+    return tombstones_.end();
+  }
+
+  std::vector<SequenceNumber>::const_iterator seq_iter(size_t idx) const {
+    return std::next(tombstone_seqs_.begin(), idx);
+  }
+
+  std::vector<SequenceNumber>::const_iterator seq_begin() const {
+    return tombstone_seqs_.begin();
+  }
+
+  std::vector<SequenceNumber>::const_iterator seq_end() const {
+    return tombstone_seqs_.end();
+  }
+
+  bool empty() const { return tombstones_.empty(); }
+
+  // Returns true if the stored tombstones contain with one with a sequence
+  // number in [lower, upper].
+  bool ContainsRange(SequenceNumber lower, SequenceNumber upper) const;
+
+ private:
+  // Given an ordered range tombstone iterator unfragmented_tombstones,
+  // "fragment" the tombstones into non-overlapping pieces, and store them in
+  // tombstones_ and tombstone_seqs_.
+  void FragmentTombstones(
+      std::unique_ptr<InternalIterator> unfragmented_tombstones,
+      const InternalKeyComparator& icmp, bool for_compaction,
+      const std::vector<SequenceNumber>& snapshots);
+
+  std::vector<RangeTombstoneStack> tombstones_;
+  std::vector<SequenceNumber> tombstone_seqs_;
+  std::set<SequenceNumber> seq_set_;
+  std::list<std::string> pinned_slices_;
+  PinnedIteratorsManager pinned_iters_mgr_;
+};
+
+// FragmentedRangeTombstoneIterator converts an InternalIterator of a range-del
+// meta block into an iterator over non-overlapping tombstone fragments. The
+// tombstone fragmentation process should be more efficient than the range
+// tombstone collapsing algorithm in RangeDelAggregator because this leverages
+// the internal key ordering already provided by the input iterator, if
+// applicable (when the iterator is unsorted, a new sorted iterator is created
+// before proceeding). If there are few overlaps, creating a
+// FragmentedRangeTombstoneIterator should be O(n), while the RangeDelAggregator
+// tombstone collapsing is always O(n log n).
+class FragmentedRangeTombstoneIterator : public InternalIterator {
+ public:
+  FragmentedRangeTombstoneIterator(
+      const FragmentedRangeTombstoneList* tombstones,
+      const InternalKeyComparator& icmp, SequenceNumber upper_bound,
+      SequenceNumber lower_bound = 0);
+  FragmentedRangeTombstoneIterator(
+      const std::shared_ptr<const FragmentedRangeTombstoneList>& tombstones,
+      const InternalKeyComparator& icmp, SequenceNumber upper_bound,
+      SequenceNumber lower_bound = 0);
+
+  void SeekToFirst() override;
+  void SeekToLast() override;
+
+  void SeekToTopFirst();
+  void SeekToTopLast();
+
+  // NOTE: Seek and SeekForPrev do not behave in the way InternalIterator
+  // seeking should behave. This is OK because they are not currently used, but
+  // eventually FragmentedRangeTombstoneIterator should no longer implement
+  // InternalIterator.
+  //
+  // Seeks to the range tombstone that covers target at a seqnum in the
+  // snapshot. If no such tombstone exists, seek to the earliest tombstone in
+  // the snapshot that ends after target.
+  void Seek(const Slice& target) override;
+  // Seeks to the range tombstone that covers target at a seqnum in the
+  // snapshot. If no such tombstone exists, seek to the latest tombstone in the
+  // snapshot that starts before target.
+  void SeekForPrev(const Slice& target) override;
+
+  void Next() override;
+  void Prev() override;
+
+  void TopNext();
+  void TopPrev();
+
+  bool Valid() const override;
+  Slice key() const override {
+    MaybePinKey();
+    return current_start_key_.Encode();
+  }
+  Slice value() const override { return pos_->end_key; }
+  bool IsKeyPinned() const override { return false; }
+  bool IsValuePinned() const override { return true; }
+  Status status() const override { return Status::OK(); }
+
+  bool empty() const { return tombstones_->empty(); }
+  void Invalidate() {
+    pos_ = tombstones_->end();
+    seq_pos_ = tombstones_->seq_end();
+    pinned_pos_ = tombstones_->end();
+    pinned_seq_pos_ = tombstones_->seq_end();
+  }
+
+  RangeTombstone Tombstone() const {
+    return RangeTombstone(start_key(), end_key(), seq());
+  }
+  Slice start_key() const { return pos_->start_key; }
+  Slice end_key() const { return pos_->end_key; }
+  SequenceNumber seq() const { return *seq_pos_; }
+  ParsedInternalKey parsed_start_key() const {
+    return ParsedInternalKey(pos_->start_key, kMaxSequenceNumber,
+                             kTypeRangeDeletion);
+  }
+  ParsedInternalKey parsed_end_key() const {
+    return ParsedInternalKey(pos_->end_key, kMaxSequenceNumber,
+                             kTypeRangeDeletion);
+  }
+
+  SequenceNumber MaxCoveringTombstoneSeqnum(const Slice& user_key);
+
+  // Splits the iterator into n+1 iterators (where n is the number of
+  // snapshots), each providing a view over a "stripe" of sequence numbers. The
+  // iterators are keyed by the upper bound of their ranges (the provided
+  // snapshots + kMaxSequenceNumber).
+  //
+  // NOTE: the iterators in the returned map are no longer valid if their
+  // parent iterator is deleted, since they do not modify the refcount of the
+  // underlying tombstone list. Therefore, this map should be deleted before
+  // the parent iterator.
+  std::map<SequenceNumber, std::unique_ptr<FragmentedRangeTombstoneIterator>>
+  SplitBySnapshot(const std::vector<SequenceNumber>& snapshots);
+
+  SequenceNumber upper_bound() const { return upper_bound_; }
+  SequenceNumber lower_bound() const { return lower_bound_; }
+
+ private:
+  using RangeTombstoneStack = FragmentedRangeTombstoneList::RangeTombstoneStack;
+
+  struct RangeTombstoneStackStartComparator {
+    explicit RangeTombstoneStackStartComparator(const Comparator* c) : cmp(c) {}
+
+    bool operator()(const RangeTombstoneStack& a,
+                    const RangeTombstoneStack& b) const {
+      return cmp->Compare(a.start_key, b.start_key) < 0;
+    }
+
+    bool operator()(const RangeTombstoneStack& a, const Slice& b) const {
+      return cmp->Compare(a.start_key, b) < 0;
+    }
+
+    bool operator()(const Slice& a, const RangeTombstoneStack& b) const {
+      return cmp->Compare(a, b.start_key) < 0;
+    }
+
+    const Comparator* cmp;
+  };
+
+  struct RangeTombstoneStackEndComparator {
+    explicit RangeTombstoneStackEndComparator(const Comparator* c) : cmp(c) {}
+
+    bool operator()(const RangeTombstoneStack& a,
+                    const RangeTombstoneStack& b) const {
+      return cmp->Compare(a.end_key, b.end_key) < 0;
+    }
+
+    bool operator()(const RangeTombstoneStack& a, const Slice& b) const {
+      return cmp->Compare(a.end_key, b) < 0;
+    }
+
+    bool operator()(const Slice& a, const RangeTombstoneStack& b) const {
+      return cmp->Compare(a, b.end_key) < 0;
+    }
+
+    const Comparator* cmp;
+  };
+
+  void MaybePinKey() const {
+    if (pos_ != tombstones_->end() && seq_pos_ != tombstones_->seq_end() &&
+        (pinned_pos_ != pos_ || pinned_seq_pos_ != seq_pos_)) {
+      current_start_key_.Set(pos_->start_key, *seq_pos_, kTypeRangeDeletion);
+      pinned_pos_ = pos_;
+      pinned_seq_pos_ = seq_pos_;
+    }
+  }
+
+  void SeekToCoveringTombstone(const Slice& key);
+  void SeekForPrevToCoveringTombstone(const Slice& key);
+  void ScanForwardToVisibleTombstone();
+  void ScanBackwardToVisibleTombstone();
+  bool ValidPos() const {
+    return Valid() && seq_pos_ != tombstones_->seq_iter(pos_->seq_end_idx);
+  }
+
+  const RangeTombstoneStackStartComparator tombstone_start_cmp_;
+  const RangeTombstoneStackEndComparator tombstone_end_cmp_;
+  const InternalKeyComparator* icmp_;
+  const Comparator* ucmp_;
+  std::shared_ptr<const FragmentedRangeTombstoneList> tombstones_ref_;
+  const FragmentedRangeTombstoneList* tombstones_;
+  SequenceNumber upper_bound_;
+  SequenceNumber lower_bound_;
+  std::vector<RangeTombstoneStack>::const_iterator pos_;
+  std::vector<SequenceNumber>::const_iterator seq_pos_;
+  mutable std::vector<RangeTombstoneStack>::const_iterator pinned_pos_;
+  mutable std::vector<SequenceNumber>::const_iterator pinned_seq_pos_;
+  mutable InternalKey current_start_key_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/range_tombstone_fragmenter_test.cc b/src/rocksdb/db/range_tombstone_fragmenter_test.cc
new file mode 100644
index 000000000..56234b1dd
--- /dev/null
+++ b/src/rocksdb/db/range_tombstone_fragmenter_test.cc
@@ -0,0 +1,552 @@
+//  Copyright (c) 2018-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/range_tombstone_fragmenter.h"
+
+#include "db/db_test_util.h"
+#include "rocksdb/comparator.h"
+#include "test_util/testutil.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class RangeTombstoneFragmenterTest : public testing::Test {};
+
+namespace {
+
+static auto bytewise_icmp = InternalKeyComparator(BytewiseComparator());
+
+std::unique_ptr<InternalIterator> MakeRangeDelIter(
+    const std::vector<RangeTombstone>& range_dels) {
+  std::vector<std::string> keys, values;
+  for (const auto& range_del : range_dels) {
+    auto key_and_value = range_del.Serialize();
+    keys.push_back(key_and_value.first.Encode().ToString());
+    values.push_back(key_and_value.second.ToString());
+  }
+  return std::unique_ptr<test::VectorIterator>(
+      new test::VectorIterator(keys, values));
+}
+
+void CheckIterPosition(const RangeTombstone& tombstone,
+                       const FragmentedRangeTombstoneIterator* iter) {
+  // Test InternalIterator interface.
+  EXPECT_EQ(tombstone.start_key_, ExtractUserKey(iter->key()));
+  EXPECT_EQ(tombstone.end_key_, iter->value());
+  EXPECT_EQ(tombstone.seq_, iter->seq());
+
+  // Test FragmentedRangeTombstoneIterator interface.
+  EXPECT_EQ(tombstone.start_key_, iter->start_key());
+  EXPECT_EQ(tombstone.end_key_, iter->end_key());
+  EXPECT_EQ(tombstone.seq_, GetInternalKeySeqno(iter->key()));
+}
+
+void VerifyFragmentedRangeDels(
+    FragmentedRangeTombstoneIterator* iter,
+    const std::vector<RangeTombstone>& expected_tombstones) {
+  iter->SeekToFirst();
+  for (size_t i = 0; i < expected_tombstones.size(); i++, iter->Next()) {
+    ASSERT_TRUE(iter->Valid());
+    CheckIterPosition(expected_tombstones[i], iter);
+  }
+  EXPECT_FALSE(iter->Valid());
+}
+
+void VerifyVisibleTombstones(
+    FragmentedRangeTombstoneIterator* iter,
+    const std::vector<RangeTombstone>& expected_tombstones) {
+  iter->SeekToTopFirst();
+  for (size_t i = 0; i < expected_tombstones.size(); i++, iter->TopNext()) {
+    ASSERT_TRUE(iter->Valid());
+    CheckIterPosition(expected_tombstones[i], iter);
+  }
+  EXPECT_FALSE(iter->Valid());
+}
+
+struct SeekTestCase {
+  Slice seek_target;
+  RangeTombstone expected_position;
+  bool out_of_range;
+};
+
+void VerifySeek(FragmentedRangeTombstoneIterator* iter,
+                const std::vector<SeekTestCase>& cases) {
+  for (const auto& testcase : cases) {
+    iter->Seek(testcase.seek_target);
+    if (testcase.out_of_range) {
+      ASSERT_FALSE(iter->Valid());
+    } else {
+      ASSERT_TRUE(iter->Valid());
+      CheckIterPosition(testcase.expected_position, iter);
+    }
+  }
+}
+
+void VerifySeekForPrev(FragmentedRangeTombstoneIterator* iter,
+                       const std::vector<SeekTestCase>& cases) {
+  for (const auto& testcase : cases) {
+    iter->SeekForPrev(testcase.seek_target);
+    if (testcase.out_of_range) {
+      ASSERT_FALSE(iter->Valid());
+    } else {
+      ASSERT_TRUE(iter->Valid());
+      CheckIterPosition(testcase.expected_position, iter);
+    }
+  }
+}
+
+struct MaxCoveringTombstoneSeqnumTestCase {
+  Slice user_key;
+  SequenceNumber result;
+};
+
+void VerifyMaxCoveringTombstoneSeqnum(
+    FragmentedRangeTombstoneIterator* iter,
+    const std::vector<MaxCoveringTombstoneSeqnumTestCase>& cases) {
+  for (const auto& testcase : cases) {
+    EXPECT_EQ(testcase.result,
+              iter->MaxCoveringTombstoneSeqnum(testcase.user_key));
+  }
+}
+
+}  // anonymous namespace
+
+TEST_F(RangeTombstoneFragmenterTest, NonOverlappingTombstones) {
+  auto range_del_iter = MakeRangeDelIter({{"a", "b", 10}, {"c", "d", 5}});
+
+  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+                                             bytewise_icmp);
+  FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp,
+                                        kMaxSequenceNumber);
+  ASSERT_EQ(0, iter.lower_bound());
+  ASSERT_EQ(kMaxSequenceNumber, iter.upper_bound());
+  VerifyFragmentedRangeDels(&iter, {{"a", "b", 10}, {"c", "d", 5}});
+  VerifyMaxCoveringTombstoneSeqnum(&iter,
+                                   {{"", 0}, {"a", 10}, {"b", 0}, {"c", 5}});
+}
+
+TEST_F(RangeTombstoneFragmenterTest, OverlappingTombstones) {
+  auto range_del_iter = MakeRangeDelIter({{"a", "e", 10}, {"c", "g", 15}});
+
+  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+                                             bytewise_icmp);
+  FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp,
+                                        kMaxSequenceNumber);
+  ASSERT_EQ(0, iter.lower_bound());
+  ASSERT_EQ(kMaxSequenceNumber, iter.upper_bound());
+  VerifyFragmentedRangeDels(
+      &iter, {{"a", "c", 10}, {"c", "e", 15}, {"c", "e", 10}, {"e", "g", 15}});
+  VerifyMaxCoveringTombstoneSeqnum(&iter,
+                                   {{"a", 10}, {"c", 15}, {"e", 15}, {"g", 0}});
+}
+
+TEST_F(RangeTombstoneFragmenterTest, ContiguousTombstones) {
+  auto range_del_iter = MakeRangeDelIter(
+      {{"a", "c", 10}, {"c", "e", 20}, {"c", "e", 5}, {"e", "g", 15}});
+
+  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+                                             bytewise_icmp);
+  FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp,
+                                        kMaxSequenceNumber);
+  ASSERT_EQ(0, iter.lower_bound());
+  ASSERT_EQ(kMaxSequenceNumber, iter.upper_bound());
+  VerifyFragmentedRangeDels(
+      &iter, {{"a", "c", 10}, {"c", "e", 20}, {"c", "e", 5}, {"e", "g", 15}});
+  VerifyMaxCoveringTombstoneSeqnum(&iter,
+                                   {{"a", 10}, {"c", 20}, {"e", 15}, {"g", 0}});
+}
+
+TEST_F(RangeTombstoneFragmenterTest, RepeatedStartAndEndKey) {
+  auto range_del_iter =
+      MakeRangeDelIter({{"a", "c", 10}, {"a", "c", 7}, {"a", "c", 3}});
+
+  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+                                             bytewise_icmp);
+  FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp,
+                                        kMaxSequenceNumber);
+  ASSERT_EQ(0, iter.lower_bound());
+  ASSERT_EQ(kMaxSequenceNumber, iter.upper_bound());
+  VerifyFragmentedRangeDels(&iter,
+                            {{"a", "c", 10}, {"a", "c", 7}, {"a", "c", 3}});
+  VerifyMaxCoveringTombstoneSeqnum(&iter, {{"a", 10}, {"b", 10}, {"c", 0}});
+}
+
+TEST_F(RangeTombstoneFragmenterTest, RepeatedStartKeyDifferentEndKeys) {
+  auto range_del_iter =
+      MakeRangeDelIter({{"a", "e", 10}, {"a", "g", 7}, {"a", "c", 3}});
+
+  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+                                             bytewise_icmp);
+  FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp,
+                                        kMaxSequenceNumber);
+  ASSERT_EQ(0, iter.lower_bound());
+  ASSERT_EQ(kMaxSequenceNumber, iter.upper_bound());
+  VerifyFragmentedRangeDels(&iter, {{"a", "c", 10},
+                                    {"a", "c", 7},
+                                    {"a", "c", 3},
+                                    {"c", "e", 10},
+                                    {"c", "e", 7},
+                                    {"e", "g", 7}});
+  VerifyMaxCoveringTombstoneSeqnum(&iter,
+                                   {{"a", 10}, {"c", 10}, {"e", 7}, {"g", 0}});
+}
+
+TEST_F(RangeTombstoneFragmenterTest, RepeatedStartKeyMixedEndKeys) {
+  auto range_del_iter = MakeRangeDelIter({{"a", "c", 30},
+                                          {"a", "g", 20},
+                                          {"a", "e", 10},
+                                          {"a", "g", 7},
+                                          {"a", "c", 3}});
+
+  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+                                             bytewise_icmp);
+  FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp,
+                                        kMaxSequenceNumber);
+  ASSERT_EQ(0, iter.lower_bound());
+  ASSERT_EQ(kMaxSequenceNumber, iter.upper_bound());
+  VerifyFragmentedRangeDels(&iter, {{"a", "c", 30},
+                                    {"a", "c", 20},
+                                    {"a", "c", 10},
+                                    {"a", "c", 7},
+                                    {"a", "c", 3},
+                                    {"c", "e", 20},
+                                    {"c", "e", 10},
+                                    {"c", "e", 7},
+                                    {"e", "g", 20},
+                                    {"e", "g", 7}});
+  VerifyMaxCoveringTombstoneSeqnum(&iter,
+                                   {{"a", 30}, {"c", 20}, {"e", 20}, {"g", 0}});
+}
+
+TEST_F(RangeTombstoneFragmenterTest, OverlapAndRepeatedStartKey) {
+  auto range_del_iter = MakeRangeDelIter({{"a", "e", 10},
+                                          {"c", "g", 8},
+                                          {"c", "i", 6},
+                                          {"j", "n", 4},
+                                          {"j", "l", 2}});
+
+  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+                                             bytewise_icmp);
+  FragmentedRangeTombstoneIterator iter1(&fragment_list, bytewise_icmp,
+                                         kMaxSequenceNumber);
+  FragmentedRangeTombstoneIterator iter2(&fragment_list, bytewise_icmp,
+                                         9 /* upper_bound */);
+  FragmentedRangeTombstoneIterator iter3(&fragment_list, bytewise_icmp,
+                                         7 /* upper_bound */);
+  FragmentedRangeTombstoneIterator iter4(&fragment_list, bytewise_icmp,
+                                         5 /* upper_bound */);
+  FragmentedRangeTombstoneIterator iter5(&fragment_list, bytewise_icmp,
+                                         3 /* upper_bound */);
+  for (auto* iter : {&iter1, &iter2, &iter3, &iter4, &iter5}) {
+    VerifyFragmentedRangeDels(iter, {{"a", "c", 10},
+                                     {"c", "e", 10},
+                                     {"c", "e", 8},
+                                     {"c", "e", 6},
+                                     {"e", "g", 8},
+                                     {"e", "g", 6},
+                                     {"g", "i", 6},
+                                     {"j", "l", 4},
+                                     {"j", "l", 2},
+                                     {"l", "n", 4}});
+  }
+
+  ASSERT_EQ(0, iter1.lower_bound());
+  ASSERT_EQ(kMaxSequenceNumber, iter1.upper_bound());
+  VerifyVisibleTombstones(&iter1, {{"a", "c", 10},
+                                   {"c", "e", 10},
+                                   {"e", "g", 8},
+                                   {"g", "i", 6},
+                                   {"j", "l", 4},
+                                   {"l", "n", 4}});
+  VerifyMaxCoveringTombstoneSeqnum(
+      &iter1, {{"a", 10}, {"c", 10}, {"e", 8}, {"i", 0}, {"j", 4}, {"m", 4}});
+
+  ASSERT_EQ(0, iter2.lower_bound());
+  ASSERT_EQ(9, iter2.upper_bound());
+  VerifyVisibleTombstones(&iter2, {{"c", "e", 8},
+                                   {"e", "g", 8},
+                                   {"g", "i", 6},
+                                   {"j", "l", 4},
+                                   {"l", "n", 4}});
+  VerifyMaxCoveringTombstoneSeqnum(
+      &iter2, {{"a", 0}, {"c", 8}, {"e", 8}, {"i", 0}, {"j", 4}, {"m", 4}});
+
+  ASSERT_EQ(0, iter3.lower_bound());
+  ASSERT_EQ(7, iter3.upper_bound());
+  VerifyVisibleTombstones(&iter3, {{"c", "e", 6},
+                                   {"e", "g", 6},
+                                   {"g", "i", 6},
+                                   {"j", "l", 4},
+                                   {"l", "n", 4}});
+  VerifyMaxCoveringTombstoneSeqnum(
+      &iter3, {{"a", 0}, {"c", 6}, {"e", 6}, {"i", 0}, {"j", 4}, {"m", 4}});
+
+  ASSERT_EQ(0, iter4.lower_bound());
+  ASSERT_EQ(5, iter4.upper_bound());
+  VerifyVisibleTombstones(&iter4, {{"j", "l", 4}, {"l", "n", 4}});
+  VerifyMaxCoveringTombstoneSeqnum(
+      &iter4, {{"a", 0}, {"c", 0}, {"e", 0}, {"i", 0}, {"j", 4}, {"m", 4}});
+
+  ASSERT_EQ(0, iter5.lower_bound());
+  ASSERT_EQ(3, iter5.upper_bound());
+  VerifyVisibleTombstones(&iter5, {{"j", "l", 2}});
+  VerifyMaxCoveringTombstoneSeqnum(
+      &iter5, {{"a", 0}, {"c", 0}, {"e", 0}, {"i", 0}, {"j", 2}, {"m", 0}});
+}
+
+TEST_F(RangeTombstoneFragmenterTest, OverlapAndRepeatedStartKeyUnordered) {
+  auto range_del_iter = MakeRangeDelIter({{"a", "e", 10},
+                                          {"j", "n", 4},
+                                          {"c", "i", 6},
+                                          {"c", "g", 8},
+                                          {"j", "l", 2}});
+
+  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+                                             bytewise_icmp);
+  FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp,
+                                        9 /* upper_bound */);
+  ASSERT_EQ(0, iter.lower_bound());
+  ASSERT_EQ(9, iter.upper_bound());
+  VerifyFragmentedRangeDels(&iter, {{"a", "c", 10},
+                                    {"c", "e", 10},
+                                    {"c", "e", 8},
+                                    {"c", "e", 6},
+                                    {"e", "g", 8},
+                                    {"e", "g", 6},
+                                    {"g", "i", 6},
+                                    {"j", "l", 4},
+                                    {"j", "l", 2},
+                                    {"l", "n", 4}});
+  VerifyMaxCoveringTombstoneSeqnum(
+      &iter, {{"a", 0}, {"c", 8}, {"e", 8}, {"i", 0}, {"j", 4}, {"m", 4}});
+}
+
+TEST_F(RangeTombstoneFragmenterTest, OverlapAndRepeatedStartKeyForCompaction) {
+  auto range_del_iter = MakeRangeDelIter({{"a", "e", 10},
+                                          {"j", "n", 4},
+                                          {"c", "i", 6},
+                                          {"c", "g", 8},
+                                          {"j", "l", 2}});
+
+  FragmentedRangeTombstoneList fragment_list(
+      std::move(range_del_iter), bytewise_icmp, true /* for_compaction */,
+      {} /* snapshots */);
+  FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp,
+                                        kMaxSequenceNumber /* upper_bound */);
+  VerifyFragmentedRangeDels(&iter, {{"a", "c", 10},
+                                    {"c", "e", 10},
+                                    {"e", "g", 8},
+                                    {"g", "i", 6},
+                                    {"j", "l", 4},
+                                    {"l", "n", 4}});
+}
+
+TEST_F(RangeTombstoneFragmenterTest,
+       OverlapAndRepeatedStartKeyForCompactionWithSnapshot) {
+  auto range_del_iter = MakeRangeDelIter({{"a", "e", 10},
+                                          {"j", "n", 4},
+                                          {"c", "i", 6},
+                                          {"c", "g", 8},
+                                          {"j", "l", 2}});
+
+  FragmentedRangeTombstoneList fragment_list(
+      std::move(range_del_iter), bytewise_icmp, true /* for_compaction */,
+      {20, 9} /* upper_bounds */);
+  FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp,
+                                        kMaxSequenceNumber /* upper_bound */);
+  VerifyFragmentedRangeDels(&iter, {{"a", "c", 10},
+                                    {"c", "e", 10},
+                                    {"c", "e", 8},
+                                    {"e", "g", 8},
+                                    {"g", "i", 6},
+                                    {"j", "l", 4},
+                                    {"l", "n", 4}});
+}
+
+TEST_F(RangeTombstoneFragmenterTest, IteratorSplitNoSnapshots) {
+  auto range_del_iter = MakeRangeDelIter({{"a", "e", 10},
+                                          {"j", "n", 4},
+                                          {"c", "i", 6},
+                                          {"c", "g", 8},
+                                          {"j", "l", 2}});
+
+  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+                                             bytewise_icmp);
+  FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp,
+                                        kMaxSequenceNumber /* upper_bound */);
+
+  auto split_iters = iter.SplitBySnapshot({} /* snapshots */);
+  ASSERT_EQ(1, split_iters.size());
+
+  auto* split_iter = split_iters[kMaxSequenceNumber].get();
+  ASSERT_EQ(0, split_iter->lower_bound());
+  ASSERT_EQ(kMaxSequenceNumber, split_iter->upper_bound());
+  VerifyVisibleTombstones(split_iter, {{"a", "c", 10},
+                                       {"c", "e", 10},
+                                       {"e", "g", 8},
+                                       {"g", "i", 6},
+                                       {"j", "l", 4},
+                                       {"l", "n", 4}});
+}
+
+TEST_F(RangeTombstoneFragmenterTest, IteratorSplitWithSnapshots) {
+  auto range_del_iter = MakeRangeDelIter({{"a", "e", 10},
+                                          {"j", "n", 4},
+                                          {"c", "i", 6},
+                                          {"c", "g", 8},
+                                          {"j", "l", 2}});
+
+  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+                                             bytewise_icmp);
+  FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp,
+                                        kMaxSequenceNumber /* upper_bound */);
+
+  auto split_iters = iter.SplitBySnapshot({3, 5, 7, 9} /* snapshots */);
+  ASSERT_EQ(5, split_iters.size());
+
+  auto* split_iter1 = split_iters[3].get();
+  ASSERT_EQ(0, split_iter1->lower_bound());
+  ASSERT_EQ(3, split_iter1->upper_bound());
+  VerifyVisibleTombstones(split_iter1, {{"j", "l", 2}});
+
+  auto* split_iter2 = split_iters[5].get();
+  ASSERT_EQ(4, split_iter2->lower_bound());
+  ASSERT_EQ(5, split_iter2->upper_bound());
+  VerifyVisibleTombstones(split_iter2, {{"j", "l", 4}, {"l", "n", 4}});
+
+  auto* split_iter3 = split_iters[7].get();
+  ASSERT_EQ(6, split_iter3->lower_bound());
+  ASSERT_EQ(7, split_iter3->upper_bound());
+  VerifyVisibleTombstones(split_iter3,
+                          {{"c", "e", 6}, {"e", "g", 6}, {"g", "i", 6}});
+
+  auto* split_iter4 = split_iters[9].get();
+  ASSERT_EQ(8, split_iter4->lower_bound());
+  ASSERT_EQ(9, split_iter4->upper_bound());
+  VerifyVisibleTombstones(split_iter4, {{"c", "e", 8}, {"e", "g", 8}});
+
+  auto* split_iter5 = split_iters[kMaxSequenceNumber].get();
+  ASSERT_EQ(10, split_iter5->lower_bound());
+  ASSERT_EQ(kMaxSequenceNumber, split_iter5->upper_bound());
+  VerifyVisibleTombstones(split_iter5, {{"a", "c", 10}, {"c", "e", 10}});
+}
+
+TEST_F(RangeTombstoneFragmenterTest, SeekStartKey) {
+  // Same tombstones as OverlapAndRepeatedStartKey.
+  auto range_del_iter = MakeRangeDelIter({{"a", "e", 10},
+                                          {"c", "g", 8},
+                                          {"c", "i", 6},
+                                          {"j", "n", 4},
+                                          {"j", "l", 2}});
+
+  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+                                             bytewise_icmp);
+
+  FragmentedRangeTombstoneIterator iter1(&fragment_list, bytewise_icmp,
+                                         kMaxSequenceNumber);
+  VerifySeek(
+      &iter1,
+      {{"a", {"a", "c", 10}}, {"e", {"e", "g", 8}}, {"l", {"l", "n", 4}}});
+  VerifySeekForPrev(
+      &iter1,
+      {{"a", {"a", "c", 10}}, {"e", {"e", "g", 8}}, {"l", {"l", "n", 4}}});
+
+  FragmentedRangeTombstoneIterator iter2(&fragment_list, bytewise_icmp,
+                                         3 /* upper_bound */);
+  VerifySeek(&iter2, {{"a", {"j", "l", 2}},
+                      {"e", {"j", "l", 2}},
+                      {"l", {}, true /* out of range */}});
+  VerifySeekForPrev(&iter2, {{"a", {}, true /* out of range */},
+                             {"e", {}, true /* out of range */},
+                             {"l", {"j", "l", 2}}});
+}
+
+TEST_F(RangeTombstoneFragmenterTest, SeekCovered) {
+  // Same tombstones as OverlapAndRepeatedStartKey.
+  auto range_del_iter = MakeRangeDelIter({{"a", "e", 10},
+                                          {"c", "g", 8},
+                                          {"c", "i", 6},
+                                          {"j", "n", 4},
+                                          {"j", "l", 2}});
+
+  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+                                             bytewise_icmp);
+
+  FragmentedRangeTombstoneIterator iter1(&fragment_list, bytewise_icmp,
+                                         kMaxSequenceNumber);
+  VerifySeek(
+      &iter1,
+      {{"b", {"a", "c", 10}}, {"f", {"e", "g", 8}}, {"m", {"l", "n", 4}}});
+  VerifySeekForPrev(
+      &iter1,
+      {{"b", {"a", "c", 10}}, {"f", {"e", "g", 8}}, {"m", {"l", "n", 4}}});
+
+  FragmentedRangeTombstoneIterator iter2(&fragment_list, bytewise_icmp,
+                                         3 /* upper_bound */);
+  VerifySeek(&iter2, {{"b", {"j", "l", 2}},
+                      {"f", {"j", "l", 2}},
+                      {"m", {}, true /* out of range */}});
+  VerifySeekForPrev(&iter2, {{"b", {}, true /* out of range */},
+                             {"f", {}, true /* out of range */},
+                             {"m", {"j", "l", 2}}});
+}
+
+TEST_F(RangeTombstoneFragmenterTest, SeekEndKey) {
+  // Same tombstones as OverlapAndRepeatedStartKey.
+  auto range_del_iter = MakeRangeDelIter({{"a", "e", 10},
+                                          {"c", "g", 8},
+                                          {"c", "i", 6},
+                                          {"j", "n", 4},
+                                          {"j", "l", 2}});
+
+  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+                                             bytewise_icmp);
+
+  FragmentedRangeTombstoneIterator iter1(&fragment_list, bytewise_icmp,
+                                         kMaxSequenceNumber);
+  VerifySeek(&iter1, {{"c", {"c", "e", 10}},
+                      {"g", {"g", "i", 6}},
+                      {"i", {"j", "l", 4}},
+                      {"n", {}, true /* out of range */}});
+  VerifySeekForPrev(&iter1, {{"c", {"c", "e", 10}},
+                             {"g", {"g", "i", 6}},
+                             {"i", {"g", "i", 6}},
+                             {"n", {"l", "n", 4}}});
+
+  FragmentedRangeTombstoneIterator iter2(&fragment_list, bytewise_icmp,
+                                         3 /* upper_bound */);
+  VerifySeek(&iter2, {{"c", {"j", "l", 2}},
+                      {"g", {"j", "l", 2}},
+                      {"i", {"j", "l", 2}},
+                      {"n", {}, true /* out of range */}});
+  VerifySeekForPrev(&iter2, {{"c", {}, true /* out of range */},
+                             {"g", {}, true /* out of range */},
+                             {"i", {}, true /* out of range */},
+                             {"n", {"j", "l", 2}}});
+}
+
+TEST_F(RangeTombstoneFragmenterTest, SeekOutOfBounds) {
+  // Same tombstones as OverlapAndRepeatedStartKey.
+  auto range_del_iter = MakeRangeDelIter({{"a", "e", 10},
+                                          {"c", "g", 8},
+                                          {"c", "i", 6},
+                                          {"j", "n", 4},
+                                          {"j", "l", 2}});
+
+  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+                                             bytewise_icmp);
+
+  FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp,
+                                        kMaxSequenceNumber);
+  VerifySeek(&iter, {{"", {"a", "c", 10}}, {"z", {}, true /* out of range */}});
+  VerifySeekForPrev(&iter,
+                    {{"", {}, true /* out of range */}, {"z", {"l", "n", 4}}});
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/read_callback.h b/src/rocksdb/db/read_callback.h
new file mode 100644
index 000000000..fbef1dd0d
--- /dev/null
+++ b/src/rocksdb/db/read_callback.h
@@ -0,0 +1,53 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include "rocksdb/types.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class ReadCallback {
+ public:
+  ReadCallback(SequenceNumber last_visible_seq)
+      : max_visible_seq_(last_visible_seq) {}
+  ReadCallback(SequenceNumber last_visible_seq, SequenceNumber min_uncommitted)
+      : max_visible_seq_(last_visible_seq), min_uncommitted_(min_uncommitted) {}
+
+  virtual ~ReadCallback() {}
+
+  // Will be called to see if the seq number visible; if not it moves on to
+  // the next seq number.
+  virtual bool IsVisibleFullCheck(SequenceNumber seq) = 0;
+
+  inline bool IsVisible(SequenceNumber seq) {
+    assert(min_uncommitted_ > 0);
+    assert(min_uncommitted_ >= kMinUnCommittedSeq);
+    if (seq < min_uncommitted_) {  // handles seq == 0 as well
+      assert(seq <= max_visible_seq_);
+      return true;
+    } else if (max_visible_seq_ < seq) {
+      assert(seq != 0);
+      return false;
+    } else {
+      assert(seq != 0);  // already handled in the first if-then clause
+      return IsVisibleFullCheck(seq);
+    }
+  }
+
+  inline SequenceNumber max_visible_seq() { return max_visible_seq_; }
+
+  // Refresh to a more recent visible seq
+  virtual void Refresh(SequenceNumber seq) { max_visible_seq_ = seq; }
+
+ protected:
+  // The max visible seq, it is usually the snapshot but could be larger if
+  // transaction has its own writes written to db.
+  SequenceNumber max_visible_seq_ = kMaxSequenceNumber;
+  // Any seq less than min_uncommitted_ is committed.
+  const SequenceNumber min_uncommitted_ = kMinUnCommittedSeq;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/repair.cc b/src/rocksdb/db/repair.cc
new file mode 100644
index 000000000..383ffe3a4
--- /dev/null
+++ b/src/rocksdb/db/repair.cc
@@ -0,0 +1,691 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Repairer does best effort recovery to recover as much data as possible after
+// a disaster without compromising consistency. It does not guarantee bringing
+// the database to a time consistent state.
+//
+// Repair process is broken into 4 phases:
+// (a) Find files
+// (b) Convert logs to tables
+// (c) Extract metadata
+// (d) Write Descriptor
+//
+// (a) Find files
+//
+// The repairer goes through all the files in the directory, and classifies them
+// based on their file name. Any file that cannot be identified by name will be
+// ignored.
+//
+// (b) Convert logs to table
+//
+// Every log file that is active is replayed. All sections of the file where the
+// checksum does not match is skipped over. We intentionally give preference to
+// data consistency.
+//
+// (c) Extract metadata
+//
+// We scan every table to compute
+// (1) smallest/largest for the table
+// (2) largest sequence number in the table
+// (3) oldest blob file referred to by the table (if applicable)
+//
+// If we are unable to scan the file, then we ignore the table.
+//
+// (d) Write Descriptor
+//
+// We generate descriptor contents:
+//  - log number is set to zero
+//  - next-file-number is set to 1 + largest file number we found
+//  - last-sequence-number is set to largest sequence# found across
+//    all tables (see 2c)
+//  - compaction pointers are cleared
+//  - every table file is added at level 0
+//
+// Possible optimization 1:
+//   (a) Compute total size and use to pick appropriate max-level M
+//   (b) Sort tables by largest sequence# in the table
+//   (c) For each table: if it overlaps earlier table, place in level-0,
+//       else place in level-M.
+//   (d) We can provide options for time consistent recovery and unsafe recovery
+//       (ignore checksum failure when applicable)
+// Possible optimization 2:
+//   Store per-table metadata (smallest, largest, largest-seq#, ...)
+//   in the table's meta section to speed up ScanTable.
+
+#ifndef ROCKSDB_LITE
+
+#include <cinttypes>
+#include "db/builder.h"
+#include "db/db_impl/db_impl.h"
+#include "db/dbformat.h"
+#include "db/log_reader.h"
+#include "db/log_writer.h"
+#include "db/memtable.h"
+#include "db/table_cache.h"
+#include "db/version_edit.h"
+#include "db/write_batch_internal.h"
+#include "env/composite_env_wrapper.h"
+#include "file/filename.h"
+#include "file/writable_file_writer.h"
+#include "options/cf_options.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/options.h"
+#include "rocksdb/write_buffer_manager.h"
+#include "table/scoped_arena_iterator.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+
+class Repairer {
+ public:
+  Repairer(const std::string& dbname, const DBOptions& db_options,
+           const std::vector<ColumnFamilyDescriptor>& column_families,
+           const ColumnFamilyOptions& default_cf_opts,
+           const ColumnFamilyOptions& unknown_cf_opts, bool create_unknown_cfs)
+      : dbname_(dbname),
+        env_(db_options.env),
+        env_options_(),
+        db_options_(SanitizeOptions(dbname_, db_options)),
+        immutable_db_options_(ImmutableDBOptions(db_options_)),
+        icmp_(default_cf_opts.comparator),
+        default_cf_opts_(
+            SanitizeOptions(immutable_db_options_, default_cf_opts)),
+        default_cf_iopts_(
+            ImmutableCFOptions(immutable_db_options_, default_cf_opts_)),
+        unknown_cf_opts_(
+            SanitizeOptions(immutable_db_options_, unknown_cf_opts)),
+        create_unknown_cfs_(create_unknown_cfs),
+        raw_table_cache_(
+            // TableCache can be small since we expect each table to be opened
+            // once.
+            NewLRUCache(10, db_options_.table_cache_numshardbits)),
+        table_cache_(new TableCache(default_cf_iopts_, env_options_,
+                                    raw_table_cache_.get(),
+                                    /*block_cache_tracer=*/nullptr)),
+        wb_(db_options_.db_write_buffer_size),
+        wc_(db_options_.delayed_write_rate),
+        vset_(dbname_, &immutable_db_options_, env_options_,
+              raw_table_cache_.get(), &wb_, &wc_,
+              /*block_cache_tracer=*/nullptr),
+        next_file_number_(1),
+        db_lock_(nullptr) {
+    for (const auto& cfd : column_families) {
+      cf_name_to_opts_[cfd.name] = cfd.options;
+    }
+  }
+
+  const ColumnFamilyOptions* GetColumnFamilyOptions(
+      const std::string& cf_name) {
+    if (cf_name_to_opts_.find(cf_name) == cf_name_to_opts_.end()) {
+      if (create_unknown_cfs_) {
+        return &unknown_cf_opts_;
+      }
+      return nullptr;
+    }
+    return &cf_name_to_opts_[cf_name];
+  }
+
+  // Adds a column family to the VersionSet with cf_options_ and updates
+  // manifest.
+  Status AddColumnFamily(const std::string& cf_name, uint32_t cf_id) {
+    const auto* cf_opts = GetColumnFamilyOptions(cf_name);
+    if (cf_opts == nullptr) {
+      return Status::Corruption("Encountered unknown column family with name=" +
+                                cf_name + ", id=" + ToString(cf_id));
+    }
+    Options opts(db_options_, *cf_opts);
+    MutableCFOptions mut_cf_opts(opts);
+
+    VersionEdit edit;
+    edit.SetComparatorName(opts.comparator->Name());
+    edit.SetLogNumber(0);
+    edit.SetColumnFamily(cf_id);
+    ColumnFamilyData* cfd;
+    cfd = nullptr;
+    edit.AddColumnFamily(cf_name);
+
+    mutex_.Lock();
+    Status status = vset_.LogAndApply(cfd, mut_cf_opts, &edit, &mutex_,
+                                      nullptr /* db_directory */,
+                                      false /* new_descriptor_log */, cf_opts);
+    mutex_.Unlock();
+    return status;
+  }
+
+  ~Repairer() {
+    if (db_lock_ != nullptr) {
+      env_->UnlockFile(db_lock_);
+    }
+    delete table_cache_;
+  }
+
+  Status Run() {
+    Status status = env_->LockFile(LockFileName(dbname_), &db_lock_);
+    if (!status.ok()) {
+      return status;
+    }
+    status = FindFiles();
+    if (status.ok()) {
+      // Discard older manifests and start a fresh one
+      for (size_t i = 0; i < manifests_.size(); i++) {
+        ArchiveFile(dbname_ + "/" + manifests_[i]);
+      }
+      // Just create a DBImpl temporarily so we can reuse NewDB()
+      DBImpl* db_impl = new DBImpl(db_options_, dbname_);
+      status = db_impl->NewDB();
+      delete db_impl;
+    }
+
+    if (status.ok()) {
+      // Recover using the fresh manifest created by NewDB()
+      status =
+          vset_.Recover({{kDefaultColumnFamilyName, default_cf_opts_}}, false);
+    }
+    if (status.ok()) {
+      // Need to scan existing SST files first so the column families are
+      // created before we process WAL files
+      ExtractMetaData();
+
+      // ExtractMetaData() uses table_fds_ to know which SST files' metadata to
+      // extract -- we need to clear it here since metadata for existing SST
+      // files has been extracted already
+      table_fds_.clear();
+      ConvertLogFilesToTables();
+      ExtractMetaData();
+      status = AddTables();
+    }
+    if (status.ok()) {
+      uint64_t bytes = 0;
+      for (size_t i = 0; i < tables_.size(); i++) {
+        bytes += tables_[i].meta.fd.GetFileSize();
+      }
+      ROCKS_LOG_WARN(db_options_.info_log,
+                     "**** Repaired rocksdb %s; "
+                     "recovered %" ROCKSDB_PRIszt " files; %" PRIu64
+                     " bytes. "
+                     "Some data may have been lost. "
+                     "****",
+                     dbname_.c_str(), tables_.size(), bytes);
+    }
+    return status;
+  }
+
+ private:
+  struct TableInfo {
+    FileMetaData meta;
+    uint32_t column_family_id;
+    std::string column_family_name;
+  };
+
+  std::string const dbname_;
+  Env* const env_;
+  const EnvOptions env_options_;
+  const DBOptions db_options_;
+  const ImmutableDBOptions immutable_db_options_;
+  const InternalKeyComparator icmp_;
+  const ColumnFamilyOptions default_cf_opts_;
+  const ImmutableCFOptions default_cf_iopts_;  // table_cache_ holds reference
+  const ColumnFamilyOptions unknown_cf_opts_;
+  const bool create_unknown_cfs_;
+  std::shared_ptr<Cache> raw_table_cache_;
+  TableCache* table_cache_;
+  WriteBufferManager wb_;
+  WriteController wc_;
+  VersionSet vset_;
+  std::unordered_map<std::string, ColumnFamilyOptions> cf_name_to_opts_;
+  InstrumentedMutex mutex_;
+
+  std::vector<std::string> manifests_;
+  std::vector<FileDescriptor> table_fds_;
+  std::vector<uint64_t> logs_;
+  std::vector<TableInfo> tables_;
+  uint64_t next_file_number_;
+  // Lock over the persistent DB state. Non-nullptr iff successfully
+  // acquired.
+  FileLock* db_lock_;
+
+  Status FindFiles() {
+    std::vector<std::string> filenames;
+    bool found_file = false;
+    std::vector<std::string> to_search_paths;
+
+    for (size_t path_id = 0; path_id < db_options_.db_paths.size(); path_id++) {
+        to_search_paths.push_back(db_options_.db_paths[path_id].path);
+    }
+
+    // search wal_dir if user uses a customize wal_dir
+    bool same = false;
+    Status status = env_->AreFilesSame(db_options_.wal_dir, dbname_, &same);
+    if (status.IsNotSupported()) {
+      same = db_options_.wal_dir == dbname_;
+      status = Status::OK();
+    } else if (!status.ok()) {
+      return status;
+    }
+
+    if (!same) {
+      to_search_paths.push_back(db_options_.wal_dir);
+    }
+
+    for (size_t path_id = 0; path_id < to_search_paths.size(); path_id++) {
+      status = env_->GetChildren(to_search_paths[path_id], &filenames);
+      if (!status.ok()) {
+        return status;
+      }
+      if (!filenames.empty()) {
+        found_file = true;
+      }
+
+      uint64_t number;
+      FileType type;
+      for (size_t i = 0; i < filenames.size(); i++) {
+        if (ParseFileName(filenames[i], &number, &type)) {
+          if (type == kDescriptorFile) {
+            manifests_.push_back(filenames[i]);
+          } else {
+            if (number + 1 > next_file_number_) {
+              next_file_number_ = number + 1;
+            }
+            if (type == kLogFile) {
+              logs_.push_back(number);
+            } else if (type == kTableFile) {
+              table_fds_.emplace_back(number, static_cast<uint32_t>(path_id),
+                                      0);
+            } else {
+              // Ignore other files
+            }
+          }
+        }
+      }
+    }
+    if (!found_file) {
+      return Status::Corruption(dbname_, "repair found no files");
+    }
+    return Status::OK();
+  }
+
+  void ConvertLogFilesToTables() {
+    for (size_t i = 0; i < logs_.size(); i++) {
+      // we should use LogFileName(wal_dir, logs_[i]) here. user might uses wal_dir option.
+      std::string logname = LogFileName(db_options_.wal_dir, logs_[i]);
+      Status status = ConvertLogToTable(logs_[i]);
+      if (!status.ok()) {
+        ROCKS_LOG_WARN(db_options_.info_log,
+                       "Log #%" PRIu64 ": ignoring conversion error: %s",
+                       logs_[i], status.ToString().c_str());
+      }
+      ArchiveFile(logname);
+    }
+  }
+
+  Status ConvertLogToTable(uint64_t log) {
+    struct LogReporter : public log::Reader::Reporter {
+      Env* env;
+      std::shared_ptr<Logger> info_log;
+      uint64_t lognum;
+      void Corruption(size_t bytes, const Status& s) override {
+        // We print error messages for corruption, but continue repairing.
+        ROCKS_LOG_ERROR(info_log, "Log #%" PRIu64 ": dropping %d bytes; %s",
+                        lognum, static_cast<int>(bytes), s.ToString().c_str());
+      }
+    };
+
+    // Open the log file
+    std::string logname = LogFileName(db_options_.wal_dir, log);
+    std::unique_ptr<SequentialFile> lfile;
+    Status status = env_->NewSequentialFile(
+        logname, &lfile, env_->OptimizeForLogRead(env_options_));
+    if (!status.ok()) {
+      return status;
+    }
+    std::unique_ptr<SequentialFileReader> lfile_reader(new SequentialFileReader(
+        NewLegacySequentialFileWrapper(lfile), logname));
+
+    // Create the log reader.
+    LogReporter reporter;
+    reporter.env = env_;
+    reporter.info_log = db_options_.info_log;
+    reporter.lognum = log;
+    // We intentionally make log::Reader do checksumming so that
+    // corruptions cause entire commits to be skipped instead of
+    // propagating bad information (like overly large sequence
+    // numbers).
+    log::Reader reader(db_options_.info_log, std::move(lfile_reader), &reporter,
+                       true /*enable checksum*/, log);
+
+    // Initialize per-column family memtables
+    for (auto* cfd : *vset_.GetColumnFamilySet()) {
+      cfd->CreateNewMemtable(*cfd->GetLatestMutableCFOptions(),
+                             kMaxSequenceNumber);
+    }
+    auto cf_mems = new ColumnFamilyMemTablesImpl(vset_.GetColumnFamilySet());
+
+    // Read all the records and add to a memtable
+    std::string scratch;
+    Slice record;
+    WriteBatch batch;
+    int counter = 0;
+    while (reader.ReadRecord(&record, &scratch)) {
+      if (record.size() < WriteBatchInternal::kHeader) {
+        reporter.Corruption(
+            record.size(), Status::Corruption("log record too small"));
+        continue;
+      }
+      WriteBatchInternal::SetContents(&batch, record);
+      status =
+          WriteBatchInternal::InsertInto(&batch, cf_mems, nullptr, nullptr);
+      if (status.ok()) {
+        counter += WriteBatchInternal::Count(&batch);
+      } else {
+        ROCKS_LOG_WARN(db_options_.info_log, "Log #%" PRIu64 ": ignoring %s",
+                       log, status.ToString().c_str());
+        status = Status::OK();  // Keep going with rest of file
+      }
+    }
+
+    // Dump a table for each column family with entries in this log file.
+    for (auto* cfd : *vset_.GetColumnFamilySet()) {
+      // Do not record a version edit for this conversion to a Table
+      // since ExtractMetaData() will also generate edits.
+      MemTable* mem = cfd->mem();
+      if (mem->IsEmpty()) {
+        continue;
+      }
+
+      FileMetaData meta;
+      meta.fd = FileDescriptor(next_file_number_++, 0, 0);
+      ReadOptions ro;
+      ro.total_order_seek = true;
+      Arena arena;
+      ScopedArenaIterator iter(mem->NewIterator(ro, &arena));
+      int64_t _current_time = 0;
+      status = env_->GetCurrentTime(&_current_time);  // ignore error
+      const uint64_t current_time = static_cast<uint64_t>(_current_time);
+      SnapshotChecker* snapshot_checker = DisableGCSnapshotChecker::Instance();
+
+      auto write_hint = cfd->CalculateSSTWriteHint(0);
+      std::vector<std::unique_ptr<FragmentedRangeTombstoneIterator>>
+          range_del_iters;
+      auto range_del_iter =
+          mem->NewRangeTombstoneIterator(ro, kMaxSequenceNumber);
+      if (range_del_iter != nullptr) {
+        range_del_iters.emplace_back(range_del_iter);
+      }
+
+      LegacyFileSystemWrapper fs(env_);
+      status = BuildTable(
+          dbname_, env_, &fs, *cfd->ioptions(),
+          *cfd->GetLatestMutableCFOptions(), env_options_, table_cache_,
+          iter.get(), std::move(range_del_iters), &meta,
+          cfd->internal_comparator(), cfd->int_tbl_prop_collector_factories(),
+          cfd->GetID(), cfd->GetName(), {}, kMaxSequenceNumber,
+          snapshot_checker, kNoCompression, 0 /* sample_for_compression */,
+          CompressionOptions(), false, nullptr /* internal_stats */,
+          TableFileCreationReason::kRecovery, nullptr /* event_logger */,
+          0 /* job_id */, Env::IO_HIGH, nullptr /* table_properties */,
+          -1 /* level */, current_time, write_hint);
+      ROCKS_LOG_INFO(db_options_.info_log,
+                     "Log #%" PRIu64 ": %d ops saved to Table #%" PRIu64 " %s",
+                     log, counter, meta.fd.GetNumber(),
+                     status.ToString().c_str());
+      if (status.ok()) {
+        if (meta.fd.GetFileSize() > 0) {
+          table_fds_.push_back(meta.fd);
+        }
+      } else {
+        break;
+      }
+    }
+    delete cf_mems;
+    return status;
+  }
+
+  void ExtractMetaData() {
+    for (size_t i = 0; i < table_fds_.size(); i++) {
+      TableInfo t;
+      t.meta.fd = table_fds_[i];
+      Status status = ScanTable(&t);
+      if (!status.ok()) {
+        std::string fname = TableFileName(
+            db_options_.db_paths, t.meta.fd.GetNumber(), t.meta.fd.GetPathId());
+        char file_num_buf[kFormatFileNumberBufSize];
+        FormatFileNumber(t.meta.fd.GetNumber(), t.meta.fd.GetPathId(),
+                         file_num_buf, sizeof(file_num_buf));
+        ROCKS_LOG_WARN(db_options_.info_log, "Table #%s: ignoring %s",
+                       file_num_buf, status.ToString().c_str());
+        ArchiveFile(fname);
+      } else {
+        tables_.push_back(t);
+      }
+    }
+  }
+
+  Status ScanTable(TableInfo* t) {
+    std::string fname = TableFileName(
+        db_options_.db_paths, t->meta.fd.GetNumber(), t->meta.fd.GetPathId());
+    int counter = 0;
+    uint64_t file_size;
+    Status status = env_->GetFileSize(fname, &file_size);
+    t->meta.fd = FileDescriptor(t->meta.fd.GetNumber(), t->meta.fd.GetPathId(),
+                                file_size);
+    std::shared_ptr<const TableProperties> props;
+    if (status.ok()) {
+      status = table_cache_->GetTableProperties(env_options_, icmp_, t->meta.fd,
+                                                &props);
+    }
+    if (status.ok()) {
+      t->column_family_id = static_cast<uint32_t>(props->column_family_id);
+      if (t->column_family_id ==
+          TablePropertiesCollectorFactory::Context::kUnknownColumnFamily) {
+        ROCKS_LOG_WARN(
+            db_options_.info_log,
+            "Table #%" PRIu64
+            ": column family unknown (probably due to legacy format); "
+            "adding to default column family id 0.",
+            t->meta.fd.GetNumber());
+        t->column_family_id = 0;
+      }
+
+      if (vset_.GetColumnFamilySet()->GetColumnFamily(t->column_family_id) ==
+          nullptr) {
+        status =
+            AddColumnFamily(props->column_family_name, t->column_family_id);
+      }
+      t->meta.oldest_ancester_time = props->creation_time;
+    }
+    ColumnFamilyData* cfd = nullptr;
+    if (status.ok()) {
+      cfd = vset_.GetColumnFamilySet()->GetColumnFamily(t->column_family_id);
+      if (cfd->GetName() != props->column_family_name) {
+        ROCKS_LOG_ERROR(
+            db_options_.info_log,
+            "Table #%" PRIu64
+            ": inconsistent column family name '%s'; expected '%s' for column "
+            "family id %" PRIu32 ".",
+            t->meta.fd.GetNumber(), props->column_family_name.c_str(),
+            cfd->GetName().c_str(), t->column_family_id);
+        status = Status::Corruption(dbname_, "inconsistent column family name");
+      }
+    }
+    if (status.ok()) {
+      ReadOptions ropts;
+      ropts.total_order_seek = true;
+      InternalIterator* iter = table_cache_->NewIterator(
+          ropts, env_options_, cfd->internal_comparator(), t->meta,
+          nullptr /* range_del_agg */,
+          cfd->GetLatestMutableCFOptions()->prefix_extractor.get(),
+          /*table_reader_ptr=*/nullptr, /*file_read_hist=*/nullptr,
+          TableReaderCaller::kRepair, /*arena=*/nullptr, /*skip_filters=*/false,
+          /*level=*/-1, /*smallest_compaction_key=*/nullptr,
+          /*largest_compaction_key=*/nullptr);
+      ParsedInternalKey parsed;
+      for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+        Slice key = iter->key();
+        if (!ParseInternalKey(key, &parsed)) {
+          ROCKS_LOG_ERROR(db_options_.info_log,
+                          "Table #%" PRIu64 ": unparsable key %s",
+                          t->meta.fd.GetNumber(), EscapeString(key).c_str());
+          continue;
+        }
+
+        counter++;
+
+        t->meta.UpdateBoundaries(key, iter->value(), parsed.sequence,
+                                 parsed.type);
+      }
+      if (!iter->status().ok()) {
+        status = iter->status();
+      }
+      delete iter;
+
+      ROCKS_LOG_INFO(db_options_.info_log, "Table #%" PRIu64 ": %d entries %s",
+                     t->meta.fd.GetNumber(), counter,
+                     status.ToString().c_str());
+    }
+    return status;
+  }
+
+  Status AddTables() {
+    std::unordered_map<uint32_t, std::vector<const TableInfo*>> cf_id_to_tables;
+    SequenceNumber max_sequence = 0;
+    for (size_t i = 0; i < tables_.size(); i++) {
+      cf_id_to_tables[tables_[i].column_family_id].push_back(&tables_[i]);
+      if (max_sequence < tables_[i].meta.fd.largest_seqno) {
+        max_sequence = tables_[i].meta.fd.largest_seqno;
+      }
+    }
+    vset_.SetLastAllocatedSequence(max_sequence);
+    vset_.SetLastPublishedSequence(max_sequence);
+    vset_.SetLastSequence(max_sequence);
+
+    for (const auto& cf_id_and_tables : cf_id_to_tables) {
+      auto* cfd =
+          vset_.GetColumnFamilySet()->GetColumnFamily(cf_id_and_tables.first);
+      VersionEdit edit;
+      edit.SetComparatorName(cfd->user_comparator()->Name());
+      edit.SetLogNumber(0);
+      edit.SetNextFile(next_file_number_);
+      edit.SetColumnFamily(cfd->GetID());
+
+      // TODO(opt): separate out into multiple levels
+      for (const auto* table : cf_id_and_tables.second) {
+        edit.AddFile(
+            0, table->meta.fd.GetNumber(), table->meta.fd.GetPathId(),
+            table->meta.fd.GetFileSize(), table->meta.smallest,
+            table->meta.largest, table->meta.fd.smallest_seqno,
+            table->meta.fd.largest_seqno, table->meta.marked_for_compaction,
+            table->meta.oldest_blob_file_number,
+            table->meta.oldest_ancester_time, table->meta.file_creation_time,
+            table->meta.file_checksum, table->meta.file_checksum_func_name);
+      }
+      assert(next_file_number_ > 0);
+      vset_.MarkFileNumberUsed(next_file_number_ - 1);
+      mutex_.Lock();
+      Status status = vset_.LogAndApply(
+          cfd, *cfd->GetLatestMutableCFOptions(), &edit, &mutex_,
+          nullptr /* db_directory */, false /* new_descriptor_log */);
+      mutex_.Unlock();
+      if (!status.ok()) {
+        return status;
+      }
+    }
+    return Status::OK();
+  }
+
+  void ArchiveFile(const std::string& fname) {
+    // Move into another directory.  E.g., for
+    //    dir/foo
+    // rename to
+    //    dir/lost/foo
+    const char* slash = strrchr(fname.c_str(), '/');
+    std::string new_dir;
+    if (slash != nullptr) {
+      new_dir.assign(fname.data(), slash - fname.data());
+    }
+    new_dir.append("/lost");
+    env_->CreateDir(new_dir);  // Ignore error
+    std::string new_file = new_dir;
+    new_file.append("/");
+    new_file.append((slash == nullptr) ? fname.c_str() : slash + 1);
+    Status s = env_->RenameFile(fname, new_file);
+    ROCKS_LOG_INFO(db_options_.info_log, "Archiving %s: %s\n", fname.c_str(),
+                   s.ToString().c_str());
+  }
+};
+
+Status GetDefaultCFOptions(
+    const std::vector<ColumnFamilyDescriptor>& column_families,
+    ColumnFamilyOptions* res) {
+  assert(res != nullptr);
+  auto iter = std::find_if(column_families.begin(), column_families.end(),
+                           [](const ColumnFamilyDescriptor& cfd) {
+                             return cfd.name == kDefaultColumnFamilyName;
+                           });
+  if (iter == column_families.end()) {
+    return Status::InvalidArgument(
+        "column_families", "Must contain entry for default column family");
+  }
+  *res = iter->options;
+  return Status::OK();
+}
+}  // anonymous namespace
+
+Status RepairDB(const std::string& dbname, const DBOptions& db_options,
+                const std::vector<ColumnFamilyDescriptor>& column_families
+                ) {
+  ColumnFamilyOptions default_cf_opts;
+  Status status = GetDefaultCFOptions(column_families, &default_cf_opts);
+  if (status.ok()) {
+    Repairer repairer(dbname, db_options, column_families,
+                      default_cf_opts,
+                      ColumnFamilyOptions() /* unknown_cf_opts */,
+                      false /* create_unknown_cfs */);
+    status = repairer.Run();
+  }
+  return status;
+}
+
+Status RepairDB(const std::string& dbname, const DBOptions& db_options,
+                const std::vector<ColumnFamilyDescriptor>& column_families,
+                const ColumnFamilyOptions& unknown_cf_opts) {
+  ColumnFamilyOptions default_cf_opts;
+  Status status = GetDefaultCFOptions(column_families, &default_cf_opts);
+  if (status.ok()) {
+    Repairer repairer(dbname, db_options,
+                      column_families, default_cf_opts,
+                      unknown_cf_opts, true /* create_unknown_cfs */);
+    status = repairer.Run();
+  }
+  return status;
+}
+
+Status RepairDB(const std::string& dbname, const Options& options) {
+  Options opts(options);
+  if (opts.file_system == nullptr) {
+    opts.file_system.reset(new LegacyFileSystemWrapper(opts.env));
+    ;
+  }
+
+  DBOptions db_options(opts);
+  ColumnFamilyOptions cf_options(opts);
+  Repairer repairer(dbname, db_options,
+                    {}, cf_options /* default_cf_opts */,
+                    cf_options /* unknown_cf_opts */,
+                    true /* create_unknown_cfs */);
+  return repairer.Run();
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/db/repair_test.cc b/src/rocksdb/db/repair_test.cc
new file mode 100644
index 000000000..ba2bae3d0
--- /dev/null
+++ b/src/rocksdb/db/repair_test.cc
@@ -0,0 +1,369 @@
+//  Copyright (c) 2016-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include <algorithm>
+#include <string>
+#include <vector>
+
+#include "db/db_impl/db_impl.h"
+#include "db/db_test_util.h"
+#include "file/file_util.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/db.h"
+#include "rocksdb/transaction_log.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+#ifndef ROCKSDB_LITE
+class RepairTest : public DBTestBase {
+ public:
+  RepairTest() : DBTestBase("/repair_test") {}
+
+  std::string GetFirstSstPath() {
+    uint64_t manifest_size;
+    std::vector<std::string> files;
+    db_->GetLiveFiles(files, &manifest_size);
+    auto sst_iter =
+        std::find_if(files.begin(), files.end(), [](const std::string& file) {
+          uint64_t number;
+          FileType type;
+          bool ok = ParseFileName(file, &number, &type);
+          return ok && type == kTableFile;
+        });
+    return sst_iter == files.end() ? "" : dbname_ + *sst_iter;
+  }
+};
+
+TEST_F(RepairTest, LostManifest) {
+  // Add a couple SST files, delete the manifest, and verify RepairDB() saves
+  // the day.
+  Put("key", "val");
+  Flush();
+  Put("key2", "val2");
+  Flush();
+  // Need to get path before Close() deletes db_, but delete it after Close() to
+  // ensure Close() didn't change the manifest.
+  std::string manifest_path =
+      DescriptorFileName(dbname_, dbfull()->TEST_Current_Manifest_FileNo());
+
+  Close();
+  ASSERT_OK(env_->FileExists(manifest_path));
+  ASSERT_OK(env_->DeleteFile(manifest_path));
+  ASSERT_OK(RepairDB(dbname_, CurrentOptions()));
+  Reopen(CurrentOptions());
+
+  ASSERT_EQ(Get("key"), "val");
+  ASSERT_EQ(Get("key2"), "val2");
+}
+
+TEST_F(RepairTest, CorruptManifest) {
+  // Manifest is in an invalid format. Expect a full recovery.
+  Put("key", "val");
+  Flush();
+  Put("key2", "val2");
+  Flush();
+  // Need to get path before Close() deletes db_, but overwrite it after Close()
+  // to ensure Close() didn't change the manifest.
+  std::string manifest_path =
+      DescriptorFileName(dbname_, dbfull()->TEST_Current_Manifest_FileNo());
+
+  Close();
+  ASSERT_OK(env_->FileExists(manifest_path));
+
+  LegacyFileSystemWrapper fs(env_);
+  CreateFile(&fs, manifest_path, "blah", false /* use_fsync */);
+  ASSERT_OK(RepairDB(dbname_, CurrentOptions()));
+  Reopen(CurrentOptions());
+
+  ASSERT_EQ(Get("key"), "val");
+  ASSERT_EQ(Get("key2"), "val2");
+}
+
+TEST_F(RepairTest, IncompleteManifest) {
+  // In this case, the manifest is valid but does not reference all of the SST
+  // files. Expect a full recovery.
+  Put("key", "val");
+  Flush();
+  std::string orig_manifest_path =
+      DescriptorFileName(dbname_, dbfull()->TEST_Current_Manifest_FileNo());
+  CopyFile(orig_manifest_path, orig_manifest_path + ".tmp");
+  Put("key2", "val2");
+  Flush();
+  // Need to get path before Close() deletes db_, but overwrite it after Close()
+  // to ensure Close() didn't change the manifest.
+  std::string new_manifest_path =
+      DescriptorFileName(dbname_, dbfull()->TEST_Current_Manifest_FileNo());
+
+  Close();
+  ASSERT_OK(env_->FileExists(new_manifest_path));
+  // Replace the manifest with one that is only aware of the first SST file.
+  CopyFile(orig_manifest_path + ".tmp", new_manifest_path);
+  ASSERT_OK(RepairDB(dbname_, CurrentOptions()));
+  Reopen(CurrentOptions());
+
+  ASSERT_EQ(Get("key"), "val");
+  ASSERT_EQ(Get("key2"), "val2");
+}
+
+TEST_F(RepairTest, PostRepairSstFileNumbering) {
+  // Verify after a DB is repaired, new files will be assigned higher numbers
+  // than old files.
+  Put("key", "val");
+  Flush();
+  Put("key2", "val2");
+  Flush();
+  uint64_t pre_repair_file_num = dbfull()->TEST_Current_Next_FileNo();
+  Close();
+
+  ASSERT_OK(RepairDB(dbname_, CurrentOptions()));
+
+  Reopen(CurrentOptions());
+  uint64_t post_repair_file_num = dbfull()->TEST_Current_Next_FileNo();
+  ASSERT_GE(post_repair_file_num, pre_repair_file_num);
+}
+
+TEST_F(RepairTest, LostSst) {
+  // Delete one of the SST files but preserve the manifest that refers to it,
+  // then verify the DB is still usable for the intact SST.
+  Put("key", "val");
+  Flush();
+  Put("key2", "val2");
+  Flush();
+  auto sst_path = GetFirstSstPath();
+  ASSERT_FALSE(sst_path.empty());
+  ASSERT_OK(env_->DeleteFile(sst_path));
+
+  Close();
+  ASSERT_OK(RepairDB(dbname_, CurrentOptions()));
+  Reopen(CurrentOptions());
+
+  // Exactly one of the key-value pairs should be in the DB now.
+  ASSERT_TRUE((Get("key") == "val") != (Get("key2") == "val2"));
+}
+
+TEST_F(RepairTest, CorruptSst) {
+  // Corrupt one of the SST files but preserve the manifest that refers to it,
+  // then verify the DB is still usable for the intact SST.
+  Put("key", "val");
+  Flush();
+  Put("key2", "val2");
+  Flush();
+  auto sst_path = GetFirstSstPath();
+  ASSERT_FALSE(sst_path.empty());
+
+  LegacyFileSystemWrapper fs(env_);
+  CreateFile(&fs, sst_path, "blah", false /* use_fsync */);
+
+  Close();
+  ASSERT_OK(RepairDB(dbname_, CurrentOptions()));
+  Reopen(CurrentOptions());
+
+  // Exactly one of the key-value pairs should be in the DB now.
+  ASSERT_TRUE((Get("key") == "val") != (Get("key2") == "val2"));
+}
+
+TEST_F(RepairTest, UnflushedSst) {
+  // This test case invokes repair while some data is unflushed, then verifies
+  // that data is in the db.
+  Put("key", "val");
+  VectorLogPtr wal_files;
+  ASSERT_OK(dbfull()->GetSortedWalFiles(wal_files));
+  ASSERT_EQ(wal_files.size(), 1);
+  uint64_t total_ssts_size;
+  GetAllSSTFiles(&total_ssts_size);
+  ASSERT_EQ(total_ssts_size, 0);
+  // Need to get path before Close() deletes db_, but delete it after Close() to
+  // ensure Close() didn't change the manifest.
+  std::string manifest_path =
+      DescriptorFileName(dbname_, dbfull()->TEST_Current_Manifest_FileNo());
+
+  Close();
+  ASSERT_OK(env_->FileExists(manifest_path));
+  ASSERT_OK(env_->DeleteFile(manifest_path));
+  ASSERT_OK(RepairDB(dbname_, CurrentOptions()));
+  Reopen(CurrentOptions());
+
+  ASSERT_OK(dbfull()->GetSortedWalFiles(wal_files));
+  ASSERT_EQ(wal_files.size(), 0);
+  GetAllSSTFiles(&total_ssts_size);
+  ASSERT_GT(total_ssts_size, 0);
+  ASSERT_EQ(Get("key"), "val");
+}
+
+TEST_F(RepairTest, SeparateWalDir) {
+  do {
+    Options options = CurrentOptions();
+    DestroyAndReopen(options);
+    Put("key", "val");
+    Put("foo", "bar");
+    VectorLogPtr wal_files;
+    ASSERT_OK(dbfull()->GetSortedWalFiles(wal_files));
+    ASSERT_EQ(wal_files.size(), 1);
+    uint64_t total_ssts_size;
+    GetAllSSTFiles(&total_ssts_size);
+    ASSERT_EQ(total_ssts_size, 0);
+    std::string manifest_path =
+      DescriptorFileName(dbname_, dbfull()->TEST_Current_Manifest_FileNo());
+
+    Close();
+    ASSERT_OK(env_->FileExists(manifest_path));
+    ASSERT_OK(env_->DeleteFile(manifest_path));
+    ASSERT_OK(RepairDB(dbname_, options));
+
+    // make sure that all WALs are converted to SSTables.
+    options.wal_dir = "";
+
+    Reopen(options);
+    ASSERT_OK(dbfull()->GetSortedWalFiles(wal_files));
+    ASSERT_EQ(wal_files.size(), 0);
+    GetAllSSTFiles(&total_ssts_size);
+    ASSERT_GT(total_ssts_size, 0);
+    ASSERT_EQ(Get("key"), "val");
+    ASSERT_EQ(Get("foo"), "bar");
+
+ } while(ChangeWalOptions());
+}
+
+TEST_F(RepairTest, RepairMultipleColumnFamilies) {
+  // Verify repair logic associates SST files with their original column
+  // families.
+  const int kNumCfs = 3;
+  const int kEntriesPerCf = 2;
+  DestroyAndReopen(CurrentOptions());
+  CreateAndReopenWithCF({"pikachu1", "pikachu2"}, CurrentOptions());
+  for (int i = 0; i < kNumCfs; ++i) {
+    for (int j = 0; j < kEntriesPerCf; ++j) {
+      Put(i, "key" + ToString(j), "val" + ToString(j));
+      if (j == kEntriesPerCf - 1 && i == kNumCfs - 1) {
+        // Leave one unflushed so we can verify WAL entries are properly
+        // associated with column families.
+        continue;
+      }
+      Flush(i);
+    }
+  }
+
+  // Need to get path before Close() deletes db_, but delete it after Close() to
+  // ensure Close() doesn't re-create the manifest.
+  std::string manifest_path =
+      DescriptorFileName(dbname_, dbfull()->TEST_Current_Manifest_FileNo());
+  Close();
+  ASSERT_OK(env_->FileExists(manifest_path));
+  ASSERT_OK(env_->DeleteFile(manifest_path));
+
+  ASSERT_OK(RepairDB(dbname_, CurrentOptions()));
+
+  ReopenWithColumnFamilies({"default", "pikachu1", "pikachu2"},
+                           CurrentOptions());
+  for (int i = 0; i < kNumCfs; ++i) {
+    for (int j = 0; j < kEntriesPerCf; ++j) {
+      ASSERT_EQ(Get(i, "key" + ToString(j)), "val" + ToString(j));
+    }
+  }
+}
+
+TEST_F(RepairTest, RepairColumnFamilyOptions) {
+  // Verify repair logic uses correct ColumnFamilyOptions when repairing a
+  // database with different options for column families.
+  const int kNumCfs = 2;
+  const int kEntriesPerCf = 2;
+
+  Options opts(CurrentOptions()), rev_opts(CurrentOptions());
+  opts.comparator = BytewiseComparator();
+  rev_opts.comparator = ReverseBytewiseComparator();
+
+  DestroyAndReopen(opts);
+  CreateColumnFamilies({"reverse"}, rev_opts);
+  ReopenWithColumnFamilies({"default", "reverse"},
+                           std::vector<Options>{opts, rev_opts});
+  for (int i = 0; i < kNumCfs; ++i) {
+    for (int j = 0; j < kEntriesPerCf; ++j) {
+      Put(i, "key" + ToString(j), "val" + ToString(j));
+      if (i == kNumCfs - 1 && j == kEntriesPerCf - 1) {
+        // Leave one unflushed so we can verify RepairDB's flush logic
+        continue;
+      }
+      Flush(i);
+    }
+  }
+  Close();
+
+  // RepairDB() records the comparator in the manifest, and DB::Open would fail
+  // if a different comparator were used.
+  ASSERT_OK(RepairDB(dbname_, opts, {{"default", opts}, {"reverse", rev_opts}},
+                     opts /* unknown_cf_opts */));
+  ASSERT_OK(TryReopenWithColumnFamilies({"default", "reverse"},
+                                        std::vector<Options>{opts, rev_opts}));
+  for (int i = 0; i < kNumCfs; ++i) {
+    for (int j = 0; j < kEntriesPerCf; ++j) {
+      ASSERT_EQ(Get(i, "key" + ToString(j)), "val" + ToString(j));
+    }
+  }
+
+  // Examine table properties to verify RepairDB() used the right options when
+  // converting WAL->SST
+  TablePropertiesCollection fname_to_props;
+  db_->GetPropertiesOfAllTables(handles_[1], &fname_to_props);
+  ASSERT_EQ(fname_to_props.size(), 2U);
+  for (const auto& fname_and_props : fname_to_props) {
+    std::string comparator_name (
+      InternalKeyComparator(rev_opts.comparator).Name());
+    comparator_name = comparator_name.substr(comparator_name.find(':') + 1);
+    ASSERT_EQ(comparator_name,
+              fname_and_props.second->comparator_name);
+  }
+  Close();
+
+  // Also check comparator when it's provided via "unknown" CF options
+  ASSERT_OK(RepairDB(dbname_, opts, {{"default", opts}},
+                     rev_opts /* unknown_cf_opts */));
+  ASSERT_OK(TryReopenWithColumnFamilies({"default", "reverse"},
+                                        std::vector<Options>{opts, rev_opts}));
+  for (int i = 0; i < kNumCfs; ++i) {
+    for (int j = 0; j < kEntriesPerCf; ++j) {
+      ASSERT_EQ(Get(i, "key" + ToString(j)), "val" + ToString(j));
+    }
+  }
+}
+
+TEST_F(RepairTest, DbNameContainsTrailingSlash) {
+  {
+    bool tmp;
+    if (env_->AreFilesSame("", "", &tmp).IsNotSupported()) {
+      fprintf(stderr,
+              "skipping RepairTest.DbNameContainsTrailingSlash due to "
+              "unsupported Env::AreFilesSame\n");
+      return;
+    }
+  }
+
+  Put("key", "val");
+  Flush();
+  Close();
+
+  ASSERT_OK(RepairDB(dbname_ + "/", CurrentOptions()));
+  Reopen(CurrentOptions());
+  ASSERT_EQ(Get("key"), "val");
+}
+#endif  // ROCKSDB_LITE
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+  fprintf(stderr, "SKIPPED as RepairDB is not supported in ROCKSDB_LITE\n");
+  return 0;
+}
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/db/snapshot_checker.h b/src/rocksdb/db/snapshot_checker.h
new file mode 100644
index 000000000..1d2c2c316
--- /dev/null
+++ b/src/rocksdb/db/snapshot_checker.h
@@ -0,0 +1,61 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#include "rocksdb/types.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+enum class SnapshotCheckerResult : int {
+  kInSnapshot = 0,
+  kNotInSnapshot = 1,
+  // In case snapshot is released and the checker has no clue whether
+  // the given sequence is visible to the snapshot.
+  kSnapshotReleased = 2,
+};
+
+// Callback class that control GC of duplicate keys in flush/compaction.
+class SnapshotChecker {
+ public:
+  virtual ~SnapshotChecker() {}
+  virtual SnapshotCheckerResult CheckInSnapshot(
+      SequenceNumber sequence, SequenceNumber snapshot_sequence) const = 0;
+};
+
+class DisableGCSnapshotChecker : public SnapshotChecker {
+ public:
+  virtual ~DisableGCSnapshotChecker() {}
+  virtual SnapshotCheckerResult CheckInSnapshot(
+      SequenceNumber /*sequence*/,
+      SequenceNumber /*snapshot_sequence*/) const override {
+    // By returning kNotInSnapshot, we prevent all the values from being GCed
+    return SnapshotCheckerResult::kNotInSnapshot;
+  }
+  static DisableGCSnapshotChecker* Instance() { return &instance_; }
+
+ protected:
+  static DisableGCSnapshotChecker instance_;
+  explicit DisableGCSnapshotChecker() {}
+};
+
+class WritePreparedTxnDB;
+
+// Callback class created by WritePreparedTxnDB to check if a key
+// is visible by a snapshot.
+class WritePreparedSnapshotChecker : public SnapshotChecker {
+ public:
+  explicit WritePreparedSnapshotChecker(WritePreparedTxnDB* txn_db);
+  virtual ~WritePreparedSnapshotChecker() {}
+
+  virtual SnapshotCheckerResult CheckInSnapshot(
+      SequenceNumber sequence, SequenceNumber snapshot_sequence) const override;
+
+ private:
+#ifndef ROCKSDB_LITE
+  const WritePreparedTxnDB* const txn_db_;
+#endif  // !ROCKSDB_LITE
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/snapshot_impl.cc b/src/rocksdb/db/snapshot_impl.cc
new file mode 100644
index 000000000..b9228c797
--- /dev/null
+++ b/src/rocksdb/db/snapshot_impl.cc
@@ -0,0 +1,26 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "rocksdb/snapshot.h"
+
+#include "rocksdb/db.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+ManagedSnapshot::ManagedSnapshot(DB* db) : db_(db),
+                                           snapshot_(db->GetSnapshot()) {}
+
+ManagedSnapshot::ManagedSnapshot(DB* db, const Snapshot* _snapshot)
+    : db_(db), snapshot_(_snapshot) {}
+
+ManagedSnapshot::~ManagedSnapshot() {
+  if (snapshot_) {
+    db_->ReleaseSnapshot(snapshot_);
+  }
+}
+
+const Snapshot* ManagedSnapshot::snapshot() { return snapshot_;}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/snapshot_impl.h b/src/rocksdb/db/snapshot_impl.h
new file mode 100644
index 000000000..785f814f8
--- /dev/null
+++ b/src/rocksdb/db/snapshot_impl.h
@@ -0,0 +1,167 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <vector>
+
+#include "rocksdb/db.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class SnapshotList;
+
+// Snapshots are kept in a doubly-linked list in the DB.
+// Each SnapshotImpl corresponds to a particular sequence number.
+class SnapshotImpl : public Snapshot {
+ public:
+  SequenceNumber number_;  // const after creation
+  // It indicates the smallest uncommitted data at the time the snapshot was
+  // taken. This is currently used by WritePrepared transactions to limit the
+  // scope of queries to IsInSnpashot.
+  SequenceNumber min_uncommitted_ = kMinUnCommittedSeq;
+
+  virtual SequenceNumber GetSequenceNumber() const override { return number_; }
+
+ private:
+  friend class SnapshotList;
+
+  // SnapshotImpl is kept in a doubly-linked circular list
+  SnapshotImpl* prev_;
+  SnapshotImpl* next_;
+
+  SnapshotList* list_;                 // just for sanity checks
+
+  int64_t unix_time_;
+
+  // Will this snapshot be used by a Transaction to do write-conflict checking?
+  bool is_write_conflict_boundary_;
+};
+
+class SnapshotList {
+ public:
+  SnapshotList() {
+    list_.prev_ = &list_;
+    list_.next_ = &list_;
+    list_.number_ = 0xFFFFFFFFL;      // placeholder marker, for debugging
+    // Set all the variables to make UBSAN happy.
+    list_.list_ = nullptr;
+    list_.unix_time_ = 0;
+    list_.is_write_conflict_boundary_ = false;
+    count_ = 0;
+  }
+
+  // No copy-construct.
+  SnapshotList(const SnapshotList&) = delete;
+
+  bool empty() const { return list_.next_ == &list_; }
+  SnapshotImpl* oldest() const { assert(!empty()); return list_.next_; }
+  SnapshotImpl* newest() const { assert(!empty()); return list_.prev_; }
+
+  SnapshotImpl* New(SnapshotImpl* s, SequenceNumber seq, uint64_t unix_time,
+                    bool is_write_conflict_boundary) {
+    s->number_ = seq;
+    s->unix_time_ = unix_time;
+    s->is_write_conflict_boundary_ = is_write_conflict_boundary;
+    s->list_ = this;
+    s->next_ = &list_;
+    s->prev_ = list_.prev_;
+    s->prev_->next_ = s;
+    s->next_->prev_ = s;
+    count_++;
+    return s;
+  }
+
+  // Do not responsible to free the object.
+  void Delete(const SnapshotImpl* s) {
+    assert(s->list_ == this);
+    s->prev_->next_ = s->next_;
+    s->next_->prev_ = s->prev_;
+    count_--;
+  }
+
+  // retrieve all snapshot numbers up until max_seq. They are sorted in
+  // ascending order (with no duplicates).
+  std::vector<SequenceNumber> GetAll(
+      SequenceNumber* oldest_write_conflict_snapshot = nullptr,
+      const SequenceNumber& max_seq = kMaxSequenceNumber) const {
+    std::vector<SequenceNumber> ret;
+    GetAll(&ret, oldest_write_conflict_snapshot, max_seq);
+    return ret;
+  }
+
+  void GetAll(std::vector<SequenceNumber>* snap_vector,
+              SequenceNumber* oldest_write_conflict_snapshot = nullptr,
+              const SequenceNumber& max_seq = kMaxSequenceNumber) const {
+    std::vector<SequenceNumber>& ret = *snap_vector;
+    // So far we have no use case that would pass a non-empty vector
+    assert(ret.size() == 0);
+
+    if (oldest_write_conflict_snapshot != nullptr) {
+      *oldest_write_conflict_snapshot = kMaxSequenceNumber;
+    }
+
+    if (empty()) {
+      return;
+    }
+    const SnapshotImpl* s = &list_;
+    while (s->next_ != &list_) {
+      if (s->next_->number_ > max_seq) {
+        break;
+      }
+      // Avoid duplicates
+      if (ret.empty() || ret.back() != s->next_->number_) {
+        ret.push_back(s->next_->number_);
+      }
+
+      if (oldest_write_conflict_snapshot != nullptr &&
+          *oldest_write_conflict_snapshot == kMaxSequenceNumber &&
+          s->next_->is_write_conflict_boundary_) {
+        // If this is the first write-conflict boundary snapshot in the list,
+        // it is the oldest
+        *oldest_write_conflict_snapshot = s->next_->number_;
+      }
+
+      s = s->next_;
+    }
+    return;
+  }
+
+  // get the sequence number of the most recent snapshot
+  SequenceNumber GetNewest() {
+    if (empty()) {
+      return 0;
+    }
+    return newest()->number_;
+  }
+
+  int64_t GetOldestSnapshotTime() const {
+    if (empty()) {
+      return 0;
+    } else {
+      return oldest()->unix_time_;
+    }
+  }
+
+  int64_t GetOldestSnapshotSequence() const {
+    if (empty()) {
+      return 0;
+    } else {
+      return oldest()->GetSequenceNumber();
+    }
+  }
+
+  uint64_t count() const { return count_; }
+
+ private:
+  // Dummy head of doubly-linked list of snapshots
+  SnapshotImpl list_;
+  uint64_t count_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/table_cache.cc b/src/rocksdb/db/table_cache.cc
new file mode 100644
index 000000000..411959a33
--- /dev/null
+++ b/src/rocksdb/db/table_cache.cc
@@ -0,0 +1,668 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/table_cache.h"
+
+#include "db/dbformat.h"
+#include "db/range_tombstone_fragmenter.h"
+#include "db/snapshot_impl.h"
+#include "db/version_edit.h"
+#include "file/filename.h"
+#include "file/random_access_file_reader.h"
+#include "monitoring/perf_context_imp.h"
+#include "rocksdb/statistics.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "table/get_context.h"
+#include "table/internal_iterator.h"
+#include "table/iterator_wrapper.h"
+#include "table/multiget_context.h"
+#include "table/table_builder.h"
+#include "table/table_reader.h"
+#include "test_util/sync_point.h"
+#include "util/cast_util.h"
+#include "util/coding.h"
+#include "util/stop_watch.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+
+template <class T>
+static void DeleteEntry(const Slice& /*key*/, void* value) {
+  T* typed_value = reinterpret_cast<T*>(value);
+  delete typed_value;
+}
+
+static void UnrefEntry(void* arg1, void* arg2) {
+  Cache* cache = reinterpret_cast<Cache*>(arg1);
+  Cache::Handle* h = reinterpret_cast<Cache::Handle*>(arg2);
+  cache->Release(h);
+}
+
+static Slice GetSliceForFileNumber(const uint64_t* file_number) {
+  return Slice(reinterpret_cast<const char*>(file_number),
+               sizeof(*file_number));
+}
+
+#ifndef ROCKSDB_LITE
+
+void AppendVarint64(IterKey* key, uint64_t v) {
+  char buf[10];
+  auto ptr = EncodeVarint64(buf, v);
+  key->TrimAppend(key->Size(), buf, ptr - buf);
+}
+
+#endif  // ROCKSDB_LITE
+
+}  // namespace
+
+TableCache::TableCache(const ImmutableCFOptions& ioptions,
+                       const FileOptions& file_options, Cache* const cache,
+                       BlockCacheTracer* const block_cache_tracer)
+    : ioptions_(ioptions),
+      file_options_(file_options),
+      cache_(cache),
+      immortal_tables_(false),
+      block_cache_tracer_(block_cache_tracer) {
+  if (ioptions_.row_cache) {
+    // If the same cache is shared by multiple instances, we need to
+    // disambiguate its entries.
+    PutVarint64(&row_cache_id_, ioptions_.row_cache->NewId());
+  }
+}
+
+TableCache::~TableCache() {
+}
+
+TableReader* TableCache::GetTableReaderFromHandle(Cache::Handle* handle) {
+  return reinterpret_cast<TableReader*>(cache_->Value(handle));
+}
+
+void TableCache::ReleaseHandle(Cache::Handle* handle) {
+  cache_->Release(handle);
+}
+
+Status TableCache::GetTableReader(
+    const FileOptions& file_options,
+    const InternalKeyComparator& internal_comparator, const FileDescriptor& fd,
+    bool sequential_mode, bool record_read_stats, HistogramImpl* file_read_hist,
+    std::unique_ptr<TableReader>* table_reader,
+    const SliceTransform* prefix_extractor, bool skip_filters, int level,
+    bool prefetch_index_and_filter_in_cache) {
+  std::string fname =
+      TableFileName(ioptions_.cf_paths, fd.GetNumber(), fd.GetPathId());
+  std::unique_ptr<FSRandomAccessFile> file;
+  Status s = ioptions_.fs->NewRandomAccessFile(fname, file_options, &file,
+                                               nullptr);
+  RecordTick(ioptions_.statistics, NO_FILE_OPENS);
+  if (s.IsPathNotFound()) {
+    fname = Rocks2LevelTableFileName(fname);
+    s = ioptions_.fs->NewRandomAccessFile(fname, file_options, &file, nullptr);
+    RecordTick(ioptions_.statistics, NO_FILE_OPENS);
+  }
+
+  if (s.ok()) {
+    if (!sequential_mode && ioptions_.advise_random_on_open) {
+      file->Hint(FSRandomAccessFile::kRandom);
+    }
+    StopWatch sw(ioptions_.env, ioptions_.statistics, TABLE_OPEN_IO_MICROS);
+    std::unique_ptr<RandomAccessFileReader> file_reader(
+        new RandomAccessFileReader(
+            std::move(file), fname, ioptions_.env,
+            record_read_stats ? ioptions_.statistics : nullptr, SST_READ_MICROS,
+            file_read_hist, ioptions_.rate_limiter, ioptions_.listeners));
+    s = ioptions_.table_factory->NewTableReader(
+        TableReaderOptions(ioptions_, prefix_extractor, file_options,
+                           internal_comparator, skip_filters, immortal_tables_,
+                           level, fd.largest_seqno, block_cache_tracer_),
+        std::move(file_reader), fd.GetFileSize(), table_reader,
+        prefetch_index_and_filter_in_cache);
+    TEST_SYNC_POINT("TableCache::GetTableReader:0");
+  }
+  return s;
+}
+
+void TableCache::EraseHandle(const FileDescriptor& fd, Cache::Handle* handle) {
+  ReleaseHandle(handle);
+  uint64_t number = fd.GetNumber();
+  Slice key = GetSliceForFileNumber(&number);
+  cache_->Erase(key);
+}
+
+Status TableCache::FindTable(const FileOptions& file_options,
+                             const InternalKeyComparator& internal_comparator,
+                             const FileDescriptor& fd, Cache::Handle** handle,
+                             const SliceTransform* prefix_extractor,
+                             const bool no_io, bool record_read_stats,
+                             HistogramImpl* file_read_hist, bool skip_filters,
+                             int level,
+                             bool prefetch_index_and_filter_in_cache) {
+  PERF_TIMER_GUARD_WITH_ENV(find_table_nanos, ioptions_.env);
+  Status s;
+  uint64_t number = fd.GetNumber();
+  Slice key = GetSliceForFileNumber(&number);
+  *handle = cache_->Lookup(key);
+  TEST_SYNC_POINT_CALLBACK("TableCache::FindTable:0",
+                           const_cast<bool*>(&no_io));
+
+  if (*handle == nullptr) {
+    if (no_io) {  // Don't do IO and return a not-found status
+      return Status::Incomplete("Table not found in table_cache, no_io is set");
+    }
+    std::unique_ptr<TableReader> table_reader;
+    s = GetTableReader(file_options, internal_comparator, fd,
+                       false /* sequential mode */, record_read_stats,
+                       file_read_hist, &table_reader, prefix_extractor,
+                       skip_filters, level, prefetch_index_and_filter_in_cache);
+    if (!s.ok()) {
+      assert(table_reader == nullptr);
+      RecordTick(ioptions_.statistics, NO_FILE_ERRORS);
+      // We do not cache error results so that if the error is transient,
+      // or somebody repairs the file, we recover automatically.
+    } else {
+      s = cache_->Insert(key, table_reader.get(), 1, &DeleteEntry<TableReader>,
+                         handle);
+      if (s.ok()) {
+        // Release ownership of table reader.
+        table_reader.release();
+      }
+    }
+  }
+  return s;
+}
+
+InternalIterator* TableCache::NewIterator(
+    const ReadOptions& options, const FileOptions& file_options,
+    const InternalKeyComparator& icomparator, const FileMetaData& file_meta,
+    RangeDelAggregator* range_del_agg, const SliceTransform* prefix_extractor,
+    TableReader** table_reader_ptr, HistogramImpl* file_read_hist,
+    TableReaderCaller caller, Arena* arena, bool skip_filters, int level,
+    const InternalKey* smallest_compaction_key,
+    const InternalKey* largest_compaction_key) {
+  PERF_TIMER_GUARD(new_table_iterator_nanos);
+
+  Status s;
+  TableReader* table_reader = nullptr;
+  Cache::Handle* handle = nullptr;
+  if (table_reader_ptr != nullptr) {
+    *table_reader_ptr = nullptr;
+  }
+  bool for_compaction = caller == TableReaderCaller::kCompaction;
+  auto& fd = file_meta.fd;
+  table_reader = fd.table_reader;
+  if (table_reader == nullptr) {
+    s = FindTable(file_options, icomparator, fd, &handle, prefix_extractor,
+                  options.read_tier == kBlockCacheTier /* no_io */,
+                  !for_compaction /* record_read_stats */, file_read_hist,
+                  skip_filters, level);
+    if (s.ok()) {
+      table_reader = GetTableReaderFromHandle(handle);
+    }
+  }
+  InternalIterator* result = nullptr;
+  if (s.ok()) {
+    if (options.table_filter &&
+        !options.table_filter(*table_reader->GetTableProperties())) {
+      result = NewEmptyInternalIterator<Slice>(arena);
+    } else {
+      result = table_reader->NewIterator(options, prefix_extractor, arena,
+                                   skip_filters, caller,
+                                   file_options.compaction_readahead_size);
+    }
+    if (handle != nullptr) {
+      result->RegisterCleanup(&UnrefEntry, cache_, handle);
+      handle = nullptr;  // prevent from releasing below
+    }
+
+    if (for_compaction) {
+      table_reader->SetupForCompaction();
+    }
+    if (table_reader_ptr != nullptr) {
+      *table_reader_ptr = table_reader;
+    }
+  }
+  if (s.ok() && range_del_agg != nullptr && !options.ignore_range_deletions) {
+    if (range_del_agg->AddFile(fd.GetNumber())) {
+      std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter(
+          static_cast<FragmentedRangeTombstoneIterator*>(
+              table_reader->NewRangeTombstoneIterator(options)));
+      if (range_del_iter != nullptr) {
+        s = range_del_iter->status();
+      }
+      if (s.ok()) {
+        const InternalKey* smallest = &file_meta.smallest;
+        const InternalKey* largest = &file_meta.largest;
+        if (smallest_compaction_key != nullptr) {
+          smallest = smallest_compaction_key;
+        }
+        if (largest_compaction_key != nullptr) {
+          largest = largest_compaction_key;
+        }
+        range_del_agg->AddTombstones(std::move(range_del_iter), smallest,
+                                     largest);
+      }
+    }
+  }
+
+  if (handle != nullptr) {
+    ReleaseHandle(handle);
+  }
+  if (!s.ok()) {
+    assert(result == nullptr);
+    result = NewErrorInternalIterator<Slice>(s, arena);
+  }
+  return result;
+}
+
+Status TableCache::GetRangeTombstoneIterator(
+    const ReadOptions& options,
+    const InternalKeyComparator& internal_comparator,
+    const FileMetaData& file_meta,
+    std::unique_ptr<FragmentedRangeTombstoneIterator>* out_iter) {
+  const FileDescriptor& fd = file_meta.fd;
+  Status s;
+  TableReader* t = fd.table_reader;
+  Cache::Handle* handle = nullptr;
+  if (t == nullptr) {
+    s = FindTable(file_options_, internal_comparator, fd, &handle);
+    if (s.ok()) {
+      t = GetTableReaderFromHandle(handle);
+    }
+  }
+  if (s.ok()) {
+    out_iter->reset(t->NewRangeTombstoneIterator(options));
+    assert(out_iter);
+  }
+  return s;
+}
+
+#ifndef ROCKSDB_LITE
+void TableCache::CreateRowCacheKeyPrefix(const ReadOptions& options,
+                                         const FileDescriptor& fd,
+                                         const Slice& internal_key,
+                                         GetContext* get_context,
+                                         IterKey& row_cache_key) {
+  uint64_t fd_number = fd.GetNumber();
+  // We use the user key as cache key instead of the internal key,
+  // otherwise the whole cache would be invalidated every time the
+  // sequence key increases. However, to support caching snapshot
+  // reads, we append the sequence number (incremented by 1 to
+  // distinguish from 0) only in this case.
+  // If the snapshot is larger than the largest seqno in the file,
+  // all data should be exposed to the snapshot, so we treat it
+  // the same as there is no snapshot. The exception is that if
+  // a seq-checking callback is registered, some internal keys
+  // may still be filtered out.
+  uint64_t seq_no = 0;
+  // Maybe we can include the whole file ifsnapshot == fd.largest_seqno.
+  if (options.snapshot != nullptr &&
+      (get_context->has_callback() ||
+       static_cast_with_check<const SnapshotImpl, const Snapshot>(
+           options.snapshot)
+               ->GetSequenceNumber() <= fd.largest_seqno)) {
+    // We should consider to use options.snapshot->GetSequenceNumber()
+    // instead of GetInternalKeySeqno(k), which will make the code
+    // easier to understand.
+    seq_no = 1 + GetInternalKeySeqno(internal_key);
+  }
+
+  // Compute row cache key.
+  row_cache_key.TrimAppend(row_cache_key.Size(), row_cache_id_.data(),
+                           row_cache_id_.size());
+  AppendVarint64(&row_cache_key, fd_number);
+  AppendVarint64(&row_cache_key, seq_no);
+}
+
+bool TableCache::GetFromRowCache(const Slice& user_key, IterKey& row_cache_key,
+                                 size_t prefix_size, GetContext* get_context) {
+  bool found = false;
+
+  row_cache_key.TrimAppend(prefix_size, user_key.data(), user_key.size());
+  if (auto row_handle =
+          ioptions_.row_cache->Lookup(row_cache_key.GetUserKey())) {
+    // Cleanable routine to release the cache entry
+    Cleanable value_pinner;
+    auto release_cache_entry_func = [](void* cache_to_clean,
+                                       void* cache_handle) {
+      ((Cache*)cache_to_clean)->Release((Cache::Handle*)cache_handle);
+    };
+    auto found_row_cache_entry =
+        static_cast<const std::string*>(ioptions_.row_cache->Value(row_handle));
+    // If it comes here value is located on the cache.
+    // found_row_cache_entry points to the value on cache,
+    // and value_pinner has cleanup procedure for the cached entry.
+    // After replayGetContextLog() returns, get_context.pinnable_slice_
+    // will point to cache entry buffer (or a copy based on that) and
+    // cleanup routine under value_pinner will be delegated to
+    // get_context.pinnable_slice_. Cache entry is released when
+    // get_context.pinnable_slice_ is reset.
+    value_pinner.RegisterCleanup(release_cache_entry_func,
+                                 ioptions_.row_cache.get(), row_handle);
+    replayGetContextLog(*found_row_cache_entry, user_key, get_context,
+                        &value_pinner);
+    RecordTick(ioptions_.statistics, ROW_CACHE_HIT);
+    found = true;
+  } else {
+    RecordTick(ioptions_.statistics, ROW_CACHE_MISS);
+  }
+  return found;
+}
+#endif  // ROCKSDB_LITE
+
+Status TableCache::Get(const ReadOptions& options,
+                       const InternalKeyComparator& internal_comparator,
+                       const FileMetaData& file_meta, const Slice& k,
+                       GetContext* get_context,
+                       const SliceTransform* prefix_extractor,
+                       HistogramImpl* file_read_hist, bool skip_filters,
+                       int level) {
+  auto& fd = file_meta.fd;
+  std::string* row_cache_entry = nullptr;
+  bool done = false;
+#ifndef ROCKSDB_LITE
+  IterKey row_cache_key;
+  std::string row_cache_entry_buffer;
+
+  // Check row cache if enabled. Since row cache does not currently store
+  // sequence numbers, we cannot use it if we need to fetch the sequence.
+  if (ioptions_.row_cache && !get_context->NeedToReadSequence()) {
+    auto user_key = ExtractUserKey(k);
+    CreateRowCacheKeyPrefix(options, fd, k, get_context, row_cache_key);
+    done = GetFromRowCache(user_key, row_cache_key, row_cache_key.Size(),
+                           get_context);
+    if (!done) {
+      row_cache_entry = &row_cache_entry_buffer;
+    }
+  }
+#endif  // ROCKSDB_LITE
+  Status s;
+  TableReader* t = fd.table_reader;
+  Cache::Handle* handle = nullptr;
+  if (!done && s.ok()) {
+    if (t == nullptr) {
+      s = FindTable(
+          file_options_, internal_comparator, fd, &handle, prefix_extractor,
+          options.read_tier == kBlockCacheTier /* no_io */,
+          true /* record_read_stats */, file_read_hist, skip_filters, level);
+      if (s.ok()) {
+        t = GetTableReaderFromHandle(handle);
+      }
+    }
+    SequenceNumber* max_covering_tombstone_seq =
+        get_context->max_covering_tombstone_seq();
+    if (s.ok() && max_covering_tombstone_seq != nullptr &&
+        !options.ignore_range_deletions) {
+      std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter(
+          t->NewRangeTombstoneIterator(options));
+      if (range_del_iter != nullptr) {
+        *max_covering_tombstone_seq = std::max(
+            *max_covering_tombstone_seq,
+            range_del_iter->MaxCoveringTombstoneSeqnum(ExtractUserKey(k)));
+      }
+    }
+    if (s.ok()) {
+      get_context->SetReplayLog(row_cache_entry);  // nullptr if no cache.
+      s = t->Get(options, k, get_context, prefix_extractor, skip_filters);
+      get_context->SetReplayLog(nullptr);
+    } else if (options.read_tier == kBlockCacheTier && s.IsIncomplete()) {
+      // Couldn't find Table in cache but treat as kFound if no_io set
+      get_context->MarkKeyMayExist();
+      s = Status::OK();
+      done = true;
+    }
+  }
+
+#ifndef ROCKSDB_LITE
+  // Put the replay log in row cache only if something was found.
+  if (!done && s.ok() && row_cache_entry && !row_cache_entry->empty()) {
+    size_t charge =
+        row_cache_key.Size() + row_cache_entry->size() + sizeof(std::string);
+    void* row_ptr = new std::string(std::move(*row_cache_entry));
+    ioptions_.row_cache->Insert(row_cache_key.GetUserKey(), row_ptr, charge,
+                                &DeleteEntry<std::string>);
+  }
+#endif  // ROCKSDB_LITE
+
+  if (handle != nullptr) {
+    ReleaseHandle(handle);
+  }
+  return s;
+}
+
+// Batched version of TableCache::MultiGet.
+Status TableCache::MultiGet(const ReadOptions& options,
+                            const InternalKeyComparator& internal_comparator,
+                            const FileMetaData& file_meta,
+                            const MultiGetContext::Range* mget_range,
+                            const SliceTransform* prefix_extractor,
+                            HistogramImpl* file_read_hist, bool skip_filters,
+                            int level) {
+  auto& fd = file_meta.fd;
+  Status s;
+  TableReader* t = fd.table_reader;
+  Cache::Handle* handle = nullptr;
+  MultiGetRange table_range(*mget_range, mget_range->begin(),
+                            mget_range->end());
+#ifndef ROCKSDB_LITE
+  autovector<std::string, MultiGetContext::MAX_BATCH_SIZE> row_cache_entries;
+  IterKey row_cache_key;
+  size_t row_cache_key_prefix_size = 0;
+  KeyContext& first_key = *table_range.begin();
+  bool lookup_row_cache =
+      ioptions_.row_cache && !first_key.get_context->NeedToReadSequence();
+
+  // Check row cache if enabled. Since row cache does not currently store
+  // sequence numbers, we cannot use it if we need to fetch the sequence.
+  if (lookup_row_cache) {
+    GetContext* first_context = first_key.get_context;
+    CreateRowCacheKeyPrefix(options, fd, first_key.ikey, first_context,
+                            row_cache_key);
+    row_cache_key_prefix_size = row_cache_key.Size();
+
+    for (auto miter = table_range.begin(); miter != table_range.end();
+         ++miter) {
+      const Slice& user_key = miter->ukey;
+      ;
+      GetContext* get_context = miter->get_context;
+
+      if (GetFromRowCache(user_key, row_cache_key, row_cache_key_prefix_size,
+                          get_context)) {
+        table_range.SkipKey(miter);
+      } else {
+        row_cache_entries.emplace_back();
+        get_context->SetReplayLog(&(row_cache_entries.back()));
+      }
+    }
+  }
+#endif  // ROCKSDB_LITE
+
+  // Check that table_range is not empty. Its possible all keys may have been
+  // found in the row cache and thus the range may now be empty
+  if (s.ok() && !table_range.empty()) {
+    if (t == nullptr) {
+      s = FindTable(
+          file_options_, internal_comparator, fd, &handle, prefix_extractor,
+          options.read_tier == kBlockCacheTier /* no_io */,
+          true /* record_read_stats */, file_read_hist, skip_filters, level);
+      TEST_SYNC_POINT_CALLBACK("TableCache::MultiGet:FindTable", &s);
+      if (s.ok()) {
+        t = GetTableReaderFromHandle(handle);
+        assert(t);
+      }
+    }
+    if (s.ok() && !options.ignore_range_deletions) {
+      std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter(
+          t->NewRangeTombstoneIterator(options));
+      if (range_del_iter != nullptr) {
+        for (auto iter = table_range.begin(); iter != table_range.end();
+             ++iter) {
+          SequenceNumber* max_covering_tombstone_seq =
+              iter->get_context->max_covering_tombstone_seq();
+          *max_covering_tombstone_seq =
+              std::max(*max_covering_tombstone_seq,
+                       range_del_iter->MaxCoveringTombstoneSeqnum(iter->ukey));
+        }
+      }
+    }
+    if (s.ok()) {
+      t->MultiGet(options, &table_range, prefix_extractor, skip_filters);
+    } else if (options.read_tier == kBlockCacheTier && s.IsIncomplete()) {
+      for (auto iter = table_range.begin(); iter != table_range.end(); ++iter) {
+        Status* status = iter->s;
+        if (status->IsIncomplete()) {
+          // Couldn't find Table in cache but treat as kFound if no_io set
+          iter->get_context->MarkKeyMayExist();
+          s = Status::OK();
+        }
+      }
+    }
+  }
+
+#ifndef ROCKSDB_LITE
+  if (lookup_row_cache) {
+    size_t row_idx = 0;
+
+    for (auto miter = table_range.begin(); miter != table_range.end();
+         ++miter) {
+      std::string& row_cache_entry = row_cache_entries[row_idx++];
+      const Slice& user_key = miter->ukey;
+      ;
+      GetContext* get_context = miter->get_context;
+
+      get_context->SetReplayLog(nullptr);
+      // Compute row cache key.
+      row_cache_key.TrimAppend(row_cache_key_prefix_size, user_key.data(),
+                               user_key.size());
+      // Put the replay log in row cache only if something was found.
+      if (s.ok() && !row_cache_entry.empty()) {
+        size_t charge =
+            row_cache_key.Size() + row_cache_entry.size() + sizeof(std::string);
+        void* row_ptr = new std::string(std::move(row_cache_entry));
+        ioptions_.row_cache->Insert(row_cache_key.GetUserKey(), row_ptr, charge,
+                                    &DeleteEntry<std::string>);
+      }
+    }
+  }
+#endif  // ROCKSDB_LITE
+
+  if (handle != nullptr) {
+    ReleaseHandle(handle);
+  }
+  return s;
+}
+
+Status TableCache::GetTableProperties(
+    const FileOptions& file_options,
+    const InternalKeyComparator& internal_comparator, const FileDescriptor& fd,
+    std::shared_ptr<const TableProperties>* properties,
+    const SliceTransform* prefix_extractor, bool no_io) {
+  Status s;
+  auto table_reader = fd.table_reader;
+  // table already been pre-loaded?
+  if (table_reader) {
+    *properties = table_reader->GetTableProperties();
+
+    return s;
+  }
+
+  Cache::Handle* table_handle = nullptr;
+  s = FindTable(file_options, internal_comparator, fd, &table_handle,
+                prefix_extractor, no_io);
+  if (!s.ok()) {
+    return s;
+  }
+  assert(table_handle);
+  auto table = GetTableReaderFromHandle(table_handle);
+  *properties = table->GetTableProperties();
+  ReleaseHandle(table_handle);
+  return s;
+}
+
+size_t TableCache::GetMemoryUsageByTableReader(
+    const FileOptions& file_options,
+    const InternalKeyComparator& internal_comparator, const FileDescriptor& fd,
+    const SliceTransform* prefix_extractor) {
+  Status s;
+  auto table_reader = fd.table_reader;
+  // table already been pre-loaded?
+  if (table_reader) {
+    return table_reader->ApproximateMemoryUsage();
+  }
+
+  Cache::Handle* table_handle = nullptr;
+  s = FindTable(file_options, internal_comparator, fd, &table_handle,
+                prefix_extractor, true);
+  if (!s.ok()) {
+    return 0;
+  }
+  assert(table_handle);
+  auto table = GetTableReaderFromHandle(table_handle);
+  auto ret = table->ApproximateMemoryUsage();
+  ReleaseHandle(table_handle);
+  return ret;
+}
+
+void TableCache::Evict(Cache* cache, uint64_t file_number) {
+  cache->Erase(GetSliceForFileNumber(&file_number));
+}
+
+uint64_t TableCache::ApproximateOffsetOf(
+    const Slice& key, const FileDescriptor& fd, TableReaderCaller caller,
+    const InternalKeyComparator& internal_comparator,
+    const SliceTransform* prefix_extractor) {
+  uint64_t result = 0;
+  TableReader* table_reader = fd.table_reader;
+  Cache::Handle* table_handle = nullptr;
+  if (table_reader == nullptr) {
+    const bool for_compaction = (caller == TableReaderCaller::kCompaction);
+    Status s = FindTable(file_options_, internal_comparator, fd, &table_handle,
+                         prefix_extractor, false /* no_io */,
+                         !for_compaction /* record_read_stats */);
+    if (s.ok()) {
+      table_reader = GetTableReaderFromHandle(table_handle);
+    }
+  }
+
+  if (table_reader != nullptr) {
+    result = table_reader->ApproximateOffsetOf(key, caller);
+  }
+  if (table_handle != nullptr) {
+    ReleaseHandle(table_handle);
+  }
+
+  return result;
+}
+
+uint64_t TableCache::ApproximateSize(
+    const Slice& start, const Slice& end, const FileDescriptor& fd,
+    TableReaderCaller caller, const InternalKeyComparator& internal_comparator,
+    const SliceTransform* prefix_extractor) {
+  uint64_t result = 0;
+  TableReader* table_reader = fd.table_reader;
+  Cache::Handle* table_handle = nullptr;
+  if (table_reader == nullptr) {
+    const bool for_compaction = (caller == TableReaderCaller::kCompaction);
+    Status s = FindTable(file_options_, internal_comparator, fd, &table_handle,
+                         prefix_extractor, false /* no_io */,
+                         !for_compaction /* record_read_stats */);
+    if (s.ok()) {
+      table_reader = GetTableReaderFromHandle(table_handle);
+    }
+  }
+
+  if (table_reader != nullptr) {
+    result = table_reader->ApproximateSize(start, end, caller);
+  }
+  if (table_handle != nullptr) {
+    ReleaseHandle(table_handle);
+  }
+
+  return result;
+}
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/table_cache.h b/src/rocksdb/db/table_cache.h
new file mode 100644
index 000000000..b9de824ee
--- /dev/null
+++ b/src/rocksdb/db/table_cache.h
@@ -0,0 +1,226 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Thread-safe (provides internal synchronization)
+
+#pragma once
+#include <string>
+#include <vector>
+#include <stdint.h>
+
+#include "db/dbformat.h"
+#include "db/range_del_aggregator.h"
+#include "options/cf_options.h"
+#include "port/port.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/env.h"
+#include "rocksdb/options.h"
+#include "rocksdb/table.h"
+#include "table/table_reader.h"
+#include "trace_replay/block_cache_tracer.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Env;
+class Arena;
+struct FileDescriptor;
+class GetContext;
+class HistogramImpl;
+
+// Manages caching for TableReader objects for a column family. The actual
+// cache is allocated separately and passed to the constructor. TableCache
+// wraps around the underlying SST file readers by providing Get(),
+// MultiGet() and NewIterator() methods that hide the instantiation,
+// caching and access to the TableReader. The main purpose of this is
+// performance - by caching the TableReader, it avoids unnecessary file opens
+// and object allocation and instantiation. One exception is compaction, where
+// a new TableReader may be instantiated - see NewIterator() comments
+//
+// Another service provided by TableCache is managing the row cache - if the
+// DB is configured with a row cache, and the lookup key is present in the row
+// cache, lookup is very fast. The row cache is obtained from
+// ioptions.row_cache
+class TableCache {
+ public:
+  TableCache(const ImmutableCFOptions& ioptions,
+             const FileOptions& storage_options, Cache* cache,
+             BlockCacheTracer* const block_cache_tracer);
+  ~TableCache();
+
+  // Return an iterator for the specified file number (the corresponding
+  // file length must be exactly "file_size" bytes).  If "table_reader_ptr"
+  // is non-nullptr, also sets "*table_reader_ptr" to point to the Table object
+  // underlying the returned iterator, or nullptr if no Table object underlies
+  // the returned iterator.  The returned "*table_reader_ptr" object is owned
+  // by the cache and should not be deleted, and is valid for as long as the
+  // returned iterator is live.
+  // @param range_del_agg If non-nullptr, adds range deletions to the
+  //    aggregator. If an error occurs, returns it in a NewErrorInternalIterator
+  // @param for_compaction If true, a new TableReader may be allocated (but
+  //                       not cached), depending on the CF options
+  // @param skip_filters Disables loading/accessing the filter block
+  // @param level The level this table is at, -1 for "not set / don't know"
+  InternalIterator* NewIterator(
+      const ReadOptions& options, const FileOptions& toptions,
+      const InternalKeyComparator& internal_comparator,
+      const FileMetaData& file_meta, RangeDelAggregator* range_del_agg,
+      const SliceTransform* prefix_extractor, TableReader** table_reader_ptr,
+      HistogramImpl* file_read_hist, TableReaderCaller caller, Arena* arena,
+      bool skip_filters, int level, const InternalKey* smallest_compaction_key,
+      const InternalKey* largest_compaction_key);
+
+  // If a seek to internal key "k" in specified file finds an entry,
+  // call get_context->SaveValue() repeatedly until
+  // it returns false. As a side effect, it will insert the TableReader
+  // into the cache and potentially evict another entry
+  // @param get_context Context for get operation. The result of the lookup
+  //                    can be retrieved by calling get_context->State()
+  // @param file_read_hist If non-nullptr, the file reader statistics are
+  //                       recorded
+  // @param skip_filters Disables loading/accessing the filter block
+  // @param level The level this table is at, -1 for "not set / don't know"
+  Status Get(const ReadOptions& options,
+             const InternalKeyComparator& internal_comparator,
+             const FileMetaData& file_meta, const Slice& k,
+             GetContext* get_context,
+             const SliceTransform* prefix_extractor = nullptr,
+             HistogramImpl* file_read_hist = nullptr, bool skip_filters = false,
+             int level = -1);
+
+  // Return the range delete tombstone iterator of the file specified by
+  // `file_meta`.
+  Status GetRangeTombstoneIterator(
+      const ReadOptions& options,
+      const InternalKeyComparator& internal_comparator,
+      const FileMetaData& file_meta,
+      std::unique_ptr<FragmentedRangeTombstoneIterator>* out_iter);
+
+  // If a seek to internal key "k" in specified file finds an entry,
+  // call get_context->SaveValue() repeatedly until
+  // it returns false. As a side effect, it will insert the TableReader
+  // into the cache and potentially evict another entry
+  // @param mget_range Pointer to the structure describing a batch of keys to
+  //                   be looked up in this table file. The result is stored
+  //                   in the embedded GetContext
+  // @param skip_filters Disables loading/accessing the filter block
+  // @param level The level this table is at, -1 for "not set / don't know"
+  Status MultiGet(const ReadOptions& options,
+                  const InternalKeyComparator& internal_comparator,
+                  const FileMetaData& file_meta,
+                  const MultiGetContext::Range* mget_range,
+                  const SliceTransform* prefix_extractor = nullptr,
+                  HistogramImpl* file_read_hist = nullptr,
+                  bool skip_filters = false, int level = -1);
+
+  // Evict any entry for the specified file number
+  static void Evict(Cache* cache, uint64_t file_number);
+
+  // Clean table handle and erase it from the table cache
+  // Used in DB close, or the file is not live anymore.
+  void EraseHandle(const FileDescriptor& fd, Cache::Handle* handle);
+
+  // Find table reader
+  // @param skip_filters Disables loading/accessing the filter block
+  // @param level == -1 means not specified
+  Status FindTable(const FileOptions& toptions,
+                   const InternalKeyComparator& internal_comparator,
+                   const FileDescriptor& file_fd, Cache::Handle**,
+                   const SliceTransform* prefix_extractor = nullptr,
+                   const bool no_io = false, bool record_read_stats = true,
+                   HistogramImpl* file_read_hist = nullptr,
+                   bool skip_filters = false, int level = -1,
+                   bool prefetch_index_and_filter_in_cache = true);
+
+  // Get TableReader from a cache handle.
+  TableReader* GetTableReaderFromHandle(Cache::Handle* handle);
+
+  // Get the table properties of a given table.
+  // @no_io: indicates if we should load table to the cache if it is not present
+  //         in table cache yet.
+  // @returns: `properties` will be reset on success. Please note that we will
+  //            return Status::Incomplete() if table is not present in cache and
+  //            we set `no_io` to be true.
+  Status GetTableProperties(const FileOptions& toptions,
+                            const InternalKeyComparator& internal_comparator,
+                            const FileDescriptor& file_meta,
+                            std::shared_ptr<const TableProperties>* properties,
+                            const SliceTransform* prefix_extractor = nullptr,
+                            bool no_io = false);
+
+  // Return total memory usage of the table reader of the file.
+  // 0 if table reader of the file is not loaded.
+  size_t GetMemoryUsageByTableReader(
+      const FileOptions& toptions,
+      const InternalKeyComparator& internal_comparator,
+      const FileDescriptor& fd,
+      const SliceTransform* prefix_extractor = nullptr);
+
+  // Returns approximated offset of a key in a file represented by fd.
+  uint64_t ApproximateOffsetOf(
+      const Slice& key, const FileDescriptor& fd, TableReaderCaller caller,
+      const InternalKeyComparator& internal_comparator,
+      const SliceTransform* prefix_extractor = nullptr);
+
+  // Returns approximated data size between start and end keys in a file
+  // represented by fd (the start key must not be greater than the end key).
+  uint64_t ApproximateSize(const Slice& start, const Slice& end,
+                           const FileDescriptor& fd, TableReaderCaller caller,
+                           const InternalKeyComparator& internal_comparator,
+                           const SliceTransform* prefix_extractor = nullptr);
+
+  // Release the handle from a cache
+  void ReleaseHandle(Cache::Handle* handle);
+
+  Cache* get_cache() const { return cache_; }
+
+  // Capacity of the backing Cache that indicates inifinite TableCache capacity.
+  // For example when max_open_files is -1 we set the backing Cache to this.
+  static const int kInfiniteCapacity = 0x400000;
+
+  // The tables opened with this TableCache will be immortal, i.e., their
+  // lifetime is as long as that of the DB.
+  void SetTablesAreImmortal() {
+    if (cache_->GetCapacity() >= kInfiniteCapacity) {
+      immortal_tables_ = true;
+    }
+  }
+
+ private:
+  // Build a table reader
+  Status GetTableReader(const FileOptions& file_options,
+                        const InternalKeyComparator& internal_comparator,
+                        const FileDescriptor& fd, bool sequential_mode,
+                        bool record_read_stats, HistogramImpl* file_read_hist,
+                        std::unique_ptr<TableReader>* table_reader,
+                        const SliceTransform* prefix_extractor = nullptr,
+                        bool skip_filters = false, int level = -1,
+                        bool prefetch_index_and_filter_in_cache = true);
+
+  // Create a key prefix for looking up the row cache. The prefix is of the
+  // format row_cache_id + fd_number + seq_no. Later, the user key can be
+  // appended to form the full key
+  void CreateRowCacheKeyPrefix(const ReadOptions& options,
+                               const FileDescriptor& fd,
+                               const Slice& internal_key,
+                               GetContext* get_context, IterKey& row_cache_key);
+
+  // Helper function to lookup the row cache for a key. It appends the
+  // user key to row_cache_key at offset prefix_size
+  bool GetFromRowCache(const Slice& user_key, IterKey& row_cache_key,
+                       size_t prefix_size, GetContext* get_context);
+
+  const ImmutableCFOptions& ioptions_;
+  const FileOptions& file_options_;
+  Cache* const cache_;
+  std::string row_cache_id_;
+  bool immortal_tables_;
+  BlockCacheTracer* const block_cache_tracer_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/table_properties_collector.cc b/src/rocksdb/db/table_properties_collector.cc
new file mode 100644
index 000000000..d98ff5e9b
--- /dev/null
+++ b/src/rocksdb/db/table_properties_collector.cc
@@ -0,0 +1,74 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/table_properties_collector.h"
+
+#include "db/dbformat.h"
+#include "util/coding.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+
+uint64_t GetUint64Property(const UserCollectedProperties& props,
+                           const std::string& property_name,
+                           bool* property_present) {
+  auto pos = props.find(property_name);
+  if (pos == props.end()) {
+    *property_present = false;
+    return 0;
+  }
+  Slice raw = pos->second;
+  uint64_t val = 0;
+  *property_present = true;
+  return GetVarint64(&raw, &val) ? val : 0;
+}
+
+}  // namespace
+
+Status UserKeyTablePropertiesCollector::InternalAdd(const Slice& key,
+                                                    const Slice& value,
+                                                    uint64_t file_size) {
+  ParsedInternalKey ikey;
+  if (!ParseInternalKey(key, &ikey)) {
+    return Status::InvalidArgument("Invalid internal key");
+  }
+
+  return collector_->AddUserKey(ikey.user_key, value, GetEntryType(ikey.type),
+                                ikey.sequence, file_size);
+}
+
+void UserKeyTablePropertiesCollector::BlockAdd(
+    uint64_t bLockRawBytes, uint64_t blockCompressedBytesFast,
+    uint64_t blockCompressedBytesSlow) {
+  return collector_->BlockAdd(bLockRawBytes, blockCompressedBytesFast,
+                              blockCompressedBytesSlow);
+}
+
+Status UserKeyTablePropertiesCollector::Finish(
+    UserCollectedProperties* properties) {
+  return collector_->Finish(properties);
+}
+
+UserCollectedProperties
+UserKeyTablePropertiesCollector::GetReadableProperties() const {
+  return collector_->GetReadableProperties();
+}
+
+uint64_t GetDeletedKeys(
+    const UserCollectedProperties& props) {
+  bool property_present_ignored;
+  return GetUint64Property(props, TablePropertiesNames::kDeletedKeys,
+                           &property_present_ignored);
+}
+
+uint64_t GetMergeOperands(const UserCollectedProperties& props,
+                          bool* property_present) {
+  return GetUint64Property(
+      props, TablePropertiesNames::kMergeOperands, property_present);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/table_properties_collector.h b/src/rocksdb/db/table_properties_collector.h
new file mode 100644
index 000000000..130eb64d4
--- /dev/null
+++ b/src/rocksdb/db/table_properties_collector.h
@@ -0,0 +1,107 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file defines a collection of statistics collectors.
+#pragma once
+
+#include "rocksdb/table_properties.h"
+
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace ROCKSDB_NAMESPACE {
+
+// Base class for internal table properties collector.
+class IntTblPropCollector {
+ public:
+  virtual ~IntTblPropCollector() {}
+  virtual Status Finish(UserCollectedProperties* properties) = 0;
+
+  virtual const char* Name() const = 0;
+
+  // @params key    the user key that is inserted into the table.
+  // @params value  the value that is inserted into the table.
+  virtual Status InternalAdd(const Slice& key, const Slice& value,
+                             uint64_t file_size) = 0;
+
+  virtual void BlockAdd(uint64_t blockRawBytes,
+                        uint64_t blockCompressedBytesFast,
+                        uint64_t blockCompressedBytesSlow) = 0;
+
+  virtual UserCollectedProperties GetReadableProperties() const = 0;
+
+  virtual bool NeedCompact() const { return false; }
+};
+
+// Factory for internal table properties collector.
+class IntTblPropCollectorFactory {
+ public:
+  virtual ~IntTblPropCollectorFactory() {}
+  // has to be thread-safe
+  virtual IntTblPropCollector* CreateIntTblPropCollector(
+      uint32_t column_family_id) = 0;
+
+  // The name of the properties collector can be used for debugging purpose.
+  virtual const char* Name() const = 0;
+};
+
+// When rocksdb creates a new table, it will encode all "user keys" into
+// "internal keys", which contains meta information of a given entry.
+//
+// This class extracts user key from the encoded internal key when Add() is
+// invoked.
+class UserKeyTablePropertiesCollector : public IntTblPropCollector {
+ public:
+  // transfer of ownership
+  explicit UserKeyTablePropertiesCollector(TablePropertiesCollector* collector)
+      : collector_(collector) {}
+
+  virtual ~UserKeyTablePropertiesCollector() {}
+
+  virtual Status InternalAdd(const Slice& key, const Slice& value,
+                             uint64_t file_size) override;
+
+  virtual void BlockAdd(uint64_t blockRawBytes,
+                        uint64_t blockCompressedBytesFast,
+                        uint64_t blockCompressedBytesSlow) override;
+
+  virtual Status Finish(UserCollectedProperties* properties) override;
+
+  virtual const char* Name() const override { return collector_->Name(); }
+
+  UserCollectedProperties GetReadableProperties() const override;
+
+  virtual bool NeedCompact() const override {
+    return collector_->NeedCompact();
+  }
+
+ protected:
+  std::unique_ptr<TablePropertiesCollector> collector_;
+};
+
+class UserKeyTablePropertiesCollectorFactory
+    : public IntTblPropCollectorFactory {
+ public:
+  explicit UserKeyTablePropertiesCollectorFactory(
+      std::shared_ptr<TablePropertiesCollectorFactory> user_collector_factory)
+      : user_collector_factory_(user_collector_factory) {}
+  virtual IntTblPropCollector* CreateIntTblPropCollector(
+      uint32_t column_family_id) override {
+    TablePropertiesCollectorFactory::Context context;
+    context.column_family_id = column_family_id;
+    return new UserKeyTablePropertiesCollector(
+        user_collector_factory_->CreateTablePropertiesCollector(context));
+  }
+
+  virtual const char* Name() const override {
+    return user_collector_factory_->Name();
+  }
+
+ private:
+  std::shared_ptr<TablePropertiesCollectorFactory> user_collector_factory_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/table_properties_collector_test.cc b/src/rocksdb/db/table_properties_collector_test.cc
new file mode 100644
index 000000000..5c202de81
--- /dev/null
+++ b/src/rocksdb/db/table_properties_collector_test.cc
@@ -0,0 +1,515 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include <map>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "db/db_impl/db_impl.h"
+#include "db/dbformat.h"
+#include "db/table_properties_collector.h"
+#include "env/composite_env_wrapper.h"
+#include "file/sequence_file_reader.h"
+#include "file/writable_file_writer.h"
+#include "options/cf_options.h"
+#include "rocksdb/table.h"
+#include "table/block_based/block_based_table_factory.h"
+#include "table/meta_blocks.h"
+#include "table/plain/plain_table_factory.h"
+#include "table/table_builder.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/coding.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class TablePropertiesTest : public testing::Test,
+                            public testing::WithParamInterface<bool> {
+ public:
+  void SetUp() override { backward_mode_ = GetParam(); }
+
+  bool backward_mode_;
+};
+
+// Utilities test functions
+namespace {
+static const uint32_t kTestColumnFamilyId = 66;
+static const std::string kTestColumnFamilyName = "test_column_fam";
+
+void MakeBuilder(const Options& options, const ImmutableCFOptions& ioptions,
+                 const MutableCFOptions& moptions,
+                 const InternalKeyComparator& internal_comparator,
+                 const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
+                     int_tbl_prop_collector_factories,
+                 std::unique_ptr<WritableFileWriter>* writable,
+                 std::unique_ptr<TableBuilder>* builder) {
+  std::unique_ptr<WritableFile> wf(new test::StringSink);
+  writable->reset(
+      new WritableFileWriter(NewLegacyWritableFileWrapper(std::move(wf)),
+                             "" /* don't care */, EnvOptions()));
+  int unknown_level = -1;
+  builder->reset(NewTableBuilder(
+      ioptions, moptions, internal_comparator, int_tbl_prop_collector_factories,
+      kTestColumnFamilyId, kTestColumnFamilyName, writable->get(),
+      options.compression, options.sample_for_compression,
+      options.compression_opts, unknown_level));
+}
+}  // namespace
+
+// Collects keys that starts with "A" in a table.
+class RegularKeysStartWithA: public TablePropertiesCollector {
+ public:
+  const char* Name() const override { return "RegularKeysStartWithA"; }
+
+  Status Finish(UserCollectedProperties* properties) override {
+     std::string encoded;
+     std::string encoded_num_puts;
+     std::string encoded_num_deletes;
+     std::string encoded_num_single_deletes;
+     std::string encoded_num_size_changes;
+     PutVarint32(&encoded, count_);
+     PutVarint32(&encoded_num_puts, num_puts_);
+     PutVarint32(&encoded_num_deletes, num_deletes_);
+     PutVarint32(&encoded_num_single_deletes, num_single_deletes_);
+     PutVarint32(&encoded_num_size_changes, num_size_changes_);
+     *properties = UserCollectedProperties{
+         {"TablePropertiesTest", message_},
+         {"Count", encoded},
+         {"NumPuts", encoded_num_puts},
+         {"NumDeletes", encoded_num_deletes},
+         {"NumSingleDeletes", encoded_num_single_deletes},
+         {"NumSizeChanges", encoded_num_size_changes},
+     };
+     return Status::OK();
+  }
+
+  Status AddUserKey(const Slice& user_key, const Slice& /*value*/,
+                    EntryType type, SequenceNumber /*seq*/,
+                    uint64_t file_size) override {
+    // simply asssume all user keys are not empty.
+    if (user_key.data()[0] == 'A') {
+      ++count_;
+    }
+    if (type == kEntryPut) {
+      num_puts_++;
+    } else if (type == kEntryDelete) {
+      num_deletes_++;
+    } else if (type == kEntrySingleDelete) {
+      num_single_deletes_++;
+    }
+    if (file_size < file_size_) {
+      message_ = "File size should not decrease.";
+    } else if (file_size != file_size_) {
+      num_size_changes_++;
+    }
+
+    return Status::OK();
+  }
+
+  UserCollectedProperties GetReadableProperties() const override {
+    return UserCollectedProperties{};
+  }
+
+ private:
+  std::string message_ = "Rocksdb";
+  uint32_t count_ = 0;
+  uint32_t num_puts_ = 0;
+  uint32_t num_deletes_ = 0;
+  uint32_t num_single_deletes_ = 0;
+  uint32_t num_size_changes_ = 0;
+  uint64_t file_size_ = 0;
+};
+
+// Collects keys that starts with "A" in a table. Backward compatible mode
+// It is also used to test internal key table property collector
+class RegularKeysStartWithABackwardCompatible
+    : public TablePropertiesCollector {
+ public:
+  const char* Name() const override { return "RegularKeysStartWithA"; }
+
+  Status Finish(UserCollectedProperties* properties) override {
+    std::string encoded;
+    PutVarint32(&encoded, count_);
+    *properties = UserCollectedProperties{{"TablePropertiesTest", "Rocksdb"},
+                                          {"Count", encoded}};
+    return Status::OK();
+  }
+
+  Status Add(const Slice& user_key, const Slice& /*value*/) override {
+    // simply asssume all user keys are not empty.
+    if (user_key.data()[0] == 'A') {
+      ++count_;
+    }
+    return Status::OK();
+  }
+
+  UserCollectedProperties GetReadableProperties() const override {
+    return UserCollectedProperties{};
+  }
+
+ private:
+  uint32_t count_ = 0;
+};
+
+class RegularKeysStartWithAInternal : public IntTblPropCollector {
+ public:
+  const char* Name() const override { return "RegularKeysStartWithA"; }
+
+  Status Finish(UserCollectedProperties* properties) override {
+    std::string encoded;
+    PutVarint32(&encoded, count_);
+    *properties = UserCollectedProperties{{"TablePropertiesTest", "Rocksdb"},
+                                          {"Count", encoded}};
+    return Status::OK();
+  }
+
+  Status InternalAdd(const Slice& user_key, const Slice& /*value*/,
+                     uint64_t /*file_size*/) override {
+    // simply asssume all user keys are not empty.
+    if (user_key.data()[0] == 'A') {
+      ++count_;
+    }
+    return Status::OK();
+  }
+
+  void BlockAdd(uint64_t /* blockRawBytes */,
+                uint64_t /* blockCompressedBytesFast */,
+                uint64_t /* blockCompressedBytesSlow */) override {
+    // Nothing to do.
+    return;
+  }
+
+  UserCollectedProperties GetReadableProperties() const override {
+    return UserCollectedProperties{};
+  }
+
+ private:
+  uint32_t count_ = 0;
+};
+
+class RegularKeysStartWithAFactory : public IntTblPropCollectorFactory,
+                                     public TablePropertiesCollectorFactory {
+ public:
+  explicit RegularKeysStartWithAFactory(bool backward_mode)
+      : backward_mode_(backward_mode) {}
+  TablePropertiesCollector* CreateTablePropertiesCollector(
+      TablePropertiesCollectorFactory::Context context) override {
+    EXPECT_EQ(kTestColumnFamilyId, context.column_family_id);
+    if (!backward_mode_) {
+      return new RegularKeysStartWithA();
+    } else {
+      return new RegularKeysStartWithABackwardCompatible();
+    }
+  }
+  IntTblPropCollector* CreateIntTblPropCollector(
+      uint32_t /*column_family_id*/) override {
+    return new RegularKeysStartWithAInternal();
+  }
+  const char* Name() const override { return "RegularKeysStartWithA"; }
+
+  bool backward_mode_;
+};
+
+class FlushBlockEveryThreePolicy : public FlushBlockPolicy {
+ public:
+  bool Update(const Slice& /*key*/, const Slice& /*value*/) override {
+    return (++count_ % 3U == 0);
+  }
+
+ private:
+  uint64_t count_ = 0;
+};
+
+class FlushBlockEveryThreePolicyFactory : public FlushBlockPolicyFactory {
+ public:
+  explicit FlushBlockEveryThreePolicyFactory() {}
+
+  const char* Name() const override {
+    return "FlushBlockEveryThreePolicyFactory";
+  }
+
+  FlushBlockPolicy* NewFlushBlockPolicy(
+      const BlockBasedTableOptions& /*table_options*/,
+      const BlockBuilder& /*data_block_builder*/) const override {
+    return new FlushBlockEveryThreePolicy;
+  }
+};
+
+extern const uint64_t kBlockBasedTableMagicNumber;
+extern const uint64_t kPlainTableMagicNumber;
+namespace {
+void TestCustomizedTablePropertiesCollector(
+    bool backward_mode, uint64_t magic_number, bool test_int_tbl_prop_collector,
+    const Options& options, const InternalKeyComparator& internal_comparator) {
+  // make sure the entries will be inserted with order.
+  std::map<std::pair<std::string, ValueType>, std::string> kvs = {
+      {{"About   ", kTypeValue}, "val5"},  // starts with 'A'
+      {{"Abstract", kTypeValue}, "val2"},  // starts with 'A'
+      {{"Around  ", kTypeValue}, "val7"},  // starts with 'A'
+      {{"Beyond  ", kTypeValue}, "val3"},
+      {{"Builder ", kTypeValue}, "val1"},
+      {{"Love    ", kTypeDeletion}, ""},
+      {{"Cancel  ", kTypeValue}, "val4"},
+      {{"Find    ", kTypeValue}, "val6"},
+      {{"Rocks   ", kTypeDeletion}, ""},
+      {{"Foo     ", kTypeSingleDeletion}, ""},
+  };
+
+  // -- Step 1: build table
+  std::unique_ptr<TableBuilder> builder;
+  std::unique_ptr<WritableFileWriter> writer;
+  const ImmutableCFOptions ioptions(options);
+  const MutableCFOptions moptions(options);
+  std::vector<std::unique_ptr<IntTblPropCollectorFactory>>
+      int_tbl_prop_collector_factories;
+  if (test_int_tbl_prop_collector) {
+    int_tbl_prop_collector_factories.emplace_back(
+        new RegularKeysStartWithAFactory(backward_mode));
+  } else {
+    GetIntTblPropCollectorFactory(ioptions, &int_tbl_prop_collector_factories);
+  }
+  MakeBuilder(options, ioptions, moptions, internal_comparator,
+              &int_tbl_prop_collector_factories, &writer, &builder);
+
+  SequenceNumber seqNum = 0U;
+  for (const auto& kv : kvs) {
+    InternalKey ikey(kv.first.first, seqNum++, kv.first.second);
+    builder->Add(ikey.Encode(), kv.second);
+  }
+  ASSERT_OK(builder->Finish());
+  writer->Flush();
+
+  // -- Step 2: Read properties
+  LegacyWritableFileWrapper* file =
+      static_cast<LegacyWritableFileWrapper*>(writer->writable_file());
+  test::StringSink* fwf = static_cast<test::StringSink*>(file->target());
+  std::unique_ptr<RandomAccessFileReader> fake_file_reader(
+      test::GetRandomAccessFileReader(
+          new test::StringSource(fwf->contents())));
+  TableProperties* props;
+  Status s = ReadTableProperties(fake_file_reader.get(), fwf->contents().size(),
+                                 magic_number, ioptions, &props,
+                                 true /* compression_type_missing */);
+  std::unique_ptr<TableProperties> props_guard(props);
+  ASSERT_OK(s);
+
+  auto user_collected = props->user_collected_properties;
+
+  ASSERT_NE(user_collected.find("TablePropertiesTest"), user_collected.end());
+  ASSERT_EQ("Rocksdb", user_collected.at("TablePropertiesTest"));
+
+  uint32_t starts_with_A = 0;
+  ASSERT_NE(user_collected.find("Count"), user_collected.end());
+  Slice key(user_collected.at("Count"));
+  ASSERT_TRUE(GetVarint32(&key, &starts_with_A));
+  ASSERT_EQ(3u, starts_with_A);
+
+  if (!backward_mode && !test_int_tbl_prop_collector) {
+    uint32_t num_puts;
+    ASSERT_NE(user_collected.find("NumPuts"), user_collected.end());
+    Slice key_puts(user_collected.at("NumPuts"));
+    ASSERT_TRUE(GetVarint32(&key_puts, &num_puts));
+    ASSERT_EQ(7u, num_puts);
+
+    uint32_t num_deletes;
+    ASSERT_NE(user_collected.find("NumDeletes"), user_collected.end());
+    Slice key_deletes(user_collected.at("NumDeletes"));
+    ASSERT_TRUE(GetVarint32(&key_deletes, &num_deletes));
+    ASSERT_EQ(2u, num_deletes);
+
+    uint32_t num_single_deletes;
+    ASSERT_NE(user_collected.find("NumSingleDeletes"), user_collected.end());
+    Slice key_single_deletes(user_collected.at("NumSingleDeletes"));
+    ASSERT_TRUE(GetVarint32(&key_single_deletes, &num_single_deletes));
+    ASSERT_EQ(1u, num_single_deletes);
+
+    uint32_t num_size_changes;
+    ASSERT_NE(user_collected.find("NumSizeChanges"), user_collected.end());
+    Slice key_size_changes(user_collected.at("NumSizeChanges"));
+    ASSERT_TRUE(GetVarint32(&key_size_changes, &num_size_changes));
+    ASSERT_GE(num_size_changes, 2u);
+  }
+}
+}  // namespace
+
+TEST_P(TablePropertiesTest, CustomizedTablePropertiesCollector) {
+  // Test properties collectors with internal keys or regular keys
+  // for block based table
+  for (bool encode_as_internal : { true, false }) {
+    Options options;
+    BlockBasedTableOptions table_options;
+    table_options.flush_block_policy_factory =
+        std::make_shared<FlushBlockEveryThreePolicyFactory>();
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+    test::PlainInternalKeyComparator ikc(options.comparator);
+    std::shared_ptr<TablePropertiesCollectorFactory> collector_factory(
+        new RegularKeysStartWithAFactory(backward_mode_));
+    options.table_properties_collector_factories.resize(1);
+    options.table_properties_collector_factories[0] = collector_factory;
+
+    TestCustomizedTablePropertiesCollector(backward_mode_,
+                                           kBlockBasedTableMagicNumber,
+                                           encode_as_internal, options, ikc);
+
+#ifndef ROCKSDB_LITE  // PlainTable is not supported in Lite
+    // test plain table
+    PlainTableOptions plain_table_options;
+    plain_table_options.user_key_len = 8;
+    plain_table_options.bloom_bits_per_key = 8;
+    plain_table_options.hash_table_ratio = 0;
+
+    options.table_factory =
+        std::make_shared<PlainTableFactory>(plain_table_options);
+    TestCustomizedTablePropertiesCollector(backward_mode_,
+                                           kPlainTableMagicNumber,
+                                           encode_as_internal, options, ikc);
+#endif  // !ROCKSDB_LITE
+  }
+}
+
+namespace {
+void TestInternalKeyPropertiesCollector(
+    bool backward_mode, uint64_t magic_number, bool sanitized,
+    std::shared_ptr<TableFactory> table_factory) {
+  InternalKey keys[] = {
+      InternalKey("A       ", 0, ValueType::kTypeValue),
+      InternalKey("B       ", 1, ValueType::kTypeValue),
+      InternalKey("C       ", 2, ValueType::kTypeValue),
+      InternalKey("W       ", 3, ValueType::kTypeDeletion),
+      InternalKey("X       ", 4, ValueType::kTypeDeletion),
+      InternalKey("Y       ", 5, ValueType::kTypeDeletion),
+      InternalKey("Z       ", 6, ValueType::kTypeDeletion),
+      InternalKey("a       ", 7, ValueType::kTypeSingleDeletion),
+      InternalKey("b       ", 8, ValueType::kTypeMerge),
+      InternalKey("c       ", 9, ValueType::kTypeMerge),
+  };
+
+  std::unique_ptr<TableBuilder> builder;
+  std::unique_ptr<WritableFileWriter> writable;
+  Options options;
+  test::PlainInternalKeyComparator pikc(options.comparator);
+
+  std::vector<std::unique_ptr<IntTblPropCollectorFactory>>
+      int_tbl_prop_collector_factories;
+  options.table_factory = table_factory;
+  if (sanitized) {
+    options.table_properties_collector_factories.emplace_back(
+        new RegularKeysStartWithAFactory(backward_mode));
+    // with sanitization, even regular properties collector will be able to
+    // handle internal keys.
+    auto comparator = options.comparator;
+    // HACK: Set options.info_log to avoid writing log in
+    // SanitizeOptions().
+    options.info_log = std::make_shared<test::NullLogger>();
+    options = SanitizeOptions("db",            // just a place holder
+                              options);
+    ImmutableCFOptions ioptions(options);
+    GetIntTblPropCollectorFactory(ioptions, &int_tbl_prop_collector_factories);
+    options.comparator = comparator;
+  }
+  const ImmutableCFOptions ioptions(options);
+  MutableCFOptions moptions(options);
+
+  for (int iter = 0; iter < 2; ++iter) {
+    MakeBuilder(options, ioptions, moptions, pikc,
+                &int_tbl_prop_collector_factories, &writable, &builder);
+    for (const auto& k : keys) {
+      builder->Add(k.Encode(), "val");
+    }
+
+    ASSERT_OK(builder->Finish());
+    writable->Flush();
+
+    LegacyWritableFileWrapper* file =
+        static_cast<LegacyWritableFileWrapper*>(writable->writable_file());
+    test::StringSink* fwf = static_cast<test::StringSink*>(file->target());
+    std::unique_ptr<RandomAccessFileReader> reader(
+        test::GetRandomAccessFileReader(
+            new test::StringSource(fwf->contents())));
+    TableProperties* props;
+    Status s =
+        ReadTableProperties(reader.get(), fwf->contents().size(), magic_number,
+                            ioptions, &props, true /* compression_type_missing */);
+    ASSERT_OK(s);
+
+    std::unique_ptr<TableProperties> props_guard(props);
+    auto user_collected = props->user_collected_properties;
+    uint64_t deleted = GetDeletedKeys(user_collected);
+    ASSERT_EQ(5u, deleted);  // deletes + single-deletes
+
+    bool property_present;
+    uint64_t merges = GetMergeOperands(user_collected, &property_present);
+    ASSERT_TRUE(property_present);
+    ASSERT_EQ(2u, merges);
+
+    if (sanitized) {
+      uint32_t starts_with_A = 0;
+      ASSERT_NE(user_collected.find("Count"), user_collected.end());
+      Slice key(user_collected.at("Count"));
+      ASSERT_TRUE(GetVarint32(&key, &starts_with_A));
+      ASSERT_EQ(1u, starts_with_A);
+
+      if (!backward_mode) {
+        uint32_t num_puts;
+        ASSERT_NE(user_collected.find("NumPuts"), user_collected.end());
+        Slice key_puts(user_collected.at("NumPuts"));
+        ASSERT_TRUE(GetVarint32(&key_puts, &num_puts));
+        ASSERT_EQ(3u, num_puts);
+
+        uint32_t num_deletes;
+        ASSERT_NE(user_collected.find("NumDeletes"), user_collected.end());
+        Slice key_deletes(user_collected.at("NumDeletes"));
+        ASSERT_TRUE(GetVarint32(&key_deletes, &num_deletes));
+        ASSERT_EQ(4u, num_deletes);
+
+        uint32_t num_single_deletes;
+        ASSERT_NE(user_collected.find("NumSingleDeletes"),
+                  user_collected.end());
+        Slice key_single_deletes(user_collected.at("NumSingleDeletes"));
+        ASSERT_TRUE(GetVarint32(&key_single_deletes, &num_single_deletes));
+        ASSERT_EQ(1u, num_single_deletes);
+      }
+    }
+  }
+}
+}  // namespace
+
+TEST_P(TablePropertiesTest, InternalKeyPropertiesCollector) {
+  TestInternalKeyPropertiesCollector(
+      backward_mode_, kBlockBasedTableMagicNumber, true /* sanitize */,
+      std::make_shared<BlockBasedTableFactory>());
+  if (backward_mode_) {
+    TestInternalKeyPropertiesCollector(
+        backward_mode_, kBlockBasedTableMagicNumber, false /* not sanitize */,
+        std::make_shared<BlockBasedTableFactory>());
+  }
+
+#ifndef ROCKSDB_LITE  // PlainTable is not supported in Lite
+  PlainTableOptions plain_table_options;
+  plain_table_options.user_key_len = 8;
+  plain_table_options.bloom_bits_per_key = 8;
+  plain_table_options.hash_table_ratio = 0;
+
+  TestInternalKeyPropertiesCollector(
+      backward_mode_, kPlainTableMagicNumber, false /* not sanitize */,
+      std::make_shared<PlainTableFactory>(plain_table_options));
+#endif  // !ROCKSDB_LITE
+}
+
+INSTANTIATE_TEST_CASE_P(InternalKeyPropertiesCollector, TablePropertiesTest,
+                        ::testing::Bool());
+
+INSTANTIATE_TEST_CASE_P(CustomizedTablePropertiesCollector, TablePropertiesTest,
+                        ::testing::Bool());
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/transaction_log_impl.cc b/src/rocksdb/db/transaction_log_impl.cc
new file mode 100644
index 000000000..56bc161a3
--- /dev/null
+++ b/src/rocksdb/db/transaction_log_impl.cc
@@ -0,0 +1,315 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "db/transaction_log_impl.h"
+#include <cinttypes>
+#include "db/write_batch_internal.h"
+#include "file/sequence_file_reader.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+TransactionLogIteratorImpl::TransactionLogIteratorImpl(
+    const std::string& dir, const ImmutableDBOptions* options,
+    const TransactionLogIterator::ReadOptions& read_options,
+    const EnvOptions& soptions, const SequenceNumber seq,
+    std::unique_ptr<VectorLogPtr> files, VersionSet const* const versions,
+    const bool seq_per_batch)
+    : dir_(dir),
+      options_(options),
+      read_options_(read_options),
+      soptions_(soptions),
+      starting_sequence_number_(seq),
+      files_(std::move(files)),
+      started_(false),
+      is_valid_(false),
+      current_file_index_(0),
+      current_batch_seq_(0),
+      current_last_seq_(0),
+      versions_(versions),
+      seq_per_batch_(seq_per_batch) {
+  assert(files_ != nullptr);
+  assert(versions_ != nullptr);
+
+  reporter_.env = options_->env;
+  reporter_.info_log = options_->info_log.get();
+  SeekToStartSequence(); // Seek till starting sequence
+}
+
+Status TransactionLogIteratorImpl::OpenLogFile(
+    const LogFile* log_file,
+    std::unique_ptr<SequentialFileReader>* file_reader) {
+  FileSystem* fs = options_->fs.get();
+  std::unique_ptr<FSSequentialFile> file;
+  std::string fname;
+  Status s;
+  EnvOptions optimized_env_options = fs->OptimizeForLogRead(soptions_);
+  if (log_file->Type() == kArchivedLogFile) {
+    fname = ArchivedLogFileName(dir_, log_file->LogNumber());
+    s = fs->NewSequentialFile(fname, optimized_env_options, &file, nullptr);
+  } else {
+    fname = LogFileName(dir_, log_file->LogNumber());
+    s = fs->NewSequentialFile(fname, optimized_env_options, &file, nullptr);
+    if (!s.ok()) {
+      //  If cannot open file in DB directory.
+      //  Try the archive dir, as it could have moved in the meanwhile.
+      fname = ArchivedLogFileName(dir_, log_file->LogNumber());
+      s = fs->NewSequentialFile(fname, optimized_env_options,
+                                &file, nullptr);
+    }
+  }
+  if (s.ok()) {
+    file_reader->reset(new SequentialFileReader(std::move(file), fname));
+  }
+  return s;
+}
+
+BatchResult TransactionLogIteratorImpl::GetBatch()  {
+  assert(is_valid_);  //  cannot call in a non valid state.
+  BatchResult result;
+  result.sequence = current_batch_seq_;
+  result.writeBatchPtr = std::move(current_batch_);
+  return result;
+}
+
+Status TransactionLogIteratorImpl::status() { return current_status_; }
+
+bool TransactionLogIteratorImpl::Valid() { return started_ && is_valid_; }
+
+bool TransactionLogIteratorImpl::RestrictedRead(Slice* record) {
+  // Don't read if no more complete entries to read from logs
+  if (current_last_seq_ >= versions_->LastSequence()) {
+    return false;
+  }
+  return current_log_reader_->ReadRecord(record, &scratch_);
+}
+
+void TransactionLogIteratorImpl::SeekToStartSequence(uint64_t start_file_index,
+                                                     bool strict) {
+  Slice record;
+  started_ = false;
+  is_valid_ = false;
+  if (files_->size() <= start_file_index) {
+    return;
+  }
+  Status s =
+      OpenLogReader(files_->at(static_cast<size_t>(start_file_index)).get());
+  if (!s.ok()) {
+    current_status_ = s;
+    reporter_.Info(current_status_.ToString().c_str());
+    return;
+  }
+  while (RestrictedRead(&record)) {
+    if (record.size() < WriteBatchInternal::kHeader) {
+      reporter_.Corruption(
+        record.size(), Status::Corruption("very small log record"));
+      continue;
+    }
+    UpdateCurrentWriteBatch(record);
+    if (current_last_seq_ >= starting_sequence_number_) {
+      if (strict && current_batch_seq_ != starting_sequence_number_) {
+        current_status_ = Status::Corruption(
+            "Gap in sequence number. Could not "
+            "seek to required sequence number");
+        reporter_.Info(current_status_.ToString().c_str());
+        return;
+      } else if (strict) {
+        reporter_.Info("Could seek required sequence number. Iterator will "
+                       "continue.");
+      }
+      is_valid_ = true;
+      started_ = true; // set started_ as we could seek till starting sequence
+      return;
+    } else {
+      is_valid_ = false;
+    }
+  }
+
+  // Could not find start sequence in first file. Normally this must be the
+  // only file. Otherwise log the error and let the iterator return next entry
+  // If strict is set, we want to seek exactly till the start sequence and it
+  // should have been present in the file we scanned above
+  if (strict) {
+    current_status_ = Status::Corruption(
+        "Gap in sequence number. Could not "
+        "seek to required sequence number");
+    reporter_.Info(current_status_.ToString().c_str());
+  } else if (files_->size() != 1) {
+    current_status_ = Status::Corruption(
+        "Start sequence was not found, "
+        "skipping to the next available");
+    reporter_.Info(current_status_.ToString().c_str());
+    // Let NextImpl find the next available entry. started_ remains false
+    // because we don't want to check for gaps while moving to start sequence
+    NextImpl(true);
+  }
+}
+
+void TransactionLogIteratorImpl::Next() {
+  return NextImpl(false);
+}
+
+void TransactionLogIteratorImpl::NextImpl(bool internal) {
+  Slice record;
+  is_valid_ = false;
+  if (!internal && !started_) {
+    // Runs every time until we can seek to the start sequence
+    return SeekToStartSequence();
+  }
+  while(true) {
+    assert(current_log_reader_);
+    if (current_log_reader_->IsEOF()) {
+      current_log_reader_->UnmarkEOF();
+    }
+    while (RestrictedRead(&record)) {
+      if (record.size() < WriteBatchInternal::kHeader) {
+        reporter_.Corruption(
+          record.size(), Status::Corruption("very small log record"));
+        continue;
+      } else {
+        // started_ should be true if called by application
+        assert(internal || started_);
+        // started_ should be false if called internally
+        assert(!internal || !started_);
+        UpdateCurrentWriteBatch(record);
+        if (internal && !started_) {
+          started_ = true;
+        }
+        return;
+      }
+    }
+
+    // Open the next file
+    if (current_file_index_ < files_->size() - 1) {
+      ++current_file_index_;
+      Status s = OpenLogReader(files_->at(current_file_index_).get());
+      if (!s.ok()) {
+        is_valid_ = false;
+        current_status_ = s;
+        return;
+      }
+    } else {
+      is_valid_ = false;
+      if (current_last_seq_ == versions_->LastSequence()) {
+        current_status_ = Status::OK();
+      } else {
+        const char* msg = "Create a new iterator to fetch the new tail.";
+        current_status_ = Status::TryAgain(msg);
+      }
+      return;
+    }
+  }
+}
+
+bool TransactionLogIteratorImpl::IsBatchExpected(
+    const WriteBatch* batch, const SequenceNumber expected_seq) {
+  assert(batch);
+  SequenceNumber batchSeq = WriteBatchInternal::Sequence(batch);
+  if (batchSeq != expected_seq) {
+    char buf[200];
+    snprintf(buf, sizeof(buf),
+             "Discontinuity in log records. Got seq=%" PRIu64
+             ", Expected seq=%" PRIu64 ", Last flushed seq=%" PRIu64
+             ".Log iterator will reseek the correct batch.",
+             batchSeq, expected_seq, versions_->LastSequence());
+    reporter_.Info(buf);
+    return false;
+  }
+  return true;
+}
+
+void TransactionLogIteratorImpl::UpdateCurrentWriteBatch(const Slice& record) {
+  std::unique_ptr<WriteBatch> batch(new WriteBatch());
+  WriteBatchInternal::SetContents(batch.get(), record);
+
+  SequenceNumber expected_seq = current_last_seq_ + 1;
+  // If the iterator has started, then confirm that we get continuous batches
+  if (started_ && !IsBatchExpected(batch.get(), expected_seq)) {
+    // Seek to the batch having expected sequence number
+    if (expected_seq < files_->at(current_file_index_)->StartSequence()) {
+      // Expected batch must lie in the previous log file
+      // Avoid underflow.
+      if (current_file_index_ != 0) {
+        current_file_index_--;
+      }
+    }
+    starting_sequence_number_ = expected_seq;
+    // currentStatus_ will be set to Ok if reseek succeeds
+    // Note: this is still ok in seq_pre_batch_ && two_write_queuesp_ mode
+    // that allows gaps in the WAL since it will still skip over the gap.
+    current_status_ = Status::NotFound("Gap in sequence numbers");
+    // In seq_per_batch_ mode, gaps in the seq are possible so the strict mode
+    // should be disabled
+    return SeekToStartSequence(current_file_index_, !seq_per_batch_);
+  }
+
+  struct BatchCounter : public WriteBatch::Handler {
+    SequenceNumber sequence_;
+    BatchCounter(SequenceNumber sequence) : sequence_(sequence) {}
+    Status MarkNoop(bool empty_batch) override {
+      if (!empty_batch) {
+        sequence_++;
+      }
+      return Status::OK();
+    }
+    Status MarkEndPrepare(const Slice&) override {
+      sequence_++;
+      return Status::OK();
+    }
+    Status MarkCommit(const Slice&) override {
+      sequence_++;
+      return Status::OK();
+    }
+
+    Status PutCF(uint32_t /*cf*/, const Slice& /*key*/,
+                 const Slice& /*val*/) override {
+      return Status::OK();
+    }
+    Status DeleteCF(uint32_t /*cf*/, const Slice& /*key*/) override {
+      return Status::OK();
+    }
+    Status SingleDeleteCF(uint32_t /*cf*/, const Slice& /*key*/) override {
+      return Status::OK();
+    }
+    Status MergeCF(uint32_t /*cf*/, const Slice& /*key*/,
+                   const Slice& /*val*/) override {
+      return Status::OK();
+    }
+    Status MarkBeginPrepare(bool) override { return Status::OK(); }
+    Status MarkRollback(const Slice&) override { return Status::OK(); }
+  };
+
+  current_batch_seq_ = WriteBatchInternal::Sequence(batch.get());
+  if (seq_per_batch_) {
+    BatchCounter counter(current_batch_seq_);
+    batch->Iterate(&counter);
+    current_last_seq_ = counter.sequence_;
+  } else {
+    current_last_seq_ =
+        current_batch_seq_ + WriteBatchInternal::Count(batch.get()) - 1;
+  }
+  // currentBatchSeq_ can only change here
+  assert(current_last_seq_ <= versions_->LastSequence());
+
+  current_batch_ = std::move(batch);
+  is_valid_ = true;
+  current_status_ = Status::OK();
+}
+
+Status TransactionLogIteratorImpl::OpenLogReader(const LogFile* log_file) {
+  std::unique_ptr<SequentialFileReader> file;
+  Status s = OpenLogFile(log_file, &file);
+  if (!s.ok()) {
+    return s;
+  }
+  assert(file);
+  current_log_reader_.reset(
+      new log::Reader(options_->info_log, std::move(file), &reporter_,
+                      read_options_.verify_checksums_, log_file->LogNumber()));
+  return Status::OK();
+}
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/db/transaction_log_impl.h b/src/rocksdb/db/transaction_log_impl.h
new file mode 100644
index 000000000..eb53daf2b
--- /dev/null
+++ b/src/rocksdb/db/transaction_log_impl.h
@@ -0,0 +1,127 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#ifndef ROCKSDB_LITE
+#include <vector>
+
+#include "db/log_reader.h"
+#include "db/version_set.h"
+#include "file/filename.h"
+#include "options/db_options.h"
+#include "port/port.h"
+#include "rocksdb/env.h"
+#include "rocksdb/options.h"
+#include "rocksdb/transaction_log.h"
+#include "rocksdb/types.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class LogFileImpl : public LogFile {
+ public:
+  LogFileImpl(uint64_t logNum, WalFileType logType, SequenceNumber startSeq,
+              uint64_t sizeBytes) :
+    logNumber_(logNum),
+    type_(logType),
+    startSequence_(startSeq),
+    sizeFileBytes_(sizeBytes) {
+  }
+
+  std::string PathName() const override {
+    if (type_ == kArchivedLogFile) {
+      return ArchivedLogFileName("", logNumber_);
+    }
+    return LogFileName("", logNumber_);
+  }
+
+  uint64_t LogNumber() const override { return logNumber_; }
+
+  WalFileType Type() const override { return type_; }
+
+  SequenceNumber StartSequence() const override { return startSequence_; }
+
+  uint64_t SizeFileBytes() const override { return sizeFileBytes_; }
+
+  bool operator < (const LogFile& that) const {
+    return LogNumber() < that.LogNumber();
+  }
+
+ private:
+  uint64_t logNumber_;
+  WalFileType type_;
+  SequenceNumber startSequence_;
+  uint64_t sizeFileBytes_;
+
+};
+
+class TransactionLogIteratorImpl : public TransactionLogIterator {
+ public:
+  TransactionLogIteratorImpl(
+      const std::string& dir, const ImmutableDBOptions* options,
+      const TransactionLogIterator::ReadOptions& read_options,
+      const EnvOptions& soptions, const SequenceNumber seqNum,
+      std::unique_ptr<VectorLogPtr> files, VersionSet const* const versions,
+      const bool seq_per_batch);
+
+  virtual bool Valid() override;
+
+  virtual void Next() override;
+
+  virtual Status status() override;
+
+  virtual BatchResult GetBatch() override;
+
+ private:
+  const std::string& dir_;
+  const ImmutableDBOptions* options_;
+  const TransactionLogIterator::ReadOptions read_options_;
+  const EnvOptions& soptions_;
+  SequenceNumber starting_sequence_number_;
+  std::unique_ptr<VectorLogPtr> files_;
+  bool started_;
+  bool is_valid_;  // not valid when it starts of.
+  Status current_status_;
+  size_t current_file_index_;
+  std::unique_ptr<WriteBatch> current_batch_;
+  std::unique_ptr<log::Reader> current_log_reader_;
+  std::string scratch_;
+  Status OpenLogFile(const LogFile* log_file,
+                     std::unique_ptr<SequentialFileReader>* file);
+
+  struct LogReporter : public log::Reader::Reporter {
+    Env* env;
+    Logger* info_log;
+    virtual void Corruption(size_t bytes, const Status& s) override {
+      ROCKS_LOG_ERROR(info_log, "dropping %" ROCKSDB_PRIszt " bytes; %s", bytes,
+                      s.ToString().c_str());
+    }
+    virtual void Info(const char* s) { ROCKS_LOG_INFO(info_log, "%s", s); }
+  } reporter_;
+
+  SequenceNumber
+      current_batch_seq_;  // sequence number at start of current batch
+  SequenceNumber current_last_seq_;  // last sequence in the current batch
+  // Used only to get latest seq. num
+  // TODO(icanadi) can this be just a callback?
+  VersionSet const* const versions_;
+  const bool seq_per_batch_;
+  // Reads from transaction log only if the writebatch record has been written
+  bool RestrictedRead(Slice* record);
+  // Seeks to startingSequenceNumber reading from startFileIndex in files_.
+  // If strict is set,then must get a batch starting with startingSequenceNumber
+  void SeekToStartSequence(uint64_t start_file_index = 0, bool strict = false);
+  // Implementation of Next. SeekToStartSequence calls it internally with
+  // internal=true to let it find next entry even if it has to jump gaps because
+  // the iterator may start off from the first available entry but promises to
+  // be continuous after that
+  void NextImpl(bool internal = false);
+  // Check if batch is expected, else return false
+  bool IsBatchExpected(const WriteBatch* batch, SequenceNumber expected_seq);
+  // Update current batch if a continuous batch is found, else return false
+  void UpdateCurrentWriteBatch(const Slice& record);
+  Status OpenLogReader(const LogFile* file);
+};
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/db/trim_history_scheduler.cc b/src/rocksdb/db/trim_history_scheduler.cc
new file mode 100644
index 000000000..d7ca0899f
--- /dev/null
+++ b/src/rocksdb/db/trim_history_scheduler.cc
@@ -0,0 +1,54 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/trim_history_scheduler.h"
+
+#include <cassert>
+
+#include "db/column_family.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+void TrimHistoryScheduler::ScheduleWork(ColumnFamilyData* cfd) {
+  std::lock_guard<std::mutex> lock(checking_mutex_);
+  cfd->Ref();
+  cfds_.push_back(cfd);
+  is_empty_.store(false, std::memory_order_relaxed);
+}
+
+ColumnFamilyData* TrimHistoryScheduler::TakeNextColumnFamily() {
+  std::lock_guard<std::mutex> lock(checking_mutex_);
+  while (true) {
+    if (cfds_.empty()) {
+      return nullptr;
+    }
+    ColumnFamilyData* cfd = cfds_.back();
+    cfds_.pop_back();
+    if (cfds_.empty()) {
+      is_empty_.store(true, std::memory_order_relaxed);
+    }
+
+    if (!cfd->IsDropped()) {
+      // success
+      return cfd;
+    }
+    cfd->UnrefAndTryDelete();
+  }
+}
+
+bool TrimHistoryScheduler::Empty() {
+  bool is_empty = is_empty_.load(std::memory_order_relaxed);
+  return is_empty;
+}
+
+void TrimHistoryScheduler::Clear() {
+  ColumnFamilyData* cfd;
+  while ((cfd = TakeNextColumnFamily()) != nullptr) {
+    cfd->UnrefAndTryDelete();
+  }
+  assert(Empty());
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/trim_history_scheduler.h b/src/rocksdb/db/trim_history_scheduler.h
new file mode 100644
index 000000000..b17f6170f
--- /dev/null
+++ b/src/rocksdb/db/trim_history_scheduler.h
@@ -0,0 +1,44 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <stdint.h>
+#include <atomic>
+#include <mutex>
+#include "util/autovector.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class ColumnFamilyData;
+
+// Similar to FlushScheduler, TrimHistoryScheduler is a FIFO queue that keeps
+// track of column families whose flushed immutable memtables may need to be
+// removed (aka trimmed). The actual trimming may be slightly delayed. Due to
+// the use of the mutex and atomic variable, ScheduleWork,
+// TakeNextColumnFamily, and, Empty can be called concurrently.
+class TrimHistoryScheduler {
+ public:
+  TrimHistoryScheduler() : is_empty_(true) {}
+
+  // When a column family needs history trimming, add cfd to the FIFO queue
+  void ScheduleWork(ColumnFamilyData* cfd);
+
+  // Remove the column family from the queue, the caller is responsible for
+  // calling `MemtableList::TrimHistory`
+  ColumnFamilyData* TakeNextColumnFamily();
+
+  bool Empty();
+
+  void Clear();
+
+  // Not on critical path, use mutex to ensure thread safety
+ private:
+  std::atomic<bool> is_empty_;
+  autovector<ColumnFamilyData*> cfds_;
+  std::mutex checking_mutex_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/version_builder.cc b/src/rocksdb/db/version_builder.cc
new file mode 100644
index 000000000..4694218a1
--- /dev/null
+++ b/src/rocksdb/db/version_builder.cc
@@ -0,0 +1,545 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/version_builder.h"
+
+#include <algorithm>
+#include <atomic>
+#include <cinttypes>
+#include <functional>
+#include <map>
+#include <set>
+#include <thread>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "db/dbformat.h"
+#include "db/internal_stats.h"
+#include "db/table_cache.h"
+#include "db/version_set.h"
+#include "port/port.h"
+#include "table/table_reader.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+bool NewestFirstBySeqNo(FileMetaData* a, FileMetaData* b) {
+  if (a->fd.largest_seqno != b->fd.largest_seqno) {
+    return a->fd.largest_seqno > b->fd.largest_seqno;
+  }
+  if (a->fd.smallest_seqno != b->fd.smallest_seqno) {
+    return a->fd.smallest_seqno > b->fd.smallest_seqno;
+  }
+  // Break ties by file number
+  return a->fd.GetNumber() > b->fd.GetNumber();
+}
+
+namespace {
+bool BySmallestKey(FileMetaData* a, FileMetaData* b,
+                   const InternalKeyComparator* cmp) {
+  int r = cmp->Compare(a->smallest, b->smallest);
+  if (r != 0) {
+    return (r < 0);
+  }
+  // Break ties by file number
+  return (a->fd.GetNumber() < b->fd.GetNumber());
+}
+}  // namespace
+
+class VersionBuilder::Rep {
+ private:
+  // Helper to sort files_ in v
+  // kLevel0 -- NewestFirstBySeqNo
+  // kLevelNon0 -- BySmallestKey
+  struct FileComparator {
+    enum SortMethod { kLevel0 = 0, kLevelNon0 = 1, } sort_method;
+    const InternalKeyComparator* internal_comparator;
+
+    FileComparator() : internal_comparator(nullptr) {}
+
+    bool operator()(FileMetaData* f1, FileMetaData* f2) const {
+      switch (sort_method) {
+        case kLevel0:
+          return NewestFirstBySeqNo(f1, f2);
+        case kLevelNon0:
+          return BySmallestKey(f1, f2, internal_comparator);
+      }
+      assert(false);
+      return false;
+    }
+  };
+
+  struct LevelState {
+    std::unordered_set<uint64_t> deleted_files;
+    // Map from file number to file meta data.
+    std::unordered_map<uint64_t, FileMetaData*> added_files;
+  };
+
+  const FileOptions& file_options_;
+  Logger* info_log_;
+  TableCache* table_cache_;
+  VersionStorageInfo* base_vstorage_;
+  int num_levels_;
+  LevelState* levels_;
+  // Store states of levels larger than num_levels_. We do this instead of
+  // storing them in levels_ to avoid regression in case there are no files
+  // on invalid levels. The version is not consistent if in the end the files
+  // on invalid levels don't cancel out.
+  std::map<int, std::unordered_set<uint64_t>> invalid_levels_;
+  // Whether there are invalid new files or invalid deletion on levels larger
+  // than num_levels_.
+  bool has_invalid_levels_;
+  FileComparator level_zero_cmp_;
+  FileComparator level_nonzero_cmp_;
+
+ public:
+  Rep(const FileOptions& file_options, Logger* info_log,
+      TableCache* table_cache,
+      VersionStorageInfo* base_vstorage)
+      : file_options_(file_options),
+        info_log_(info_log),
+        table_cache_(table_cache),
+        base_vstorage_(base_vstorage),
+        num_levels_(base_vstorage->num_levels()),
+        has_invalid_levels_(false) {
+    levels_ = new LevelState[num_levels_];
+    level_zero_cmp_.sort_method = FileComparator::kLevel0;
+    level_nonzero_cmp_.sort_method = FileComparator::kLevelNon0;
+    level_nonzero_cmp_.internal_comparator =
+        base_vstorage_->InternalComparator();
+  }
+
+  ~Rep() {
+    for (int level = 0; level < num_levels_; level++) {
+      const auto& added = levels_[level].added_files;
+      for (auto& pair : added) {
+        UnrefFile(pair.second);
+      }
+    }
+
+    delete[] levels_;
+  }
+
+  void UnrefFile(FileMetaData* f) {
+    f->refs--;
+    if (f->refs <= 0) {
+      if (f->table_reader_handle) {
+        assert(table_cache_ != nullptr);
+        table_cache_->ReleaseHandle(f->table_reader_handle);
+        f->table_reader_handle = nullptr;
+      }
+      delete f;
+    }
+  }
+
+  Status CheckConsistency(VersionStorageInfo* vstorage) {
+#ifdef NDEBUG
+    if (!vstorage->force_consistency_checks()) {
+      // Dont run consistency checks in release mode except if
+      // explicitly asked to
+      return Status::OK();
+    }
+#endif
+    // make sure the files are sorted correctly
+    for (int level = 0; level < num_levels_; level++) {
+      auto& level_files = vstorage->LevelFiles(level);
+      for (size_t i = 1; i < level_files.size(); i++) {
+        auto f1 = level_files[i - 1];
+        auto f2 = level_files[i];
+#ifndef NDEBUG
+        auto pair = std::make_pair(&f1, &f2);
+        TEST_SYNC_POINT_CALLBACK("VersionBuilder::CheckConsistency", &pair);
+#endif
+        if (level == 0) {
+          if (!level_zero_cmp_(f1, f2)) {
+            fprintf(stderr, "L0 files are not sorted properly");
+            return Status::Corruption("L0 files are not sorted properly");
+          }
+
+          if (f2->fd.smallest_seqno == f2->fd.largest_seqno) {
+            // This is an external file that we ingested
+            SequenceNumber external_file_seqno = f2->fd.smallest_seqno;
+            if (!(external_file_seqno < f1->fd.largest_seqno ||
+                  external_file_seqno == 0)) {
+              fprintf(stderr,
+                      "L0 file with seqno %" PRIu64 " %" PRIu64
+                      " vs. file with global_seqno %" PRIu64 "\n",
+                      f1->fd.smallest_seqno, f1->fd.largest_seqno,
+                      external_file_seqno);
+              return Status::Corruption(
+                  "L0 file with seqno " +
+                  NumberToString(f1->fd.smallest_seqno) + " " +
+                  NumberToString(f1->fd.largest_seqno) +
+                  " vs. file with global_seqno" +
+                  NumberToString(external_file_seqno) + " with fileNumber " +
+                  NumberToString(f1->fd.GetNumber()));
+            }
+          } else if (f1->fd.smallest_seqno <= f2->fd.smallest_seqno) {
+            fprintf(stderr,
+                    "L0 files seqno %" PRIu64 " %" PRIu64 " vs. %" PRIu64
+                    " %" PRIu64 "\n",
+                    f1->fd.smallest_seqno, f1->fd.largest_seqno,
+                    f2->fd.smallest_seqno, f2->fd.largest_seqno);
+            return Status::Corruption(
+                "L0 files seqno " + NumberToString(f1->fd.smallest_seqno) +
+                " " + NumberToString(f1->fd.largest_seqno) + " " +
+                NumberToString(f1->fd.GetNumber()) + " vs. " +
+                NumberToString(f2->fd.smallest_seqno) + " " +
+                NumberToString(f2->fd.largest_seqno) + " " +
+                NumberToString(f2->fd.GetNumber()));
+          }
+        } else {
+          if (!level_nonzero_cmp_(f1, f2)) {
+            fprintf(stderr, "L%d files are not sorted properly", level);
+            return Status::Corruption("L" + NumberToString(level) +
+                                      " files are not sorted properly");
+          }
+
+          // Make sure there is no overlap in levels > 0
+          if (vstorage->InternalComparator()->Compare(f1->largest,
+                                                      f2->smallest) >= 0) {
+            fprintf(stderr, "L%d have overlapping ranges %s vs. %s\n", level,
+                    (f1->largest).DebugString(true).c_str(),
+                    (f2->smallest).DebugString(true).c_str());
+            return Status::Corruption(
+                "L" + NumberToString(level) + " have overlapping ranges " +
+                (f1->largest).DebugString(true) + " vs. " +
+                (f2->smallest).DebugString(true));
+          }
+        }
+      }
+    }
+    return Status::OK();
+  }
+
+  Status CheckConsistencyForDeletes(VersionEdit* /*edit*/, uint64_t number,
+                                    int level) {
+#ifdef NDEBUG
+    if (!base_vstorage_->force_consistency_checks()) {
+      // Dont run consistency checks in release mode except if
+      // explicitly asked to
+      return Status::OK();
+    }
+#endif
+    // a file to be deleted better exist in the previous version
+    bool found = false;
+    for (int l = 0; !found && l < num_levels_; l++) {
+      const std::vector<FileMetaData*>& base_files =
+          base_vstorage_->LevelFiles(l);
+      for (size_t i = 0; i < base_files.size(); i++) {
+        FileMetaData* f = base_files[i];
+        if (f->fd.GetNumber() == number) {
+          found = true;
+          break;
+        }
+      }
+    }
+    // if the file did not exist in the previous version, then it
+    // is possibly moved from lower level to higher level in current
+    // version
+    for (int l = level + 1; !found && l < num_levels_; l++) {
+      auto& level_added = levels_[l].added_files;
+      auto got = level_added.find(number);
+      if (got != level_added.end()) {
+        found = true;
+        break;
+      }
+    }
+
+    // maybe this file was added in a previous edit that was Applied
+    if (!found) {
+      auto& level_added = levels_[level].added_files;
+      auto got = level_added.find(number);
+      if (got != level_added.end()) {
+        found = true;
+      }
+    }
+    if (!found) {
+      fprintf(stderr, "not found %" PRIu64 "\n", number);
+      return Status::Corruption("not found " + NumberToString(number));
+    }
+    return Status::OK();
+  }
+
+  bool CheckConsistencyForNumLevels() {
+    // Make sure there are no files on or beyond num_levels().
+    if (has_invalid_levels_) {
+      return false;
+    }
+    for (auto& level : invalid_levels_) {
+      if (level.second.size() > 0) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  // Apply all of the edits in *edit to the current state.
+  Status Apply(VersionEdit* edit) {
+    Status s = CheckConsistency(base_vstorage_);
+    if (!s.ok()) {
+      return s;
+    }
+
+    // Delete files
+    const auto& del = edit->GetDeletedFiles();
+    for (const auto& del_file : del) {
+      const auto level = del_file.first;
+      const auto number = del_file.second;
+      if (level < num_levels_) {
+        levels_[level].deleted_files.insert(number);
+        CheckConsistencyForDeletes(edit, number, level);
+
+        auto exising = levels_[level].added_files.find(number);
+        if (exising != levels_[level].added_files.end()) {
+          UnrefFile(exising->second);
+          levels_[level].added_files.erase(exising);
+        }
+      } else {
+        if (invalid_levels_[level].erase(number) == 0) {
+          // Deleting an non-existing file on invalid level.
+          has_invalid_levels_ = true;
+        }
+      }
+    }
+
+    // Add new files
+    for (const auto& new_file : edit->GetNewFiles()) {
+      const int level = new_file.first;
+      if (level < num_levels_) {
+        FileMetaData* f = new FileMetaData(new_file.second);
+        f->refs = 1;
+
+        assert(levels_[level].added_files.find(f->fd.GetNumber()) ==
+               levels_[level].added_files.end());
+        levels_[level].deleted_files.erase(f->fd.GetNumber());
+        levels_[level].added_files[f->fd.GetNumber()] = f;
+      } else {
+        uint64_t number = new_file.second.fd.GetNumber();
+        auto& lvls = invalid_levels_[level];
+        if (lvls.count(number) == 0) {
+          lvls.insert(number);
+        } else {
+          // Creating an already existing file on invalid level.
+          has_invalid_levels_ = true;
+        }
+      }
+    }
+    return s;
+  }
+
+  // Save the current state in *v.
+  Status SaveTo(VersionStorageInfo* vstorage) {
+    Status s = CheckConsistency(base_vstorage_);
+    if (!s.ok()) {
+      return s;
+    }
+
+    s = CheckConsistency(vstorage);
+    if (!s.ok()) {
+      return s;
+    }
+
+    for (int level = 0; level < num_levels_; level++) {
+      const auto& cmp = (level == 0) ? level_zero_cmp_ : level_nonzero_cmp_;
+      // Merge the set of added files with the set of pre-existing files.
+      // Drop any deleted files.  Store the result in *v.
+      const auto& base_files = base_vstorage_->LevelFiles(level);
+      const auto& unordered_added_files = levels_[level].added_files;
+      vstorage->Reserve(level,
+                        base_files.size() + unordered_added_files.size());
+
+      // Sort added files for the level.
+      std::vector<FileMetaData*> added_files;
+      added_files.reserve(unordered_added_files.size());
+      for (const auto& pair : unordered_added_files) {
+        added_files.push_back(pair.second);
+      }
+      std::sort(added_files.begin(), added_files.end(), cmp);
+
+#ifndef NDEBUG
+      FileMetaData* prev_added_file = nullptr;
+      for (const auto& added : added_files) {
+        if (level > 0 && prev_added_file != nullptr) {
+          assert(base_vstorage_->InternalComparator()->Compare(
+                     prev_added_file->smallest, added->smallest) <= 0);
+        }
+        prev_added_file = added;
+      }
+#endif
+
+      auto base_iter = base_files.begin();
+      auto base_end = base_files.end();
+      auto added_iter = added_files.begin();
+      auto added_end = added_files.end();
+      while (added_iter != added_end || base_iter != base_end) {
+        if (base_iter == base_end ||
+                (added_iter != added_end && cmp(*added_iter, *base_iter))) {
+          MaybeAddFile(vstorage, level, *added_iter++);
+        } else {
+          MaybeAddFile(vstorage, level, *base_iter++);
+        }
+      }
+    }
+
+    s = CheckConsistency(vstorage);
+    return s;
+  }
+
+  Status LoadTableHandlers(InternalStats* internal_stats, int max_threads,
+                           bool prefetch_index_and_filter_in_cache,
+                           bool is_initial_load,
+                           const SliceTransform* prefix_extractor) {
+    assert(table_cache_ != nullptr);
+
+    size_t table_cache_capacity = table_cache_->get_cache()->GetCapacity();
+    bool always_load = (table_cache_capacity == TableCache::kInfiniteCapacity);
+    size_t max_load = port::kMaxSizet;
+
+    if (!always_load) {
+      // If it is initial loading and not set to always laoding all the
+      // files, we only load up to kInitialLoadLimit files, to limit the
+      // time reopening the DB.
+      const size_t kInitialLoadLimit = 16;
+      size_t load_limit;
+      // If the table cache is not 1/4 full, we pin the table handle to
+      // file metadata to avoid the cache read costs when reading the file.
+      // The downside of pinning those files is that LRU won't be followed
+      // for those files. This doesn't matter much because if number of files
+      // of the DB excceeds table cache capacity, eventually no table reader
+      // will be pinned and LRU will be followed.
+      if (is_initial_load) {
+        load_limit = std::min(kInitialLoadLimit, table_cache_capacity / 4);
+      } else {
+        load_limit = table_cache_capacity / 4;
+      }
+
+      size_t table_cache_usage = table_cache_->get_cache()->GetUsage();
+      if (table_cache_usage >= load_limit) {
+        // TODO (yanqin) find a suitable status code.
+        return Status::OK();
+      } else {
+        max_load = load_limit - table_cache_usage;
+      }
+    }
+
+    // <file metadata, level>
+    std::vector<std::pair<FileMetaData*, int>> files_meta;
+    std::vector<Status> statuses;
+    for (int level = 0; level < num_levels_; level++) {
+      for (auto& file_meta_pair : levels_[level].added_files) {
+        auto* file_meta = file_meta_pair.second;
+        // If the file has been opened before, just skip it.
+        if (!file_meta->table_reader_handle) {
+          files_meta.emplace_back(file_meta, level);
+          statuses.emplace_back(Status::OK());
+        }
+        if (files_meta.size() >= max_load) {
+          break;
+        }
+      }
+      if (files_meta.size() >= max_load) {
+        break;
+      }
+    }
+
+    std::atomic<size_t> next_file_meta_idx(0);
+    std::function<void()> load_handlers_func([&]() {
+      while (true) {
+        size_t file_idx = next_file_meta_idx.fetch_add(1);
+        if (file_idx >= files_meta.size()) {
+          break;
+        }
+
+        auto* file_meta = files_meta[file_idx].first;
+        int level = files_meta[file_idx].second;
+        statuses[file_idx] = table_cache_->FindTable(
+            file_options_, *(base_vstorage_->InternalComparator()),
+            file_meta->fd, &file_meta->table_reader_handle, prefix_extractor,
+            false /*no_io */, true /* record_read_stats */,
+            internal_stats->GetFileReadHist(level), false, level,
+            prefetch_index_and_filter_in_cache);
+        if (file_meta->table_reader_handle != nullptr) {
+          // Load table_reader
+          file_meta->fd.table_reader = table_cache_->GetTableReaderFromHandle(
+              file_meta->table_reader_handle);
+        }
+      }
+    });
+
+    std::vector<port::Thread> threads;
+    for (int i = 1; i < max_threads; i++) {
+      threads.emplace_back(load_handlers_func);
+    }
+    load_handlers_func();
+    for (auto& t : threads) {
+      t.join();
+    }
+    for (const auto& s : statuses) {
+      if (!s.ok()) {
+        return s;
+      }
+    }
+    return Status::OK();
+  }
+
+  void MaybeAddFile(VersionStorageInfo* vstorage, int level, FileMetaData* f) {
+    if (levels_[level].deleted_files.count(f->fd.GetNumber()) > 0) {
+      // f is to-be-deleted table file
+      vstorage->RemoveCurrentStats(f);
+    } else {
+      vstorage->AddFile(level, f, info_log_);
+    }
+  }
+};
+
+VersionBuilder::VersionBuilder(const FileOptions& file_options,
+                               TableCache* table_cache,
+                               VersionStorageInfo* base_vstorage,
+                               Logger* info_log)
+    : rep_(new Rep(file_options, info_log, table_cache, base_vstorage)) {}
+
+VersionBuilder::~VersionBuilder() { delete rep_; }
+
+Status VersionBuilder::CheckConsistency(VersionStorageInfo* vstorage) {
+  return rep_->CheckConsistency(vstorage);
+}
+
+Status VersionBuilder::CheckConsistencyForDeletes(VersionEdit* edit,
+                                                  uint64_t number, int level) {
+  return rep_->CheckConsistencyForDeletes(edit, number, level);
+}
+
+bool VersionBuilder::CheckConsistencyForNumLevels() {
+  return rep_->CheckConsistencyForNumLevels();
+}
+
+Status VersionBuilder::Apply(VersionEdit* edit) { return rep_->Apply(edit); }
+
+Status VersionBuilder::SaveTo(VersionStorageInfo* vstorage) {
+  return rep_->SaveTo(vstorage);
+}
+
+Status VersionBuilder::LoadTableHandlers(
+    InternalStats* internal_stats, int max_threads,
+    bool prefetch_index_and_filter_in_cache, bool is_initial_load,
+    const SliceTransform* prefix_extractor) {
+  return rep_->LoadTableHandlers(internal_stats, max_threads,
+                                 prefetch_index_and_filter_in_cache,
+                                 is_initial_load, prefix_extractor);
+}
+
+void VersionBuilder::MaybeAddFile(VersionStorageInfo* vstorage, int level,
+                                  FileMetaData* f) {
+  rep_->MaybeAddFile(vstorage, level, f);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/version_builder.h b/src/rocksdb/db/version_builder.h
new file mode 100644
index 000000000..87415ed55
--- /dev/null
+++ b/src/rocksdb/db/version_builder.h
@@ -0,0 +1,48 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+#pragma once
+#include "rocksdb/file_system.h"
+#include "rocksdb/slice_transform.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class TableCache;
+class VersionStorageInfo;
+class VersionEdit;
+struct FileMetaData;
+class InternalStats;
+
+// A helper class so we can efficiently apply a whole sequence
+// of edits to a particular state without creating intermediate
+// Versions that contain full copies of the intermediate state.
+class VersionBuilder {
+ public:
+  VersionBuilder(const FileOptions& file_options, TableCache* table_cache,
+                 VersionStorageInfo* base_vstorage, Logger* info_log = nullptr);
+  ~VersionBuilder();
+  Status CheckConsistency(VersionStorageInfo* vstorage);
+  Status CheckConsistencyForDeletes(VersionEdit* edit, uint64_t number,
+                                    int level);
+  bool CheckConsistencyForNumLevels();
+  Status Apply(VersionEdit* edit);
+  Status SaveTo(VersionStorageInfo* vstorage);
+  Status LoadTableHandlers(InternalStats* internal_stats, int max_threads,
+                           bool prefetch_index_and_filter_in_cache,
+                           bool is_initial_load,
+                           const SliceTransform* prefix_extractor);
+  void MaybeAddFile(VersionStorageInfo* vstorage, int level, FileMetaData* f);
+
+ private:
+  class Rep;
+  Rep* rep_;
+};
+
+extern bool NewestFirstBySeqNo(FileMetaData* a, FileMetaData* b);
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/version_builder_test.cc b/src/rocksdb/db/version_builder_test.cc
new file mode 100644
index 000000000..2dda03f31
--- /dev/null
+++ b/src/rocksdb/db/version_builder_test.cc
@@ -0,0 +1,349 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include <string>
+#include "db/version_edit.h"
+#include "db/version_set.h"
+#include "logging/logging.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class VersionBuilderTest : public testing::Test {
+ public:
+  const Comparator* ucmp_;
+  InternalKeyComparator icmp_;
+  Options options_;
+  ImmutableCFOptions ioptions_;
+  MutableCFOptions mutable_cf_options_;
+  VersionStorageInfo vstorage_;
+  uint32_t file_num_;
+  CompactionOptionsFIFO fifo_options_;
+  std::vector<uint64_t> size_being_compacted_;
+
+  VersionBuilderTest()
+      : ucmp_(BytewiseComparator()),
+        icmp_(ucmp_),
+        ioptions_(options_),
+        mutable_cf_options_(options_),
+        vstorage_(&icmp_, ucmp_, options_.num_levels, kCompactionStyleLevel,
+                  nullptr, false),
+        file_num_(1) {
+    mutable_cf_options_.RefreshDerivedOptions(ioptions_);
+    size_being_compacted_.resize(options_.num_levels);
+  }
+
+  ~VersionBuilderTest() override {
+    for (int i = 0; i < vstorage_.num_levels(); i++) {
+      for (auto* f : vstorage_.LevelFiles(i)) {
+        if (--f->refs == 0) {
+          delete f;
+        }
+      }
+    }
+  }
+
+  InternalKey GetInternalKey(const char* ukey,
+                             SequenceNumber smallest_seq = 100) {
+    return InternalKey(ukey, smallest_seq, kTypeValue);
+  }
+
+  void Add(int level, uint32_t file_number, const char* smallest,
+           const char* largest, uint64_t file_size = 0, uint32_t path_id = 0,
+           SequenceNumber smallest_seq = 100, SequenceNumber largest_seq = 100,
+           uint64_t num_entries = 0, uint64_t num_deletions = 0,
+           bool sampled = false, SequenceNumber smallest_seqno = 0,
+           SequenceNumber largest_seqno = 0) {
+    assert(level < vstorage_.num_levels());
+    FileMetaData* f = new FileMetaData(
+        file_number, path_id, file_size, GetInternalKey(smallest, smallest_seq),
+        GetInternalKey(largest, largest_seq), smallest_seqno, largest_seqno,
+        /* marked_for_compact */ false, kInvalidBlobFileNumber,
+        kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+        kUnknownFileChecksum, kUnknownFileChecksumFuncName);
+    f->compensated_file_size = file_size;
+    f->num_entries = num_entries;
+    f->num_deletions = num_deletions;
+    vstorage_.AddFile(level, f);
+    if (sampled) {
+      f->init_stats_from_file = true;
+      vstorage_.UpdateAccumulatedStats(f);
+    }
+  }
+
+  void UpdateVersionStorageInfo() {
+    vstorage_.UpdateFilesByCompactionPri(ioptions_.compaction_pri);
+    vstorage_.UpdateNumNonEmptyLevels();
+    vstorage_.GenerateFileIndexer();
+    vstorage_.GenerateLevelFilesBrief();
+    vstorage_.CalculateBaseBytes(ioptions_, mutable_cf_options_);
+    vstorage_.GenerateLevel0NonOverlapping();
+    vstorage_.SetFinalized();
+  }
+};
+
+void UnrefFilesInVersion(VersionStorageInfo* new_vstorage) {
+  for (int i = 0; i < new_vstorage->num_levels(); i++) {
+    for (auto* f : new_vstorage->LevelFiles(i)) {
+      if (--f->refs == 0) {
+        delete f;
+      }
+    }
+  }
+}
+
+TEST_F(VersionBuilderTest, ApplyAndSaveTo) {
+  Add(0, 1U, "150", "200", 100U);
+
+  Add(1, 66U, "150", "200", 100U);
+  Add(1, 88U, "201", "300", 100U);
+
+  Add(2, 6U, "150", "179", 100U);
+  Add(2, 7U, "180", "220", 100U);
+  Add(2, 8U, "221", "300", 100U);
+
+  Add(3, 26U, "150", "170", 100U);
+  Add(3, 27U, "171", "179", 100U);
+  Add(3, 28U, "191", "220", 100U);
+  Add(3, 29U, "221", "300", 100U);
+  UpdateVersionStorageInfo();
+
+  VersionEdit version_edit;
+  version_edit.AddFile(2, 666, 0, 100U, GetInternalKey("301"),
+                       GetInternalKey("350"), 200, 200, false,
+                       kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
+                       kUnknownFileCreationTime, kUnknownFileChecksum,
+                       kUnknownFileChecksumFuncName);
+  version_edit.DeleteFile(3, 27U);
+
+  EnvOptions env_options;
+
+  VersionBuilder version_builder(env_options, nullptr, &vstorage_);
+
+  VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels,
+                                  kCompactionStyleLevel, nullptr, false);
+  version_builder.Apply(&version_edit);
+  version_builder.SaveTo(&new_vstorage);
+
+  ASSERT_EQ(400U, new_vstorage.NumLevelBytes(2));
+  ASSERT_EQ(300U, new_vstorage.NumLevelBytes(3));
+
+  UnrefFilesInVersion(&new_vstorage);
+}
+
+TEST_F(VersionBuilderTest, ApplyAndSaveToDynamic) {
+  ioptions_.level_compaction_dynamic_level_bytes = true;
+
+  Add(0, 1U, "150", "200", 100U, 0, 200U, 200U, 0, 0, false, 200U, 200U);
+  Add(0, 88U, "201", "300", 100U, 0, 100U, 100U, 0, 0, false, 100U, 100U);
+
+  Add(4, 6U, "150", "179", 100U);
+  Add(4, 7U, "180", "220", 100U);
+  Add(4, 8U, "221", "300", 100U);
+
+  Add(5, 26U, "150", "170", 100U);
+  Add(5, 27U, "171", "179", 100U);
+  UpdateVersionStorageInfo();
+
+  VersionEdit version_edit;
+  version_edit.AddFile(3, 666, 0, 100U, GetInternalKey("301"),
+                       GetInternalKey("350"), 200, 200, false,
+                       kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
+                       kUnknownFileCreationTime, kUnknownFileChecksum,
+                       kUnknownFileChecksumFuncName);
+  version_edit.DeleteFile(0, 1U);
+  version_edit.DeleteFile(0, 88U);
+
+  EnvOptions env_options;
+
+  VersionBuilder version_builder(env_options, nullptr, &vstorage_);
+
+  VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels,
+                                  kCompactionStyleLevel, nullptr, false);
+  version_builder.Apply(&version_edit);
+  version_builder.SaveTo(&new_vstorage);
+
+  ASSERT_EQ(0U, new_vstorage.NumLevelBytes(0));
+  ASSERT_EQ(100U, new_vstorage.NumLevelBytes(3));
+  ASSERT_EQ(300U, new_vstorage.NumLevelBytes(4));
+  ASSERT_EQ(200U, new_vstorage.NumLevelBytes(5));
+
+  UnrefFilesInVersion(&new_vstorage);
+}
+
+TEST_F(VersionBuilderTest, ApplyAndSaveToDynamic2) {
+  ioptions_.level_compaction_dynamic_level_bytes = true;
+
+  Add(0, 1U, "150", "200", 100U, 0, 200U, 200U, 0, 0, false, 200U, 200U);
+  Add(0, 88U, "201", "300", 100U, 0, 100U, 100U, 0, 0, false, 100U, 100U);
+
+  Add(4, 6U, "150", "179", 100U);
+  Add(4, 7U, "180", "220", 100U);
+  Add(4, 8U, "221", "300", 100U);
+
+  Add(5, 26U, "150", "170", 100U);
+  Add(5, 27U, "171", "179", 100U);
+  UpdateVersionStorageInfo();
+
+  VersionEdit version_edit;
+  version_edit.AddFile(4, 666, 0, 100U, GetInternalKey("301"),
+                       GetInternalKey("350"), 200, 200, false,
+                       kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
+                       kUnknownFileCreationTime, kUnknownFileChecksum,
+                       kUnknownFileChecksumFuncName);
+  version_edit.DeleteFile(0, 1U);
+  version_edit.DeleteFile(0, 88U);
+  version_edit.DeleteFile(4, 6U);
+  version_edit.DeleteFile(4, 7U);
+  version_edit.DeleteFile(4, 8U);
+
+  EnvOptions env_options;
+
+  VersionBuilder version_builder(env_options, nullptr, &vstorage_);
+
+  VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels,
+                                  kCompactionStyleLevel, nullptr, false);
+  version_builder.Apply(&version_edit);
+  version_builder.SaveTo(&new_vstorage);
+
+  ASSERT_EQ(0U, new_vstorage.NumLevelBytes(0));
+  ASSERT_EQ(100U, new_vstorage.NumLevelBytes(4));
+  ASSERT_EQ(200U, new_vstorage.NumLevelBytes(5));
+
+  UnrefFilesInVersion(&new_vstorage);
+}
+
+TEST_F(VersionBuilderTest, ApplyMultipleAndSaveTo) {
+  UpdateVersionStorageInfo();
+
+  VersionEdit version_edit;
+  version_edit.AddFile(2, 666, 0, 100U, GetInternalKey("301"),
+                       GetInternalKey("350"), 200, 200, false,
+                       kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
+                       kUnknownFileCreationTime, kUnknownFileChecksum,
+                       kUnknownFileChecksumFuncName);
+  version_edit.AddFile(2, 676, 0, 100U, GetInternalKey("401"),
+                       GetInternalKey("450"), 200, 200, false,
+                       kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
+                       kUnknownFileCreationTime, kUnknownFileChecksum,
+                       kUnknownFileChecksumFuncName);
+  version_edit.AddFile(2, 636, 0, 100U, GetInternalKey("601"),
+                       GetInternalKey("650"), 200, 200, false,
+                       kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
+                       kUnknownFileCreationTime, kUnknownFileChecksum,
+                       kUnknownFileChecksumFuncName);
+  version_edit.AddFile(2, 616, 0, 100U, GetInternalKey("501"),
+                       GetInternalKey("550"), 200, 200, false,
+                       kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
+                       kUnknownFileCreationTime, kUnknownFileChecksum,
+                       kUnknownFileChecksumFuncName);
+  version_edit.AddFile(2, 606, 0, 100U, GetInternalKey("701"),
+                       GetInternalKey("750"), 200, 200, false,
+                       kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
+                       kUnknownFileCreationTime, kUnknownFileChecksum,
+                       kUnknownFileChecksumFuncName);
+
+  EnvOptions env_options;
+
+  VersionBuilder version_builder(env_options, nullptr, &vstorage_);
+
+  VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels,
+                                  kCompactionStyleLevel, nullptr, false);
+  version_builder.Apply(&version_edit);
+  version_builder.SaveTo(&new_vstorage);
+
+  ASSERT_EQ(500U, new_vstorage.NumLevelBytes(2));
+
+  UnrefFilesInVersion(&new_vstorage);
+}
+
+TEST_F(VersionBuilderTest, ApplyDeleteAndSaveTo) {
+  UpdateVersionStorageInfo();
+
+  EnvOptions env_options;
+  VersionBuilder version_builder(env_options, nullptr, &vstorage_);
+  VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels,
+                                  kCompactionStyleLevel, nullptr, false);
+
+  VersionEdit version_edit;
+  version_edit.AddFile(2, 666, 0, 100U, GetInternalKey("301"),
+                       GetInternalKey("350"), 200, 200, false,
+                       kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
+                       kUnknownFileCreationTime, kUnknownFileChecksum,
+                       kUnknownFileChecksumFuncName);
+  version_edit.AddFile(2, 676, 0, 100U, GetInternalKey("401"),
+                       GetInternalKey("450"), 200, 200, false,
+                       kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
+                       kUnknownFileCreationTime, kUnknownFileChecksum,
+                       kUnknownFileChecksumFuncName);
+  version_edit.AddFile(2, 636, 0, 100U, GetInternalKey("601"),
+                       GetInternalKey("650"), 200, 200, false,
+                       kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
+                       kUnknownFileCreationTime, kUnknownFileChecksum,
+                       kUnknownFileChecksumFuncName);
+  version_edit.AddFile(2, 616, 0, 100U, GetInternalKey("501"),
+                       GetInternalKey("550"), 200, 200, false,
+                       kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
+                       kUnknownFileCreationTime, kUnknownFileChecksum,
+                       kUnknownFileChecksumFuncName);
+  version_edit.AddFile(2, 606, 0, 100U, GetInternalKey("701"),
+                       GetInternalKey("750"), 200, 200, false,
+                       kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
+                       kUnknownFileCreationTime, kUnknownFileChecksum,
+                       kUnknownFileChecksumFuncName);
+  version_builder.Apply(&version_edit);
+
+  VersionEdit version_edit2;
+  version_edit.AddFile(2, 808, 0, 100U, GetInternalKey("901"),
+                       GetInternalKey("950"), 200, 200, false,
+                       kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
+                       kUnknownFileCreationTime, kUnknownFileChecksum,
+                       kUnknownFileChecksumFuncName);
+  version_edit2.DeleteFile(2, 616);
+  version_edit2.DeleteFile(2, 636);
+  version_edit.AddFile(2, 806, 0, 100U, GetInternalKey("801"),
+                       GetInternalKey("850"), 200, 200, false,
+                       kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
+                       kUnknownFileCreationTime, kUnknownFileChecksum,
+                       kUnknownFileChecksumFuncName);
+  version_builder.Apply(&version_edit2);
+
+  version_builder.SaveTo(&new_vstorage);
+
+  ASSERT_EQ(300U, new_vstorage.NumLevelBytes(2));
+
+  UnrefFilesInVersion(&new_vstorage);
+}
+
+TEST_F(VersionBuilderTest, EstimatedActiveKeys) {
+  const uint32_t kTotalSamples = 20;
+  const uint32_t kNumLevels = 5;
+  const uint32_t kFilesPerLevel = 8;
+  const uint32_t kNumFiles = kNumLevels * kFilesPerLevel;
+  const uint32_t kEntriesPerFile = 1000;
+  const uint32_t kDeletionsPerFile = 100;
+  for (uint32_t i = 0; i < kNumFiles; ++i) {
+    Add(static_cast<int>(i / kFilesPerLevel), i + 1,
+        ToString((i + 100) * 1000).c_str(),
+        ToString((i + 100) * 1000 + 999).c_str(),
+        100U,  0, 100, 100,
+        kEntriesPerFile, kDeletionsPerFile,
+        (i < kTotalSamples));
+  }
+  // minus 2X for the number of deletion entries because:
+  // 1x for deletion entry does not count as a data entry.
+  // 1x for each deletion entry will actually remove one data entry.
+  ASSERT_EQ(vstorage_.GetEstimatedActiveKeys(),
+            (kEntriesPerFile - 2 * kDeletionsPerFile) * kNumFiles);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/version_edit.cc b/src/rocksdb/db/version_edit.cc
new file mode 100644
index 000000000..e45e82656
--- /dev/null
+++ b/src/rocksdb/db/version_edit.cc
@@ -0,0 +1,826 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/version_edit.h"
+
+#include "db/blob_index.h"
+#include "db/version_set.h"
+#include "logging/event_logger.h"
+#include "rocksdb/slice.h"
+#include "test_util/sync_point.h"
+#include "util/coding.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+// The unknown file checksum.
+const std::string kUnknownFileChecksum("");
+// The unknown sst file checksum function name.
+const std::string kUnknownFileChecksumFuncName("Unknown");
+// Mask for an identified tag from the future which can be safely ignored.
+const uint32_t kTagSafeIgnoreMask = 1 << 13;
+
+// Tag numbers for serialized VersionEdit.  These numbers are written to
+// disk and should not be changed. The number should be forward compatible so
+// users can down-grade RocksDB safely. A future Tag is ignored by doing '&'
+// between Tag and kTagSafeIgnoreMask field.
+enum Tag : uint32_t {
+  kComparator = 1,
+  kLogNumber = 2,
+  kNextFileNumber = 3,
+  kLastSequence = 4,
+  kCompactPointer = 5,
+  kDeletedFile = 6,
+  kNewFile = 7,
+  // 8 was used for large value refs
+  kPrevLogNumber = 9,
+  kMinLogNumberToKeep = 10,
+  // Ignore-able field
+  kDbId = kTagSafeIgnoreMask + 1,
+
+  // these are new formats divergent from open source leveldb
+  kNewFile2 = 100,
+  kNewFile3 = 102,
+  kNewFile4 = 103,      // 4th (the latest) format version of adding files
+  kColumnFamily = 200,  // specify column family for version edit
+  kColumnFamilyAdd = 201,
+  kColumnFamilyDrop = 202,
+  kMaxColumnFamily = 203,
+
+  kInAtomicGroup = 300,
+};
+
+enum CustomTag : uint32_t {
+  kTerminate = 1,  // The end of customized fields
+  kNeedCompaction = 2,
+  // Since Manifest is not entirely currently forward-compatible, and the only
+  // forward-compatible part is the CutsomtTag of kNewFile, we currently encode
+  // kMinLogNumberToKeep as part of a CustomTag as a hack. This should be
+  // removed when manifest becomes forward-comptabile.
+  kMinLogNumberToKeepHack = 3,
+  kOldestBlobFileNumber = 4,
+  kOldestAncesterTime = 5,
+  kFileCreationTime = 6,
+  kFileChecksum = 7,
+  kFileChecksumFuncName = 8,
+  kPathId = 65,
+};
+// If this bit for the custom tag is set, opening DB should fail if
+// we don't know this field.
+uint32_t kCustomTagNonSafeIgnoreMask = 1 << 6;
+
+uint64_t PackFileNumberAndPathId(uint64_t number, uint64_t path_id) {
+  assert(number <= kFileNumberMask);
+  return number | (path_id * (kFileNumberMask + 1));
+}
+
+void FileMetaData::UpdateBoundaries(const Slice& key, const Slice& value,
+                                    SequenceNumber seqno,
+                                    ValueType value_type) {
+  if (smallest.size() == 0) {
+    smallest.DecodeFrom(key);
+  }
+  largest.DecodeFrom(key);
+  fd.smallest_seqno = std::min(fd.smallest_seqno, seqno);
+  fd.largest_seqno = std::max(fd.largest_seqno, seqno);
+
+#ifndef ROCKSDB_LITE
+  if (value_type == kTypeBlobIndex) {
+    BlobIndex blob_index;
+    const Status s = blob_index.DecodeFrom(value);
+    if (!s.ok()) {
+      return;
+    }
+
+    if (blob_index.IsInlined()) {
+      return;
+    }
+
+    if (blob_index.HasTTL()) {
+      return;
+    }
+
+    // Paranoid check: this should not happen because BlobDB numbers the blob
+    // files starting from 1.
+    if (blob_index.file_number() == kInvalidBlobFileNumber) {
+      return;
+    }
+
+    if (oldest_blob_file_number == kInvalidBlobFileNumber ||
+        oldest_blob_file_number > blob_index.file_number()) {
+      oldest_blob_file_number = blob_index.file_number();
+    }
+  }
+#else
+  (void)value;
+  (void)value_type;
+#endif
+}
+
+void VersionEdit::Clear() {
+  max_level_ = 0;
+  db_id_.clear();
+  comparator_.clear();
+  log_number_ = 0;
+  prev_log_number_ = 0;
+  next_file_number_ = 0;
+  max_column_family_ = 0;
+  min_log_number_to_keep_ = 0;
+  last_sequence_ = 0;
+  has_db_id_ = false;
+  has_comparator_ = false;
+  has_log_number_ = false;
+  has_prev_log_number_ = false;
+  has_next_file_number_ = false;
+  has_max_column_family_ = false;
+  has_min_log_number_to_keep_ = false;
+  has_last_sequence_ = false;
+  deleted_files_.clear();
+  new_files_.clear();
+  column_family_ = 0;
+  is_column_family_add_ = false;
+  is_column_family_drop_ = false;
+  column_family_name_.clear();
+  is_in_atomic_group_ = false;
+  remaining_entries_ = 0;
+}
+
+bool VersionEdit::EncodeTo(std::string* dst) const {
+  if (has_db_id_) {
+    PutVarint32(dst, kDbId);
+    PutLengthPrefixedSlice(dst, db_id_);
+  }
+  if (has_comparator_) {
+    PutVarint32(dst, kComparator);
+    PutLengthPrefixedSlice(dst, comparator_);
+  }
+  if (has_log_number_) {
+    PutVarint32Varint64(dst, kLogNumber, log_number_);
+  }
+  if (has_prev_log_number_) {
+    PutVarint32Varint64(dst, kPrevLogNumber, prev_log_number_);
+  }
+  if (has_next_file_number_) {
+    PutVarint32Varint64(dst, kNextFileNumber, next_file_number_);
+  }
+  if (has_max_column_family_) {
+    PutVarint32Varint32(dst, kMaxColumnFamily, max_column_family_);
+  }
+  if (has_last_sequence_) {
+    PutVarint32Varint64(dst, kLastSequence, last_sequence_);
+  }
+  for (const auto& deleted : deleted_files_) {
+    PutVarint32Varint32Varint64(dst, kDeletedFile, deleted.first /* level */,
+                                deleted.second /* file number */);
+  }
+
+  bool min_log_num_written = false;
+  for (size_t i = 0; i < new_files_.size(); i++) {
+    const FileMetaData& f = new_files_[i].second;
+    if (!f.smallest.Valid() || !f.largest.Valid()) {
+      return false;
+    }
+    PutVarint32(dst, kNewFile4);
+    PutVarint32Varint64(dst, new_files_[i].first /* level */, f.fd.GetNumber());
+    PutVarint64(dst, f.fd.GetFileSize());
+    PutLengthPrefixedSlice(dst, f.smallest.Encode());
+    PutLengthPrefixedSlice(dst, f.largest.Encode());
+    PutVarint64Varint64(dst, f.fd.smallest_seqno, f.fd.largest_seqno);
+    // Customized fields' format:
+    // +-----------------------------+
+    // | 1st field's tag (varint32)  |
+    // +-----------------------------+
+    // | 1st field's size (varint32) |
+    // +-----------------------------+
+    // |    bytes for 1st field      |
+    // |  (based on size decoded)    |
+    // +-----------------------------+
+    // |                             |
+    // |          ......             |
+    // |                             |
+    // +-----------------------------+
+    // | last field's size (varint32)|
+    // +-----------------------------+
+    // |    bytes for last field     |
+    // |  (based on size decoded)    |
+    // +-----------------------------+
+    // | terminating tag (varint32)  |
+    // +-----------------------------+
+    //
+    // Customized encoding for fields:
+    //   tag kPathId: 1 byte as path_id
+    //   tag kNeedCompaction:
+    //        now only can take one char value 1 indicating need-compaction
+    //
+    PutVarint32(dst, CustomTag::kOldestAncesterTime);
+    std::string varint_oldest_ancester_time;
+    PutVarint64(&varint_oldest_ancester_time, f.oldest_ancester_time);
+    TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:VarintOldestAncesterTime",
+                             &varint_oldest_ancester_time);
+    PutLengthPrefixedSlice(dst, Slice(varint_oldest_ancester_time));
+
+    PutVarint32(dst, CustomTag::kFileCreationTime);
+    std::string varint_file_creation_time;
+    PutVarint64(&varint_file_creation_time, f.file_creation_time);
+    TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:VarintFileCreationTime",
+                             &varint_file_creation_time);
+    PutLengthPrefixedSlice(dst, Slice(varint_file_creation_time));
+
+    PutVarint32(dst, CustomTag::kFileChecksum);
+    PutLengthPrefixedSlice(dst, Slice(f.file_checksum));
+
+    PutVarint32(dst, CustomTag::kFileChecksumFuncName);
+    PutLengthPrefixedSlice(dst, Slice(f.file_checksum_func_name));
+
+    if (f.fd.GetPathId() != 0) {
+      PutVarint32(dst, CustomTag::kPathId);
+      char p = static_cast<char>(f.fd.GetPathId());
+      PutLengthPrefixedSlice(dst, Slice(&p, 1));
+    }
+    if (f.marked_for_compaction) {
+      PutVarint32(dst, CustomTag::kNeedCompaction);
+      char p = static_cast<char>(1);
+      PutLengthPrefixedSlice(dst, Slice(&p, 1));
+    }
+    if (has_min_log_number_to_keep_ && !min_log_num_written) {
+      PutVarint32(dst, CustomTag::kMinLogNumberToKeepHack);
+      std::string varint_log_number;
+      PutFixed64(&varint_log_number, min_log_number_to_keep_);
+      PutLengthPrefixedSlice(dst, Slice(varint_log_number));
+      min_log_num_written = true;
+    }
+    if (f.oldest_blob_file_number != kInvalidBlobFileNumber) {
+      PutVarint32(dst, CustomTag::kOldestBlobFileNumber);
+      std::string oldest_blob_file_number;
+      PutVarint64(&oldest_blob_file_number, f.oldest_blob_file_number);
+      PutLengthPrefixedSlice(dst, Slice(oldest_blob_file_number));
+    }
+    TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:NewFile4:CustomizeFields",
+                             dst);
+
+    PutVarint32(dst, CustomTag::kTerminate);
+  }
+
+  // 0 is default and does not need to be explicitly written
+  if (column_family_ != 0) {
+    PutVarint32Varint32(dst, kColumnFamily, column_family_);
+  }
+
+  if (is_column_family_add_) {
+    PutVarint32(dst, kColumnFamilyAdd);
+    PutLengthPrefixedSlice(dst, Slice(column_family_name_));
+  }
+
+  if (is_column_family_drop_) {
+    PutVarint32(dst, kColumnFamilyDrop);
+  }
+
+  if (is_in_atomic_group_) {
+    PutVarint32(dst, kInAtomicGroup);
+    PutVarint32(dst, remaining_entries_);
+  }
+  return true;
+}
+
+static bool GetInternalKey(Slice* input, InternalKey* dst) {
+  Slice str;
+  if (GetLengthPrefixedSlice(input, &str)) {
+    dst->DecodeFrom(str);
+    return dst->Valid();
+  } else {
+    return false;
+  }
+}
+
+bool VersionEdit::GetLevel(Slice* input, int* level, const char** /*msg*/) {
+  uint32_t v = 0;
+  if (GetVarint32(input, &v)) {
+    *level = v;
+    if (max_level_ < *level) {
+      max_level_ = *level;
+    }
+    return true;
+  } else {
+    return false;
+  }
+}
+
+static bool is_pseudo_new_file_record_pr3488(
+  const int level,
+  const uint64_t number,
+  const uint64_t file_size,
+  InternalKey& smallest,
+  InternalKey& largest,
+  const bool has_min_log_number_to_keep_) {
+
+  if (level == 0 && number == 0 && file_size == 0 &&
+      has_min_log_number_to_keep_) {
+    InternalKey dummy_key(Slice("dummy_key"), 0ull, ValueType::kTypeValue);
+    return (*smallest.rep() == *dummy_key.rep() &&
+            *largest.rep() == *dummy_key.rep());
+  } else {
+    return false;
+  }
+}
+
+const char* VersionEdit::DecodeNewFile4From(Slice* input) {
+  const char* msg = nullptr;
+  int level = 0;
+  FileMetaData f;
+  uint64_t number = 0;
+  uint32_t path_id = 0;
+  uint64_t file_size = 0;
+  SequenceNumber smallest_seqno = 0;
+  SequenceNumber largest_seqno = kMaxSequenceNumber;
+  // Since this is the only forward-compatible part of the code, we hack new
+  // extension into this record. When we do, we set this boolean to distinguish
+  // the record from the normal NewFile records.
+  if (GetLevel(input, &level, &msg) && GetVarint64(input, &number) &&
+      GetVarint64(input, &file_size) && GetInternalKey(input, &f.smallest) &&
+      GetInternalKey(input, &f.largest) &&
+      GetVarint64(input, &smallest_seqno) &&
+      GetVarint64(input, &largest_seqno)) {
+    // See comments in VersionEdit::EncodeTo() for format of customized fields
+    while (true) {
+      uint32_t custom_tag = 0;
+      Slice field;
+      if (!GetVarint32(input, &custom_tag)) {
+        return "new-file4 custom field";
+      }
+      if (custom_tag == kTerminate) {
+        break;
+      }
+      if (!GetLengthPrefixedSlice(input, &field)) {
+        return "new-file4 custom field length prefixed slice error";
+      }
+      switch (custom_tag) {
+        case kPathId:
+          if (field.size() != 1) {
+            return "path_id field wrong size";
+          }
+          path_id = field[0];
+          if (path_id > 3) {
+            return "path_id wrong vaue";
+          }
+          break;
+        case kOldestAncesterTime:
+          if (!GetVarint64(&field, &f.oldest_ancester_time)) {
+            return "invalid oldest ancester time";
+          }
+          break;
+        case kFileCreationTime:
+          if (!GetVarint64(&field, &f.file_creation_time)) {
+            return "invalid file creation time";
+          }
+          break;
+        case kFileChecksum:
+          f.file_checksum = field.ToString();
+          break;
+        case kFileChecksumFuncName:
+          f.file_checksum_func_name = field.ToString();
+          break;
+        case kNeedCompaction:
+          if (field.size() != 1) {
+            return "need_compaction field wrong size";
+          }
+          f.marked_for_compaction = (field[0] == 1);
+          break;
+        case kMinLogNumberToKeepHack:
+          // This is a hack to encode kMinLogNumberToKeep in a
+          // forward-compatible fashion.
+          if (!GetFixed64(&field, &min_log_number_to_keep_)) {
+            return "deleted log number malformatted";
+          }
+          has_min_log_number_to_keep_ = true;
+          break;
+        case kOldestBlobFileNumber:
+          if (!GetVarint64(&field, &f.oldest_blob_file_number)) {
+            return "invalid oldest blob file number";
+          }
+          break;
+        default:
+          if ((custom_tag & kCustomTagNonSafeIgnoreMask) != 0) {
+            // Should not proceed if cannot understand it
+            return "new-file4 custom field not supported";
+          }
+          break;
+      }
+    }
+  } else {
+    return "new-file4 entry";
+  }
+  if (is_pseudo_new_file_record_pr3488(level, number, file_size,
+                                       f.smallest, f.largest,
+                                       has_min_log_number_to_keep_)) {
+    // Since this has nothing to do with NewFile, return immediately.
+    return nullptr;
+  }
+  f.fd =
+      FileDescriptor(number, path_id, file_size, smallest_seqno, largest_seqno);
+  new_files_.push_back(std::make_pair(level, f));
+  return nullptr;
+}
+
+Status VersionEdit::DecodeFrom(const Slice& src) {
+  Clear();
+  Slice input = src;
+  const char* msg = nullptr;
+  uint32_t tag = 0;
+
+  // Temporary storage for parsing
+  int level = 0;
+  FileMetaData f;
+  Slice str;
+  InternalKey key;
+  while (msg == nullptr && GetVarint32(&input, &tag)) {
+    switch (tag) {
+      case kDbId:
+        if (GetLengthPrefixedSlice(&input, &str)) {
+          db_id_ = str.ToString();
+          has_db_id_ = true;
+        } else {
+          msg = "db id";
+        }
+        break;
+      case kComparator:
+        if (GetLengthPrefixedSlice(&input, &str)) {
+          comparator_ = str.ToString();
+          has_comparator_ = true;
+        } else {
+          msg = "comparator name";
+        }
+        break;
+
+      case kLogNumber:
+        if (GetVarint64(&input, &log_number_)) {
+          has_log_number_ = true;
+        } else {
+          msg = "log number";
+        }
+        break;
+
+      case kPrevLogNumber:
+        if (GetVarint64(&input, &prev_log_number_)) {
+          has_prev_log_number_ = true;
+        } else {
+          msg = "previous log number";
+        }
+        break;
+
+      case kNextFileNumber:
+        if (GetVarint64(&input, &next_file_number_)) {
+          has_next_file_number_ = true;
+        } else {
+          msg = "next file number";
+        }
+        break;
+
+      case kMaxColumnFamily:
+        if (GetVarint32(&input, &max_column_family_)) {
+          has_max_column_family_ = true;
+        } else {
+          msg = "max column family";
+        }
+        break;
+
+      case kMinLogNumberToKeep:
+        if (GetVarint64(&input, &min_log_number_to_keep_)) {
+          has_min_log_number_to_keep_ = true;
+        } else {
+          msg = "min log number to kee";
+        }
+        break;
+
+      case kLastSequence:
+        if (GetVarint64(&input, &last_sequence_)) {
+          has_last_sequence_ = true;
+        } else {
+          msg = "last sequence number";
+        }
+        break;
+
+      case kCompactPointer:
+        if (GetLevel(&input, &level, &msg) &&
+            GetInternalKey(&input, &key)) {
+          // we don't use compact pointers anymore,
+          // but we should not fail if they are still
+          // in manifest
+        } else {
+          if (!msg) {
+            msg = "compaction pointer";
+          }
+        }
+        break;
+
+      case kDeletedFile: {
+        uint64_t number = 0;
+        if (GetLevel(&input, &level, &msg) && GetVarint64(&input, &number)) {
+          deleted_files_.insert(std::make_pair(level, number));
+        } else {
+          if (!msg) {
+            msg = "deleted file";
+          }
+        }
+        break;
+      }
+
+      case kNewFile: {
+        uint64_t number = 0;
+        uint64_t file_size = 0;
+        if (GetLevel(&input, &level, &msg) && GetVarint64(&input, &number) &&
+            GetVarint64(&input, &file_size) &&
+            GetInternalKey(&input, &f.smallest) &&
+            GetInternalKey(&input, &f.largest)) {
+          f.fd = FileDescriptor(number, 0, file_size);
+          new_files_.push_back(std::make_pair(level, f));
+        } else {
+          if (!msg) {
+            msg = "new-file entry";
+          }
+        }
+        break;
+      }
+      case kNewFile2: {
+        uint64_t number = 0;
+        uint64_t file_size = 0;
+        SequenceNumber smallest_seqno = 0;
+        SequenceNumber largest_seqno = kMaxSequenceNumber;
+        if (GetLevel(&input, &level, &msg) && GetVarint64(&input, &number) &&
+            GetVarint64(&input, &file_size) &&
+            GetInternalKey(&input, &f.smallest) &&
+            GetInternalKey(&input, &f.largest) &&
+            GetVarint64(&input, &smallest_seqno) &&
+            GetVarint64(&input, &largest_seqno)) {
+          f.fd = FileDescriptor(number, 0, file_size, smallest_seqno,
+                                largest_seqno);
+          new_files_.push_back(std::make_pair(level, f));
+        } else {
+          if (!msg) {
+            msg = "new-file2 entry";
+          }
+        }
+        break;
+      }
+
+      case kNewFile3: {
+        uint64_t number = 0;
+        uint32_t path_id = 0;
+        uint64_t file_size = 0;
+        SequenceNumber smallest_seqno = 0;
+        SequenceNumber largest_seqno = kMaxSequenceNumber;
+        if (GetLevel(&input, &level, &msg) && GetVarint64(&input, &number) &&
+            GetVarint32(&input, &path_id) && GetVarint64(&input, &file_size) &&
+            GetInternalKey(&input, &f.smallest) &&
+            GetInternalKey(&input, &f.largest) &&
+            GetVarint64(&input, &smallest_seqno) &&
+            GetVarint64(&input, &largest_seqno)) {
+          f.fd = FileDescriptor(number, path_id, file_size, smallest_seqno,
+                                largest_seqno);
+          new_files_.push_back(std::make_pair(level, f));
+        } else {
+          if (!msg) {
+            msg = "new-file3 entry";
+          }
+        }
+        break;
+      }
+
+      case kNewFile4: {
+        msg = DecodeNewFile4From(&input);
+        break;
+      }
+
+      case kColumnFamily:
+        if (!GetVarint32(&input, &column_family_)) {
+          if (!msg) {
+            msg = "set column family id";
+          }
+        }
+        break;
+
+      case kColumnFamilyAdd:
+        if (GetLengthPrefixedSlice(&input, &str)) {
+          is_column_family_add_ = true;
+          column_family_name_ = str.ToString();
+        } else {
+          if (!msg) {
+            msg = "column family add";
+          }
+        }
+        break;
+
+      case kColumnFamilyDrop:
+        is_column_family_drop_ = true;
+        break;
+
+      case kInAtomicGroup:
+        is_in_atomic_group_ = true;
+        if (!GetVarint32(&input, &remaining_entries_)) {
+          if (!msg) {
+            msg = "remaining entries";
+          }
+        }
+        break;
+
+      default:
+        if (tag & kTagSafeIgnoreMask) {
+          // Tag from future which can be safely ignored.
+          // The next field must be the length of the entry.
+          uint32_t field_len;
+          if (!GetVarint32(&input, &field_len) ||
+              static_cast<size_t>(field_len) > input.size()) {
+            if (!msg) {
+              msg = "safely ignoreable tag length error";
+            }
+          } else {
+            input.remove_prefix(static_cast<size_t>(field_len));
+          }
+        } else {
+          msg = "unknown tag";
+        }
+        break;
+    }
+  }
+
+  if (msg == nullptr && !input.empty()) {
+    msg = "invalid tag";
+  }
+
+  Status result;
+  if (msg != nullptr) {
+    result = Status::Corruption("VersionEdit", msg);
+  }
+  return result;
+}
+
+std::string VersionEdit::DebugString(bool hex_key) const {
+  std::string r;
+  r.append("VersionEdit {");
+  if (has_db_id_) {
+    r.append("\n  DB ID: ");
+    r.append(db_id_);
+  }
+  if (has_comparator_) {
+    r.append("\n  Comparator: ");
+    r.append(comparator_);
+  }
+  if (has_log_number_) {
+    r.append("\n  LogNumber: ");
+    AppendNumberTo(&r, log_number_);
+  }
+  if (has_prev_log_number_) {
+    r.append("\n  PrevLogNumber: ");
+    AppendNumberTo(&r, prev_log_number_);
+  }
+  if (has_next_file_number_) {
+    r.append("\n  NextFileNumber: ");
+    AppendNumberTo(&r, next_file_number_);
+  }
+  if (has_max_column_family_) {
+    r.append("\n  MaxColumnFamily: ");
+    AppendNumberTo(&r, max_column_family_);
+  }
+  if (has_min_log_number_to_keep_) {
+    r.append("\n  MinLogNumberToKeep: ");
+    AppendNumberTo(&r, min_log_number_to_keep_);
+  }
+  if (has_last_sequence_) {
+    r.append("\n  LastSeq: ");
+    AppendNumberTo(&r, last_sequence_);
+  }
+  for (const auto& deleted_file : deleted_files_) {
+    r.append("\n  DeleteFile: ");
+    AppendNumberTo(&r, deleted_file.first);
+    r.append(" ");
+    AppendNumberTo(&r, deleted_file.second);
+  }
+  for (size_t i = 0; i < new_files_.size(); i++) {
+    const FileMetaData& f = new_files_[i].second;
+    r.append("\n  AddFile: ");
+    AppendNumberTo(&r, new_files_[i].first);
+    r.append(" ");
+    AppendNumberTo(&r, f.fd.GetNumber());
+    r.append(" ");
+    AppendNumberTo(&r, f.fd.GetFileSize());
+    r.append(" ");
+    r.append(f.smallest.DebugString(hex_key));
+    r.append(" .. ");
+    r.append(f.largest.DebugString(hex_key));
+    if (f.oldest_blob_file_number != kInvalidBlobFileNumber) {
+      r.append(" blob_file:");
+      AppendNumberTo(&r, f.oldest_blob_file_number);
+    }
+    r.append(" oldest_ancester_time:");
+    AppendNumberTo(&r, f.oldest_ancester_time);
+    r.append(" file_creation_time:");
+    AppendNumberTo(&r, f.file_creation_time);
+    r.append(" file_checksum:");
+    r.append(f.file_checksum);
+    r.append(" file_checksum_func_name: ");
+    r.append(f.file_checksum_func_name);
+  }
+  r.append("\n  ColumnFamily: ");
+  AppendNumberTo(&r, column_family_);
+  if (is_column_family_add_) {
+    r.append("\n  ColumnFamilyAdd: ");
+    r.append(column_family_name_);
+  }
+  if (is_column_family_drop_) {
+    r.append("\n  ColumnFamilyDrop");
+  }
+  if (is_in_atomic_group_) {
+    r.append("\n  AtomicGroup: ");
+    AppendNumberTo(&r, remaining_entries_);
+    r.append(" entries remains");
+  }
+  r.append("\n}\n");
+  return r;
+}
+
+std::string VersionEdit::DebugJSON(int edit_num, bool hex_key) const {
+  JSONWriter jw;
+  jw << "EditNumber" << edit_num;
+
+  if (has_db_id_) {
+    jw << "DB ID" << db_id_;
+  }
+  if (has_comparator_) {
+    jw << "Comparator" << comparator_;
+  }
+  if (has_log_number_) {
+    jw << "LogNumber" << log_number_;
+  }
+  if (has_prev_log_number_) {
+    jw << "PrevLogNumber" << prev_log_number_;
+  }
+  if (has_next_file_number_) {
+    jw << "NextFileNumber" << next_file_number_;
+  }
+  if (has_max_column_family_) {
+    jw << "MaxColumnFamily" << max_column_family_;
+  }
+  if (has_min_log_number_to_keep_) {
+    jw << "MinLogNumberToKeep" << min_log_number_to_keep_;
+  }
+  if (has_last_sequence_) {
+    jw << "LastSeq" << last_sequence_;
+  }
+
+  if (!deleted_files_.empty()) {
+    jw << "DeletedFiles";
+    jw.StartArray();
+
+    for (const auto& deleted_file : deleted_files_) {
+      jw.StartArrayedObject();
+      jw << "Level" << deleted_file.first;
+      jw << "FileNumber" << deleted_file.second;
+      jw.EndArrayedObject();
+    }
+
+    jw.EndArray();
+  }
+
+  if (!new_files_.empty()) {
+    jw << "AddedFiles";
+    jw.StartArray();
+
+    for (size_t i = 0; i < new_files_.size(); i++) {
+      jw.StartArrayedObject();
+      jw << "Level" << new_files_[i].first;
+      const FileMetaData& f = new_files_[i].second;
+      jw << "FileNumber" << f.fd.GetNumber();
+      jw << "FileSize" << f.fd.GetFileSize();
+      jw << "SmallestIKey" << f.smallest.DebugString(hex_key);
+      jw << "LargestIKey" << f.largest.DebugString(hex_key);
+      if (f.oldest_blob_file_number != kInvalidBlobFileNumber) {
+        jw << "OldestBlobFile" << f.oldest_blob_file_number;
+      }
+      jw.EndArrayedObject();
+    }
+
+    jw.EndArray();
+  }
+
+  jw << "ColumnFamily" << column_family_;
+
+  if (is_column_family_add_) {
+    jw << "ColumnFamilyAdd" << column_family_name_;
+  }
+  if (is_column_family_drop_) {
+    jw << "ColumnFamilyDrop" << column_family_name_;
+  }
+  if (is_in_atomic_group_) {
+    jw << "AtomicGroup" << remaining_entries_;
+  }
+
+  jw.EndObject();
+
+  return jw.Get();
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/version_edit.h b/src/rocksdb/db/version_edit.h
new file mode 100644
index 000000000..6d1893f2a
--- /dev/null
+++ b/src/rocksdb/db/version_edit.h
@@ -0,0 +1,438 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <algorithm>
+#include <set>
+#include <string>
+#include <utility>
+#include <vector>
+#include "db/dbformat.h"
+#include "memory/arena.h"
+#include "rocksdb/cache.h"
+#include "table/table_reader.h"
+#include "util/autovector.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class VersionSet;
+
+constexpr uint64_t kFileNumberMask = 0x3FFFFFFFFFFFFFFF;
+constexpr uint64_t kInvalidBlobFileNumber = 0;
+constexpr uint64_t kUnknownOldestAncesterTime = 0;
+constexpr uint64_t kUnknownFileCreationTime = 0;
+
+extern const std::string kUnknownFileChecksum;
+extern const std::string kUnknownFileChecksumFuncName;
+
+extern uint64_t PackFileNumberAndPathId(uint64_t number, uint64_t path_id);
+
+// A copyable structure contains information needed to read data from an SST
+// file. It can contain a pointer to a table reader opened for the file, or
+// file number and size, which can be used to create a new table reader for it.
+// The behavior is undefined when a copied of the structure is used when the
+// file is not in any live version any more.
+struct FileDescriptor {
+  // Table reader in table_reader_handle
+  TableReader* table_reader;
+  uint64_t packed_number_and_path_id;
+  uint64_t file_size;  // File size in bytes
+  SequenceNumber smallest_seqno;  // The smallest seqno in this file
+  SequenceNumber largest_seqno;   // The largest seqno in this file
+
+  FileDescriptor() : FileDescriptor(0, 0, 0) {}
+
+  FileDescriptor(uint64_t number, uint32_t path_id, uint64_t _file_size)
+      : FileDescriptor(number, path_id, _file_size, kMaxSequenceNumber, 0) {}
+
+  FileDescriptor(uint64_t number, uint32_t path_id, uint64_t _file_size,
+                 SequenceNumber _smallest_seqno, SequenceNumber _largest_seqno)
+      : table_reader(nullptr),
+        packed_number_and_path_id(PackFileNumberAndPathId(number, path_id)),
+        file_size(_file_size),
+        smallest_seqno(_smallest_seqno),
+        largest_seqno(_largest_seqno) {}
+
+  FileDescriptor(const FileDescriptor& fd) { *this = fd; }
+
+  FileDescriptor& operator=(const FileDescriptor& fd) {
+    table_reader = fd.table_reader;
+    packed_number_and_path_id = fd.packed_number_and_path_id;
+    file_size = fd.file_size;
+    smallest_seqno = fd.smallest_seqno;
+    largest_seqno = fd.largest_seqno;
+    return *this;
+  }
+
+  uint64_t GetNumber() const {
+    return packed_number_and_path_id & kFileNumberMask;
+  }
+  uint32_t GetPathId() const {
+    return static_cast<uint32_t>(
+        packed_number_and_path_id / (kFileNumberMask + 1));
+  }
+  uint64_t GetFileSize() const { return file_size; }
+};
+
+struct FileSampledStats {
+  FileSampledStats() : num_reads_sampled(0) {}
+  FileSampledStats(const FileSampledStats& other) { *this = other; }
+  FileSampledStats& operator=(const FileSampledStats& other) {
+    num_reads_sampled = other.num_reads_sampled.load();
+    return *this;
+  }
+
+  // number of user reads to this file.
+  mutable std::atomic<uint64_t> num_reads_sampled;
+};
+
+struct FileMetaData {
+  FileDescriptor fd;
+  InternalKey smallest;            // Smallest internal key served by table
+  InternalKey largest;             // Largest internal key served by table
+
+  // Needs to be disposed when refs becomes 0.
+  Cache::Handle* table_reader_handle = nullptr;
+
+  FileSampledStats stats;
+
+  // Stats for compensating deletion entries during compaction
+
+  // File size compensated by deletion entry.
+  // This is updated in Version::UpdateAccumulatedStats() first time when the
+  // file is created or loaded.  After it is updated (!= 0), it is immutable.
+  uint64_t compensated_file_size = 0;
+  // These values can mutate, but they can only be read or written from
+  // single-threaded LogAndApply thread
+  uint64_t num_entries = 0;     // the number of entries.
+  uint64_t num_deletions = 0;   // the number of deletion entries.
+  uint64_t raw_key_size = 0;    // total uncompressed key size.
+  uint64_t raw_value_size = 0;  // total uncompressed value size.
+
+  int refs = 0;  // Reference count
+
+  bool being_compacted = false;       // Is this file undergoing compaction?
+  bool init_stats_from_file = false;  // true if the data-entry stats of this
+                                      // file has initialized from file.
+
+  bool marked_for_compaction = false;  // True if client asked us nicely to
+                                       // compact this file.
+
+  // Used only in BlobDB. The file number of the oldest blob file this SST file
+  // refers to. 0 is an invalid value; BlobDB numbers the files starting from 1.
+  uint64_t oldest_blob_file_number = kInvalidBlobFileNumber;
+
+  // The file could be the compaction output from other SST files, which could
+  // in turn be outputs for compact older SST files. We track the memtable
+  // flush timestamp for the oldest SST file that eventaully contribute data
+  // to this file. 0 means the information is not available.
+  uint64_t oldest_ancester_time = kUnknownOldestAncesterTime;
+
+  // Unix time when the SST file is created.
+  uint64_t file_creation_time = kUnknownFileCreationTime;
+
+  // File checksum
+  std::string file_checksum = kUnknownFileChecksum;
+
+  // File checksum function name
+  std::string file_checksum_func_name = kUnknownFileChecksumFuncName;
+
+  FileMetaData() = default;
+
+  FileMetaData(uint64_t file, uint32_t file_path_id, uint64_t file_size,
+               const InternalKey& smallest_key, const InternalKey& largest_key,
+               const SequenceNumber& smallest_seq,
+               const SequenceNumber& largest_seq, bool marked_for_compact,
+               uint64_t oldest_blob_file, uint64_t _oldest_ancester_time,
+               uint64_t _file_creation_time, const std::string& _file_checksum,
+               const std::string& _file_checksum_func_name)
+      : fd(file, file_path_id, file_size, smallest_seq, largest_seq),
+        smallest(smallest_key),
+        largest(largest_key),
+        marked_for_compaction(marked_for_compact),
+        oldest_blob_file_number(oldest_blob_file),
+        oldest_ancester_time(_oldest_ancester_time),
+        file_creation_time(_file_creation_time),
+        file_checksum(_file_checksum),
+        file_checksum_func_name(_file_checksum_func_name) {
+    TEST_SYNC_POINT_CALLBACK("FileMetaData::FileMetaData", this);
+  }
+
+  // REQUIRED: Keys must be given to the function in sorted order (it expects
+  // the last key to be the largest).
+  void UpdateBoundaries(const Slice& key, const Slice& value,
+                        SequenceNumber seqno, ValueType value_type);
+
+  // Unlike UpdateBoundaries, ranges do not need to be presented in any
+  // particular order.
+  void UpdateBoundariesForRange(const InternalKey& start,
+                                const InternalKey& end, SequenceNumber seqno,
+                                const InternalKeyComparator& icmp) {
+    if (smallest.size() == 0 || icmp.Compare(start, smallest) < 0) {
+      smallest = start;
+    }
+    if (largest.size() == 0 || icmp.Compare(largest, end) < 0) {
+      largest = end;
+    }
+    fd.smallest_seqno = std::min(fd.smallest_seqno, seqno);
+    fd.largest_seqno = std::max(fd.largest_seqno, seqno);
+  }
+
+  // Try to get oldest ancester time from the class itself or table properties
+  // if table reader is already pinned.
+  // 0 means the information is not available.
+  uint64_t TryGetOldestAncesterTime() {
+    if (oldest_ancester_time != kUnknownOldestAncesterTime) {
+      return oldest_ancester_time;
+    } else if (fd.table_reader != nullptr &&
+               fd.table_reader->GetTableProperties() != nullptr) {
+      return fd.table_reader->GetTableProperties()->creation_time;
+    }
+    return kUnknownOldestAncesterTime;
+  }
+
+  uint64_t TryGetFileCreationTime() {
+    if (file_creation_time != kUnknownFileCreationTime) {
+      return file_creation_time;
+    } else if (fd.table_reader != nullptr &&
+               fd.table_reader->GetTableProperties() != nullptr) {
+      return fd.table_reader->GetTableProperties()->file_creation_time;
+    }
+    return kUnknownFileCreationTime;
+  }
+};
+
+// A compressed copy of file meta data that just contain minimum data needed
+// to server read operations, while still keeping the pointer to full metadata
+// of the file in case it is needed.
+struct FdWithKeyRange {
+  FileDescriptor fd;
+  FileMetaData* file_metadata;  // Point to all metadata
+  Slice smallest_key;    // slice that contain smallest key
+  Slice largest_key;     // slice that contain largest key
+
+  FdWithKeyRange()
+      : fd(),
+        file_metadata(nullptr),
+        smallest_key(),
+        largest_key() {
+  }
+
+  FdWithKeyRange(FileDescriptor _fd, Slice _smallest_key, Slice _largest_key,
+                 FileMetaData* _file_metadata)
+      : fd(_fd),
+        file_metadata(_file_metadata),
+        smallest_key(_smallest_key),
+        largest_key(_largest_key) {}
+};
+
+// Data structure to store an array of FdWithKeyRange in one level
+// Actual data is guaranteed to be stored closely
+struct LevelFilesBrief {
+  size_t num_files;
+  FdWithKeyRange* files;
+  LevelFilesBrief() {
+    num_files = 0;
+    files = nullptr;
+  }
+};
+
+// The state of a DB at any given time is referred to as a Version.
+// Any modification to the Version is considered a Version Edit. A Version is
+// constructed by joining a sequence of Version Edits. Version Edits are written
+// to the MANIFEST file.
+class VersionEdit {
+ public:
+  void Clear();
+
+  void SetDBId(const std::string& db_id) {
+    has_db_id_ = true;
+    db_id_ = db_id;
+  }
+  bool HasDbId() const { return has_db_id_; }
+  const std::string& GetDbId() const { return db_id_; }
+
+  void SetComparatorName(const Slice& name) {
+    has_comparator_ = true;
+    comparator_ = name.ToString();
+  }
+  bool HasComparatorName() const { return has_comparator_; }
+  const std::string& GetComparatorName() const { return comparator_; }
+
+  void SetLogNumber(uint64_t num) {
+    has_log_number_ = true;
+    log_number_ = num;
+  }
+  bool HasLogNumber() const { return has_log_number_; }
+  uint64_t GetLogNumber() const { return log_number_; }
+
+  void SetPrevLogNumber(uint64_t num) {
+    has_prev_log_number_ = true;
+    prev_log_number_ = num;
+  }
+  bool HasPrevLogNumber() const { return has_prev_log_number_; }
+  uint64_t GetPrevLogNumber() const { return prev_log_number_; }
+
+  void SetNextFile(uint64_t num) {
+    has_next_file_number_ = true;
+    next_file_number_ = num;
+  }
+  bool HasNextFile() const { return has_next_file_number_; }
+  uint64_t GetNextFile() const { return next_file_number_; }
+
+  void SetMaxColumnFamily(uint32_t max_column_family) {
+    has_max_column_family_ = true;
+    max_column_family_ = max_column_family;
+  }
+  bool HasMaxColumnFamily() const { return has_max_column_family_; }
+  uint32_t GetMaxColumnFamily() const { return max_column_family_; }
+
+  void SetMinLogNumberToKeep(uint64_t num) {
+    has_min_log_number_to_keep_ = true;
+    min_log_number_to_keep_ = num;
+  }
+  bool HasMinLogNumberToKeep() const { return has_min_log_number_to_keep_; }
+  uint64_t GetMinLogNumberToKeep() const { return min_log_number_to_keep_; }
+
+  void SetLastSequence(SequenceNumber seq) {
+    has_last_sequence_ = true;
+    last_sequence_ = seq;
+  }
+  bool HasLastSequence() const { return has_last_sequence_; }
+  SequenceNumber GetLastSequence() const { return last_sequence_; }
+
+  // Delete the specified "file" from the specified "level".
+  void DeleteFile(int level, uint64_t file) {
+    deleted_files_.emplace(level, file);
+  }
+
+  // Retrieve the files deleted as well as their associated levels.
+  using DeletedFiles = std::set<std::pair<int, uint64_t>>;
+  const DeletedFiles& GetDeletedFiles() const { return deleted_files_; }
+
+  // Add the specified file at the specified level.
+  // REQUIRES: This version has not been saved (see VersionSet::SaveTo)
+  // REQUIRES: "smallest" and "largest" are smallest and largest keys in file
+  // REQUIRES: "oldest_blob_file_number" is the number of the oldest blob file
+  // referred to by this file if any, kInvalidBlobFileNumber otherwise.
+  void AddFile(int level, uint64_t file, uint32_t file_path_id,
+               uint64_t file_size, const InternalKey& smallest,
+               const InternalKey& largest, const SequenceNumber& smallest_seqno,
+               const SequenceNumber& largest_seqno, bool marked_for_compaction,
+               uint64_t oldest_blob_file_number, uint64_t oldest_ancester_time,
+               uint64_t file_creation_time, const std::string& file_checksum,
+               const std::string& file_checksum_func_name) {
+    assert(smallest_seqno <= largest_seqno);
+    new_files_.emplace_back(
+        level, FileMetaData(file, file_path_id, file_size, smallest, largest,
+                            smallest_seqno, largest_seqno,
+                            marked_for_compaction, oldest_blob_file_number,
+                            oldest_ancester_time, file_creation_time,
+                            file_checksum, file_checksum_func_name));
+  }
+
+  void AddFile(int level, const FileMetaData& f) {
+    assert(f.fd.smallest_seqno <= f.fd.largest_seqno);
+    new_files_.emplace_back(level, f);
+  }
+
+  // Retrieve the files added as well as their associated levels.
+  using NewFiles = std::vector<std::pair<int, FileMetaData>>;
+  const NewFiles& GetNewFiles() const { return new_files_; }
+
+  // Number of edits
+  size_t NumEntries() const { return new_files_.size() + deleted_files_.size(); }
+
+  void SetColumnFamily(uint32_t column_family_id) {
+    column_family_ = column_family_id;
+  }
+  uint32_t GetColumnFamily() const { return column_family_; }
+
+  // set column family ID by calling SetColumnFamily()
+  void AddColumnFamily(const std::string& name) {
+    assert(!is_column_family_drop_);
+    assert(!is_column_family_add_);
+    assert(NumEntries() == 0);
+    is_column_family_add_ = true;
+    column_family_name_ = name;
+  }
+
+  // set column family ID by calling SetColumnFamily()
+  void DropColumnFamily() {
+    assert(!is_column_family_drop_);
+    assert(!is_column_family_add_);
+    assert(NumEntries() == 0);
+    is_column_family_drop_ = true;
+  }
+
+  bool IsColumnFamilyManipulation() const {
+    return is_column_family_add_ || is_column_family_drop_;
+  }
+
+  void MarkAtomicGroup(uint32_t remaining_entries) {
+    is_in_atomic_group_ = true;
+    remaining_entries_ = remaining_entries;
+  }
+  bool IsInAtomicGroup() const { return is_in_atomic_group_; }
+  uint32_t GetRemainingEntries() const { return remaining_entries_; }
+
+  // return true on success.
+  bool EncodeTo(std::string* dst) const;
+  Status DecodeFrom(const Slice& src);
+
+  std::string DebugString(bool hex_key = false) const;
+  std::string DebugJSON(int edit_num, bool hex_key = false) const;
+
+ private:
+  friend class ReactiveVersionSet;
+  friend class VersionSet;
+  friend class Version;
+  friend class AtomicGroupReadBuffer;
+
+  bool GetLevel(Slice* input, int* level, const char** msg);
+
+  const char* DecodeNewFile4From(Slice* input);
+
+  int max_level_ = 0;
+  std::string db_id_;
+  std::string comparator_;
+  uint64_t log_number_ = 0;
+  uint64_t prev_log_number_ = 0;
+  uint64_t next_file_number_ = 0;
+  uint32_t max_column_family_ = 0;
+  // The most recent WAL log number that is deleted
+  uint64_t min_log_number_to_keep_ = 0;
+  SequenceNumber last_sequence_ = 0;
+  bool has_db_id_ = false;
+  bool has_comparator_ = false;
+  bool has_log_number_ = false;
+  bool has_prev_log_number_ = false;
+  bool has_next_file_number_ = false;
+  bool has_max_column_family_ = false;
+  bool has_min_log_number_to_keep_ = false;
+  bool has_last_sequence_ = false;
+
+  DeletedFiles deleted_files_;
+  NewFiles new_files_;
+
+  // Each version edit record should have column_family_ set
+  // If it's not set, it is default (0)
+  uint32_t column_family_ = 0;
+  // a version edit can be either column_family add or
+  // column_family drop. If it's column family add,
+  // it also includes column family name.
+  bool is_column_family_drop_ = false;
+  bool is_column_family_add_ = false;
+  std::string column_family_name_;
+
+  bool is_in_atomic_group_ = false;
+  uint32_t remaining_entries_ = 0;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/version_edit_test.cc b/src/rocksdb/db/version_edit_test.cc
new file mode 100644
index 000000000..8bc884df9
--- /dev/null
+++ b/src/rocksdb/db/version_edit_test.cc
@@ -0,0 +1,286 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/version_edit.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "util/coding.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+static void TestEncodeDecode(const VersionEdit& edit) {
+  std::string encoded, encoded2;
+  edit.EncodeTo(&encoded);
+  VersionEdit parsed;
+  Status s = parsed.DecodeFrom(encoded);
+  ASSERT_TRUE(s.ok()) << s.ToString();
+  parsed.EncodeTo(&encoded2);
+  ASSERT_EQ(encoded, encoded2);
+}
+
+class VersionEditTest : public testing::Test {};
+
+TEST_F(VersionEditTest, EncodeDecode) {
+  static const uint64_t kBig = 1ull << 50;
+  static const uint32_t kBig32Bit = 1ull << 30;
+
+  VersionEdit edit;
+  for (int i = 0; i < 4; i++) {
+    TestEncodeDecode(edit);
+    edit.AddFile(3, kBig + 300 + i, kBig32Bit + 400 + i, 0,
+                 InternalKey("foo", kBig + 500 + i, kTypeValue),
+                 InternalKey("zoo", kBig + 600 + i, kTypeDeletion),
+                 kBig + 500 + i, kBig + 600 + i, false, kInvalidBlobFileNumber,
+                 888, 678, "234", "crc32c");
+    edit.DeleteFile(4, kBig + 700 + i);
+  }
+
+  edit.SetComparatorName("foo");
+  edit.SetLogNumber(kBig + 100);
+  edit.SetNextFile(kBig + 200);
+  edit.SetLastSequence(kBig + 1000);
+  TestEncodeDecode(edit);
+}
+
+TEST_F(VersionEditTest, EncodeDecodeNewFile4) {
+  static const uint64_t kBig = 1ull << 50;
+
+  VersionEdit edit;
+  edit.AddFile(3, 300, 3, 100, InternalKey("foo", kBig + 500, kTypeValue),
+               InternalKey("zoo", kBig + 600, kTypeDeletion), kBig + 500,
+               kBig + 600, true, kInvalidBlobFileNumber,
+               kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+               kUnknownFileChecksum, kUnknownFileChecksumFuncName);
+  edit.AddFile(4, 301, 3, 100, InternalKey("foo", kBig + 501, kTypeValue),
+               InternalKey("zoo", kBig + 601, kTypeDeletion), kBig + 501,
+               kBig + 601, false, kInvalidBlobFileNumber,
+               kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+               kUnknownFileChecksum, kUnknownFileChecksumFuncName);
+  edit.AddFile(5, 302, 0, 100, InternalKey("foo", kBig + 502, kTypeValue),
+               InternalKey("zoo", kBig + 602, kTypeDeletion), kBig + 502,
+               kBig + 602, true, kInvalidBlobFileNumber, 666, 888,
+               kUnknownFileChecksum, kUnknownFileChecksumFuncName);
+  edit.AddFile(5, 303, 0, 100, InternalKey("foo", kBig + 503, kTypeBlobIndex),
+               InternalKey("zoo", kBig + 603, kTypeBlobIndex), kBig + 503,
+               kBig + 603, true, 1001, kUnknownOldestAncesterTime,
+               kUnknownFileCreationTime, kUnknownFileChecksum,
+               kUnknownFileChecksumFuncName);
+  ;
+
+  edit.DeleteFile(4, 700);
+
+  edit.SetComparatorName("foo");
+  edit.SetLogNumber(kBig + 100);
+  edit.SetNextFile(kBig + 200);
+  edit.SetLastSequence(kBig + 1000);
+  TestEncodeDecode(edit);
+
+  std::string encoded, encoded2;
+  edit.EncodeTo(&encoded);
+  VersionEdit parsed;
+  Status s = parsed.DecodeFrom(encoded);
+  ASSERT_TRUE(s.ok()) << s.ToString();
+  auto& new_files = parsed.GetNewFiles();
+  ASSERT_TRUE(new_files[0].second.marked_for_compaction);
+  ASSERT_TRUE(!new_files[1].second.marked_for_compaction);
+  ASSERT_TRUE(new_files[2].second.marked_for_compaction);
+  ASSERT_TRUE(new_files[3].second.marked_for_compaction);
+  ASSERT_EQ(3u, new_files[0].second.fd.GetPathId());
+  ASSERT_EQ(3u, new_files[1].second.fd.GetPathId());
+  ASSERT_EQ(0u, new_files[2].second.fd.GetPathId());
+  ASSERT_EQ(0u, new_files[3].second.fd.GetPathId());
+  ASSERT_EQ(kInvalidBlobFileNumber,
+            new_files[0].second.oldest_blob_file_number);
+  ASSERT_EQ(kInvalidBlobFileNumber,
+            new_files[1].second.oldest_blob_file_number);
+  ASSERT_EQ(kInvalidBlobFileNumber,
+            new_files[2].second.oldest_blob_file_number);
+  ASSERT_EQ(1001, new_files[3].second.oldest_blob_file_number);
+}
+
+TEST_F(VersionEditTest, ForwardCompatibleNewFile4) {
+  static const uint64_t kBig = 1ull << 50;
+  VersionEdit edit;
+  edit.AddFile(3, 300, 3, 100, InternalKey("foo", kBig + 500, kTypeValue),
+               InternalKey("zoo", kBig + 600, kTypeDeletion), kBig + 500,
+               kBig + 600, true, kInvalidBlobFileNumber,
+               kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+               kUnknownFileChecksum, kUnknownFileChecksumFuncName);
+  edit.AddFile(4, 301, 3, 100, InternalKey("foo", kBig + 501, kTypeValue),
+               InternalKey("zoo", kBig + 601, kTypeDeletion), kBig + 501,
+               kBig + 601, false, kInvalidBlobFileNumber, 686, 868, "234",
+               "crc32c");
+  edit.DeleteFile(4, 700);
+
+  edit.SetComparatorName("foo");
+  edit.SetLogNumber(kBig + 100);
+  edit.SetNextFile(kBig + 200);
+  edit.SetLastSequence(kBig + 1000);
+
+  std::string encoded;
+
+  // Call back function to add extra customized builds.
+  bool first = true;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "VersionEdit::EncodeTo:NewFile4:CustomizeFields", [&](void* arg) {
+        std::string* str = reinterpret_cast<std::string*>(arg);
+        PutVarint32(str, 33);
+        const std::string str1 = "random_string";
+        PutLengthPrefixedSlice(str, str1);
+        if (first) {
+          first = false;
+          PutVarint32(str, 22);
+          const std::string str2 = "s";
+          PutLengthPrefixedSlice(str, str2);
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  edit.EncodeTo(&encoded);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+  VersionEdit parsed;
+  Status s = parsed.DecodeFrom(encoded);
+  ASSERT_TRUE(s.ok()) << s.ToString();
+  ASSERT_TRUE(!first);
+  auto& new_files = parsed.GetNewFiles();
+  ASSERT_TRUE(new_files[0].second.marked_for_compaction);
+  ASSERT_TRUE(!new_files[1].second.marked_for_compaction);
+  ASSERT_EQ(3u, new_files[0].second.fd.GetPathId());
+  ASSERT_EQ(3u, new_files[1].second.fd.GetPathId());
+  ASSERT_EQ(1u, parsed.GetDeletedFiles().size());
+}
+
+TEST_F(VersionEditTest, NewFile4NotSupportedField) {
+  static const uint64_t kBig = 1ull << 50;
+  VersionEdit edit;
+  edit.AddFile(3, 300, 3, 100, InternalKey("foo", kBig + 500, kTypeValue),
+               InternalKey("zoo", kBig + 600, kTypeDeletion), kBig + 500,
+               kBig + 600, true, kInvalidBlobFileNumber,
+               kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+               kUnknownFileChecksum, kUnknownFileChecksumFuncName);
+
+  edit.SetComparatorName("foo");
+  edit.SetLogNumber(kBig + 100);
+  edit.SetNextFile(kBig + 200);
+  edit.SetLastSequence(kBig + 1000);
+
+  std::string encoded;
+
+  // Call back function to add extra customized builds.
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "VersionEdit::EncodeTo:NewFile4:CustomizeFields", [&](void* arg) {
+        std::string* str = reinterpret_cast<std::string*>(arg);
+        const std::string str1 = "s";
+        PutLengthPrefixedSlice(str, str1);
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  edit.EncodeTo(&encoded);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+  VersionEdit parsed;
+  Status s = parsed.DecodeFrom(encoded);
+  ASSERT_NOK(s);
+}
+
+TEST_F(VersionEditTest, EncodeEmptyFile) {
+  VersionEdit edit;
+  edit.AddFile(0, 0, 0, 0, InternalKey(), InternalKey(), 0, 0, false,
+               kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
+               kUnknownFileCreationTime, kUnknownFileChecksum,
+               kUnknownFileChecksumFuncName);
+  std::string buffer;
+  ASSERT_TRUE(!edit.EncodeTo(&buffer));
+}
+
+TEST_F(VersionEditTest, ColumnFamilyTest) {
+  VersionEdit edit;
+  edit.SetColumnFamily(2);
+  edit.AddColumnFamily("column_family");
+  edit.SetMaxColumnFamily(5);
+  TestEncodeDecode(edit);
+
+  edit.Clear();
+  edit.SetColumnFamily(3);
+  edit.DropColumnFamily();
+  TestEncodeDecode(edit);
+}
+
+TEST_F(VersionEditTest, MinLogNumberToKeep) {
+  VersionEdit edit;
+  edit.SetMinLogNumberToKeep(13);
+  TestEncodeDecode(edit);
+
+  edit.Clear();
+  edit.SetMinLogNumberToKeep(23);
+  TestEncodeDecode(edit);
+}
+
+TEST_F(VersionEditTest, AtomicGroupTest) {
+  VersionEdit edit;
+  edit.MarkAtomicGroup(1);
+  TestEncodeDecode(edit);
+}
+
+TEST_F(VersionEditTest, IgnorableField) {
+  VersionEdit ve;
+  std::string encoded;
+
+  // Size of ignorable field is too large
+  PutVarint32Varint64(&encoded, 2 /* kLogNumber */, 66);
+  // This is a customized ignorable tag
+  PutVarint32Varint64(&encoded,
+                      0x2710 /* A field with kTagSafeIgnoreMask set */,
+                      5 /* fieldlength 5 */);
+  encoded += "abc";  // Only fills 3 bytes,
+  ASSERT_NOK(ve.DecodeFrom(encoded));
+
+  encoded.clear();
+  // Error when seeing unidentified tag that is not ignorable
+  PutVarint32Varint64(&encoded, 2 /* kLogNumber */, 66);
+  // This is a customized ignorable tag
+  PutVarint32Varint64(&encoded, 666 /* A field with kTagSafeIgnoreMask unset */,
+                      3 /* fieldlength 3 */);
+  encoded += "abc";  //  Fill 3 bytes
+  PutVarint32Varint64(&encoded, 3 /* next file number */, 88);
+  ASSERT_NOK(ve.DecodeFrom(encoded));
+
+  // Safely ignore an identified but safely ignorable entry
+  encoded.clear();
+  PutVarint32Varint64(&encoded, 2 /* kLogNumber */, 66);
+  // This is a customized ignorable tag
+  PutVarint32Varint64(&encoded,
+                      0x2710 /* A field with kTagSafeIgnoreMask set */,
+                      3 /* fieldlength 3 */);
+  encoded += "abc";  //  Fill 3 bytes
+  PutVarint32Varint64(&encoded, 3 /* kNextFileNumber */, 88);
+
+  ASSERT_OK(ve.DecodeFrom(encoded));
+
+  ASSERT_TRUE(ve.HasLogNumber());
+  ASSERT_TRUE(ve.HasNextFile());
+  ASSERT_EQ(66, ve.GetLogNumber());
+  ASSERT_EQ(88, ve.GetNextFile());
+}
+
+TEST_F(VersionEditTest, DbId) {
+  VersionEdit edit;
+  edit.SetDBId("ab34-cd12-435f-er00");
+  TestEncodeDecode(edit);
+
+  edit.Clear();
+  edit.SetDBId("34ba-cd12-435f-er01");
+  TestEncodeDecode(edit);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/version_set.cc b/src/rocksdb/db/version_set.cc
new file mode 100644
index 000000000..e913a97dd
--- /dev/null
+++ b/src/rocksdb/db/version_set.cc
@@ -0,0 +1,6005 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/version_set.h"
+
+#include <stdio.h>
+#include <algorithm>
+#include <array>
+#include <cinttypes>
+#include <list>
+#include <map>
+#include <set>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include "compaction/compaction.h"
+#include "db/internal_stats.h"
+#include "db/log_reader.h"
+#include "db/log_writer.h"
+#include "db/memtable.h"
+#include "db/merge_context.h"
+#include "db/merge_helper.h"
+#include "db/pinned_iterators_manager.h"
+#include "db/table_cache.h"
+#include "db/version_builder.h"
+#include "file/filename.h"
+#include "file/random_access_file_reader.h"
+#include "file/read_write_util.h"
+#include "file/writable_file_writer.h"
+#include "monitoring/file_read_sample.h"
+#include "monitoring/perf_context_imp.h"
+#include "monitoring/persistent_stats_history.h"
+#include "rocksdb/env.h"
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/write_buffer_manager.h"
+#include "table/format.h"
+#include "table/get_context.h"
+#include "table/internal_iterator.h"
+#include "table/merging_iterator.h"
+#include "table/meta_blocks.h"
+#include "table/multiget_context.h"
+#include "table/plain/plain_table_factory.h"
+#include "table/table_reader.h"
+#include "table/two_level_iterator.h"
+#include "test_util/sync_point.h"
+#include "util/coding.h"
+#include "util/stop_watch.h"
+#include "util/string_util.h"
+#include "util/user_comparator_wrapper.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+
+// Find File in LevelFilesBrief data structure
+// Within an index range defined by left and right
+int FindFileInRange(const InternalKeyComparator& icmp,
+    const LevelFilesBrief& file_level,
+    const Slice& key,
+    uint32_t left,
+    uint32_t right) {
+  auto cmp = [&](const FdWithKeyRange& f, const Slice& k) -> bool {
+    return icmp.InternalKeyComparator::Compare(f.largest_key, k) < 0;
+  };
+  const auto &b = file_level.files;
+  return static_cast<int>(std::lower_bound(b + left,
+                                           b + right, key, cmp) - b);
+}
+
+Status OverlapWithIterator(const Comparator* ucmp,
+    const Slice& smallest_user_key,
+    const Slice& largest_user_key,
+    InternalIterator* iter,
+    bool* overlap) {
+  InternalKey range_start(smallest_user_key, kMaxSequenceNumber,
+                          kValueTypeForSeek);
+  iter->Seek(range_start.Encode());
+  if (!iter->status().ok()) {
+    return iter->status();
+  }
+
+  *overlap = false;
+  if (iter->Valid()) {
+    ParsedInternalKey seek_result;
+    if (!ParseInternalKey(iter->key(), &seek_result)) {
+      return Status::Corruption("DB have corrupted keys");
+    }
+
+    if (ucmp->CompareWithoutTimestamp(seek_result.user_key, largest_user_key) <=
+        0) {
+      *overlap = true;
+    }
+  }
+
+  return iter->status();
+}
+
+// Class to help choose the next file to search for the particular key.
+// Searches and returns files level by level.
+// We can search level-by-level since entries never hop across
+// levels. Therefore we are guaranteed that if we find data
+// in a smaller level, later levels are irrelevant (unless we
+// are MergeInProgress).
+class FilePicker {
+ public:
+  FilePicker(std::vector<FileMetaData*>* files, const Slice& user_key,
+             const Slice& ikey, autovector<LevelFilesBrief>* file_levels,
+             unsigned int num_levels, FileIndexer* file_indexer,
+             const Comparator* user_comparator,
+             const InternalKeyComparator* internal_comparator)
+      : num_levels_(num_levels),
+        curr_level_(static_cast<unsigned int>(-1)),
+        returned_file_level_(static_cast<unsigned int>(-1)),
+        hit_file_level_(static_cast<unsigned int>(-1)),
+        search_left_bound_(0),
+        search_right_bound_(FileIndexer::kLevelMaxIndex),
+#ifndef NDEBUG
+        files_(files),
+#endif
+        level_files_brief_(file_levels),
+        is_hit_file_last_in_level_(false),
+        curr_file_level_(nullptr),
+        user_key_(user_key),
+        ikey_(ikey),
+        file_indexer_(file_indexer),
+        user_comparator_(user_comparator),
+        internal_comparator_(internal_comparator) {
+#ifdef NDEBUG
+    (void)files;
+#endif
+    // Setup member variables to search first level.
+    search_ended_ = !PrepareNextLevel();
+    if (!search_ended_) {
+      // Prefetch Level 0 table data to avoid cache miss if possible.
+      for (unsigned int i = 0; i < (*level_files_brief_)[0].num_files; ++i) {
+        auto* r = (*level_files_brief_)[0].files[i].fd.table_reader;
+        if (r) {
+          r->Prepare(ikey);
+        }
+      }
+    }
+  }
+
+  int GetCurrentLevel() const { return curr_level_; }
+
+  FdWithKeyRange* GetNextFile() {
+    while (!search_ended_) {  // Loops over different levels.
+      while (curr_index_in_curr_level_ < curr_file_level_->num_files) {
+        // Loops over all files in current level.
+        FdWithKeyRange* f = &curr_file_level_->files[curr_index_in_curr_level_];
+        hit_file_level_ = curr_level_;
+        is_hit_file_last_in_level_ =
+            curr_index_in_curr_level_ == curr_file_level_->num_files - 1;
+        int cmp_largest = -1;
+
+        // Do key range filtering of files or/and fractional cascading if:
+        // (1) not all the files are in level 0, or
+        // (2) there are more than 3 current level files
+        // If there are only 3 or less current level files in the system, we skip
+        // the key range filtering. In this case, more likely, the system is
+        // highly tuned to minimize number of tables queried by each query,
+        // so it is unlikely that key range filtering is more efficient than
+        // querying the files.
+        if (num_levels_ > 1 || curr_file_level_->num_files > 3) {
+          // Check if key is within a file's range. If search left bound and
+          // right bound point to the same find, we are sure key falls in
+          // range.
+          assert(curr_level_ == 0 ||
+                 curr_index_in_curr_level_ == start_index_in_curr_level_ ||
+                 user_comparator_->CompareWithoutTimestamp(
+                     user_key_, ExtractUserKey(f->smallest_key)) <= 0);
+
+          int cmp_smallest = user_comparator_->CompareWithoutTimestamp(
+              user_key_, ExtractUserKey(f->smallest_key));
+          if (cmp_smallest >= 0) {
+            cmp_largest = user_comparator_->CompareWithoutTimestamp(
+                user_key_, ExtractUserKey(f->largest_key));
+          }
+
+          // Setup file search bound for the next level based on the
+          // comparison results
+          if (curr_level_ > 0) {
+            file_indexer_->GetNextLevelIndex(curr_level_,
+                                            curr_index_in_curr_level_,
+                                            cmp_smallest, cmp_largest,
+                                            &search_left_bound_,
+                                            &search_right_bound_);
+          }
+          // Key falls out of current file's range
+          if (cmp_smallest < 0 || cmp_largest > 0) {
+            if (curr_level_ == 0) {
+              ++curr_index_in_curr_level_;
+              continue;
+            } else {
+              // Search next level.
+              break;
+            }
+          }
+        }
+#ifndef NDEBUG
+        // Sanity check to make sure that the files are correctly sorted
+        if (prev_file_) {
+          if (curr_level_ != 0) {
+            int comp_sign = internal_comparator_->Compare(
+                prev_file_->largest_key, f->smallest_key);
+            assert(comp_sign < 0);
+          } else {
+            // level == 0, the current file cannot be newer than the previous
+            // one. Use compressed data structure, has no attribute seqNo
+            assert(curr_index_in_curr_level_ > 0);
+            assert(!NewestFirstBySeqNo(files_[0][curr_index_in_curr_level_],
+                  files_[0][curr_index_in_curr_level_-1]));
+          }
+        }
+        prev_file_ = f;
+#endif
+        returned_file_level_ = curr_level_;
+        if (curr_level_ > 0 && cmp_largest < 0) {
+          // No more files to search in this level.
+          search_ended_ = !PrepareNextLevel();
+        } else {
+          ++curr_index_in_curr_level_;
+        }
+        return f;
+      }
+      // Start searching next level.
+      search_ended_ = !PrepareNextLevel();
+    }
+    // Search ended.
+    return nullptr;
+  }
+
+  // getter for current file level
+  // for GET_HIT_L0, GET_HIT_L1 & GET_HIT_L2_AND_UP counts
+  unsigned int GetHitFileLevel() { return hit_file_level_; }
+
+  // Returns true if the most recent "hit file" (i.e., one returned by
+  // GetNextFile()) is at the last index in its level.
+  bool IsHitFileLastInLevel() { return is_hit_file_last_in_level_; }
+
+ private:
+  unsigned int num_levels_;
+  unsigned int curr_level_;
+  unsigned int returned_file_level_;
+  unsigned int hit_file_level_;
+  int32_t search_left_bound_;
+  int32_t search_right_bound_;
+#ifndef NDEBUG
+  std::vector<FileMetaData*>* files_;
+#endif
+  autovector<LevelFilesBrief>* level_files_brief_;
+  bool search_ended_;
+  bool is_hit_file_last_in_level_;
+  LevelFilesBrief* curr_file_level_;
+  unsigned int curr_index_in_curr_level_;
+  unsigned int start_index_in_curr_level_;
+  Slice user_key_;
+  Slice ikey_;
+  FileIndexer* file_indexer_;
+  const Comparator* user_comparator_;
+  const InternalKeyComparator* internal_comparator_;
+#ifndef NDEBUG
+  FdWithKeyRange* prev_file_;
+#endif
+
+  // Setup local variables to search next level.
+  // Returns false if there are no more levels to search.
+  bool PrepareNextLevel() {
+    curr_level_++;
+    while (curr_level_ < num_levels_) {
+      curr_file_level_ = &(*level_files_brief_)[curr_level_];
+      if (curr_file_level_->num_files == 0) {
+        // When current level is empty, the search bound generated from upper
+        // level must be [0, -1] or [0, FileIndexer::kLevelMaxIndex] if it is
+        // also empty.
+        assert(search_left_bound_ == 0);
+        assert(search_right_bound_ == -1 ||
+               search_right_bound_ == FileIndexer::kLevelMaxIndex);
+        // Since current level is empty, it will need to search all files in
+        // the next level
+        search_left_bound_ = 0;
+        search_right_bound_ = FileIndexer::kLevelMaxIndex;
+        curr_level_++;
+        continue;
+      }
+
+      // Some files may overlap each other. We find
+      // all files that overlap user_key and process them in order from
+      // newest to oldest. In the context of merge-operator, this can occur at
+      // any level. Otherwise, it only occurs at Level-0 (since Put/Deletes
+      // are always compacted into a single entry).
+      int32_t start_index;
+      if (curr_level_ == 0) {
+        // On Level-0, we read through all files to check for overlap.
+        start_index = 0;
+      } else {
+        // On Level-n (n>=1), files are sorted. Binary search to find the
+        // earliest file whose largest key >= ikey. Search left bound and
+        // right bound are used to narrow the range.
+        if (search_left_bound_ <= search_right_bound_) {
+          if (search_right_bound_ == FileIndexer::kLevelMaxIndex) {
+            search_right_bound_ =
+                static_cast<int32_t>(curr_file_level_->num_files) - 1;
+          }
+          // `search_right_bound_` is an inclusive upper-bound, but since it was
+          // determined based on user key, it is still possible the lookup key
+          // falls to the right of `search_right_bound_`'s corresponding file.
+          // So, pass a limit one higher, which allows us to detect this case.
+          start_index =
+              FindFileInRange(*internal_comparator_, *curr_file_level_, ikey_,
+                              static_cast<uint32_t>(search_left_bound_),
+                              static_cast<uint32_t>(search_right_bound_) + 1);
+          if (start_index == search_right_bound_ + 1) {
+            // `ikey_` comes after `search_right_bound_`. The lookup key does
+            // not exist on this level, so let's skip this level and do a full
+            // binary search on the next level.
+            search_left_bound_ = 0;
+            search_right_bound_ = FileIndexer::kLevelMaxIndex;
+            curr_level_++;
+            continue;
+          }
+        } else {
+          // search_left_bound > search_right_bound, key does not exist in
+          // this level. Since no comparison is done in this level, it will
+          // need to search all files in the next level.
+          search_left_bound_ = 0;
+          search_right_bound_ = FileIndexer::kLevelMaxIndex;
+          curr_level_++;
+          continue;
+        }
+      }
+      start_index_in_curr_level_ = start_index;
+      curr_index_in_curr_level_ = start_index;
+#ifndef NDEBUG
+      prev_file_ = nullptr;
+#endif
+      return true;
+    }
+    // curr_level_ = num_levels_. So, no more levels to search.
+    return false;
+  }
+};
+
+class FilePickerMultiGet {
+ private:
+  struct FilePickerContext;
+
+ public:
+  FilePickerMultiGet(MultiGetRange* range,
+                     autovector<LevelFilesBrief>* file_levels,
+                     unsigned int num_levels, FileIndexer* file_indexer,
+                     const Comparator* user_comparator,
+                     const InternalKeyComparator* internal_comparator)
+      : num_levels_(num_levels),
+        curr_level_(static_cast<unsigned int>(-1)),
+        returned_file_level_(static_cast<unsigned int>(-1)),
+        hit_file_level_(static_cast<unsigned int>(-1)),
+        range_(range),
+        batch_iter_(range->begin()),
+        batch_iter_prev_(range->begin()),
+        maybe_repeat_key_(false),
+        current_level_range_(*range, range->begin(), range->end()),
+        current_file_range_(*range, range->begin(), range->end()),
+        level_files_brief_(file_levels),
+        is_hit_file_last_in_level_(false),
+        curr_file_level_(nullptr),
+        file_indexer_(file_indexer),
+        user_comparator_(user_comparator),
+        internal_comparator_(internal_comparator) {
+    for (auto iter = range_->begin(); iter != range_->end(); ++iter) {
+      fp_ctx_array_[iter.index()] =
+          FilePickerContext(0, FileIndexer::kLevelMaxIndex);
+    }
+
+    // Setup member variables to search first level.
+    search_ended_ = !PrepareNextLevel();
+    if (!search_ended_) {
+      // REVISIT
+      // Prefetch Level 0 table data to avoid cache miss if possible.
+      // As of now, only PlainTableReader and CuckooTableReader do any
+      // prefetching. This may not be necessary anymore once we implement
+      // batching in those table readers
+      for (unsigned int i = 0; i < (*level_files_brief_)[0].num_files; ++i) {
+        auto* r = (*level_files_brief_)[0].files[i].fd.table_reader;
+        if (r) {
+          for (auto iter = range_->begin(); iter != range_->end(); ++iter) {
+            r->Prepare(iter->ikey);
+          }
+        }
+      }
+    }
+  }
+
+  int GetCurrentLevel() const { return curr_level_; }
+
+  // Iterates through files in the current level until it finds a file that
+  // contains atleast one key from the MultiGet batch
+  bool GetNextFileInLevelWithKeys(MultiGetRange* next_file_range,
+                                  size_t* file_index, FdWithKeyRange** fd,
+                                  bool* is_last_key_in_file) {
+    size_t curr_file_index = *file_index;
+    FdWithKeyRange* f = nullptr;
+    bool file_hit = false;
+    int cmp_largest = -1;
+    if (curr_file_index >= curr_file_level_->num_files) {
+      // In the unlikely case the next key is a duplicate of the current key,
+      // and the current key is the last in the level and the internal key
+      // was not found, we need to skip lookup for the remaining keys and
+      // reset the search bounds
+      if (batch_iter_ != current_level_range_.end()) {
+        ++batch_iter_;
+        for (; batch_iter_ != current_level_range_.end(); ++batch_iter_) {
+          struct FilePickerContext& fp_ctx = fp_ctx_array_[batch_iter_.index()];
+          fp_ctx.search_left_bound = 0;
+          fp_ctx.search_right_bound = FileIndexer::kLevelMaxIndex;
+        }
+      }
+      return false;
+    }
+    // Loops over keys in the MultiGet batch until it finds a file with
+    // atleast one of the keys. Then it keeps moving forward until the
+    // last key in the batch that falls in that file
+    while (batch_iter_ != current_level_range_.end() &&
+           (fp_ctx_array_[batch_iter_.index()].curr_index_in_curr_level ==
+                curr_file_index ||
+            !file_hit)) {
+      struct FilePickerContext& fp_ctx = fp_ctx_array_[batch_iter_.index()];
+      f = &curr_file_level_->files[fp_ctx.curr_index_in_curr_level];
+      Slice& user_key = batch_iter_->ukey;
+
+      // Do key range filtering of files or/and fractional cascading if:
+      // (1) not all the files are in level 0, or
+      // (2) there are more than 3 current level files
+      // If there are only 3 or less current level files in the system, we
+      // skip the key range filtering. In this case, more likely, the system
+      // is highly tuned to minimize number of tables queried by each query,
+      // so it is unlikely that key range filtering is more efficient than
+      // querying the files.
+      if (num_levels_ > 1 || curr_file_level_->num_files > 3) {
+        // Check if key is within a file's range. If search left bound and
+        // right bound point to the same find, we are sure key falls in
+        // range.
+        assert(curr_level_ == 0 ||
+               fp_ctx.curr_index_in_curr_level ==
+                   fp_ctx.start_index_in_curr_level ||
+               user_comparator_->Compare(user_key,
+                                         ExtractUserKey(f->smallest_key)) <= 0);
+
+        int cmp_smallest = user_comparator_->Compare(
+            user_key, ExtractUserKey(f->smallest_key));
+        if (cmp_smallest >= 0) {
+          cmp_largest = user_comparator_->Compare(
+              user_key, ExtractUserKey(f->largest_key));
+        } else {
+          cmp_largest = -1;
+        }
+
+        // Setup file search bound for the next level based on the
+        // comparison results
+        if (curr_level_ > 0) {
+          file_indexer_->GetNextLevelIndex(
+              curr_level_, fp_ctx.curr_index_in_curr_level, cmp_smallest,
+              cmp_largest, &fp_ctx.search_left_bound,
+              &fp_ctx.search_right_bound);
+        }
+        // Key falls out of current file's range
+        if (cmp_smallest < 0 || cmp_largest > 0) {
+          next_file_range->SkipKey(batch_iter_);
+        } else {
+          file_hit = true;
+        }
+      } else {
+        file_hit = true;
+      }
+      if (cmp_largest == 0) {
+        // cmp_largest is 0, which means the next key will not be in this
+        // file, so stop looking further. Also don't increment megt_iter_
+        // as we may have to look for this key in the next file if we don't
+        // find it in this one
+        break;
+      } else {
+        if (curr_level_ == 0) {
+          // We need to look through all files in level 0
+          ++fp_ctx.curr_index_in_curr_level;
+        }
+        ++batch_iter_;
+      }
+      if (!file_hit) {
+        curr_file_index =
+            (batch_iter_ != current_level_range_.end())
+                ? fp_ctx_array_[batch_iter_.index()].curr_index_in_curr_level
+                : curr_file_level_->num_files;
+      }
+    }
+
+    *fd = f;
+    *file_index = curr_file_index;
+    *is_last_key_in_file = cmp_largest == 0;
+    return file_hit;
+  }
+
+  FdWithKeyRange* GetNextFile() {
+    while (!search_ended_) {
+      // Start searching next level.
+      if (batch_iter_ == current_level_range_.end()) {
+        search_ended_ = !PrepareNextLevel();
+        continue;
+      } else {
+        if (maybe_repeat_key_) {
+          maybe_repeat_key_ = false;
+          // Check if we found the final value for the last key in the
+          // previous lookup range. If we did, then there's no need to look
+          // any further for that key, so advance batch_iter_. Else, keep
+          // batch_iter_ positioned on that key so we look it up again in
+          // the next file
+          // For L0, always advance the key because we will look in the next
+          // file regardless for all keys not found yet
+          if (current_level_range_.CheckKeyDone(batch_iter_) ||
+              curr_level_ == 0) {
+            ++batch_iter_;
+          }
+        }
+        // batch_iter_prev_ will become the start key for the next file
+        // lookup
+        batch_iter_prev_ = batch_iter_;
+      }
+
+      MultiGetRange next_file_range(current_level_range_, batch_iter_prev_,
+                                    current_level_range_.end());
+      size_t curr_file_index =
+          (batch_iter_ != current_level_range_.end())
+              ? fp_ctx_array_[batch_iter_.index()].curr_index_in_curr_level
+              : curr_file_level_->num_files;
+      FdWithKeyRange* f;
+      bool is_last_key_in_file;
+      if (!GetNextFileInLevelWithKeys(&next_file_range, &curr_file_index, &f,
+                                      &is_last_key_in_file)) {
+        search_ended_ = !PrepareNextLevel();
+      } else {
+        MultiGetRange::Iterator upper_key = batch_iter_;
+        if (is_last_key_in_file) {
+          // Since cmp_largest is 0, batch_iter_ still points to the last key
+          // that falls in this file, instead of the next one. Increment
+          // upper_key so we can set the range properly for SST MultiGet
+          ++upper_key;
+          ++(fp_ctx_array_[batch_iter_.index()].curr_index_in_curr_level);
+          maybe_repeat_key_ = true;
+        }
+        // Set the range for this file
+        current_file_range_ =
+            MultiGetRange(next_file_range, batch_iter_prev_, upper_key);
+        returned_file_level_ = curr_level_;
+        hit_file_level_ = curr_level_;
+        is_hit_file_last_in_level_ =
+            curr_file_index == curr_file_level_->num_files - 1;
+        return f;
+      }
+    }
+
+    // Search ended
+    return nullptr;
+  }
+
+  // getter for current file level
+  // for GET_HIT_L0, GET_HIT_L1 & GET_HIT_L2_AND_UP counts
+  unsigned int GetHitFileLevel() { return hit_file_level_; }
+
+  // Returns true if the most recent "hit file" (i.e., one returned by
+  // GetNextFile()) is at the last index in its level.
+  bool IsHitFileLastInLevel() { return is_hit_file_last_in_level_; }
+
+  const MultiGetRange& CurrentFileRange() { return current_file_range_; }
+
+ private:
+  unsigned int num_levels_;
+  unsigned int curr_level_;
+  unsigned int returned_file_level_;
+  unsigned int hit_file_level_;
+
+  struct FilePickerContext {
+    int32_t search_left_bound;
+    int32_t search_right_bound;
+    unsigned int curr_index_in_curr_level;
+    unsigned int start_index_in_curr_level;
+
+    FilePickerContext(int32_t left, int32_t right)
+        : search_left_bound(left), search_right_bound(right),
+          curr_index_in_curr_level(0), start_index_in_curr_level(0) {}
+
+    FilePickerContext() = default;
+  };
+  std::array<FilePickerContext, MultiGetContext::MAX_BATCH_SIZE> fp_ctx_array_;
+  MultiGetRange* range_;
+  // Iterator to iterate through the keys in a MultiGet batch, that gets reset
+  // at the beginning of each level. Each call to GetNextFile() will position
+  // batch_iter_ at or right after the last key that was found in the returned
+  // SST file
+  MultiGetRange::Iterator batch_iter_;
+  // An iterator that records the previous position of batch_iter_, i.e last
+  // key found in the previous SST file, in order to serve as the start of
+  // the batch key range for the next SST file
+  MultiGetRange::Iterator batch_iter_prev_;
+  bool maybe_repeat_key_;
+  MultiGetRange current_level_range_;
+  MultiGetRange current_file_range_;
+  autovector<LevelFilesBrief>* level_files_brief_;
+  bool search_ended_;
+  bool is_hit_file_last_in_level_;
+  LevelFilesBrief* curr_file_level_;
+  FileIndexer* file_indexer_;
+  const Comparator* user_comparator_;
+  const InternalKeyComparator* internal_comparator_;
+
+  // Setup local variables to search next level.
+  // Returns false if there are no more levels to search.
+  bool PrepareNextLevel() {
+    if (curr_level_ == 0) {
+      MultiGetRange::Iterator mget_iter = current_level_range_.begin();
+      if (fp_ctx_array_[mget_iter.index()].curr_index_in_curr_level <
+          curr_file_level_->num_files) {
+        batch_iter_prev_ = current_level_range_.begin();
+        batch_iter_ = current_level_range_.begin();
+        return true;
+      }
+    }
+
+    curr_level_++;
+    // Reset key range to saved value
+    while (curr_level_ < num_levels_) {
+      bool level_contains_keys = false;
+      curr_file_level_ = &(*level_files_brief_)[curr_level_];
+      if (curr_file_level_->num_files == 0) {
+        // When current level is empty, the search bound generated from upper
+        // level must be [0, -1] or [0, FileIndexer::kLevelMaxIndex] if it is
+        // also empty.
+
+        for (auto mget_iter = current_level_range_.begin();
+             mget_iter != current_level_range_.end(); ++mget_iter) {
+          struct FilePickerContext& fp_ctx = fp_ctx_array_[mget_iter.index()];
+
+          assert(fp_ctx.search_left_bound == 0);
+          assert(fp_ctx.search_right_bound == -1 ||
+                 fp_ctx.search_right_bound == FileIndexer::kLevelMaxIndex);
+          // Since current level is empty, it will need to search all files in
+          // the next level
+          fp_ctx.search_left_bound = 0;
+          fp_ctx.search_right_bound = FileIndexer::kLevelMaxIndex;
+        }
+        // Skip all subsequent empty levels
+        do {
+          ++curr_level_;
+        } while ((curr_level_ < num_levels_) &&
+                 (*level_files_brief_)[curr_level_].num_files == 0);
+        continue;
+      }
+
+      // Some files may overlap each other. We find
+      // all files that overlap user_key and process them in order from
+      // newest to oldest. In the context of merge-operator, this can occur at
+      // any level. Otherwise, it only occurs at Level-0 (since Put/Deletes
+      // are always compacted into a single entry).
+      int32_t start_index = -1;
+      current_level_range_ =
+          MultiGetRange(*range_, range_->begin(), range_->end());
+      for (auto mget_iter = current_level_range_.begin();
+           mget_iter != current_level_range_.end(); ++mget_iter) {
+        struct FilePickerContext& fp_ctx = fp_ctx_array_[mget_iter.index()];
+        if (curr_level_ == 0) {
+          // On Level-0, we read through all files to check for overlap.
+          start_index = 0;
+          level_contains_keys = true;
+        } else {
+          // On Level-n (n>=1), files are sorted. Binary search to find the
+          // earliest file whose largest key >= ikey. Search left bound and
+          // right bound are used to narrow the range.
+          if (fp_ctx.search_left_bound <= fp_ctx.search_right_bound) {
+            if (fp_ctx.search_right_bound == FileIndexer::kLevelMaxIndex) {
+              fp_ctx.search_right_bound =
+                  static_cast<int32_t>(curr_file_level_->num_files) - 1;
+            }
+            // `search_right_bound_` is an inclusive upper-bound, but since it
+            // was determined based on user key, it is still possible the lookup
+            // key falls to the right of `search_right_bound_`'s corresponding
+            // file. So, pass a limit one higher, which allows us to detect this
+            // case.
+            Slice& ikey = mget_iter->ikey;
+            start_index = FindFileInRange(
+                *internal_comparator_, *curr_file_level_, ikey,
+                static_cast<uint32_t>(fp_ctx.search_left_bound),
+                static_cast<uint32_t>(fp_ctx.search_right_bound) + 1);
+            if (start_index == fp_ctx.search_right_bound + 1) {
+              // `ikey_` comes after `search_right_bound_`. The lookup key does
+              // not exist on this level, so let's skip this level and do a full
+              // binary search on the next level.
+              fp_ctx.search_left_bound = 0;
+              fp_ctx.search_right_bound = FileIndexer::kLevelMaxIndex;
+              current_level_range_.SkipKey(mget_iter);
+              continue;
+            } else {
+              level_contains_keys = true;
+            }
+          } else {
+            // search_left_bound > search_right_bound, key does not exist in
+            // this level. Since no comparison is done in this level, it will
+            // need to search all files in the next level.
+            fp_ctx.search_left_bound = 0;
+            fp_ctx.search_right_bound = FileIndexer::kLevelMaxIndex;
+            current_level_range_.SkipKey(mget_iter);
+            continue;
+          }
+        }
+        fp_ctx.start_index_in_curr_level = start_index;
+        fp_ctx.curr_index_in_curr_level = start_index;
+      }
+      if (level_contains_keys) {
+        batch_iter_prev_ = current_level_range_.begin();
+        batch_iter_ = current_level_range_.begin();
+        return true;
+      }
+      curr_level_++;
+    }
+    // curr_level_ = num_levels_. So, no more levels to search.
+    return false;
+  }
+};
+}  // anonymous namespace
+
+VersionStorageInfo::~VersionStorageInfo() { delete[] files_; }
+
+Version::~Version() {
+  assert(refs_ == 0);
+
+  // Remove from linked list
+  prev_->next_ = next_;
+  next_->prev_ = prev_;
+
+  // Drop references to files
+  for (int level = 0; level < storage_info_.num_levels_; level++) {
+    for (size_t i = 0; i < storage_info_.files_[level].size(); i++) {
+      FileMetaData* f = storage_info_.files_[level][i];
+      assert(f->refs > 0);
+      f->refs--;
+      if (f->refs <= 0) {
+        assert(cfd_ != nullptr);
+        uint32_t path_id = f->fd.GetPathId();
+        assert(path_id < cfd_->ioptions()->cf_paths.size());
+        vset_->obsolete_files_.push_back(
+            ObsoleteFileInfo(f, cfd_->ioptions()->cf_paths[path_id].path));
+      }
+    }
+  }
+}
+
+int FindFile(const InternalKeyComparator& icmp,
+             const LevelFilesBrief& file_level,
+             const Slice& key) {
+  return FindFileInRange(icmp, file_level, key, 0,
+                         static_cast<uint32_t>(file_level.num_files));
+}
+
+void DoGenerateLevelFilesBrief(LevelFilesBrief* file_level,
+        const std::vector<FileMetaData*>& files,
+        Arena* arena) {
+  assert(file_level);
+  assert(arena);
+
+  size_t num = files.size();
+  file_level->num_files = num;
+  char* mem = arena->AllocateAligned(num * sizeof(FdWithKeyRange));
+  file_level->files = new (mem)FdWithKeyRange[num];
+
+  for (size_t i = 0; i < num; i++) {
+    Slice smallest_key = files[i]->smallest.Encode();
+    Slice largest_key = files[i]->largest.Encode();
+
+    // Copy key slice to sequential memory
+    size_t smallest_size = smallest_key.size();
+    size_t largest_size = largest_key.size();
+    mem = arena->AllocateAligned(smallest_size + largest_size);
+    memcpy(mem, smallest_key.data(), smallest_size);
+    memcpy(mem + smallest_size, largest_key.data(), largest_size);
+
+    FdWithKeyRange& f = file_level->files[i];
+    f.fd = files[i]->fd;
+    f.file_metadata = files[i];
+    f.smallest_key = Slice(mem, smallest_size);
+    f.largest_key = Slice(mem + smallest_size, largest_size);
+  }
+}
+
+static bool AfterFile(const Comparator* ucmp,
+                      const Slice* user_key, const FdWithKeyRange* f) {
+  // nullptr user_key occurs before all keys and is therefore never after *f
+  return (user_key != nullptr &&
+          ucmp->CompareWithoutTimestamp(*user_key,
+                                        ExtractUserKey(f->largest_key)) > 0);
+}
+
+static bool BeforeFile(const Comparator* ucmp,
+                       const Slice* user_key, const FdWithKeyRange* f) {
+  // nullptr user_key occurs after all keys and is therefore never before *f
+  return (user_key != nullptr &&
+          ucmp->CompareWithoutTimestamp(*user_key,
+                                        ExtractUserKey(f->smallest_key)) < 0);
+}
+
+bool SomeFileOverlapsRange(
+    const InternalKeyComparator& icmp,
+    bool disjoint_sorted_files,
+    const LevelFilesBrief& file_level,
+    const Slice* smallest_user_key,
+    const Slice* largest_user_key) {
+  const Comparator* ucmp = icmp.user_comparator();
+  if (!disjoint_sorted_files) {
+    // Need to check against all files
+    for (size_t i = 0; i < file_level.num_files; i++) {
+      const FdWithKeyRange* f = &(file_level.files[i]);
+      if (AfterFile(ucmp, smallest_user_key, f) ||
+          BeforeFile(ucmp, largest_user_key, f)) {
+        // No overlap
+      } else {
+        return true;  // Overlap
+      }
+    }
+    return false;
+  }
+
+  // Binary search over file list
+  uint32_t index = 0;
+  if (smallest_user_key != nullptr) {
+    // Find the leftmost possible internal key for smallest_user_key
+    InternalKey small;
+    small.SetMinPossibleForUserKey(*smallest_user_key);
+    index = FindFile(icmp, file_level, small.Encode());
+  }
+
+  if (index >= file_level.num_files) {
+    // beginning of range is after all files, so no overlap.
+    return false;
+  }
+
+  return !BeforeFile(ucmp, largest_user_key, &file_level.files[index]);
+}
+
+namespace {
+
+class LevelIterator final : public InternalIterator {
+ public:
+  LevelIterator(TableCache* table_cache, const ReadOptions& read_options,
+                const FileOptions& file_options,
+                const InternalKeyComparator& icomparator,
+                const LevelFilesBrief* flevel,
+                const SliceTransform* prefix_extractor, bool should_sample,
+                HistogramImpl* file_read_hist, TableReaderCaller caller,
+                bool skip_filters, int level, RangeDelAggregator* range_del_agg,
+                const std::vector<AtomicCompactionUnitBoundary>*
+                    compaction_boundaries = nullptr)
+      : table_cache_(table_cache),
+        read_options_(read_options),
+        file_options_(file_options),
+        icomparator_(icomparator),
+        user_comparator_(icomparator.user_comparator()),
+        flevel_(flevel),
+        prefix_extractor_(prefix_extractor),
+        file_read_hist_(file_read_hist),
+        should_sample_(should_sample),
+        caller_(caller),
+        skip_filters_(skip_filters),
+        file_index_(flevel_->num_files),
+        level_(level),
+        range_del_agg_(range_del_agg),
+        pinned_iters_mgr_(nullptr),
+        compaction_boundaries_(compaction_boundaries) {
+    // Empty level is not supported.
+    assert(flevel_ != nullptr && flevel_->num_files > 0);
+  }
+
+  ~LevelIterator() override { delete file_iter_.Set(nullptr); }
+
+  void Seek(const Slice& target) override;
+  void SeekForPrev(const Slice& target) override;
+  void SeekToFirst() override;
+  void SeekToLast() override;
+  void Next() final override;
+  bool NextAndGetResult(IterateResult* result) override;
+  void Prev() override;
+
+  bool Valid() const override { return file_iter_.Valid(); }
+  Slice key() const override {
+    assert(Valid());
+    return file_iter_.key();
+  }
+
+  Slice value() const override {
+    assert(Valid());
+    return file_iter_.value();
+  }
+
+  Status status() const override {
+    return file_iter_.iter() ? file_iter_.status() : Status::OK();
+  }
+
+  inline bool MayBeOutOfLowerBound() override {
+    assert(Valid());
+    return may_be_out_of_lower_bound_ && file_iter_.MayBeOutOfLowerBound();
+  }
+
+  inline bool MayBeOutOfUpperBound() override {
+    assert(Valid());
+    return file_iter_.MayBeOutOfUpperBound();
+  }
+
+  void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override {
+    pinned_iters_mgr_ = pinned_iters_mgr;
+    if (file_iter_.iter()) {
+      file_iter_.SetPinnedItersMgr(pinned_iters_mgr);
+    }
+  }
+
+  bool IsKeyPinned() const override {
+    return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() &&
+           file_iter_.iter() && file_iter_.IsKeyPinned();
+  }
+
+  bool IsValuePinned() const override {
+    return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() &&
+           file_iter_.iter() && file_iter_.IsValuePinned();
+  }
+
+ private:
+  // Return true if at least one invalid file is seen and skipped.
+  bool SkipEmptyFileForward();
+  void SkipEmptyFileBackward();
+  void SetFileIterator(InternalIterator* iter);
+  void InitFileIterator(size_t new_file_index);
+
+  // Called by both of Next() and NextAndGetResult(). Force inline.
+  void NextImpl() {
+    assert(Valid());
+    file_iter_.Next();
+    SkipEmptyFileForward();
+  }
+
+  const Slice& file_smallest_key(size_t file_index) {
+    assert(file_index < flevel_->num_files);
+    return flevel_->files[file_index].smallest_key;
+  }
+
+  bool KeyReachedUpperBound(const Slice& internal_key) {
+    return read_options_.iterate_upper_bound != nullptr &&
+           user_comparator_.CompareWithoutTimestamp(
+               ExtractUserKey(internal_key),
+               *read_options_.iterate_upper_bound) >= 0;
+  }
+
+  InternalIterator* NewFileIterator() {
+    assert(file_index_ < flevel_->num_files);
+    auto file_meta = flevel_->files[file_index_];
+    if (should_sample_) {
+      sample_file_read_inc(file_meta.file_metadata);
+    }
+
+    const InternalKey* smallest_compaction_key = nullptr;
+    const InternalKey* largest_compaction_key = nullptr;
+    if (compaction_boundaries_ != nullptr) {
+      smallest_compaction_key = (*compaction_boundaries_)[file_index_].smallest;
+      largest_compaction_key = (*compaction_boundaries_)[file_index_].largest;
+    }
+    CheckMayBeOutOfLowerBound();
+    return table_cache_->NewIterator(
+        read_options_, file_options_, icomparator_, *file_meta.file_metadata,
+        range_del_agg_, prefix_extractor_,
+        nullptr /* don't need reference to table */, file_read_hist_, caller_,
+        /*arena=*/nullptr, skip_filters_, level_, smallest_compaction_key,
+        largest_compaction_key);
+  }
+
+  // Check if current file being fully within iterate_lower_bound.
+  //
+  // Note MyRocks may update iterate bounds between seek. To workaround it,
+  // we need to check and update may_be_out_of_lower_bound_ accordingly.
+  void CheckMayBeOutOfLowerBound() {
+    if (read_options_.iterate_lower_bound != nullptr &&
+        file_index_ < flevel_->num_files) {
+      may_be_out_of_lower_bound_ =
+          user_comparator_.Compare(
+              ExtractUserKey(file_smallest_key(file_index_)),
+              *read_options_.iterate_lower_bound) < 0;
+    }
+  }
+
+  TableCache* table_cache_;
+  const ReadOptions read_options_;
+  const FileOptions& file_options_;
+  const InternalKeyComparator& icomparator_;
+  const UserComparatorWrapper user_comparator_;
+  const LevelFilesBrief* flevel_;
+  mutable FileDescriptor current_value_;
+  // `prefix_extractor_` may be non-null even for total order seek. Checking
+  // this variable is not the right way to identify whether prefix iterator
+  // is used.
+  const SliceTransform* prefix_extractor_;
+
+  HistogramImpl* file_read_hist_;
+  bool should_sample_;
+  TableReaderCaller caller_;
+  bool skip_filters_;
+  bool may_be_out_of_lower_bound_ = true;
+  size_t file_index_;
+  int level_;
+  RangeDelAggregator* range_del_agg_;
+  IteratorWrapper file_iter_;  // May be nullptr
+  PinnedIteratorsManager* pinned_iters_mgr_;
+
+  // To be propagated to RangeDelAggregator in order to safely truncate range
+  // tombstones.
+  const std::vector<AtomicCompactionUnitBoundary>* compaction_boundaries_;
+};
+
+void LevelIterator::Seek(const Slice& target) {
+  // Check whether the seek key fall under the same file
+  bool need_to_reseek = true;
+  if (file_iter_.iter() != nullptr && file_index_ < flevel_->num_files) {
+    const FdWithKeyRange& cur_file = flevel_->files[file_index_];
+    if (icomparator_.InternalKeyComparator::Compare(
+            target, cur_file.largest_key) <= 0 &&
+        icomparator_.InternalKeyComparator::Compare(
+            target, cur_file.smallest_key) >= 0) {
+      need_to_reseek = false;
+      assert(static_cast<size_t>(FindFile(icomparator_, *flevel_, target)) ==
+             file_index_);
+    }
+  }
+  if (need_to_reseek) {
+    TEST_SYNC_POINT("LevelIterator::Seek:BeforeFindFile");
+    size_t new_file_index = FindFile(icomparator_, *flevel_, target);
+    InitFileIterator(new_file_index);
+  }
+
+  if (file_iter_.iter() != nullptr) {
+    file_iter_.Seek(target);
+  }
+  if (SkipEmptyFileForward() && prefix_extractor_ != nullptr &&
+      !read_options_.total_order_seek && !read_options_.auto_prefix_mode &&
+      file_iter_.iter() != nullptr && file_iter_.Valid()) {
+    // We've skipped the file we initially positioned to. In the prefix
+    // seek case, it is likely that the file is skipped because of
+    // prefix bloom or hash, where more keys are skipped. We then check
+    // the current key and invalidate the iterator if the prefix is
+    // already passed.
+    // When doing prefix iterator seek, when keys for one prefix have
+    // been exhausted, it can jump to any key that is larger. Here we are
+    // enforcing a stricter contract than that, in order to make it easier for
+    // higher layers (merging and DB iterator) to reason the correctness:
+    // 1. Within the prefix, the result should be accurate.
+    // 2. If keys for the prefix is exhausted, it is either positioned to the
+    //    next key after the prefix, or make the iterator invalid.
+    // A side benefit will be that it invalidates the iterator earlier so that
+    // the upper level merging iterator can merge fewer child iterators.
+    Slice target_user_key = ExtractUserKey(target);
+    Slice file_user_key = ExtractUserKey(file_iter_.key());
+    if (prefix_extractor_->InDomain(target_user_key) &&
+        (!prefix_extractor_->InDomain(file_user_key) ||
+         user_comparator_.Compare(
+             prefix_extractor_->Transform(target_user_key),
+             prefix_extractor_->Transform(file_user_key)) != 0)) {
+      SetFileIterator(nullptr);
+    }
+  }
+  CheckMayBeOutOfLowerBound();
+}
+
+void LevelIterator::SeekForPrev(const Slice& target) {
+  size_t new_file_index = FindFile(icomparator_, *flevel_, target);
+  if (new_file_index >= flevel_->num_files) {
+    new_file_index = flevel_->num_files - 1;
+  }
+
+  InitFileIterator(new_file_index);
+  if (file_iter_.iter() != nullptr) {
+    file_iter_.SeekForPrev(target);
+    SkipEmptyFileBackward();
+  }
+  CheckMayBeOutOfLowerBound();
+}
+
+void LevelIterator::SeekToFirst() {
+  InitFileIterator(0);
+  if (file_iter_.iter() != nullptr) {
+    file_iter_.SeekToFirst();
+  }
+  SkipEmptyFileForward();
+  CheckMayBeOutOfLowerBound();
+}
+
+void LevelIterator::SeekToLast() {
+  InitFileIterator(flevel_->num_files - 1);
+  if (file_iter_.iter() != nullptr) {
+    file_iter_.SeekToLast();
+  }
+  SkipEmptyFileBackward();
+  CheckMayBeOutOfLowerBound();
+}
+
+void LevelIterator::Next() { NextImpl(); }
+
+bool LevelIterator::NextAndGetResult(IterateResult* result) {
+  NextImpl();
+  bool is_valid = Valid();
+  if (is_valid) {
+    result->key = key();
+    result->may_be_out_of_upper_bound = MayBeOutOfUpperBound();
+  }
+  return is_valid;
+}
+
+void LevelIterator::Prev() {
+  assert(Valid());
+  file_iter_.Prev();
+  SkipEmptyFileBackward();
+}
+
+bool LevelIterator::SkipEmptyFileForward() {
+  bool seen_empty_file = false;
+  while (file_iter_.iter() == nullptr ||
+         (!file_iter_.Valid() && file_iter_.status().ok() &&
+          !file_iter_.iter()->IsOutOfBound())) {
+    seen_empty_file = true;
+    // Move to next file
+    if (file_index_ >= flevel_->num_files - 1) {
+      // Already at the last file
+      SetFileIterator(nullptr);
+      break;
+    }
+    if (KeyReachedUpperBound(file_smallest_key(file_index_ + 1))) {
+      SetFileIterator(nullptr);
+      break;
+    }
+    InitFileIterator(file_index_ + 1);
+    if (file_iter_.iter() != nullptr) {
+      file_iter_.SeekToFirst();
+    }
+  }
+  return seen_empty_file;
+}
+
+void LevelIterator::SkipEmptyFileBackward() {
+  while (file_iter_.iter() == nullptr ||
+         (!file_iter_.Valid() && file_iter_.status().ok())) {
+    // Move to previous file
+    if (file_index_ == 0) {
+      // Already the first file
+      SetFileIterator(nullptr);
+      return;
+    }
+    InitFileIterator(file_index_ - 1);
+    if (file_iter_.iter() != nullptr) {
+      file_iter_.SeekToLast();
+    }
+  }
+}
+
+void LevelIterator::SetFileIterator(InternalIterator* iter) {
+  if (pinned_iters_mgr_ && iter) {
+    iter->SetPinnedItersMgr(pinned_iters_mgr_);
+  }
+
+  InternalIterator* old_iter = file_iter_.Set(iter);
+  if (pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled()) {
+    pinned_iters_mgr_->PinIterator(old_iter);
+  } else {
+    delete old_iter;
+  }
+}
+
+void LevelIterator::InitFileIterator(size_t new_file_index) {
+  if (new_file_index >= flevel_->num_files) {
+    file_index_ = new_file_index;
+    SetFileIterator(nullptr);
+    return;
+  } else {
+    // If the file iterator shows incomplete, we try it again if users seek
+    // to the same file, as this time we may go to a different data block
+    // which is cached in block cache.
+    //
+    if (file_iter_.iter() != nullptr && !file_iter_.status().IsIncomplete() &&
+        new_file_index == file_index_) {
+      // file_iter_ is already constructed with this iterator, so
+      // no need to change anything
+    } else {
+      file_index_ = new_file_index;
+      InternalIterator* iter = NewFileIterator();
+      SetFileIterator(iter);
+    }
+  }
+}
+}  // anonymous namespace
+
+// A wrapper of version builder which references the current version in
+// constructor and unref it in the destructor.
+// Both of the constructor and destructor need to be called inside DB Mutex.
+class BaseReferencedVersionBuilder {
+ public:
+  explicit BaseReferencedVersionBuilder(ColumnFamilyData* cfd)
+      : version_builder_(new VersionBuilder(
+            cfd->current()->version_set()->file_options(), cfd->table_cache(),
+            cfd->current()->storage_info(), cfd->ioptions()->info_log)),
+        version_(cfd->current()) {
+    version_->Ref();
+  }
+  ~BaseReferencedVersionBuilder() {
+    version_->Unref();
+  }
+  VersionBuilder* version_builder() { return version_builder_.get(); }
+
+ private:
+  std::unique_ptr<VersionBuilder> version_builder_;
+  Version* version_;
+};
+
+Status Version::GetTableProperties(std::shared_ptr<const TableProperties>* tp,
+                                   const FileMetaData* file_meta,
+                                   const std::string* fname) const {
+  auto table_cache = cfd_->table_cache();
+  auto ioptions = cfd_->ioptions();
+  Status s = table_cache->GetTableProperties(
+      file_options_, cfd_->internal_comparator(), file_meta->fd, tp,
+      mutable_cf_options_.prefix_extractor.get(), true /* no io */);
+  if (s.ok()) {
+    return s;
+  }
+
+  // We only ignore error type `Incomplete` since it's by design that we
+  // disallow table when it's not in table cache.
+  if (!s.IsIncomplete()) {
+    return s;
+  }
+
+  // 2. Table is not present in table cache, we'll read the table properties
+  // directly from the properties block in the file.
+  std::unique_ptr<FSRandomAccessFile> file;
+  std::string file_name;
+  if (fname != nullptr) {
+    file_name = *fname;
+  } else {
+    file_name =
+      TableFileName(ioptions->cf_paths, file_meta->fd.GetNumber(),
+                    file_meta->fd.GetPathId());
+  }
+  s = ioptions->fs->NewRandomAccessFile(file_name, file_options_, &file,
+                                        nullptr);
+  if (!s.ok()) {
+    return s;
+  }
+
+  TableProperties* raw_table_properties;
+  // By setting the magic number to kInvalidTableMagicNumber, we can by
+  // pass the magic number check in the footer.
+  std::unique_ptr<RandomAccessFileReader> file_reader(
+      new RandomAccessFileReader(
+          std::move(file), file_name, nullptr /* env */, nullptr /* stats */,
+          0 /* hist_type */, nullptr /* file_read_hist */,
+          nullptr /* rate_limiter */, ioptions->listeners));
+  s = ReadTableProperties(
+      file_reader.get(), file_meta->fd.GetFileSize(),
+      Footer::kInvalidTableMagicNumber /* table's magic number */, *ioptions,
+      &raw_table_properties, false /* compression_type_missing */);
+  if (!s.ok()) {
+    return s;
+  }
+  RecordTick(ioptions->statistics, NUMBER_DIRECT_LOAD_TABLE_PROPERTIES);
+
+  *tp = std::shared_ptr<const TableProperties>(raw_table_properties);
+  return s;
+}
+
+Status Version::GetPropertiesOfAllTables(TablePropertiesCollection* props) {
+  Status s;
+  for (int level = 0; level < storage_info_.num_levels_; level++) {
+    s = GetPropertiesOfAllTables(props, level);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  return Status::OK();
+}
+
+Status Version::TablesRangeTombstoneSummary(int max_entries_to_print,
+                                            std::string* out_str) {
+  if (max_entries_to_print <= 0) {
+    return Status::OK();
+  }
+  int num_entries_left = max_entries_to_print;
+
+  std::stringstream ss;
+
+  for (int level = 0; level < storage_info_.num_levels_; level++) {
+    for (const auto& file_meta : storage_info_.files_[level]) {
+      auto fname =
+          TableFileName(cfd_->ioptions()->cf_paths, file_meta->fd.GetNumber(),
+                        file_meta->fd.GetPathId());
+
+      ss << "=== file : " << fname << " ===\n";
+
+      TableCache* table_cache = cfd_->table_cache();
+      std::unique_ptr<FragmentedRangeTombstoneIterator> tombstone_iter;
+
+      Status s = table_cache->GetRangeTombstoneIterator(
+          ReadOptions(), cfd_->internal_comparator(), *file_meta,
+          &tombstone_iter);
+      if (!s.ok()) {
+        return s;
+      }
+      if (tombstone_iter) {
+        tombstone_iter->SeekToFirst();
+
+        while (tombstone_iter->Valid() && num_entries_left > 0) {
+          ss << "start: " << tombstone_iter->start_key().ToString(true)
+             << " end: " << tombstone_iter->end_key().ToString(true)
+             << " seq: " << tombstone_iter->seq() << '\n';
+          tombstone_iter->Next();
+          num_entries_left--;
+        }
+        if (num_entries_left <= 0) {
+          break;
+        }
+      }
+    }
+    if (num_entries_left <= 0) {
+      break;
+    }
+  }
+  assert(num_entries_left >= 0);
+  if (num_entries_left <= 0) {
+    ss << "(results may not be complete)\n";
+  }
+
+  *out_str = ss.str();
+  return Status::OK();
+}
+
+Status Version::GetPropertiesOfAllTables(TablePropertiesCollection* props,
+                                         int level) {
+  for (const auto& file_meta : storage_info_.files_[level]) {
+    auto fname =
+        TableFileName(cfd_->ioptions()->cf_paths, file_meta->fd.GetNumber(),
+                      file_meta->fd.GetPathId());
+    // 1. If the table is already present in table cache, load table
+    // properties from there.
+    std::shared_ptr<const TableProperties> table_properties;
+    Status s = GetTableProperties(&table_properties, file_meta, &fname);
+    if (s.ok()) {
+      props->insert({fname, table_properties});
+    } else {
+      return s;
+    }
+  }
+
+  return Status::OK();
+}
+
+Status Version::GetPropertiesOfTablesInRange(
+    const Range* range, std::size_t n, TablePropertiesCollection* props) const {
+  for (int level = 0; level < storage_info_.num_non_empty_levels(); level++) {
+    for (decltype(n) i = 0; i < n; i++) {
+      // Convert user_key into a corresponding internal key.
+      InternalKey k1(range[i].start, kMaxSequenceNumber, kValueTypeForSeek);
+      InternalKey k2(range[i].limit, kMaxSequenceNumber, kValueTypeForSeek);
+      std::vector<FileMetaData*> files;
+      storage_info_.GetOverlappingInputs(level, &k1, &k2, &files, -1, nullptr,
+                                         false);
+      for (const auto& file_meta : files) {
+        auto fname =
+            TableFileName(cfd_->ioptions()->cf_paths,
+                          file_meta->fd.GetNumber(), file_meta->fd.GetPathId());
+        if (props->count(fname) == 0) {
+          // 1. If the table is already present in table cache, load table
+          // properties from there.
+          std::shared_ptr<const TableProperties> table_properties;
+          Status s = GetTableProperties(&table_properties, file_meta, &fname);
+          if (s.ok()) {
+            props->insert({fname, table_properties});
+          } else {
+            return s;
+          }
+        }
+      }
+    }
+  }
+
+  return Status::OK();
+}
+
+Status Version::GetAggregatedTableProperties(
+    std::shared_ptr<const TableProperties>* tp, int level) {
+  TablePropertiesCollection props;
+  Status s;
+  if (level < 0) {
+    s = GetPropertiesOfAllTables(&props);
+  } else {
+    s = GetPropertiesOfAllTables(&props, level);
+  }
+  if (!s.ok()) {
+    return s;
+  }
+
+  auto* new_tp = new TableProperties();
+  for (const auto& item : props) {
+    new_tp->Add(*item.second);
+  }
+  tp->reset(new_tp);
+  return Status::OK();
+}
+
+size_t Version::GetMemoryUsageByTableReaders() {
+  size_t total_usage = 0;
+  for (auto& file_level : storage_info_.level_files_brief_) {
+    for (size_t i = 0; i < file_level.num_files; i++) {
+      total_usage += cfd_->table_cache()->GetMemoryUsageByTableReader(
+          file_options_, cfd_->internal_comparator(), file_level.files[i].fd,
+          mutable_cf_options_.prefix_extractor.get());
+    }
+  }
+  return total_usage;
+}
+
+void Version::GetColumnFamilyMetaData(ColumnFamilyMetaData* cf_meta) {
+  assert(cf_meta);
+  assert(cfd_);
+
+  cf_meta->name = cfd_->GetName();
+  cf_meta->size = 0;
+  cf_meta->file_count = 0;
+  cf_meta->levels.clear();
+
+  auto* ioptions = cfd_->ioptions();
+  auto* vstorage = storage_info();
+
+  for (int level = 0; level < cfd_->NumberLevels(); level++) {
+    uint64_t level_size = 0;
+    cf_meta->file_count += vstorage->LevelFiles(level).size();
+    std::vector<SstFileMetaData> files;
+    for (const auto& file : vstorage->LevelFiles(level)) {
+      uint32_t path_id = file->fd.GetPathId();
+      std::string file_path;
+      if (path_id < ioptions->cf_paths.size()) {
+        file_path = ioptions->cf_paths[path_id].path;
+      } else {
+        assert(!ioptions->cf_paths.empty());
+        file_path = ioptions->cf_paths.back().path;
+      }
+      const uint64_t file_number = file->fd.GetNumber();
+      files.emplace_back(SstFileMetaData{
+          MakeTableFileName("", file_number), file_number, file_path,
+          static_cast<size_t>(file->fd.GetFileSize()), file->fd.smallest_seqno,
+          file->fd.largest_seqno, file->smallest.user_key().ToString(),
+          file->largest.user_key().ToString(),
+          file->stats.num_reads_sampled.load(std::memory_order_relaxed),
+          file->being_compacted, file->oldest_blob_file_number,
+          file->TryGetOldestAncesterTime(), file->TryGetFileCreationTime(),
+          file->file_checksum, file->file_checksum_func_name});
+      files.back().num_entries = file->num_entries;
+      files.back().num_deletions = file->num_deletions;
+      level_size += file->fd.GetFileSize();
+    }
+    cf_meta->levels.emplace_back(
+        level, level_size, std::move(files));
+    cf_meta->size += level_size;
+  }
+}
+
+uint64_t Version::GetSstFilesSize() {
+  uint64_t sst_files_size = 0;
+  for (int level = 0; level < storage_info_.num_levels_; level++) {
+    for (const auto& file_meta : storage_info_.LevelFiles(level)) {
+      sst_files_size += file_meta->fd.GetFileSize();
+    }
+  }
+  return sst_files_size;
+}
+
+void Version::GetCreationTimeOfOldestFile(uint64_t* creation_time) {
+  uint64_t oldest_time = port::kMaxUint64;
+  for (int level = 0; level < storage_info_.num_non_empty_levels_; level++) {
+    for (FileMetaData* meta : storage_info_.LevelFiles(level)) {
+      assert(meta->fd.table_reader != nullptr);
+      uint64_t file_creation_time = meta->TryGetFileCreationTime();
+      if (file_creation_time == kUnknownFileCreationTime) {
+        *creation_time = 0;
+        return;
+      }
+      if (file_creation_time < oldest_time) {
+        oldest_time = file_creation_time;
+      }
+    }
+  }
+  *creation_time = oldest_time;
+}
+
+uint64_t VersionStorageInfo::GetEstimatedActiveKeys() const {
+  // Estimation will be inaccurate when:
+  // (1) there exist merge keys
+  // (2) keys are directly overwritten
+  // (3) deletion on non-existing keys
+  // (4) low number of samples
+  if (current_num_samples_ == 0) {
+    return 0;
+  }
+
+  if (current_num_non_deletions_ <= current_num_deletions_) {
+    return 0;
+  }
+
+  uint64_t est = current_num_non_deletions_ - current_num_deletions_;
+
+  uint64_t file_count = 0;
+  for (int level = 0; level < num_levels_; ++level) {
+    file_count += files_[level].size();
+  }
+
+  if (current_num_samples_ < file_count) {
+    // casting to avoid overflowing
+    return
+      static_cast<uint64_t>(
+        (est * static_cast<double>(file_count) / current_num_samples_)
+      );
+  } else {
+    return est;
+  }
+}
+
+double VersionStorageInfo::GetEstimatedCompressionRatioAtLevel(
+    int level) const {
+  assert(level < num_levels_);
+  uint64_t sum_file_size_bytes = 0;
+  uint64_t sum_data_size_bytes = 0;
+  for (auto* file_meta : files_[level]) {
+    sum_file_size_bytes += file_meta->fd.GetFileSize();
+    sum_data_size_bytes += file_meta->raw_key_size + file_meta->raw_value_size;
+  }
+  if (sum_file_size_bytes == 0) {
+    return -1.0;
+  }
+  return static_cast<double>(sum_data_size_bytes) / sum_file_size_bytes;
+}
+
+void Version::AddIterators(const ReadOptions& read_options,
+                           const FileOptions& soptions,
+                           MergeIteratorBuilder* merge_iter_builder,
+                           RangeDelAggregator* range_del_agg) {
+  assert(storage_info_.finalized_);
+
+  for (int level = 0; level < storage_info_.num_non_empty_levels(); level++) {
+    AddIteratorsForLevel(read_options, soptions, merge_iter_builder, level,
+                         range_del_agg);
+  }
+}
+
+void Version::AddIteratorsForLevel(const ReadOptions& read_options,
+                                   const FileOptions& soptions,
+                                   MergeIteratorBuilder* merge_iter_builder,
+                                   int level,
+                                   RangeDelAggregator* range_del_agg) {
+  assert(storage_info_.finalized_);
+  if (level >= storage_info_.num_non_empty_levels()) {
+    // This is an empty level
+    return;
+  } else if (storage_info_.LevelFilesBrief(level).num_files == 0) {
+    // No files in this level
+    return;
+  }
+
+  bool should_sample = should_sample_file_read();
+
+  auto* arena = merge_iter_builder->GetArena();
+  if (level == 0) {
+    // Merge all level zero files together since they may overlap
+    for (size_t i = 0; i < storage_info_.LevelFilesBrief(0).num_files; i++) {
+      const auto& file = storage_info_.LevelFilesBrief(0).files[i];
+      merge_iter_builder->AddIterator(cfd_->table_cache()->NewIterator(
+          read_options, soptions, cfd_->internal_comparator(),
+          *file.file_metadata, range_del_agg,
+          mutable_cf_options_.prefix_extractor.get(), nullptr,
+          cfd_->internal_stats()->GetFileReadHist(0),
+          TableReaderCaller::kUserIterator, arena,
+          /*skip_filters=*/false, /*level=*/0,
+          /*smallest_compaction_key=*/nullptr,
+          /*largest_compaction_key=*/nullptr));
+    }
+    if (should_sample) {
+      // Count ones for every L0 files. This is done per iterator creation
+      // rather than Seek(), while files in other levels are recored per seek.
+      // If users execute one range query per iterator, there may be some
+      // discrepancy here.
+      for (FileMetaData* meta : storage_info_.LevelFiles(0)) {
+        sample_file_read_inc(meta);
+      }
+    }
+  } else if (storage_info_.LevelFilesBrief(level).num_files > 0) {
+    // For levels > 0, we can use a concatenating iterator that sequentially
+    // walks through the non-overlapping files in the level, opening them
+    // lazily.
+    auto* mem = arena->AllocateAligned(sizeof(LevelIterator));
+    merge_iter_builder->AddIterator(new (mem) LevelIterator(
+        cfd_->table_cache(), read_options, soptions,
+        cfd_->internal_comparator(), &storage_info_.LevelFilesBrief(level),
+        mutable_cf_options_.prefix_extractor.get(), should_sample_file_read(),
+        cfd_->internal_stats()->GetFileReadHist(level),
+        TableReaderCaller::kUserIterator, IsFilterSkipped(level), level,
+        range_del_agg, /*largest_compaction_key=*/nullptr));
+  }
+}
+
+Status Version::OverlapWithLevelIterator(const ReadOptions& read_options,
+                                         const FileOptions& file_options,
+                                         const Slice& smallest_user_key,
+                                         const Slice& largest_user_key,
+                                         int level, bool* overlap) {
+  assert(storage_info_.finalized_);
+
+  auto icmp = cfd_->internal_comparator();
+  auto ucmp = icmp.user_comparator();
+
+  Arena arena;
+  Status status;
+  ReadRangeDelAggregator range_del_agg(&icmp,
+                                       kMaxSequenceNumber /* upper_bound */);
+
+  *overlap = false;
+
+  if (level == 0) {
+    for (size_t i = 0; i < storage_info_.LevelFilesBrief(0).num_files; i++) {
+      const auto file = &storage_info_.LevelFilesBrief(0).files[i];
+      if (AfterFile(ucmp, &smallest_user_key, file) ||
+          BeforeFile(ucmp, &largest_user_key, file)) {
+        continue;
+      }
+      ScopedArenaIterator iter(cfd_->table_cache()->NewIterator(
+          read_options, file_options, cfd_->internal_comparator(),
+          *file->file_metadata, &range_del_agg,
+          mutable_cf_options_.prefix_extractor.get(), nullptr,
+          cfd_->internal_stats()->GetFileReadHist(0),
+          TableReaderCaller::kUserIterator, &arena,
+          /*skip_filters=*/false, /*level=*/0,
+          /*smallest_compaction_key=*/nullptr,
+          /*largest_compaction_key=*/nullptr));
+      status = OverlapWithIterator(
+          ucmp, smallest_user_key, largest_user_key, iter.get(), overlap);
+      if (!status.ok() || *overlap) {
+        break;
+      }
+    }
+  } else if (storage_info_.LevelFilesBrief(level).num_files > 0) {
+    auto mem = arena.AllocateAligned(sizeof(LevelIterator));
+    ScopedArenaIterator iter(new (mem) LevelIterator(
+        cfd_->table_cache(), read_options, file_options,
+        cfd_->internal_comparator(), &storage_info_.LevelFilesBrief(level),
+        mutable_cf_options_.prefix_extractor.get(), should_sample_file_read(),
+        cfd_->internal_stats()->GetFileReadHist(level),
+        TableReaderCaller::kUserIterator, IsFilterSkipped(level), level,
+        &range_del_agg));
+    status = OverlapWithIterator(
+        ucmp, smallest_user_key, largest_user_key, iter.get(), overlap);
+  }
+
+  if (status.ok() && *overlap == false &&
+      range_del_agg.IsRangeOverlapped(smallest_user_key, largest_user_key)) {
+    *overlap = true;
+  }
+  return status;
+}
+
+VersionStorageInfo::VersionStorageInfo(
+    const InternalKeyComparator* internal_comparator,
+    const Comparator* user_comparator, int levels,
+    CompactionStyle compaction_style, VersionStorageInfo* ref_vstorage,
+    bool _force_consistency_checks)
+    : internal_comparator_(internal_comparator),
+      user_comparator_(user_comparator),
+      // cfd is nullptr if Version is dummy
+      num_levels_(levels),
+      num_non_empty_levels_(0),
+      file_indexer_(user_comparator),
+      compaction_style_(compaction_style),
+      files_(new std::vector<FileMetaData*>[num_levels_]),
+      base_level_(num_levels_ == 1 ? -1 : 1),
+      level_multiplier_(0.0),
+      files_by_compaction_pri_(num_levels_),
+      level0_non_overlapping_(false),
+      next_file_to_compact_by_size_(num_levels_),
+      compaction_score_(num_levels_),
+      compaction_level_(num_levels_),
+      l0_delay_trigger_count_(0),
+      accumulated_file_size_(0),
+      accumulated_raw_key_size_(0),
+      accumulated_raw_value_size_(0),
+      accumulated_num_non_deletions_(0),
+      accumulated_num_deletions_(0),
+      current_num_non_deletions_(0),
+      current_num_deletions_(0),
+      current_num_samples_(0),
+      estimated_compaction_needed_bytes_(0),
+      finalized_(false),
+      force_consistency_checks_(_force_consistency_checks) {
+  if (ref_vstorage != nullptr) {
+    accumulated_file_size_ = ref_vstorage->accumulated_file_size_;
+    accumulated_raw_key_size_ = ref_vstorage->accumulated_raw_key_size_;
+    accumulated_raw_value_size_ = ref_vstorage->accumulated_raw_value_size_;
+    accumulated_num_non_deletions_ =
+        ref_vstorage->accumulated_num_non_deletions_;
+    accumulated_num_deletions_ = ref_vstorage->accumulated_num_deletions_;
+    current_num_non_deletions_ = ref_vstorage->current_num_non_deletions_;
+    current_num_deletions_ = ref_vstorage->current_num_deletions_;
+    current_num_samples_ = ref_vstorage->current_num_samples_;
+    oldest_snapshot_seqnum_ = ref_vstorage->oldest_snapshot_seqnum_;
+  }
+}
+
+Version::Version(ColumnFamilyData* column_family_data, VersionSet* vset,
+                 const FileOptions& file_opt,
+                 const MutableCFOptions mutable_cf_options,
+                 uint64_t version_number)
+    : env_(vset->env_),
+      cfd_(column_family_data),
+      info_log_((cfd_ == nullptr) ? nullptr : cfd_->ioptions()->info_log),
+      db_statistics_((cfd_ == nullptr) ? nullptr
+                                       : cfd_->ioptions()->statistics),
+      table_cache_((cfd_ == nullptr) ? nullptr : cfd_->table_cache()),
+      merge_operator_((cfd_ == nullptr) ? nullptr
+                                        : cfd_->ioptions()->merge_operator),
+      storage_info_(
+          (cfd_ == nullptr) ? nullptr : &cfd_->internal_comparator(),
+          (cfd_ == nullptr) ? nullptr : cfd_->user_comparator(),
+          cfd_ == nullptr ? 0 : cfd_->NumberLevels(),
+          cfd_ == nullptr ? kCompactionStyleLevel
+                          : cfd_->ioptions()->compaction_style,
+          (cfd_ == nullptr || cfd_->current() == nullptr)
+              ? nullptr
+              : cfd_->current()->storage_info(),
+          cfd_ == nullptr ? false : cfd_->ioptions()->force_consistency_checks),
+      vset_(vset),
+      next_(this),
+      prev_(this),
+      refs_(0),
+      file_options_(file_opt),
+      mutable_cf_options_(mutable_cf_options),
+      version_number_(version_number) {}
+
+void Version::Get(const ReadOptions& read_options, const LookupKey& k,
+                  PinnableSlice* value, Status* status,
+                  MergeContext* merge_context,
+                  SequenceNumber* max_covering_tombstone_seq, bool* value_found,
+                  bool* key_exists, SequenceNumber* seq, ReadCallback* callback,
+                  bool* is_blob, bool do_merge) {
+  Slice ikey = k.internal_key();
+  Slice user_key = k.user_key();
+
+  assert(status->ok() || status->IsMergeInProgress());
+
+  if (key_exists != nullptr) {
+    // will falsify below if not found
+    *key_exists = true;
+  }
+
+  PinnedIteratorsManager pinned_iters_mgr;
+  uint64_t tracing_get_id = BlockCacheTraceHelper::kReservedGetId;
+  if (vset_ && vset_->block_cache_tracer_ &&
+      vset_->block_cache_tracer_->is_tracing_enabled()) {
+    tracing_get_id = vset_->block_cache_tracer_->NextGetId();
+  }
+  GetContext get_context(
+      user_comparator(), merge_operator_, info_log_, db_statistics_,
+      status->ok() ? GetContext::kNotFound : GetContext::kMerge, user_key,
+      do_merge ? value : nullptr, value_found, merge_context, do_merge,
+      max_covering_tombstone_seq, this->env_, seq,
+      merge_operator_ ? &pinned_iters_mgr : nullptr, callback, is_blob,
+      tracing_get_id);
+
+  // Pin blocks that we read to hold merge operands
+  if (merge_operator_) {
+    pinned_iters_mgr.StartPinning();
+  }
+
+  FilePicker fp(
+      storage_info_.files_, user_key, ikey, &storage_info_.level_files_brief_,
+      storage_info_.num_non_empty_levels_, &storage_info_.file_indexer_,
+      user_comparator(), internal_comparator());
+  FdWithKeyRange* f = fp.GetNextFile();
+
+  while (f != nullptr) {
+    if (*max_covering_tombstone_seq > 0) {
+      // The remaining files we look at will only contain covered keys, so we
+      // stop here.
+      break;
+    }
+    if (get_context.sample()) {
+      sample_file_read_inc(f->file_metadata);
+    }
+
+    bool timer_enabled =
+        GetPerfLevel() >= PerfLevel::kEnableTimeExceptForMutex &&
+        get_perf_context()->per_level_perf_context_enabled;
+    StopWatchNano timer(env_, timer_enabled /* auto_start */);
+    *status = table_cache_->Get(
+        read_options, *internal_comparator(), *f->file_metadata, ikey,
+        &get_context, mutable_cf_options_.prefix_extractor.get(),
+        cfd_->internal_stats()->GetFileReadHist(fp.GetHitFileLevel()),
+        IsFilterSkipped(static_cast<int>(fp.GetHitFileLevel()),
+                        fp.IsHitFileLastInLevel()),
+        fp.GetCurrentLevel());
+    // TODO: examine the behavior for corrupted key
+    if (timer_enabled) {
+      PERF_COUNTER_BY_LEVEL_ADD(get_from_table_nanos, timer.ElapsedNanos(),
+                                fp.GetCurrentLevel());
+    }
+    if (!status->ok()) {
+      return;
+    }
+
+    // report the counters before returning
+    if (get_context.State() != GetContext::kNotFound &&
+        get_context.State() != GetContext::kMerge &&
+        db_statistics_ != nullptr) {
+      get_context.ReportCounters();
+    }
+    switch (get_context.State()) {
+      case GetContext::kNotFound:
+        // Keep searching in other files
+        break;
+      case GetContext::kMerge:
+        // TODO: update per-level perfcontext user_key_return_count for kMerge
+        break;
+      case GetContext::kFound:
+        if (fp.GetHitFileLevel() == 0) {
+          RecordTick(db_statistics_, GET_HIT_L0);
+        } else if (fp.GetHitFileLevel() == 1) {
+          RecordTick(db_statistics_, GET_HIT_L1);
+        } else if (fp.GetHitFileLevel() >= 2) {
+          RecordTick(db_statistics_, GET_HIT_L2_AND_UP);
+        }
+        PERF_COUNTER_BY_LEVEL_ADD(user_key_return_count, 1,
+                                  fp.GetHitFileLevel());
+        return;
+      case GetContext::kDeleted:
+        // Use empty error message for speed
+        *status = Status::NotFound();
+        return;
+      case GetContext::kCorrupt:
+        *status = Status::Corruption("corrupted key for ", user_key);
+        return;
+      case GetContext::kBlobIndex:
+        ROCKS_LOG_ERROR(info_log_, "Encounter unexpected blob index.");
+        *status = Status::NotSupported(
+            "Encounter unexpected blob index. Please open DB with "
+            "ROCKSDB_NAMESPACE::blob_db::BlobDB instead.");
+        return;
+    }
+    f = fp.GetNextFile();
+  }
+  if (db_statistics_ != nullptr) {
+    get_context.ReportCounters();
+  }
+  if (GetContext::kMerge == get_context.State()) {
+    if (!do_merge) {
+      *status = Status::OK();
+      return;
+    }
+    if (!merge_operator_) {
+      *status =  Status::InvalidArgument(
+          "merge_operator is not properly initialized.");
+      return;
+    }
+    // merge_operands are in saver and we hit the beginning of the key history
+    // do a final merge of nullptr and operands;
+    std::string* str_value = value != nullptr ? value->GetSelf() : nullptr;
+    *status = MergeHelper::TimedFullMerge(
+        merge_operator_, user_key, nullptr, merge_context->GetOperands(),
+        str_value, info_log_, db_statistics_, env_,
+        nullptr /* result_operand */, true);
+    if (LIKELY(value != nullptr)) {
+      value->PinSelf();
+    }
+  } else {
+    if (key_exists != nullptr) {
+      *key_exists = false;
+    }
+    *status = Status::NotFound(); // Use an empty error message for speed
+  }
+}
+
+void Version::MultiGet(const ReadOptions& read_options, MultiGetRange* range,
+                       ReadCallback* callback, bool* is_blob) {
+  PinnedIteratorsManager pinned_iters_mgr;
+
+  // Pin blocks that we read to hold merge operands
+  if (merge_operator_) {
+    pinned_iters_mgr.StartPinning();
+  }
+  uint64_t tracing_mget_id = BlockCacheTraceHelper::kReservedGetId;
+
+  if (vset_ && vset_->block_cache_tracer_ &&
+      vset_->block_cache_tracer_->is_tracing_enabled()) {
+    tracing_mget_id = vset_->block_cache_tracer_->NextGetId();
+  }
+  // Even though we know the batch size won't be > MAX_BATCH_SIZE,
+  // use autovector in order to avoid unnecessary construction of GetContext
+  // objects, which is expensive
+  autovector<GetContext, 16> get_ctx;
+  for (auto iter = range->begin(); iter != range->end(); ++iter) {
+    assert(iter->s->ok() || iter->s->IsMergeInProgress());
+    get_ctx.emplace_back(
+        user_comparator(), merge_operator_, info_log_, db_statistics_,
+        iter->s->ok() ? GetContext::kNotFound : GetContext::kMerge, iter->ukey,
+        iter->value, nullptr, &(iter->merge_context), true,
+        &iter->max_covering_tombstone_seq, this->env_, nullptr,
+        merge_operator_ ? &pinned_iters_mgr : nullptr, callback, is_blob,
+        tracing_mget_id);
+    // MergeInProgress status, if set, has been transferred to the get_context
+    // state, so we set status to ok here. From now on, the iter status will
+    // be used for IO errors, and get_context state will be used for any
+    // key level errors
+    *(iter->s) = Status::OK();
+  }
+  int get_ctx_index = 0;
+  for (auto iter = range->begin(); iter != range->end();
+       ++iter, get_ctx_index++) {
+    iter->get_context = &(get_ctx[get_ctx_index]);
+  }
+
+  MultiGetRange file_picker_range(*range, range->begin(), range->end());
+  FilePickerMultiGet fp(
+      &file_picker_range,
+      &storage_info_.level_files_brief_, storage_info_.num_non_empty_levels_,
+      &storage_info_.file_indexer_, user_comparator(), internal_comparator());
+  FdWithKeyRange* f = fp.GetNextFile();
+
+  while (f != nullptr) {
+    MultiGetRange file_range = fp.CurrentFileRange();
+    bool timer_enabled =
+        GetPerfLevel() >= PerfLevel::kEnableTimeExceptForMutex &&
+        get_perf_context()->per_level_perf_context_enabled;
+    StopWatchNano timer(env_, timer_enabled /* auto_start */);
+    Status s = table_cache_->MultiGet(
+        read_options, *internal_comparator(), *f->file_metadata, &file_range,
+        mutable_cf_options_.prefix_extractor.get(),
+        cfd_->internal_stats()->GetFileReadHist(fp.GetHitFileLevel()),
+        IsFilterSkipped(static_cast<int>(fp.GetHitFileLevel()),
+                        fp.IsHitFileLastInLevel()),
+        fp.GetCurrentLevel());
+    // TODO: examine the behavior for corrupted key
+    if (timer_enabled) {
+      PERF_COUNTER_BY_LEVEL_ADD(get_from_table_nanos, timer.ElapsedNanos(),
+                                fp.GetCurrentLevel());
+    }
+    if (!s.ok()) {
+      // TODO: Set status for individual keys appropriately
+      for (auto iter = file_range.begin(); iter != file_range.end(); ++iter) {
+        *iter->s = s;
+        file_range.MarkKeyDone(iter);
+      }
+      return;
+    }
+    uint64_t batch_size = 0;
+    for (auto iter = file_range.begin(); iter != file_range.end(); ++iter) {
+      GetContext& get_context = *iter->get_context;
+      Status* status = iter->s;
+      // The Status in the KeyContext takes precedence over GetContext state
+      // Status may be an error if there were any IO errors in the table
+      // reader. We never expect Status to be NotFound(), as that is
+      // determined by get_context
+      assert(!status->IsNotFound());
+      if (!status->ok()) {
+        file_range.MarkKeyDone(iter);
+        continue;
+      }
+
+      if (get_context.sample()) {
+        sample_file_read_inc(f->file_metadata);
+      }
+      batch_size++;
+      // report the counters before returning
+      if (get_context.State() != GetContext::kNotFound &&
+          get_context.State() != GetContext::kMerge &&
+          db_statistics_ != nullptr) {
+        get_context.ReportCounters();
+      } else {
+        if (iter->max_covering_tombstone_seq > 0) {
+          // The remaining files we look at will only contain covered keys, so
+          // we stop here for this key
+          file_picker_range.SkipKey(iter);
+        }
+      }
+      switch (get_context.State()) {
+        case GetContext::kNotFound:
+          // Keep searching in other files
+          break;
+        case GetContext::kMerge:
+          // TODO: update per-level perfcontext user_key_return_count for kMerge
+          break;
+        case GetContext::kFound:
+          if (fp.GetHitFileLevel() == 0) {
+            RecordTick(db_statistics_, GET_HIT_L0);
+          } else if (fp.GetHitFileLevel() == 1) {
+            RecordTick(db_statistics_, GET_HIT_L1);
+          } else if (fp.GetHitFileLevel() >= 2) {
+            RecordTick(db_statistics_, GET_HIT_L2_AND_UP);
+          }
+          PERF_COUNTER_BY_LEVEL_ADD(user_key_return_count, 1,
+                                    fp.GetHitFileLevel());
+          file_range.MarkKeyDone(iter);
+          continue;
+        case GetContext::kDeleted:
+          // Use empty error message for speed
+          *status = Status::NotFound();
+          file_range.MarkKeyDone(iter);
+          continue;
+        case GetContext::kCorrupt:
+          *status =
+              Status::Corruption("corrupted key for ", iter->lkey->user_key());
+          file_range.MarkKeyDone(iter);
+          continue;
+        case GetContext::kBlobIndex:
+          ROCKS_LOG_ERROR(info_log_, "Encounter unexpected blob index.");
+          *status = Status::NotSupported(
+              "Encounter unexpected blob index. Please open DB with "
+              "ROCKSDB_NAMESPACE::blob_db::BlobDB instead.");
+          file_range.MarkKeyDone(iter);
+          continue;
+      }
+    }
+    RecordInHistogram(db_statistics_, SST_BATCH_SIZE, batch_size);
+    if (file_picker_range.empty()) {
+      break;
+    }
+    f = fp.GetNextFile();
+  }
+
+  // Process any left over keys
+  for (auto iter = range->begin(); iter != range->end(); ++iter) {
+    GetContext& get_context = *iter->get_context;
+    Status* status = iter->s;
+    Slice user_key = iter->lkey->user_key();
+
+    if (db_statistics_ != nullptr) {
+      get_context.ReportCounters();
+    }
+    if (GetContext::kMerge == get_context.State()) {
+      if (!merge_operator_) {
+        *status = Status::InvalidArgument(
+            "merge_operator is not properly initialized.");
+        range->MarkKeyDone(iter);
+        continue;
+      }
+      // merge_operands are in saver and we hit the beginning of the key history
+      // do a final merge of nullptr and operands;
+      std::string* str_value =
+          iter->value != nullptr ? iter->value->GetSelf() : nullptr;
+      *status = MergeHelper::TimedFullMerge(
+          merge_operator_, user_key, nullptr, iter->merge_context.GetOperands(),
+          str_value, info_log_, db_statistics_, env_,
+          nullptr /* result_operand */, true);
+      if (LIKELY(iter->value != nullptr)) {
+        iter->value->PinSelf();
+      }
+    } else {
+      range->MarkKeyDone(iter);
+      *status = Status::NotFound();  // Use an empty error message for speed
+    }
+  }
+}
+
+bool Version::IsFilterSkipped(int level, bool is_file_last_in_level) {
+  // Reaching the bottom level implies misses at all upper levels, so we'll
+  // skip checking the filters when we predict a hit.
+  return cfd_->ioptions()->optimize_filters_for_hits &&
+         (level > 0 || is_file_last_in_level) &&
+         level == storage_info_.num_non_empty_levels() - 1;
+}
+
+void VersionStorageInfo::GenerateLevelFilesBrief() {
+  level_files_brief_.resize(num_non_empty_levels_);
+  for (int level = 0; level < num_non_empty_levels_; level++) {
+    DoGenerateLevelFilesBrief(
+        &level_files_brief_[level], files_[level], &arena_);
+  }
+}
+
+void Version::PrepareApply(
+    const MutableCFOptions& mutable_cf_options,
+    bool update_stats) {
+  UpdateAccumulatedStats(update_stats);
+  storage_info_.UpdateNumNonEmptyLevels();
+  storage_info_.CalculateBaseBytes(*cfd_->ioptions(), mutable_cf_options);
+  storage_info_.UpdateFilesByCompactionPri(cfd_->ioptions()->compaction_pri);
+  storage_info_.GenerateFileIndexer();
+  storage_info_.GenerateLevelFilesBrief();
+  storage_info_.GenerateLevel0NonOverlapping();
+  storage_info_.GenerateBottommostFiles();
+}
+
+bool Version::MaybeInitializeFileMetaData(FileMetaData* file_meta) {
+  if (file_meta->init_stats_from_file ||
+      file_meta->compensated_file_size > 0) {
+    return false;
+  }
+  std::shared_ptr<const TableProperties> tp;
+  Status s = GetTableProperties(&tp, file_meta);
+  file_meta->init_stats_from_file = true;
+  if (!s.ok()) {
+    ROCKS_LOG_ERROR(vset_->db_options_->info_log,
+                    "Unable to load table properties for file %" PRIu64
+                    " --- %s\n",
+                    file_meta->fd.GetNumber(), s.ToString().c_str());
+    return false;
+  }
+  if (tp.get() == nullptr) return false;
+  file_meta->num_entries = tp->num_entries;
+  file_meta->num_deletions = tp->num_deletions;
+  file_meta->raw_value_size = tp->raw_value_size;
+  file_meta->raw_key_size = tp->raw_key_size;
+
+  return true;
+}
+
+void VersionStorageInfo::UpdateAccumulatedStats(FileMetaData* file_meta) {
+  TEST_SYNC_POINT_CALLBACK("VersionStorageInfo::UpdateAccumulatedStats",
+                           nullptr);
+
+  assert(file_meta->init_stats_from_file);
+  accumulated_file_size_ += file_meta->fd.GetFileSize();
+  accumulated_raw_key_size_ += file_meta->raw_key_size;
+  accumulated_raw_value_size_ += file_meta->raw_value_size;
+  accumulated_num_non_deletions_ +=
+      file_meta->num_entries - file_meta->num_deletions;
+  accumulated_num_deletions_ += file_meta->num_deletions;
+
+  current_num_non_deletions_ +=
+      file_meta->num_entries - file_meta->num_deletions;
+  current_num_deletions_ += file_meta->num_deletions;
+  current_num_samples_++;
+}
+
+void VersionStorageInfo::RemoveCurrentStats(FileMetaData* file_meta) {
+  if (file_meta->init_stats_from_file) {
+    current_num_non_deletions_ -=
+        file_meta->num_entries - file_meta->num_deletions;
+    current_num_deletions_ -= file_meta->num_deletions;
+    current_num_samples_--;
+  }
+}
+
+void Version::UpdateAccumulatedStats(bool update_stats) {
+  if (update_stats) {
+    // maximum number of table properties loaded from files.
+    const int kMaxInitCount = 20;
+    int init_count = 0;
+    // here only the first kMaxInitCount files which haven't been
+    // initialized from file will be updated with num_deletions.
+    // The motivation here is to cap the maximum I/O per Version creation.
+    // The reason for choosing files from lower-level instead of higher-level
+    // is that such design is able to propagate the initialization from
+    // lower-level to higher-level:  When the num_deletions of lower-level
+    // files are updated, it will make the lower-level files have accurate
+    // compensated_file_size, making lower-level to higher-level compaction
+    // will be triggered, which creates higher-level files whose num_deletions
+    // will be updated here.
+    for (int level = 0;
+         level < storage_info_.num_levels_ && init_count < kMaxInitCount;
+         ++level) {
+      for (auto* file_meta : storage_info_.files_[level]) {
+        if (MaybeInitializeFileMetaData(file_meta)) {
+          // each FileMeta will be initialized only once.
+          storage_info_.UpdateAccumulatedStats(file_meta);
+          // when option "max_open_files" is -1, all the file metadata has
+          // already been read, so MaybeInitializeFileMetaData() won't incur
+          // any I/O cost. "max_open_files=-1" means that the table cache passed
+          // to the VersionSet and then to the ColumnFamilySet has a size of
+          // TableCache::kInfiniteCapacity
+          if (vset_->GetColumnFamilySet()->get_table_cache()->GetCapacity() ==
+              TableCache::kInfiniteCapacity) {
+            continue;
+          }
+          if (++init_count >= kMaxInitCount) {
+            break;
+          }
+        }
+      }
+    }
+    // In case all sampled-files contain only deletion entries, then we
+    // load the table-property of a file in higher-level to initialize
+    // that value.
+    for (int level = storage_info_.num_levels_ - 1;
+         storage_info_.accumulated_raw_value_size_ == 0 && level >= 0;
+         --level) {
+      for (int i = static_cast<int>(storage_info_.files_[level].size()) - 1;
+           storage_info_.accumulated_raw_value_size_ == 0 && i >= 0; --i) {
+        if (MaybeInitializeFileMetaData(storage_info_.files_[level][i])) {
+          storage_info_.UpdateAccumulatedStats(storage_info_.files_[level][i]);
+        }
+      }
+    }
+  }
+
+  storage_info_.ComputeCompensatedSizes();
+}
+
+void VersionStorageInfo::ComputeCompensatedSizes() {
+  static const int kDeletionWeightOnCompaction = 2;
+  uint64_t average_value_size = GetAverageValueSize();
+
+  // compute the compensated size
+  for (int level = 0; level < num_levels_; level++) {
+    for (auto* file_meta : files_[level]) {
+      // Here we only compute compensated_file_size for those file_meta
+      // which compensated_file_size is uninitialized (== 0). This is true only
+      // for files that have been created right now and no other thread has
+      // access to them. That's why we can safely mutate compensated_file_size.
+      if (file_meta->compensated_file_size == 0) {
+        file_meta->compensated_file_size = file_meta->fd.GetFileSize();
+        // Here we only boost the size of deletion entries of a file only
+        // when the number of deletion entries is greater than the number of
+        // non-deletion entries in the file.  The motivation here is that in
+        // a stable workload, the number of deletion entries should be roughly
+        // equal to the number of non-deletion entries.  If we compensate the
+        // size of deletion entries in a stable workload, the deletion
+        // compensation logic might introduce unwanted effet which changes the
+        // shape of LSM tree.
+        if (file_meta->num_deletions * 2 >= file_meta->num_entries) {
+          file_meta->compensated_file_size +=
+              (file_meta->num_deletions * 2 - file_meta->num_entries) *
+              average_value_size * kDeletionWeightOnCompaction;
+        }
+      }
+    }
+  }
+}
+
+int VersionStorageInfo::MaxInputLevel() const {
+  if (compaction_style_ == kCompactionStyleLevel) {
+    return num_levels() - 2;
+  }
+  return 0;
+}
+
+int VersionStorageInfo::MaxOutputLevel(bool allow_ingest_behind) const {
+  if (allow_ingest_behind) {
+    assert(num_levels() > 1);
+    return num_levels() - 2;
+  }
+  return num_levels() - 1;
+}
+
+void VersionStorageInfo::EstimateCompactionBytesNeeded(
+    const MutableCFOptions& mutable_cf_options) {
+  // Only implemented for level-based compaction
+  if (compaction_style_ != kCompactionStyleLevel) {
+    estimated_compaction_needed_bytes_ = 0;
+    return;
+  }
+
+  // Start from Level 0, if level 0 qualifies compaction to level 1,
+  // we estimate the size of compaction.
+  // Then we move on to the next level and see whether it qualifies compaction
+  // to the next level. The size of the level is estimated as the actual size
+  // on the level plus the input bytes from the previous level if there is any.
+  // If it exceeds, take the exceeded bytes as compaction input and add the size
+  // of the compaction size to tatal size.
+  // We keep doing it to Level 2, 3, etc, until the last level and return the
+  // accumulated bytes.
+
+  uint64_t bytes_compact_to_next_level = 0;
+  uint64_t level_size = 0;
+  for (auto* f : files_[0]) {
+    level_size += f->fd.GetFileSize();
+  }
+  // Level 0
+  bool level0_compact_triggered = false;
+  if (static_cast<int>(files_[0].size()) >=
+          mutable_cf_options.level0_file_num_compaction_trigger ||
+      level_size >= mutable_cf_options.max_bytes_for_level_base) {
+    level0_compact_triggered = true;
+    estimated_compaction_needed_bytes_ = level_size;
+    bytes_compact_to_next_level = level_size;
+  } else {
+    estimated_compaction_needed_bytes_ = 0;
+  }
+
+  // Level 1 and up.
+  uint64_t bytes_next_level = 0;
+  for (int level = base_level(); level <= MaxInputLevel(); level++) {
+    level_size = 0;
+    if (bytes_next_level > 0) {
+#ifndef NDEBUG
+      uint64_t level_size2 = 0;
+      for (auto* f : files_[level]) {
+        level_size2 += f->fd.GetFileSize();
+      }
+      assert(level_size2 == bytes_next_level);
+#endif
+      level_size = bytes_next_level;
+      bytes_next_level = 0;
+    } else {
+      for (auto* f : files_[level]) {
+        level_size += f->fd.GetFileSize();
+      }
+    }
+    if (level == base_level() && level0_compact_triggered) {
+      // Add base level size to compaction if level0 compaction triggered.
+      estimated_compaction_needed_bytes_ += level_size;
+    }
+    // Add size added by previous compaction
+    level_size += bytes_compact_to_next_level;
+    bytes_compact_to_next_level = 0;
+    uint64_t level_target = MaxBytesForLevel(level);
+    if (level_size > level_target) {
+      bytes_compact_to_next_level = level_size - level_target;
+      // Estimate the actual compaction fan-out ratio as size ratio between
+      // the two levels.
+
+      assert(bytes_next_level == 0);
+      if (level + 1 < num_levels_) {
+        for (auto* f : files_[level + 1]) {
+          bytes_next_level += f->fd.GetFileSize();
+        }
+      }
+      if (bytes_next_level > 0) {
+        assert(level_size > 0);
+        estimated_compaction_needed_bytes_ += static_cast<uint64_t>(
+            static_cast<double>(bytes_compact_to_next_level) *
+            (static_cast<double>(bytes_next_level) /
+                 static_cast<double>(level_size) +
+             1));
+      }
+    }
+  }
+}
+
+namespace {
+uint32_t GetExpiredTtlFilesCount(const ImmutableCFOptions& ioptions,
+                                 const MutableCFOptions& mutable_cf_options,
+                                 const std::vector<FileMetaData*>& files) {
+  uint32_t ttl_expired_files_count = 0;
+
+  int64_t _current_time;
+  auto status = ioptions.env->GetCurrentTime(&_current_time);
+  if (status.ok()) {
+    const uint64_t current_time = static_cast<uint64_t>(_current_time);
+    for (FileMetaData* f : files) {
+      if (!f->being_compacted) {
+        uint64_t oldest_ancester_time = f->TryGetOldestAncesterTime();
+        if (oldest_ancester_time != 0 &&
+            oldest_ancester_time < (current_time - mutable_cf_options.ttl)) {
+          ttl_expired_files_count++;
+        }
+      }
+    }
+  }
+  return ttl_expired_files_count;
+}
+}  // anonymous namespace
+
+void VersionStorageInfo::ComputeCompactionScore(
+    const ImmutableCFOptions& immutable_cf_options,
+    const MutableCFOptions& mutable_cf_options) {
+  for (int level = 0; level <= MaxInputLevel(); level++) {
+    double score;
+    if (level == 0) {
+      // We treat level-0 specially by bounding the number of files
+      // instead of number of bytes for two reasons:
+      //
+      // (1) With larger write-buffer sizes, it is nice not to do too
+      // many level-0 compactions.
+      //
+      // (2) The files in level-0 are merged on every read and
+      // therefore we wish to avoid too many files when the individual
+      // file size is small (perhaps because of a small write-buffer
+      // setting, or very high compression ratios, or lots of
+      // overwrites/deletions).
+      int num_sorted_runs = 0;
+      uint64_t total_size = 0;
+      for (auto* f : files_[level]) {
+        if (!f->being_compacted) {
+          total_size += f->compensated_file_size;
+          num_sorted_runs++;
+        }
+      }
+      if (compaction_style_ == kCompactionStyleUniversal) {
+        // For universal compaction, we use level0 score to indicate
+        // compaction score for the whole DB. Adding other levels as if
+        // they are L0 files.
+        for (int i = 1; i < num_levels(); i++) {
+          if (!files_[i].empty() && !files_[i][0]->being_compacted) {
+            num_sorted_runs++;
+          }
+        }
+      }
+
+      if (compaction_style_ == kCompactionStyleFIFO) {
+        score = static_cast<double>(total_size) /
+                mutable_cf_options.compaction_options_fifo.max_table_files_size;
+        if (mutable_cf_options.compaction_options_fifo.allow_compaction) {
+          score = std::max(
+              static_cast<double>(num_sorted_runs) /
+                  mutable_cf_options.level0_file_num_compaction_trigger,
+              score);
+        }
+        if (mutable_cf_options.ttl > 0) {
+          score = std::max(
+              static_cast<double>(GetExpiredTtlFilesCount(
+                  immutable_cf_options, mutable_cf_options, files_[level])),
+              score);
+        }
+
+      } else {
+        score = static_cast<double>(num_sorted_runs) /
+                mutable_cf_options.level0_file_num_compaction_trigger;
+        if (compaction_style_ == kCompactionStyleLevel && num_levels() > 1) {
+          // Level-based involves L0->L0 compactions that can lead to oversized
+          // L0 files. Take into account size as well to avoid later giant
+          // compactions to the base level.
+          score = std::max(
+              score, static_cast<double>(total_size) /
+                     mutable_cf_options.max_bytes_for_level_base);
+        }
+      }
+    } else {
+      // Compute the ratio of current size to size limit.
+      uint64_t level_bytes_no_compacting = 0;
+      for (auto f : files_[level]) {
+        if (!f->being_compacted) {
+          level_bytes_no_compacting += f->compensated_file_size;
+        }
+      }
+      score = static_cast<double>(level_bytes_no_compacting) /
+              MaxBytesForLevel(level);
+    }
+    compaction_level_[level] = level;
+    compaction_score_[level] = score;
+  }
+
+  // sort all the levels based on their score. Higher scores get listed
+  // first. Use bubble sort because the number of entries are small.
+  for (int i = 0; i < num_levels() - 2; i++) {
+    for (int j = i + 1; j < num_levels() - 1; j++) {
+      if (compaction_score_[i] < compaction_score_[j]) {
+        double score = compaction_score_[i];
+        int level = compaction_level_[i];
+        compaction_score_[i] = compaction_score_[j];
+        compaction_level_[i] = compaction_level_[j];
+        compaction_score_[j] = score;
+        compaction_level_[j] = level;
+      }
+    }
+  }
+  ComputeFilesMarkedForCompaction();
+  ComputeBottommostFilesMarkedForCompaction();
+  if (mutable_cf_options.ttl > 0) {
+    ComputeExpiredTtlFiles(immutable_cf_options, mutable_cf_options.ttl);
+  }
+  if (mutable_cf_options.periodic_compaction_seconds > 0) {
+    ComputeFilesMarkedForPeriodicCompaction(
+        immutable_cf_options, mutable_cf_options.periodic_compaction_seconds);
+  }
+  EstimateCompactionBytesNeeded(mutable_cf_options);
+}
+
+void VersionStorageInfo::ComputeFilesMarkedForCompaction() {
+  files_marked_for_compaction_.clear();
+  int last_qualify_level = 0;
+
+  // Do not include files from the last level with data
+  // If table properties collector suggests a file on the last level,
+  // we should not move it to a new level.
+  for (int level = num_levels() - 1; level >= 1; level--) {
+    if (!files_[level].empty()) {
+      last_qualify_level = level - 1;
+      break;
+    }
+  }
+
+  for (int level = 0; level <= last_qualify_level; level++) {
+    for (auto* f : files_[level]) {
+      if (!f->being_compacted && f->marked_for_compaction) {
+        files_marked_for_compaction_.emplace_back(level, f);
+      }
+    }
+  }
+}
+
+void VersionStorageInfo::ComputeExpiredTtlFiles(
+    const ImmutableCFOptions& ioptions, const uint64_t ttl) {
+  assert(ttl > 0);
+
+  expired_ttl_files_.clear();
+
+  int64_t _current_time;
+  auto status = ioptions.env->GetCurrentTime(&_current_time);
+  if (!status.ok()) {
+    return;
+  }
+  const uint64_t current_time = static_cast<uint64_t>(_current_time);
+
+  for (int level = 0; level < num_levels() - 1; level++) {
+    for (FileMetaData* f : files_[level]) {
+      if (!f->being_compacted) {
+        uint64_t oldest_ancester_time = f->TryGetOldestAncesterTime();
+        if (oldest_ancester_time > 0 &&
+            oldest_ancester_time < (current_time - ttl)) {
+          expired_ttl_files_.emplace_back(level, f);
+        }
+      }
+    }
+  }
+}
+
+void VersionStorageInfo::ComputeFilesMarkedForPeriodicCompaction(
+    const ImmutableCFOptions& ioptions,
+    const uint64_t periodic_compaction_seconds) {
+  assert(periodic_compaction_seconds > 0);
+
+  files_marked_for_periodic_compaction_.clear();
+
+  int64_t temp_current_time;
+  auto status = ioptions.env->GetCurrentTime(&temp_current_time);
+  if (!status.ok()) {
+    return;
+  }
+  const uint64_t current_time = static_cast<uint64_t>(temp_current_time);
+
+  // If periodic_compaction_seconds is larger than current time, periodic
+  // compaction can't possibly be triggered.
+  if (periodic_compaction_seconds > current_time) {
+    return;
+  }
+
+  const uint64_t allowed_time_limit =
+      current_time - periodic_compaction_seconds;
+
+  for (int level = 0; level < num_levels(); level++) {
+    for (auto f : files_[level]) {
+      if (!f->being_compacted) {
+        // Compute a file's modification time in the following order:
+        // 1. Use file_creation_time table property if it is > 0.
+        // 2. Use creation_time table property if it is > 0.
+        // 3. Use file's mtime metadata if the above two table properties are 0.
+        // Don't consider the file at all if the modification time cannot be
+        // correctly determined based on the above conditions.
+        uint64_t file_modification_time = f->TryGetFileCreationTime();
+        if (file_modification_time == kUnknownFileCreationTime) {
+          file_modification_time = f->TryGetOldestAncesterTime();
+        }
+        if (file_modification_time == kUnknownOldestAncesterTime) {
+          auto file_path = TableFileName(ioptions.cf_paths, f->fd.GetNumber(),
+                                         f->fd.GetPathId());
+          status = ioptions.env->GetFileModificationTime(
+              file_path, &file_modification_time);
+          if (!status.ok()) {
+            ROCKS_LOG_WARN(ioptions.info_log,
+                           "Can't get file modification time: %s: %s",
+                           file_path.c_str(), status.ToString().c_str());
+            continue;
+          }
+        }
+        if (file_modification_time > 0 &&
+            file_modification_time < allowed_time_limit) {
+          files_marked_for_periodic_compaction_.emplace_back(level, f);
+        }
+      }
+    }
+  }
+}
+
+namespace {
+
+// used to sort files by size
+struct Fsize {
+  size_t index;
+  FileMetaData* file;
+};
+
+// Compator that is used to sort files based on their size
+// In normal mode: descending size
+bool CompareCompensatedSizeDescending(const Fsize& first, const Fsize& second) {
+  return (first.file->compensated_file_size >
+      second.file->compensated_file_size);
+}
+} // anonymous namespace
+
+void VersionStorageInfo::AddFile(int level, FileMetaData* f, Logger* info_log) {
+  auto* level_files = &files_[level];
+  // Must not overlap
+#ifndef NDEBUG
+  if (level > 0 && !level_files->empty() &&
+      internal_comparator_->Compare(
+          (*level_files)[level_files->size() - 1]->largest, f->smallest) >= 0) {
+    auto* f2 = (*level_files)[level_files->size() - 1];
+    if (info_log != nullptr) {
+      Error(info_log, "Adding new file %" PRIu64
+                      " range (%s, %s) to level %d but overlapping "
+                      "with existing file %" PRIu64 " %s %s",
+            f->fd.GetNumber(), f->smallest.DebugString(true).c_str(),
+            f->largest.DebugString(true).c_str(), level, f2->fd.GetNumber(),
+            f2->smallest.DebugString(true).c_str(),
+            f2->largest.DebugString(true).c_str());
+      LogFlush(info_log);
+    }
+    assert(false);
+  }
+#else
+  (void)info_log;
+#endif
+  f->refs++;
+  level_files->push_back(f);
+}
+
+// Version::PrepareApply() need to be called before calling the function, or
+// following functions called:
+// 1. UpdateNumNonEmptyLevels();
+// 2. CalculateBaseBytes();
+// 3. UpdateFilesByCompactionPri();
+// 4. GenerateFileIndexer();
+// 5. GenerateLevelFilesBrief();
+// 6. GenerateLevel0NonOverlapping();
+// 7. GenerateBottommostFiles();
+void VersionStorageInfo::SetFinalized() {
+  finalized_ = true;
+#ifndef NDEBUG
+  if (compaction_style_ != kCompactionStyleLevel) {
+    // Not level based compaction.
+    return;
+  }
+  assert(base_level_ < 0 || num_levels() == 1 ||
+         (base_level_ >= 1 && base_level_ < num_levels()));
+  // Verify all levels newer than base_level are empty except L0
+  for (int level = 1; level < base_level(); level++) {
+    assert(NumLevelBytes(level) == 0);
+  }
+  uint64_t max_bytes_prev_level = 0;
+  for (int level = base_level(); level < num_levels() - 1; level++) {
+    if (LevelFiles(level).size() == 0) {
+      continue;
+    }
+    assert(MaxBytesForLevel(level) >= max_bytes_prev_level);
+    max_bytes_prev_level = MaxBytesForLevel(level);
+  }
+  int num_empty_non_l0_level = 0;
+  for (int level = 0; level < num_levels(); level++) {
+    assert(LevelFiles(level).size() == 0 ||
+           LevelFiles(level).size() == LevelFilesBrief(level).num_files);
+    if (level > 0 && NumLevelBytes(level) > 0) {
+      num_empty_non_l0_level++;
+    }
+    if (LevelFiles(level).size() > 0) {
+      assert(level < num_non_empty_levels());
+    }
+  }
+  assert(compaction_level_.size() > 0);
+  assert(compaction_level_.size() == compaction_score_.size());
+#endif
+}
+
+void VersionStorageInfo::UpdateNumNonEmptyLevels() {
+  num_non_empty_levels_ = num_levels_;
+  for (int i = num_levels_ - 1; i >= 0; i--) {
+    if (files_[i].size() != 0) {
+      return;
+    } else {
+      num_non_empty_levels_ = i;
+    }
+  }
+}
+
+namespace {
+// Sort `temp` based on ratio of overlapping size over file size
+void SortFileByOverlappingRatio(
+    const InternalKeyComparator& icmp, const std::vector<FileMetaData*>& files,
+    const std::vector<FileMetaData*>& next_level_files,
+    std::vector<Fsize>* temp) {
+  std::unordered_map<uint64_t, uint64_t> file_to_order;
+  auto next_level_it = next_level_files.begin();
+
+  for (auto& file : files) {
+    uint64_t overlapping_bytes = 0;
+    // Skip files in next level that is smaller than current file
+    while (next_level_it != next_level_files.end() &&
+           icmp.Compare((*next_level_it)->largest, file->smallest) < 0) {
+      next_level_it++;
+    }
+
+    while (next_level_it != next_level_files.end() &&
+           icmp.Compare((*next_level_it)->smallest, file->largest) < 0) {
+      overlapping_bytes += (*next_level_it)->fd.file_size;
+
+      if (icmp.Compare((*next_level_it)->largest, file->largest) > 0) {
+        // next level file cross large boundary of current file.
+        break;
+      }
+      next_level_it++;
+    }
+
+    assert(file->compensated_file_size != 0);
+    file_to_order[file->fd.GetNumber()] =
+        overlapping_bytes * 1024u / file->compensated_file_size;
+  }
+
+  std::sort(temp->begin(), temp->end(),
+            [&](const Fsize& f1, const Fsize& f2) -> bool {
+              return file_to_order[f1.file->fd.GetNumber()] <
+                     file_to_order[f2.file->fd.GetNumber()];
+            });
+}
+}  // namespace
+
+void VersionStorageInfo::UpdateFilesByCompactionPri(
+    CompactionPri compaction_pri) {
+  if (compaction_style_ == kCompactionStyleNone ||
+      compaction_style_ == kCompactionStyleFIFO ||
+      compaction_style_ == kCompactionStyleUniversal) {
+    // don't need this
+    return;
+  }
+  // No need to sort the highest level because it is never compacted.
+  for (int level = 0; level < num_levels() - 1; level++) {
+    const std::vector<FileMetaData*>& files = files_[level];
+    auto& files_by_compaction_pri = files_by_compaction_pri_[level];
+    assert(files_by_compaction_pri.size() == 0);
+
+    // populate a temp vector for sorting based on size
+    std::vector<Fsize> temp(files.size());
+    for (size_t i = 0; i < files.size(); i++) {
+      temp[i].index = i;
+      temp[i].file = files[i];
+    }
+
+    // sort the top number_of_files_to_sort_ based on file size
+    size_t num = VersionStorageInfo::kNumberFilesToSort;
+    if (num > temp.size()) {
+      num = temp.size();
+    }
+    switch (compaction_pri) {
+      case kByCompensatedSize:
+        std::partial_sort(temp.begin(), temp.begin() + num, temp.end(),
+                          CompareCompensatedSizeDescending);
+        break;
+      case kOldestLargestSeqFirst:
+        std::sort(temp.begin(), temp.end(),
+                  [](const Fsize& f1, const Fsize& f2) -> bool {
+                    return f1.file->fd.largest_seqno <
+                           f2.file->fd.largest_seqno;
+                  });
+        break;
+      case kOldestSmallestSeqFirst:
+        std::sort(temp.begin(), temp.end(),
+                  [](const Fsize& f1, const Fsize& f2) -> bool {
+                    return f1.file->fd.smallest_seqno <
+                           f2.file->fd.smallest_seqno;
+                  });
+        break;
+      case kMinOverlappingRatio:
+        SortFileByOverlappingRatio(*internal_comparator_, files_[level],
+                                   files_[level + 1], &temp);
+        break;
+      default:
+        assert(false);
+    }
+    assert(temp.size() == files.size());
+
+    // initialize files_by_compaction_pri_
+    for (size_t i = 0; i < temp.size(); i++) {
+      files_by_compaction_pri.push_back(static_cast<int>(temp[i].index));
+    }
+    next_file_to_compact_by_size_[level] = 0;
+    assert(files_[level].size() == files_by_compaction_pri_[level].size());
+  }
+}
+
+void VersionStorageInfo::GenerateLevel0NonOverlapping() {
+  assert(!finalized_);
+  level0_non_overlapping_ = true;
+  if (level_files_brief_.size() == 0) {
+    return;
+  }
+
+  // A copy of L0 files sorted by smallest key
+  std::vector<FdWithKeyRange> level0_sorted_file(
+      level_files_brief_[0].files,
+      level_files_brief_[0].files + level_files_brief_[0].num_files);
+  std::sort(level0_sorted_file.begin(), level0_sorted_file.end(),
+            [this](const FdWithKeyRange& f1, const FdWithKeyRange& f2) -> bool {
+              return (internal_comparator_->Compare(f1.smallest_key,
+                                                    f2.smallest_key) < 0);
+            });
+
+  for (size_t i = 1; i < level0_sorted_file.size(); ++i) {
+    FdWithKeyRange& f = level0_sorted_file[i];
+    FdWithKeyRange& prev = level0_sorted_file[i - 1];
+    if (internal_comparator_->Compare(prev.largest_key, f.smallest_key) >= 0) {
+      level0_non_overlapping_ = false;
+      break;
+    }
+  }
+}
+
+void VersionStorageInfo::GenerateBottommostFiles() {
+  assert(!finalized_);
+  assert(bottommost_files_.empty());
+  for (size_t level = 0; level < level_files_brief_.size(); ++level) {
+    for (size_t file_idx = 0; file_idx < level_files_brief_[level].num_files;
+         ++file_idx) {
+      const FdWithKeyRange& f = level_files_brief_[level].files[file_idx];
+      int l0_file_idx;
+      if (level == 0) {
+        l0_file_idx = static_cast<int>(file_idx);
+      } else {
+        l0_file_idx = -1;
+      }
+      Slice smallest_user_key = ExtractUserKey(f.smallest_key);
+      Slice largest_user_key = ExtractUserKey(f.largest_key);
+      if (!RangeMightExistAfterSortedRun(smallest_user_key, largest_user_key,
+                                         static_cast<int>(level),
+                                         l0_file_idx)) {
+        bottommost_files_.emplace_back(static_cast<int>(level),
+                                       f.file_metadata);
+      }
+    }
+  }
+}
+
+void VersionStorageInfo::UpdateOldestSnapshot(SequenceNumber seqnum) {
+  assert(seqnum >= oldest_snapshot_seqnum_);
+  oldest_snapshot_seqnum_ = seqnum;
+  if (oldest_snapshot_seqnum_ > bottommost_files_mark_threshold_) {
+    ComputeBottommostFilesMarkedForCompaction();
+  }
+}
+
+void VersionStorageInfo::ComputeBottommostFilesMarkedForCompaction() {
+  bottommost_files_marked_for_compaction_.clear();
+  bottommost_files_mark_threshold_ = kMaxSequenceNumber;
+  for (auto& level_and_file : bottommost_files_) {
+    if (!level_and_file.second->being_compacted &&
+        level_and_file.second->fd.largest_seqno != 0 &&
+        level_and_file.second->num_deletions > 1) {
+      // largest_seqno might be nonzero due to containing the final key in an
+      // earlier compaction, whose seqnum we didn't zero out. Multiple deletions
+      // ensures the file really contains deleted or overwritten keys.
+      if (level_and_file.second->fd.largest_seqno < oldest_snapshot_seqnum_) {
+        bottommost_files_marked_for_compaction_.push_back(level_and_file);
+      } else {
+        bottommost_files_mark_threshold_ =
+            std::min(bottommost_files_mark_threshold_,
+                     level_and_file.second->fd.largest_seqno);
+      }
+    }
+  }
+}
+
+void Version::Ref() {
+  ++refs_;
+}
+
+bool Version::Unref() {
+  assert(refs_ >= 1);
+  --refs_;
+  if (refs_ == 0) {
+    delete this;
+    return true;
+  }
+  return false;
+}
+
+bool VersionStorageInfo::OverlapInLevel(int level,
+                                        const Slice* smallest_user_key,
+                                        const Slice* largest_user_key) {
+  if (level >= num_non_empty_levels_) {
+    // empty level, no overlap
+    return false;
+  }
+  return SomeFileOverlapsRange(*internal_comparator_, (level > 0),
+                               level_files_brief_[level], smallest_user_key,
+                               largest_user_key);
+}
+
+// Store in "*inputs" all files in "level" that overlap [begin,end]
+// If hint_index is specified, then it points to a file in the
+// overlapping range.
+// The file_index returns a pointer to any file in an overlapping range.
+void VersionStorageInfo::GetOverlappingInputs(
+    int level, const InternalKey* begin, const InternalKey* end,
+    std::vector<FileMetaData*>* inputs, int hint_index, int* file_index,
+    bool expand_range, InternalKey** next_smallest) const {
+  if (level >= num_non_empty_levels_) {
+    // this level is empty, no overlapping inputs
+    return;
+  }
+
+  inputs->clear();
+  if (file_index) {
+    *file_index = -1;
+  }
+  const Comparator* user_cmp = user_comparator_;
+  if (level > 0) {
+    GetOverlappingInputsRangeBinarySearch(level, begin, end, inputs, hint_index,
+                                          file_index, false, next_smallest);
+    return;
+  }
+
+  if (next_smallest) {
+    // next_smallest key only makes sense for non-level 0, where files are
+    // non-overlapping
+    *next_smallest = nullptr;
+  }
+
+  Slice user_begin, user_end;
+  if (begin != nullptr) {
+    user_begin = begin->user_key();
+  }
+  if (end != nullptr) {
+    user_end = end->user_key();
+  }
+
+  // index stores the file index need to check.
+  std::list<size_t> index;
+  for (size_t i = 0; i < level_files_brief_[level].num_files; i++) {
+    index.emplace_back(i);
+  }
+
+  while (!index.empty()) {
+    bool found_overlapping_file = false;
+    auto iter = index.begin();
+    while (iter != index.end()) {
+      FdWithKeyRange* f = &(level_files_brief_[level].files[*iter]);
+      const Slice file_start = ExtractUserKey(f->smallest_key);
+      const Slice file_limit = ExtractUserKey(f->largest_key);
+      if (begin != nullptr &&
+          user_cmp->CompareWithoutTimestamp(file_limit, user_begin) < 0) {
+        // "f" is completely before specified range; skip it
+        iter++;
+      } else if (end != nullptr &&
+                 user_cmp->CompareWithoutTimestamp(file_start, user_end) > 0) {
+        // "f" is completely after specified range; skip it
+        iter++;
+      } else {
+        // if overlap
+        inputs->emplace_back(files_[level][*iter]);
+        found_overlapping_file = true;
+        // record the first file index.
+        if (file_index && *file_index == -1) {
+          *file_index = static_cast<int>(*iter);
+        }
+        // the related file is overlap, erase to avoid checking again.
+        iter = index.erase(iter);
+        if (expand_range) {
+          if (begin != nullptr &&
+              user_cmp->CompareWithoutTimestamp(file_start, user_begin) < 0) {
+            user_begin = file_start;
+          }
+          if (end != nullptr &&
+              user_cmp->CompareWithoutTimestamp(file_limit, user_end) > 0) {
+            user_end = file_limit;
+          }
+        }
+      }
+    }
+    // if all the files left are not overlap, break
+    if (!found_overlapping_file) {
+      break;
+    }
+  }
+}
+
+// Store in "*inputs" files in "level" that within range [begin,end]
+// Guarantee a "clean cut" boundary between the files in inputs
+// and the surrounding files and the maxinum number of files.
+// This will ensure that no parts of a key are lost during compaction.
+// If hint_index is specified, then it points to a file in the range.
+// The file_index returns a pointer to any file in an overlapping range.
+void VersionStorageInfo::GetCleanInputsWithinInterval(
+    int level, const InternalKey* begin, const InternalKey* end,
+    std::vector<FileMetaData*>* inputs, int hint_index, int* file_index) const {
+  inputs->clear();
+  if (file_index) {
+    *file_index = -1;
+  }
+  if (level >= num_non_empty_levels_ || level == 0 ||
+      level_files_brief_[level].num_files == 0) {
+    // this level is empty, no inputs within range
+    // also don't support clean input interval within L0
+    return;
+  }
+
+  GetOverlappingInputsRangeBinarySearch(level, begin, end, inputs,
+                                        hint_index, file_index,
+                                        true /* within_interval */);
+}
+
+// Store in "*inputs" all files in "level" that overlap [begin,end]
+// Employ binary search to find at least one file that overlaps the
+// specified range. From that file, iterate backwards and
+// forwards to find all overlapping files.
+// if within_range is set, then only store the maximum clean inputs
+// within range [begin, end]. "clean" means there is a boudnary
+// between the files in "*inputs" and the surrounding files
+void VersionStorageInfo::GetOverlappingInputsRangeBinarySearch(
+    int level, const InternalKey* begin, const InternalKey* end,
+    std::vector<FileMetaData*>* inputs, int hint_index, int* file_index,
+    bool within_interval, InternalKey** next_smallest) const {
+  assert(level > 0);
+
+  auto user_cmp = user_comparator_;
+  const FdWithKeyRange* files = level_files_brief_[level].files;
+  const int num_files = static_cast<int>(level_files_brief_[level].num_files);
+
+  // begin to use binary search to find lower bound
+  // and upper bound.
+  int start_index = 0;
+  int end_index = num_files;
+
+  if (begin != nullptr) {
+    // if within_interval is true, with file_key would find
+    // not overlapping ranges in std::lower_bound.
+    auto cmp = [&user_cmp, &within_interval](const FdWithKeyRange& f,
+                                             const InternalKey* k) {
+      auto& file_key = within_interval ? f.file_metadata->smallest
+                                       : f.file_metadata->largest;
+      return sstableKeyCompare(user_cmp, file_key, *k) < 0;
+    };
+
+    start_index = static_cast<int>(
+        std::lower_bound(files,
+                         files + (hint_index == -1 ? num_files : hint_index),
+                         begin, cmp) -
+        files);
+
+    if (start_index > 0 && within_interval) {
+      bool is_overlapping = true;
+      while (is_overlapping && start_index < num_files) {
+        auto& pre_limit = files[start_index - 1].file_metadata->largest;
+        auto& cur_start = files[start_index].file_metadata->smallest;
+        is_overlapping = sstableKeyCompare(user_cmp, pre_limit, cur_start) == 0;
+        start_index += is_overlapping;
+      }
+    }
+  }
+
+  if (end != nullptr) {
+    // if within_interval is true, with file_key would find
+    // not overlapping ranges in std::upper_bound.
+    auto cmp = [&user_cmp, &within_interval](const InternalKey* k,
+                                             const FdWithKeyRange& f) {
+      auto& file_key = within_interval ? f.file_metadata->largest
+                                       : f.file_metadata->smallest;
+      return sstableKeyCompare(user_cmp, *k, file_key) < 0;
+    };
+
+    end_index = static_cast<int>(
+        std::upper_bound(files + start_index, files + num_files, end, cmp) -
+        files);
+
+    if (end_index < num_files && within_interval) {
+      bool is_overlapping = true;
+      while (is_overlapping && end_index > start_index) {
+        auto& next_start = files[end_index].file_metadata->smallest;
+        auto& cur_limit = files[end_index - 1].file_metadata->largest;
+        is_overlapping =
+            sstableKeyCompare(user_cmp, cur_limit, next_start) == 0;
+        end_index -= is_overlapping;
+      }
+    }
+  }
+
+  assert(start_index <= end_index);
+
+  // If there were no overlapping files, return immediately.
+  if (start_index == end_index) {
+    if (next_smallest) {
+      *next_smallest = nullptr;
+    }
+    return;
+  }
+
+  assert(start_index < end_index);
+
+  // returns the index where an overlap is found
+  if (file_index) {
+    *file_index = start_index;
+  }
+
+  // insert overlapping files into vector
+  for (int i = start_index; i < end_index; i++) {
+    inputs->push_back(files_[level][i]);
+  }
+
+  if (next_smallest != nullptr) {
+    // Provide the next key outside the range covered by inputs
+    if (end_index < static_cast<int>(files_[level].size())) {
+      **next_smallest = files_[level][end_index]->smallest;
+    } else {
+      *next_smallest = nullptr;
+    }
+  }
+}
+
+uint64_t VersionStorageInfo::NumLevelBytes(int level) const {
+  assert(level >= 0);
+  assert(level < num_levels());
+  return TotalFileSize(files_[level]);
+}
+
+const char* VersionStorageInfo::LevelSummary(
+    LevelSummaryStorage* scratch) const {
+  int len = 0;
+  if (compaction_style_ == kCompactionStyleLevel && num_levels() > 1) {
+    assert(base_level_ < static_cast<int>(level_max_bytes_.size()));
+    if (level_multiplier_ != 0.0) {
+      len = snprintf(
+          scratch->buffer, sizeof(scratch->buffer),
+          "base level %d level multiplier %.2f max bytes base %" PRIu64 " ",
+          base_level_, level_multiplier_, level_max_bytes_[base_level_]);
+    }
+  }
+  len +=
+      snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, "files[");
+  for (int i = 0; i < num_levels(); i++) {
+    int sz = sizeof(scratch->buffer) - len;
+    int ret = snprintf(scratch->buffer + len, sz, "%d ", int(files_[i].size()));
+    if (ret < 0 || ret >= sz) break;
+    len += ret;
+  }
+  if (len > 0) {
+    // overwrite the last space
+    --len;
+  }
+  len += snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len,
+                  "] max score %.2f", compaction_score_[0]);
+
+  if (!files_marked_for_compaction_.empty()) {
+    snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len,
+             " (%" ROCKSDB_PRIszt " files need compaction)",
+             files_marked_for_compaction_.size());
+  }
+
+  return scratch->buffer;
+}
+
+const char* VersionStorageInfo::LevelFileSummary(FileSummaryStorage* scratch,
+                                                 int level) const {
+  int len = snprintf(scratch->buffer, sizeof(scratch->buffer), "files_size[");
+  for (const auto& f : files_[level]) {
+    int sz = sizeof(scratch->buffer) - len;
+    char sztxt[16];
+    AppendHumanBytes(f->fd.GetFileSize(), sztxt, sizeof(sztxt));
+    int ret = snprintf(scratch->buffer + len, sz,
+                       "#%" PRIu64 "(seq=%" PRIu64 ",sz=%s,%d) ",
+                       f->fd.GetNumber(), f->fd.smallest_seqno, sztxt,
+                       static_cast<int>(f->being_compacted));
+    if (ret < 0 || ret >= sz)
+      break;
+    len += ret;
+  }
+  // overwrite the last space (only if files_[level].size() is non-zero)
+  if (files_[level].size() && len > 0) {
+    --len;
+  }
+  snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, "]");
+  return scratch->buffer;
+}
+
+int64_t VersionStorageInfo::MaxNextLevelOverlappingBytes() {
+  uint64_t result = 0;
+  std::vector<FileMetaData*> overlaps;
+  for (int level = 1; level < num_levels() - 1; level++) {
+    for (const auto& f : files_[level]) {
+      GetOverlappingInputs(level + 1, &f->smallest, &f->largest, &overlaps);
+      const uint64_t sum = TotalFileSize(overlaps);
+      if (sum > result) {
+        result = sum;
+      }
+    }
+  }
+  return result;
+}
+
+uint64_t VersionStorageInfo::MaxBytesForLevel(int level) const {
+  // Note: the result for level zero is not really used since we set
+  // the level-0 compaction threshold based on number of files.
+  assert(level >= 0);
+  assert(level < static_cast<int>(level_max_bytes_.size()));
+  return level_max_bytes_[level];
+}
+
+void VersionStorageInfo::CalculateBaseBytes(const ImmutableCFOptions& ioptions,
+                                            const MutableCFOptions& options) {
+  // Special logic to set number of sorted runs.
+  // It is to match the previous behavior when all files are in L0.
+  int num_l0_count = static_cast<int>(files_[0].size());
+  if (compaction_style_ == kCompactionStyleUniversal) {
+    // For universal compaction, we use level0 score to indicate
+    // compaction score for the whole DB. Adding other levels as if
+    // they are L0 files.
+    for (int i = 1; i < num_levels(); i++) {
+      if (!files_[i].empty()) {
+        num_l0_count++;
+      }
+    }
+  }
+  set_l0_delay_trigger_count(num_l0_count);
+
+  level_max_bytes_.resize(ioptions.num_levels);
+  if (!ioptions.level_compaction_dynamic_level_bytes) {
+    base_level_ = (ioptions.compaction_style == kCompactionStyleLevel) ? 1 : -1;
+
+    // Calculate for static bytes base case
+    for (int i = 0; i < ioptions.num_levels; ++i) {
+      if (i == 0 && ioptions.compaction_style == kCompactionStyleUniversal) {
+        level_max_bytes_[i] = options.max_bytes_for_level_base;
+      } else if (i > 1) {
+        level_max_bytes_[i] = MultiplyCheckOverflow(
+            MultiplyCheckOverflow(level_max_bytes_[i - 1],
+                                  options.max_bytes_for_level_multiplier),
+            options.MaxBytesMultiplerAdditional(i - 1));
+      } else {
+        level_max_bytes_[i] = options.max_bytes_for_level_base;
+      }
+    }
+  } else {
+    uint64_t max_level_size = 0;
+
+    int first_non_empty_level = -1;
+    // Find size of non-L0 level of most data.
+    // Cannot use the size of the last level because it can be empty or less
+    // than previous levels after compaction.
+    for (int i = 1; i < num_levels_; i++) {
+      uint64_t total_size = 0;
+      for (const auto& f : files_[i]) {
+        total_size += f->fd.GetFileSize();
+      }
+      if (total_size > 0 && first_non_empty_level == -1) {
+        first_non_empty_level = i;
+      }
+      if (total_size > max_level_size) {
+        max_level_size = total_size;
+      }
+    }
+
+    // Prefill every level's max bytes to disallow compaction from there.
+    for (int i = 0; i < num_levels_; i++) {
+      level_max_bytes_[i] = std::numeric_limits<uint64_t>::max();
+    }
+
+    if (max_level_size == 0) {
+      // No data for L1 and up. L0 compacts to last level directly.
+      // No compaction from L1+ needs to be scheduled.
+      base_level_ = num_levels_ - 1;
+    } else {
+      uint64_t l0_size = 0;
+      for (const auto& f : files_[0]) {
+        l0_size += f->fd.GetFileSize();
+      }
+
+      uint64_t base_bytes_max =
+          std::max(options.max_bytes_for_level_base, l0_size);
+      uint64_t base_bytes_min = static_cast<uint64_t>(
+          base_bytes_max / options.max_bytes_for_level_multiplier);
+
+      // Try whether we can make last level's target size to be max_level_size
+      uint64_t cur_level_size = max_level_size;
+      for (int i = num_levels_ - 2; i >= first_non_empty_level; i--) {
+        // Round up after dividing
+        cur_level_size = static_cast<uint64_t>(
+            cur_level_size / options.max_bytes_for_level_multiplier);
+      }
+
+      // Calculate base level and its size.
+      uint64_t base_level_size;
+      if (cur_level_size <= base_bytes_min) {
+        // Case 1. If we make target size of last level to be max_level_size,
+        // target size of the first non-empty level would be smaller than
+        // base_bytes_min. We set it be base_bytes_min.
+        base_level_size = base_bytes_min + 1U;
+        base_level_ = first_non_empty_level;
+        ROCKS_LOG_INFO(ioptions.info_log,
+                       "More existing levels in DB than needed. "
+                       "max_bytes_for_level_multiplier may not be guaranteed.");
+      } else {
+        // Find base level (where L0 data is compacted to).
+        base_level_ = first_non_empty_level;
+        while (base_level_ > 1 && cur_level_size > base_bytes_max) {
+          --base_level_;
+          cur_level_size = static_cast<uint64_t>(
+              cur_level_size / options.max_bytes_for_level_multiplier);
+        }
+        if (cur_level_size > base_bytes_max) {
+          // Even L1 will be too large
+          assert(base_level_ == 1);
+          base_level_size = base_bytes_max;
+        } else {
+          base_level_size = cur_level_size;
+        }
+      }
+
+      level_multiplier_ = options.max_bytes_for_level_multiplier;
+      assert(base_level_size > 0);
+      if (l0_size > base_level_size &&
+          (l0_size > options.max_bytes_for_level_base ||
+           static_cast<int>(files_[0].size() / 2) >=
+               options.level0_file_num_compaction_trigger)) {
+        // We adjust the base level according to actual L0 size, and adjust
+        // the level multiplier accordingly, when:
+        //   1. the L0 size is larger than level size base, or
+        //   2. number of L0 files reaches twice the L0->L1 compaction trigger
+        // We don't do this otherwise to keep the LSM-tree structure stable
+        // unless the L0 compation is backlogged.
+        base_level_size = l0_size;
+        if (base_level_ == num_levels_ - 1) {
+          level_multiplier_ = 1.0;
+        } else {
+          level_multiplier_ = std::pow(
+              static_cast<double>(max_level_size) /
+                  static_cast<double>(base_level_size),
+              1.0 / static_cast<double>(num_levels_ - base_level_ - 1));
+        }
+      }
+
+      uint64_t level_size = base_level_size;
+      for (int i = base_level_; i < num_levels_; i++) {
+        if (i > base_level_) {
+          level_size = MultiplyCheckOverflow(level_size, level_multiplier_);
+        }
+        // Don't set any level below base_bytes_max. Otherwise, the LSM can
+        // assume an hourglass shape where L1+ sizes are smaller than L0. This
+        // causes compaction scoring, which depends on level sizes, to favor L1+
+        // at the expense of L0, which may fill up and stall.
+        level_max_bytes_[i] = std::max(level_size, base_bytes_max);
+      }
+    }
+  }
+}
+
+uint64_t VersionStorageInfo::EstimateLiveDataSize() const {
+  // Estimate the live data size by adding up the size of the last level for all
+  // key ranges. Note: Estimate depends on the ordering of files in level 0
+  // because files in level 0 can be overlapping.
+  uint64_t size = 0;
+
+  auto ikey_lt = [this](InternalKey* x, InternalKey* y) {
+    return internal_comparator_->Compare(*x, *y) < 0;
+  };
+  // (Ordered) map of largest keys in non-overlapping files
+  std::map<InternalKey*, FileMetaData*, decltype(ikey_lt)> ranges(ikey_lt);
+
+  for (int l = num_levels_ - 1; l >= 0; l--) {
+    bool found_end = false;
+    for (auto file : files_[l]) {
+      // Find the first file where the largest key is larger than the smallest
+      // key of the current file. If this file does not overlap with the
+      // current file, none of the files in the map does. If there is
+      // no potential overlap, we can safely insert the rest of this level
+      // (if the level is not 0) into the map without checking again because
+      // the elements in the level are sorted and non-overlapping.
+      auto lb = (found_end && l != 0) ?
+        ranges.end() : ranges.lower_bound(&file->smallest);
+      found_end = (lb == ranges.end());
+      if (found_end || internal_comparator_->Compare(
+            file->largest, (*lb).second->smallest) < 0) {
+          ranges.emplace_hint(lb, &file->largest, file);
+          size += file->fd.file_size;
+      }
+    }
+  }
+  return size;
+}
+
+bool VersionStorageInfo::RangeMightExistAfterSortedRun(
+    const Slice& smallest_user_key, const Slice& largest_user_key,
+    int last_level, int last_l0_idx) {
+  assert((last_l0_idx != -1) == (last_level == 0));
+  // TODO(ajkr): this preserves earlier behavior where we considered an L0 file
+  // bottommost only if it's the oldest L0 file and there are no files on older
+  // levels. It'd be better to consider it bottommost if there's no overlap in
+  // older levels/files.
+  if (last_level == 0 &&
+      last_l0_idx != static_cast<int>(LevelFiles(0).size() - 1)) {
+    return true;
+  }
+
+  // Checks whether there are files living beyond the `last_level`. If lower
+  // levels have files, it checks for overlap between [`smallest_key`,
+  // `largest_key`] and those files. Bottomlevel optimizations can be made if
+  // there are no files in lower levels or if there is no overlap with the files
+  // in the lower levels.
+  for (int level = last_level + 1; level < num_levels(); level++) {
+    // The range is not in the bottommost level if there are files in lower
+    // levels when the `last_level` is 0 or if there are files in lower levels
+    // which overlap with [`smallest_key`, `largest_key`].
+    if (files_[level].size() > 0 &&
+        (last_level == 0 ||
+         OverlapInLevel(level, &smallest_user_key, &largest_user_key))) {
+      return true;
+    }
+  }
+  return false;
+}
+
+void Version::AddLiveFiles(std::vector<FileDescriptor>* live) {
+  for (int level = 0; level < storage_info_.num_levels(); level++) {
+    const std::vector<FileMetaData*>& files = storage_info_.files_[level];
+    for (const auto& file : files) {
+      live->push_back(file->fd);
+    }
+  }
+}
+
+std::string Version::DebugString(bool hex, bool print_stats) const {
+  std::string r;
+  for (int level = 0; level < storage_info_.num_levels_; level++) {
+    // E.g.,
+    //   --- level 1 ---
+    //   17:123[1 .. 124]['a' .. 'd']
+    //   20:43[124 .. 128]['e' .. 'g']
+    //
+    // if print_stats=true:
+    //   17:123[1 .. 124]['a' .. 'd'](4096)
+    r.append("--- level ");
+    AppendNumberTo(&r, level);
+    r.append(" --- version# ");
+    AppendNumberTo(&r, version_number_);
+    r.append(" ---\n");
+    const std::vector<FileMetaData*>& files = storage_info_.files_[level];
+    for (size_t i = 0; i < files.size(); i++) {
+      r.push_back(' ');
+      AppendNumberTo(&r, files[i]->fd.GetNumber());
+      r.push_back(':');
+      AppendNumberTo(&r, files[i]->fd.GetFileSize());
+      r.append("[");
+      AppendNumberTo(&r, files[i]->fd.smallest_seqno);
+      r.append(" .. ");
+      AppendNumberTo(&r, files[i]->fd.largest_seqno);
+      r.append("]");
+      r.append("[");
+      r.append(files[i]->smallest.DebugString(hex));
+      r.append(" .. ");
+      r.append(files[i]->largest.DebugString(hex));
+      r.append("]");
+      if (files[i]->oldest_blob_file_number != kInvalidBlobFileNumber) {
+        r.append(" blob_file:");
+        AppendNumberTo(&r, files[i]->oldest_blob_file_number);
+      }
+      if (print_stats) {
+        r.append("(");
+        r.append(ToString(
+            files[i]->stats.num_reads_sampled.load(std::memory_order_relaxed)));
+        r.append(")");
+      }
+      r.append("\n");
+    }
+  }
+  return r;
+}
+
+// this is used to batch writes to the manifest file
+struct VersionSet::ManifestWriter {
+  Status status;
+  bool done;
+  InstrumentedCondVar cv;
+  ColumnFamilyData* cfd;
+  const MutableCFOptions mutable_cf_options;
+  const autovector<VersionEdit*>& edit_list;
+
+  explicit ManifestWriter(InstrumentedMutex* mu, ColumnFamilyData* _cfd,
+                          const MutableCFOptions& cf_options,
+                          const autovector<VersionEdit*>& e)
+      : done(false),
+        cv(mu),
+        cfd(_cfd),
+        mutable_cf_options(cf_options),
+        edit_list(e) {}
+};
+
+Status AtomicGroupReadBuffer::AddEdit(VersionEdit* edit) {
+  assert(edit);
+  if (edit->is_in_atomic_group_) {
+    TEST_SYNC_POINT("AtomicGroupReadBuffer::AddEdit:AtomicGroup");
+    if (replay_buffer_.empty()) {
+      replay_buffer_.resize(edit->remaining_entries_ + 1);
+      TEST_SYNC_POINT_CALLBACK(
+          "AtomicGroupReadBuffer::AddEdit:FirstInAtomicGroup", edit);
+    }
+    read_edits_in_atomic_group_++;
+    if (read_edits_in_atomic_group_ + edit->remaining_entries_ !=
+        static_cast<uint32_t>(replay_buffer_.size())) {
+      TEST_SYNC_POINT_CALLBACK(
+          "AtomicGroupReadBuffer::AddEdit:IncorrectAtomicGroupSize", edit);
+      return Status::Corruption("corrupted atomic group");
+    }
+    replay_buffer_[read_edits_in_atomic_group_ - 1] = *edit;
+    if (read_edits_in_atomic_group_ == replay_buffer_.size()) {
+      TEST_SYNC_POINT_CALLBACK(
+          "AtomicGroupReadBuffer::AddEdit:LastInAtomicGroup", edit);
+      return Status::OK();
+    }
+    return Status::OK();
+  }
+
+  // A normal edit.
+  if (!replay_buffer().empty()) {
+    TEST_SYNC_POINT_CALLBACK(
+        "AtomicGroupReadBuffer::AddEdit:AtomicGroupMixedWithNormalEdits", edit);
+    return Status::Corruption("corrupted atomic group");
+  }
+  return Status::OK();
+}
+
+bool AtomicGroupReadBuffer::IsFull() const {
+  return read_edits_in_atomic_group_ == replay_buffer_.size();
+}
+
+bool AtomicGroupReadBuffer::IsEmpty() const { return replay_buffer_.empty(); }
+
+void AtomicGroupReadBuffer::Clear() {
+  read_edits_in_atomic_group_ = 0;
+  replay_buffer_.clear();
+}
+
+VersionSet::VersionSet(const std::string& dbname,
+                       const ImmutableDBOptions* _db_options,
+                       const FileOptions& storage_options, Cache* table_cache,
+                       WriteBufferManager* write_buffer_manager,
+                       WriteController* write_controller,
+                       BlockCacheTracer* const block_cache_tracer)
+    : column_family_set_(new ColumnFamilySet(
+          dbname, _db_options, storage_options, table_cache,
+          write_buffer_manager, write_controller, block_cache_tracer)),
+      env_(_db_options->env),
+      fs_(_db_options->fs.get()),
+      dbname_(dbname),
+      db_options_(_db_options),
+      next_file_number_(2),
+      manifest_file_number_(0),  // Filled by Recover()
+      options_file_number_(0),
+      pending_manifest_file_number_(0),
+      last_sequence_(0),
+      last_allocated_sequence_(0),
+      last_published_sequence_(0),
+      prev_log_number_(0),
+      current_version_number_(0),
+      manifest_file_size_(0),
+      file_options_(storage_options),
+      block_cache_tracer_(block_cache_tracer) {}
+
+VersionSet::~VersionSet() {
+  // we need to delete column_family_set_ because its destructor depends on
+  // VersionSet
+  Cache* table_cache = column_family_set_->get_table_cache();
+  column_family_set_.reset();
+  for (auto& file : obsolete_files_) {
+    if (file.metadata->table_reader_handle) {
+      table_cache->Release(file.metadata->table_reader_handle);
+      TableCache::Evict(table_cache, file.metadata->fd.GetNumber());
+    }
+    file.DeleteMetadata();
+  }
+  obsolete_files_.clear();
+}
+
+void VersionSet::AppendVersion(ColumnFamilyData* column_family_data,
+                               Version* v) {
+  // compute new compaction score
+  v->storage_info()->ComputeCompactionScore(
+      *column_family_data->ioptions(),
+      *column_family_data->GetLatestMutableCFOptions());
+
+  // Mark v finalized
+  v->storage_info_.SetFinalized();
+
+  // Make "v" current
+  assert(v->refs_ == 0);
+  Version* current = column_family_data->current();
+  assert(v != current);
+  if (current != nullptr) {
+    assert(current->refs_ > 0);
+    current->Unref();
+  }
+  column_family_data->SetCurrent(v);
+  v->Ref();
+
+  // Append to linked list
+  v->prev_ = column_family_data->dummy_versions()->prev_;
+  v->next_ = column_family_data->dummy_versions();
+  v->prev_->next_ = v;
+  v->next_->prev_ = v;
+}
+
+Status VersionSet::ProcessManifestWrites(
+    std::deque<ManifestWriter>& writers, InstrumentedMutex* mu,
+    Directory* db_directory, bool new_descriptor_log,
+    const ColumnFamilyOptions* new_cf_options) {
+  assert(!writers.empty());
+  ManifestWriter& first_writer = writers.front();
+  ManifestWriter* last_writer = &first_writer;
+
+  assert(!manifest_writers_.empty());
+  assert(manifest_writers_.front() == &first_writer);
+
+  autovector<VersionEdit*> batch_edits;
+  autovector<Version*> versions;
+  autovector<const MutableCFOptions*> mutable_cf_options_ptrs;
+  std::vector<std::unique_ptr<BaseReferencedVersionBuilder>> builder_guards;
+
+  if (first_writer.edit_list.front()->IsColumnFamilyManipulation()) {
+    // No group commits for column family add or drop
+    LogAndApplyCFHelper(first_writer.edit_list.front());
+    batch_edits.push_back(first_writer.edit_list.front());
+  } else {
+    auto it = manifest_writers_.cbegin();
+    size_t group_start = std::numeric_limits<size_t>::max();
+    while (it != manifest_writers_.cend()) {
+      if ((*it)->edit_list.front()->IsColumnFamilyManipulation()) {
+        // no group commits for column family add or drop
+        break;
+      }
+      last_writer = *(it++);
+      assert(last_writer != nullptr);
+      assert(last_writer->cfd != nullptr);
+      if (last_writer->cfd->IsDropped()) {
+        // If we detect a dropped CF at this point, and the corresponding
+        // version edits belong to an atomic group, then we need to find out
+        // the preceding version edits in the same atomic group, and update
+        // their `remaining_entries_` member variable because we are NOT going
+        // to write the version edits' of dropped CF to the MANIFEST. If we
+        // don't update, then Recover can report corrupted atomic group because
+        // the `remaining_entries_` do not match.
+        if (!batch_edits.empty()) {
+          if (batch_edits.back()->is_in_atomic_group_ &&
+              batch_edits.back()->remaining_entries_ > 0) {
+            assert(group_start < batch_edits.size());
+            const auto& edit_list = last_writer->edit_list;
+            size_t k = 0;
+            while (k < edit_list.size()) {
+              if (!edit_list[k]->is_in_atomic_group_) {
+                break;
+              } else if (edit_list[k]->remaining_entries_ == 0) {
+                ++k;
+                break;
+              }
+              ++k;
+            }
+            for (auto i = group_start; i < batch_edits.size(); ++i) {
+              assert(static_cast<uint32_t>(k) <=
+                     batch_edits.back()->remaining_entries_);
+              batch_edits[i]->remaining_entries_ -= static_cast<uint32_t>(k);
+            }
+          }
+        }
+        continue;
+      }
+      // We do a linear search on versions because versions is small.
+      // TODO(yanqin) maybe consider unordered_map
+      Version* version = nullptr;
+      VersionBuilder* builder = nullptr;
+      for (int i = 0; i != static_cast<int>(versions.size()); ++i) {
+        uint32_t cf_id = last_writer->cfd->GetID();
+        if (versions[i]->cfd()->GetID() == cf_id) {
+          version = versions[i];
+          assert(!builder_guards.empty() &&
+                 builder_guards.size() == versions.size());
+          builder = builder_guards[i]->version_builder();
+          TEST_SYNC_POINT_CALLBACK(
+              "VersionSet::ProcessManifestWrites:SameColumnFamily", &cf_id);
+          break;
+        }
+      }
+      if (version == nullptr) {
+        version = new Version(last_writer->cfd, this, file_options_,
+                              last_writer->mutable_cf_options,
+                              current_version_number_++);
+        versions.push_back(version);
+        mutable_cf_options_ptrs.push_back(&last_writer->mutable_cf_options);
+        builder_guards.emplace_back(
+            new BaseReferencedVersionBuilder(last_writer->cfd));
+        builder = builder_guards.back()->version_builder();
+      }
+      assert(builder != nullptr);  // make checker happy
+      for (const auto& e : last_writer->edit_list) {
+        if (e->is_in_atomic_group_) {
+          if (batch_edits.empty() || !batch_edits.back()->is_in_atomic_group_ ||
+              (batch_edits.back()->is_in_atomic_group_ &&
+               batch_edits.back()->remaining_entries_ == 0)) {
+            group_start = batch_edits.size();
+          }
+        } else if (group_start != std::numeric_limits<size_t>::max()) {
+          group_start = std::numeric_limits<size_t>::max();
+        }
+        Status s = LogAndApplyHelper(last_writer->cfd, builder, e, mu);
+        if (!s.ok()) {
+          // free up the allocated memory
+          for (auto v : versions) {
+            delete v;
+          }
+          return s;
+        }
+        batch_edits.push_back(e);
+      }
+    }
+    for (int i = 0; i < static_cast<int>(versions.size()); ++i) {
+      assert(!builder_guards.empty() &&
+             builder_guards.size() == versions.size());
+      auto* builder = builder_guards[i]->version_builder();
+      Status s = builder->SaveTo(versions[i]->storage_info());
+      if (!s.ok()) {
+        // free up the allocated memory
+        for (auto v : versions) {
+          delete v;
+        }
+        return s;
+      }
+    }
+  }
+
+#ifndef NDEBUG
+  // Verify that version edits of atomic groups have correct
+  // remaining_entries_.
+  size_t k = 0;
+  while (k < batch_edits.size()) {
+    while (k < batch_edits.size() && !batch_edits[k]->is_in_atomic_group_) {
+      ++k;
+    }
+    if (k == batch_edits.size()) {
+      break;
+    }
+    size_t i = k;
+    while (i < batch_edits.size()) {
+      if (!batch_edits[i]->is_in_atomic_group_) {
+        break;
+      }
+      assert(i - k + batch_edits[i]->remaining_entries_ ==
+             batch_edits[k]->remaining_entries_);
+      if (batch_edits[i]->remaining_entries_ == 0) {
+        ++i;
+        break;
+      }
+      ++i;
+    }
+    assert(batch_edits[i - 1]->is_in_atomic_group_);
+    assert(0 == batch_edits[i - 1]->remaining_entries_);
+    std::vector<VersionEdit*> tmp;
+    for (size_t j = k; j != i; ++j) {
+      tmp.emplace_back(batch_edits[j]);
+    }
+    TEST_SYNC_POINT_CALLBACK(
+        "VersionSet::ProcessManifestWrites:CheckOneAtomicGroup", &tmp);
+    k = i;
+  }
+#endif  // NDEBUG
+
+  uint64_t new_manifest_file_size = 0;
+  Status s;
+
+  assert(pending_manifest_file_number_ == 0);
+  if (!descriptor_log_ ||
+      manifest_file_size_ > db_options_->max_manifest_file_size) {
+    TEST_SYNC_POINT("VersionSet::ProcessManifestWrites:BeforeNewManifest");
+    new_descriptor_log = true;
+  } else {
+    pending_manifest_file_number_ = manifest_file_number_;
+  }
+
+  // Local cached copy of state variable(s). WriteCurrentStateToManifest()
+  // reads its content after releasing db mutex to avoid race with
+  // SwitchMemtable().
+  std::unordered_map<uint32_t, MutableCFState> curr_state;
+  if (new_descriptor_log) {
+    pending_manifest_file_number_ = NewFileNumber();
+    batch_edits.back()->SetNextFile(next_file_number_.load());
+
+    // if we are writing out new snapshot make sure to persist max column
+    // family.
+    if (column_family_set_->GetMaxColumnFamily() > 0) {
+      first_writer.edit_list.front()->SetMaxColumnFamily(
+          column_family_set_->GetMaxColumnFamily());
+    }
+    for (const auto* cfd : *column_family_set_) {
+      assert(curr_state.find(cfd->GetID()) == curr_state.end());
+      curr_state[cfd->GetID()] = {cfd->GetLogNumber()};
+    }
+  }
+
+  {
+    FileOptions opt_file_opts = fs_->OptimizeForManifestWrite(file_options_);
+    mu->Unlock();
+
+    TEST_SYNC_POINT("VersionSet::LogAndApply:WriteManifest");
+    if (!first_writer.edit_list.front()->IsColumnFamilyManipulation()) {
+      for (int i = 0; i < static_cast<int>(versions.size()); ++i) {
+        assert(!builder_guards.empty() &&
+               builder_guards.size() == versions.size());
+        assert(!mutable_cf_options_ptrs.empty() &&
+               builder_guards.size() == versions.size());
+        ColumnFamilyData* cfd = versions[i]->cfd_;
+        s = builder_guards[i]->version_builder()->LoadTableHandlers(
+            cfd->internal_stats(), cfd->ioptions()->optimize_filters_for_hits,
+            true /* prefetch_index_and_filter_in_cache */,
+            false /* is_initial_load */,
+            mutable_cf_options_ptrs[i]->prefix_extractor.get());
+        if (!s.ok()) {
+          if (db_options_->paranoid_checks) {
+            break;
+          }
+          s = Status::OK();
+        }
+      }
+    }
+
+    if (s.ok() && new_descriptor_log) {
+      // This is fine because everything inside of this block is serialized --
+      // only one thread can be here at the same time
+      // create new manifest file
+      ROCKS_LOG_INFO(db_options_->info_log, "Creating manifest %" PRIu64 "\n",
+                     pending_manifest_file_number_);
+      std::string descriptor_fname =
+          DescriptorFileName(dbname_, pending_manifest_file_number_);
+      std::unique_ptr<FSWritableFile> descriptor_file;
+      s = NewWritableFile(fs_, descriptor_fname, &descriptor_file,
+                          opt_file_opts);
+      if (s.ok()) {
+        descriptor_file->SetPreallocationBlockSize(
+            db_options_->manifest_preallocation_size);
+
+        std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
+            std::move(descriptor_file), descriptor_fname, opt_file_opts, env_,
+            nullptr, db_options_->listeners));
+        descriptor_log_.reset(
+            new log::Writer(std::move(file_writer), 0, false));
+        s = WriteCurrentStateToManifest(curr_state, descriptor_log_.get());
+      }
+    }
+
+    if (s.ok()) {
+      if (!first_writer.edit_list.front()->IsColumnFamilyManipulation()) {
+        for (int i = 0; i < static_cast<int>(versions.size()); ++i) {
+          versions[i]->PrepareApply(*mutable_cf_options_ptrs[i], true);
+        }
+      }
+
+      // Write new records to MANIFEST log
+#ifndef NDEBUG
+      size_t idx = 0;
+#endif
+      for (auto& e : batch_edits) {
+        std::string record;
+        if (!e->EncodeTo(&record)) {
+          s = Status::Corruption("Unable to encode VersionEdit:" +
+                                 e->DebugString(true));
+          break;
+        }
+        TEST_KILL_RANDOM("VersionSet::LogAndApply:BeforeAddRecord",
+                         rocksdb_kill_odds * REDUCE_ODDS2);
+#ifndef NDEBUG
+        if (batch_edits.size() > 1 && batch_edits.size() - 1 == idx) {
+          TEST_SYNC_POINT_CALLBACK(
+              "VersionSet::ProcessManifestWrites:BeforeWriteLastVersionEdit:0",
+              nullptr);
+          TEST_SYNC_POINT(
+              "VersionSet::ProcessManifestWrites:BeforeWriteLastVersionEdit:1");
+        }
+        ++idx;
+#endif /* !NDEBUG */
+        s = descriptor_log_->AddRecord(record);
+        if (!s.ok()) {
+          break;
+        }
+      }
+      if (s.ok()) {
+        s = SyncManifest(env_, db_options_, descriptor_log_->file());
+      }
+      if (!s.ok()) {
+        ROCKS_LOG_ERROR(db_options_->info_log, "MANIFEST write %s\n",
+                        s.ToString().c_str());
+      }
+    }
+
+    // If we just created a new descriptor file, install it by writing a
+    // new CURRENT file that points to it.
+    if (s.ok() && new_descriptor_log) {
+      s = SetCurrentFile(env_, dbname_, pending_manifest_file_number_,
+                         db_directory);
+      TEST_SYNC_POINT("VersionSet::ProcessManifestWrites:AfterNewManifest");
+    }
+
+    if (s.ok()) {
+      // find offset in manifest file where this version is stored.
+      new_manifest_file_size = descriptor_log_->file()->GetFileSize();
+    }
+
+    if (first_writer.edit_list.front()->is_column_family_drop_) {
+      TEST_SYNC_POINT("VersionSet::LogAndApply::ColumnFamilyDrop:0");
+      TEST_SYNC_POINT("VersionSet::LogAndApply::ColumnFamilyDrop:1");
+      TEST_SYNC_POINT("VersionSet::LogAndApply::ColumnFamilyDrop:2");
+    }
+
+    LogFlush(db_options_->info_log);
+    TEST_SYNC_POINT("VersionSet::LogAndApply:WriteManifestDone");
+    mu->Lock();
+  }
+
+  // Append the old manifest file to the obsolete_manifest_ list to be deleted
+  // by PurgeObsoleteFiles later.
+  if (s.ok() && new_descriptor_log) {
+    obsolete_manifests_.emplace_back(
+        DescriptorFileName("", manifest_file_number_));
+  }
+
+  // Install the new versions
+  if (s.ok()) {
+    if (first_writer.edit_list.front()->is_column_family_add_) {
+      assert(batch_edits.size() == 1);
+      assert(new_cf_options != nullptr);
+      CreateColumnFamily(*new_cf_options, first_writer.edit_list.front());
+    } else if (first_writer.edit_list.front()->is_column_family_drop_) {
+      assert(batch_edits.size() == 1);
+      first_writer.cfd->SetDropped();
+      first_writer.cfd->UnrefAndTryDelete();
+    } else {
+      // Each version in versions corresponds to a column family.
+      // For each column family, update its log number indicating that logs
+      // with number smaller than this should be ignored.
+      for (const auto version : versions) {
+        uint64_t max_log_number_in_batch = 0;
+        uint32_t cf_id = version->cfd_->GetID();
+        for (const auto& e : batch_edits) {
+          if (e->has_log_number_ && e->column_family_ == cf_id) {
+            max_log_number_in_batch =
+                std::max(max_log_number_in_batch, e->log_number_);
+          }
+        }
+        if (max_log_number_in_batch != 0) {
+          assert(version->cfd_->GetLogNumber() <= max_log_number_in_batch);
+          version->cfd_->SetLogNumber(max_log_number_in_batch);
+        }
+      }
+
+      uint64_t last_min_log_number_to_keep = 0;
+      for (auto& e : batch_edits) {
+        if (e->has_min_log_number_to_keep_) {
+          last_min_log_number_to_keep =
+              std::max(last_min_log_number_to_keep, e->min_log_number_to_keep_);
+        }
+      }
+
+      if (last_min_log_number_to_keep != 0) {
+        // Should only be set in 2PC mode.
+        MarkMinLogNumberToKeep2PC(last_min_log_number_to_keep);
+      }
+
+      for (int i = 0; i < static_cast<int>(versions.size()); ++i) {
+        ColumnFamilyData* cfd = versions[i]->cfd_;
+        AppendVersion(cfd, versions[i]);
+      }
+    }
+    manifest_file_number_ = pending_manifest_file_number_;
+    manifest_file_size_ = new_manifest_file_size;
+    prev_log_number_ = first_writer.edit_list.front()->prev_log_number_;
+  } else {
+    std::string version_edits;
+    for (auto& e : batch_edits) {
+      version_edits += ("\n" + e->DebugString(true));
+    }
+    ROCKS_LOG_ERROR(db_options_->info_log,
+                    "Error in committing version edit to MANIFEST: %s",
+                    version_edits.c_str());
+    for (auto v : versions) {
+      delete v;
+    }
+    // If manifest append failed for whatever reason, the file could be
+    // corrupted. So we need to force the next version update to start a
+    // new manifest file.
+    descriptor_log_.reset();
+    if (new_descriptor_log) {
+      ROCKS_LOG_INFO(db_options_->info_log,
+                     "Deleting manifest %" PRIu64 " current manifest %" PRIu64
+                     "\n",
+                     manifest_file_number_, pending_manifest_file_number_);
+      env_->DeleteFile(
+          DescriptorFileName(dbname_, pending_manifest_file_number_));
+    }
+  }
+
+  pending_manifest_file_number_ = 0;
+
+  // wake up all the waiting writers
+  while (true) {
+    ManifestWriter* ready = manifest_writers_.front();
+    manifest_writers_.pop_front();
+    bool need_signal = true;
+    for (const auto& w : writers) {
+      if (&w == ready) {
+        need_signal = false;
+        break;
+      }
+    }
+    ready->status = s;
+    ready->done = true;
+    if (need_signal) {
+      ready->cv.Signal();
+    }
+    if (ready == last_writer) {
+      break;
+    }
+  }
+  if (!manifest_writers_.empty()) {
+    manifest_writers_.front()->cv.Signal();
+  }
+  return s;
+}
+
+// 'datas' is gramatically incorrect. We still use this notation to indicate
+// that this variable represents a collection of column_family_data.
+Status VersionSet::LogAndApply(
+    const autovector<ColumnFamilyData*>& column_family_datas,
+    const autovector<const MutableCFOptions*>& mutable_cf_options_list,
+    const autovector<autovector<VersionEdit*>>& edit_lists,
+    InstrumentedMutex* mu, Directory* db_directory, bool new_descriptor_log,
+    const ColumnFamilyOptions* new_cf_options) {
+  mu->AssertHeld();
+  int num_edits = 0;
+  for (const auto& elist : edit_lists) {
+    num_edits += static_cast<int>(elist.size());
+  }
+  if (num_edits == 0) {
+    return Status::OK();
+  } else if (num_edits > 1) {
+#ifndef NDEBUG
+    for (const auto& edit_list : edit_lists) {
+      for (const auto& edit : edit_list) {
+        assert(!edit->IsColumnFamilyManipulation());
+      }
+    }
+#endif /* ! NDEBUG */
+  }
+
+  int num_cfds = static_cast<int>(column_family_datas.size());
+  if (num_cfds == 1 && column_family_datas[0] == nullptr) {
+    assert(edit_lists.size() == 1 && edit_lists[0].size() == 1);
+    assert(edit_lists[0][0]->is_column_family_add_);
+    assert(new_cf_options != nullptr);
+  }
+  std::deque<ManifestWriter> writers;
+  if (num_cfds > 0) {
+    assert(static_cast<size_t>(num_cfds) == mutable_cf_options_list.size());
+    assert(static_cast<size_t>(num_cfds) == edit_lists.size());
+  }
+  for (int i = 0; i < num_cfds; ++i) {
+    writers.emplace_back(mu, column_family_datas[i],
+                         *mutable_cf_options_list[i], edit_lists[i]);
+    manifest_writers_.push_back(&writers[i]);
+  }
+  assert(!writers.empty());
+  ManifestWriter& first_writer = writers.front();
+  while (!first_writer.done && &first_writer != manifest_writers_.front()) {
+    first_writer.cv.Wait();
+  }
+  if (first_writer.done) {
+    // All non-CF-manipulation operations can be grouped together and committed
+    // to MANIFEST. They should all have finished. The status code is stored in
+    // the first manifest writer.
+#ifndef NDEBUG
+    for (const auto& writer : writers) {
+      assert(writer.done);
+    }
+#endif /* !NDEBUG */
+    return first_writer.status;
+  }
+
+  int num_undropped_cfds = 0;
+  for (auto cfd : column_family_datas) {
+    // if cfd == nullptr, it is a column family add.
+    if (cfd == nullptr || !cfd->IsDropped()) {
+      ++num_undropped_cfds;
+    }
+  }
+  if (0 == num_undropped_cfds) {
+    for (int i = 0; i != num_cfds; ++i) {
+      manifest_writers_.pop_front();
+    }
+    // Notify new head of manifest write queue.
+    if (!manifest_writers_.empty()) {
+      manifest_writers_.front()->cv.Signal();
+    }
+    return Status::ColumnFamilyDropped();
+  }
+
+  return ProcessManifestWrites(writers, mu, db_directory, new_descriptor_log,
+                               new_cf_options);
+}
+
+void VersionSet::LogAndApplyCFHelper(VersionEdit* edit) {
+  assert(edit->IsColumnFamilyManipulation());
+  edit->SetNextFile(next_file_number_.load());
+  // The log might have data that is not visible to memtbale and hence have not
+  // updated the last_sequence_ yet. It is also possible that the log has is
+  // expecting some new data that is not written yet. Since LastSequence is an
+  // upper bound on the sequence, it is ok to record
+  // last_allocated_sequence_ as the last sequence.
+  edit->SetLastSequence(db_options_->two_write_queues ? last_allocated_sequence_
+                                                      : last_sequence_);
+  if (edit->is_column_family_drop_) {
+    // if we drop column family, we have to make sure to save max column family,
+    // so that we don't reuse existing ID
+    edit->SetMaxColumnFamily(column_family_set_->GetMaxColumnFamily());
+  }
+}
+
+Status VersionSet::LogAndApplyHelper(ColumnFamilyData* cfd,
+                                     VersionBuilder* builder, VersionEdit* edit,
+                                     InstrumentedMutex* mu) {
+#ifdef NDEBUG
+  (void)cfd;
+#endif
+  mu->AssertHeld();
+  assert(!edit->IsColumnFamilyManipulation());
+
+  if (edit->has_log_number_) {
+    assert(edit->log_number_ >= cfd->GetLogNumber());
+    assert(edit->log_number_ < next_file_number_.load());
+  }
+
+  if (!edit->has_prev_log_number_) {
+    edit->SetPrevLogNumber(prev_log_number_);
+  }
+  edit->SetNextFile(next_file_number_.load());
+  // The log might have data that is not visible to memtbale and hence have not
+  // updated the last_sequence_ yet. It is also possible that the log has is
+  // expecting some new data that is not written yet. Since LastSequence is an
+  // upper bound on the sequence, it is ok to record
+  // last_allocated_sequence_ as the last sequence.
+  edit->SetLastSequence(db_options_->two_write_queues ? last_allocated_sequence_
+                                                      : last_sequence_);
+
+  Status s = builder->Apply(edit);
+
+  return s;
+}
+
+Status VersionSet::ApplyOneVersionEditToBuilder(
+    VersionEdit& edit,
+    const std::unordered_map<std::string, ColumnFamilyOptions>& name_to_options,
+    std::unordered_map<int, std::string>& column_families_not_found,
+    std::unordered_map<uint32_t, std::unique_ptr<BaseReferencedVersionBuilder>>&
+        builders,
+    VersionEditParams* version_edit_params) {
+  // Not found means that user didn't supply that column
+  // family option AND we encountered column family add
+  // record. Once we encounter column family drop record,
+  // we will delete the column family from
+  // column_families_not_found.
+  bool cf_in_not_found = (column_families_not_found.find(edit.column_family_) !=
+                          column_families_not_found.end());
+  // in builders means that user supplied that column family
+  // option AND that we encountered column family add record
+  bool cf_in_builders = builders.find(edit.column_family_) != builders.end();
+
+  // they can't both be true
+  assert(!(cf_in_not_found && cf_in_builders));
+
+  ColumnFamilyData* cfd = nullptr;
+
+  if (edit.is_column_family_add_) {
+    if (cf_in_builders || cf_in_not_found) {
+      return Status::Corruption(
+          "Manifest adding the same column family twice: " +
+          edit.column_family_name_);
+    }
+    auto cf_options = name_to_options.find(edit.column_family_name_);
+    // implicitly add persistent_stats column family without requiring user
+    // to specify
+    bool is_persistent_stats_column_family =
+        edit.column_family_name_.compare(kPersistentStatsColumnFamilyName) == 0;
+    if (cf_options == name_to_options.end() &&
+        !is_persistent_stats_column_family) {
+      column_families_not_found.insert(
+          {edit.column_family_, edit.column_family_name_});
+    } else {
+      // recover persistent_stats CF from a DB that already contains it
+      if (is_persistent_stats_column_family) {
+        ColumnFamilyOptions cfo;
+        OptimizeForPersistentStats(&cfo);
+        cfd = CreateColumnFamily(cfo, &edit);
+      } else {
+        cfd = CreateColumnFamily(cf_options->second, &edit);
+      }
+      cfd->set_initialized();
+      builders.insert(std::make_pair(
+          edit.column_family_, std::unique_ptr<BaseReferencedVersionBuilder>(
+                                   new BaseReferencedVersionBuilder(cfd))));
+    }
+  } else if (edit.is_column_family_drop_) {
+    if (cf_in_builders) {
+      auto builder = builders.find(edit.column_family_);
+      assert(builder != builders.end());
+      builders.erase(builder);
+      cfd = column_family_set_->GetColumnFamily(edit.column_family_);
+      assert(cfd != nullptr);
+      if (cfd->UnrefAndTryDelete()) {
+        cfd = nullptr;
+      } else {
+        // who else can have reference to cfd!?
+        assert(false);
+      }
+    } else if (cf_in_not_found) {
+      column_families_not_found.erase(edit.column_family_);
+    } else {
+      return Status::Corruption(
+          "Manifest - dropping non-existing column family");
+    }
+  } else if (!cf_in_not_found) {
+    if (!cf_in_builders) {
+      return Status::Corruption(
+          "Manifest record referencing unknown column family");
+    }
+
+    cfd = column_family_set_->GetColumnFamily(edit.column_family_);
+    // this should never happen since cf_in_builders is true
+    assert(cfd != nullptr);
+
+    // if it is not column family add or column family drop,
+    // then it's a file add/delete, which should be forwarded
+    // to builder
+    auto builder = builders.find(edit.column_family_);
+    assert(builder != builders.end());
+    Status s = builder->second->version_builder()->Apply(&edit);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+  return ExtractInfoFromVersionEdit(cfd, edit, version_edit_params);
+}
+
+Status VersionSet::ExtractInfoFromVersionEdit(
+    ColumnFamilyData* cfd, const VersionEdit& from_edit,
+    VersionEditParams* version_edit_params) {
+  if (cfd != nullptr) {
+    if (from_edit.has_db_id_) {
+      version_edit_params->SetDBId(from_edit.db_id_);
+    }
+    if (from_edit.has_log_number_) {
+      if (cfd->GetLogNumber() > from_edit.log_number_) {
+        ROCKS_LOG_WARN(
+            db_options_->info_log,
+            "MANIFEST corruption detected, but ignored - Log numbers in "
+            "records NOT monotonically increasing");
+      } else {
+        cfd->SetLogNumber(from_edit.log_number_);
+        version_edit_params->SetLogNumber(from_edit.log_number_);
+      }
+    }
+    if (from_edit.has_comparator_ &&
+        from_edit.comparator_ != cfd->user_comparator()->Name()) {
+      return Status::InvalidArgument(
+          cfd->user_comparator()->Name(),
+          "does not match existing comparator " + from_edit.comparator_);
+    }
+  }
+
+  if (from_edit.has_prev_log_number_) {
+    version_edit_params->SetPrevLogNumber(from_edit.prev_log_number_);
+  }
+
+  if (from_edit.has_next_file_number_) {
+    version_edit_params->SetNextFile(from_edit.next_file_number_);
+  }
+
+  if (from_edit.has_max_column_family_) {
+    version_edit_params->SetMaxColumnFamily(from_edit.max_column_family_);
+  }
+
+  if (from_edit.has_min_log_number_to_keep_) {
+    version_edit_params->min_log_number_to_keep_ =
+        std::max(version_edit_params->min_log_number_to_keep_,
+                 from_edit.min_log_number_to_keep_);
+  }
+
+  if (from_edit.has_last_sequence_) {
+    version_edit_params->SetLastSequence(from_edit.last_sequence_);
+  }
+  return Status::OK();
+}
+
+Status VersionSet::GetCurrentManifestPath(const std::string& dbname,
+                                          FileSystem* fs,
+                                          std::string* manifest_path,
+                                          uint64_t* manifest_file_number) {
+  assert(fs != nullptr);
+  assert(manifest_path != nullptr);
+  assert(manifest_file_number != nullptr);
+
+  std::string fname;
+  Status s = ReadFileToString(fs, CurrentFileName(dbname), &fname);
+  if (!s.ok()) {
+    return s;
+  }
+  if (fname.empty() || fname.back() != '\n') {
+    return Status::Corruption("CURRENT file does not end with newline");
+  }
+  // remove the trailing '\n'
+  fname.resize(fname.size() - 1);
+  FileType type;
+  bool parse_ok = ParseFileName(fname, manifest_file_number, &type);
+  if (!parse_ok || type != kDescriptorFile) {
+    return Status::Corruption("CURRENT file corrupted");
+  }
+  *manifest_path = dbname;
+  if (dbname.back() != '/') {
+    manifest_path->push_back('/');
+  }
+  *manifest_path += fname;
+  return Status::OK();
+}
+
+Status VersionSet::ReadAndRecover(
+    log::Reader* reader, AtomicGroupReadBuffer* read_buffer,
+    const std::unordered_map<std::string, ColumnFamilyOptions>& name_to_options,
+    std::unordered_map<int, std::string>& column_families_not_found,
+    std::unordered_map<uint32_t, std::unique_ptr<BaseReferencedVersionBuilder>>&
+        builders,
+    VersionEditParams* version_edit_params, std::string* db_id) {
+  assert(reader != nullptr);
+  assert(read_buffer != nullptr);
+  Status s;
+  Slice record;
+  std::string scratch;
+  size_t recovered_edits = 0;
+  while (reader->ReadRecord(&record, &scratch) && s.ok()) {
+    VersionEdit edit;
+    s = edit.DecodeFrom(record);
+    if (!s.ok()) {
+      break;
+    }
+    if (edit.has_db_id_) {
+      db_id_ = edit.GetDbId();
+      if (db_id != nullptr) {
+        db_id->assign(edit.GetDbId());
+      }
+    }
+    s = read_buffer->AddEdit(&edit);
+    if (!s.ok()) {
+      break;
+    }
+    if (edit.is_in_atomic_group_) {
+      if (read_buffer->IsFull()) {
+        // Apply edits in an atomic group when we have read all edits in the
+        // group.
+        for (auto& e : read_buffer->replay_buffer()) {
+          s = ApplyOneVersionEditToBuilder(e, name_to_options,
+                                           column_families_not_found, builders,
+                                           version_edit_params);
+          if (!s.ok()) {
+            break;
+          }
+          recovered_edits++;
+        }
+        if (!s.ok()) {
+          break;
+        }
+        read_buffer->Clear();
+      }
+    } else {
+      // Apply a normal edit immediately.
+      s = ApplyOneVersionEditToBuilder(edit, name_to_options,
+                                       column_families_not_found, builders,
+                                       version_edit_params);
+      if (s.ok()) {
+        recovered_edits++;
+      }
+    }
+  }
+  if (!s.ok()) {
+    // Clear the buffer if we fail to decode/apply an edit.
+    read_buffer->Clear();
+  }
+  TEST_SYNC_POINT_CALLBACK("VersionSet::ReadAndRecover:RecoveredEdits",
+                           &recovered_edits);
+  return s;
+}
+
+Status VersionSet::Recover(
+    const std::vector<ColumnFamilyDescriptor>& column_families, bool read_only,
+    std::string* db_id) {
+  std::unordered_map<std::string, ColumnFamilyOptions> cf_name_to_options;
+  for (const auto& cf : column_families) {
+    cf_name_to_options.emplace(cf.name, cf.options);
+  }
+  // keeps track of column families in manifest that were not found in
+  // column families parameters. if those column families are not dropped
+  // by subsequent manifest records, Recover() will return failure status
+  std::unordered_map<int, std::string> column_families_not_found;
+
+  // Read "CURRENT" file, which contains a pointer to the current manifest file
+  std::string manifest_path;
+  Status s = GetCurrentManifestPath(dbname_, fs_, &manifest_path,
+                                    &manifest_file_number_);
+  if (!s.ok()) {
+    return s;
+  }
+
+  ROCKS_LOG_INFO(db_options_->info_log, "Recovering from manifest file: %s\n",
+                 manifest_path.c_str());
+
+  std::unique_ptr<SequentialFileReader> manifest_file_reader;
+  {
+    std::unique_ptr<FSSequentialFile> manifest_file;
+    s = fs_->NewSequentialFile(manifest_path,
+                               fs_->OptimizeForManifestRead(file_options_),
+                               &manifest_file, nullptr);
+    if (!s.ok()) {
+      return s;
+    }
+    manifest_file_reader.reset(
+        new SequentialFileReader(std::move(manifest_file), manifest_path,
+                                 db_options_->log_readahead_size));
+  }
+
+  std::unordered_map<uint32_t, std::unique_ptr<BaseReferencedVersionBuilder>>
+      builders;
+
+  // add default column family
+  auto default_cf_iter = cf_name_to_options.find(kDefaultColumnFamilyName);
+  if (default_cf_iter == cf_name_to_options.end()) {
+    return Status::InvalidArgument("Default column family not specified");
+  }
+  VersionEdit default_cf_edit;
+  default_cf_edit.AddColumnFamily(kDefaultColumnFamilyName);
+  default_cf_edit.SetColumnFamily(0);
+  ColumnFamilyData* default_cfd =
+      CreateColumnFamily(default_cf_iter->second, &default_cf_edit);
+  // In recovery, nobody else can access it, so it's fine to set it to be
+  // initialized earlier.
+  default_cfd->set_initialized();
+  builders.insert(
+      std::make_pair(0, std::unique_ptr<BaseReferencedVersionBuilder>(
+                            new BaseReferencedVersionBuilder(default_cfd))));
+  uint64_t current_manifest_file_size = 0;
+  VersionEditParams version_edit_params;
+  {
+    VersionSet::LogReporter reporter;
+    reporter.status = &s;
+    log::Reader reader(nullptr, std::move(manifest_file_reader), &reporter,
+                       true /* checksum */, 0 /* log_number */);
+    Slice record;
+    std::string scratch;
+    AtomicGroupReadBuffer read_buffer;
+    s = ReadAndRecover(&reader, &read_buffer, cf_name_to_options,
+                       column_families_not_found, builders,
+                       &version_edit_params, db_id);
+    current_manifest_file_size = reader.GetReadOffset();
+    assert(current_manifest_file_size != 0);
+  }
+
+  if (s.ok()) {
+    if (!version_edit_params.has_next_file_number_) {
+      s = Status::Corruption("no meta-nextfile entry in descriptor");
+    } else if (!version_edit_params.has_log_number_) {
+      s = Status::Corruption("no meta-lognumber entry in descriptor");
+    } else if (!version_edit_params.has_last_sequence_) {
+      s = Status::Corruption("no last-sequence-number entry in descriptor");
+    }
+
+    if (!version_edit_params.has_prev_log_number_) {
+      version_edit_params.SetPrevLogNumber(0);
+    }
+
+    column_family_set_->UpdateMaxColumnFamily(
+        version_edit_params.max_column_family_);
+
+    // When reading DB generated using old release, min_log_number_to_keep=0.
+    // All log files will be scanned for potential prepare entries.
+    MarkMinLogNumberToKeep2PC(version_edit_params.min_log_number_to_keep_);
+    MarkFileNumberUsed(version_edit_params.prev_log_number_);
+    MarkFileNumberUsed(version_edit_params.log_number_);
+  }
+
+  // there were some column families in the MANIFEST that weren't specified
+  // in the argument. This is OK in read_only mode
+  if (read_only == false && !column_families_not_found.empty()) {
+    std::string list_of_not_found;
+    for (const auto& cf : column_families_not_found) {
+      list_of_not_found += ", " + cf.second;
+    }
+    list_of_not_found = list_of_not_found.substr(2);
+    s = Status::InvalidArgument(
+        "You have to open all column families. Column families not opened: " +
+        list_of_not_found);
+  }
+
+  if (s.ok()) {
+    for (auto cfd : *column_family_set_) {
+      assert(builders.count(cfd->GetID()) > 0);
+      auto* builder = builders[cfd->GetID()]->version_builder();
+      if (!builder->CheckConsistencyForNumLevels()) {
+        s = Status::InvalidArgument(
+            "db has more levels than options.num_levels");
+        break;
+      }
+    }
+  }
+
+  if (s.ok()) {
+    for (auto cfd : *column_family_set_) {
+      if (cfd->IsDropped()) {
+        continue;
+      }
+      if (read_only) {
+        cfd->table_cache()->SetTablesAreImmortal();
+      }
+      assert(cfd->initialized());
+      auto builders_iter = builders.find(cfd->GetID());
+      assert(builders_iter != builders.end());
+      auto builder = builders_iter->second->version_builder();
+
+      // unlimited table cache. Pre-load table handle now.
+      // Need to do it out of the mutex.
+      s = builder->LoadTableHandlers(
+          cfd->internal_stats(), db_options_->max_file_opening_threads,
+          false /* prefetch_index_and_filter_in_cache */,
+          true /* is_initial_load */,
+          cfd->GetLatestMutableCFOptions()->prefix_extractor.get());
+      if (!s.ok()) {
+        if (db_options_->paranoid_checks) {
+          return s;
+        }
+        s = Status::OK();
+      }
+
+      Version* v = new Version(cfd, this, file_options_,
+                               *cfd->GetLatestMutableCFOptions(),
+                               current_version_number_++);
+      builder->SaveTo(v->storage_info());
+
+      // Install recovered version
+      v->PrepareApply(*cfd->GetLatestMutableCFOptions(),
+          !(db_options_->skip_stats_update_on_db_open));
+      AppendVersion(cfd, v);
+    }
+
+    manifest_file_size_ = current_manifest_file_size;
+    next_file_number_.store(version_edit_params.next_file_number_ + 1);
+    last_allocated_sequence_ = version_edit_params.last_sequence_;
+    last_published_sequence_ = version_edit_params.last_sequence_;
+    last_sequence_ = version_edit_params.last_sequence_;
+    prev_log_number_ = version_edit_params.prev_log_number_;
+
+    ROCKS_LOG_INFO(
+        db_options_->info_log,
+        "Recovered from manifest file:%s succeeded,"
+        "manifest_file_number is %" PRIu64 ", next_file_number is %" PRIu64
+        ", last_sequence is %" PRIu64 ", log_number is %" PRIu64
+        ",prev_log_number is %" PRIu64 ",max_column_family is %" PRIu32
+        ",min_log_number_to_keep is %" PRIu64 "\n",
+        manifest_path.c_str(), manifest_file_number_, next_file_number_.load(),
+        last_sequence_.load(), version_edit_params.log_number_,
+        prev_log_number_, column_family_set_->GetMaxColumnFamily(),
+        min_log_number_to_keep_2pc());
+
+    for (auto cfd : *column_family_set_) {
+      if (cfd->IsDropped()) {
+        continue;
+      }
+      ROCKS_LOG_INFO(db_options_->info_log,
+                     "Column family [%s] (ID %" PRIu32
+                     "), log number is %" PRIu64 "\n",
+                     cfd->GetName().c_str(), cfd->GetID(), cfd->GetLogNumber());
+    }
+  }
+
+  return s;
+}
+
+Status VersionSet::ListColumnFamilies(std::vector<std::string>* column_families,
+                                      const std::string& dbname,
+                                      FileSystem* fs) {
+  // these are just for performance reasons, not correcntes,
+  // so we're fine using the defaults
+  FileOptions soptions;
+  // Read "CURRENT" file, which contains a pointer to the current manifest file
+  std::string manifest_path;
+  uint64_t manifest_file_number;
+  Status s =
+      GetCurrentManifestPath(dbname, fs, &manifest_path, &manifest_file_number);
+  if (!s.ok()) {
+    return s;
+  }
+
+  std::unique_ptr<SequentialFileReader> file_reader;
+  {
+    std::unique_ptr<FSSequentialFile> file;
+    s = fs->NewSequentialFile(manifest_path, soptions, &file, nullptr);
+    if (!s.ok()) {
+      return s;
+  }
+  file_reader.reset(new SequentialFileReader(std::move(file), manifest_path));
+  }
+
+  std::map<uint32_t, std::string> column_family_names;
+  // default column family is always implicitly there
+  column_family_names.insert({0, kDefaultColumnFamilyName});
+  VersionSet::LogReporter reporter;
+  reporter.status = &s;
+  log::Reader reader(nullptr, std::move(file_reader), &reporter,
+                     true /* checksum */, 0 /* log_number */);
+  Slice record;
+  std::string scratch;
+  while (reader.ReadRecord(&record, &scratch) && s.ok()) {
+    VersionEdit edit;
+    s = edit.DecodeFrom(record);
+    if (!s.ok()) {
+      break;
+    }
+    if (edit.is_column_family_add_) {
+      if (column_family_names.find(edit.column_family_) !=
+          column_family_names.end()) {
+        s = Status::Corruption("Manifest adding the same column family twice");
+        break;
+      }
+      column_family_names.insert(
+          {edit.column_family_, edit.column_family_name_});
+    } else if (edit.is_column_family_drop_) {
+      if (column_family_names.find(edit.column_family_) ==
+          column_family_names.end()) {
+        s = Status::Corruption(
+            "Manifest - dropping non-existing column family");
+        break;
+      }
+      column_family_names.erase(edit.column_family_);
+    }
+  }
+
+  column_families->clear();
+  if (s.ok()) {
+    for (const auto& iter : column_family_names) {
+      column_families->push_back(iter.second);
+    }
+  }
+
+  return s;
+}
+
+#ifndef ROCKSDB_LITE
+Status VersionSet::ReduceNumberOfLevels(const std::string& dbname,
+                                        const Options* options,
+                                        const FileOptions& file_options,
+                                        int new_levels) {
+  if (new_levels <= 1) {
+    return Status::InvalidArgument(
+        "Number of levels needs to be bigger than 1");
+  }
+
+  ImmutableDBOptions db_options(*options);
+  ColumnFamilyOptions cf_options(*options);
+  std::shared_ptr<Cache> tc(NewLRUCache(options->max_open_files - 10,
+                                        options->table_cache_numshardbits));
+  WriteController wc(options->delayed_write_rate);
+  WriteBufferManager wb(options->db_write_buffer_size);
+  VersionSet versions(dbname, &db_options, file_options, tc.get(), &wb, &wc,
+                      /*block_cache_tracer=*/nullptr);
+  Status status;
+
+  std::vector<ColumnFamilyDescriptor> dummy;
+  ColumnFamilyDescriptor dummy_descriptor(kDefaultColumnFamilyName,
+                                          ColumnFamilyOptions(*options));
+  dummy.push_back(dummy_descriptor);
+  status = versions.Recover(dummy);
+  if (!status.ok()) {
+    return status;
+  }
+
+  Version* current_version =
+      versions.GetColumnFamilySet()->GetDefault()->current();
+  auto* vstorage = current_version->storage_info();
+  int current_levels = vstorage->num_levels();
+
+  if (current_levels <= new_levels) {
+    return Status::OK();
+  }
+
+  // Make sure there are file only on one level from
+  // (new_levels-1) to (current_levels-1)
+  int first_nonempty_level = -1;
+  int first_nonempty_level_filenum = 0;
+  for (int i = new_levels - 1; i < current_levels; i++) {
+    int file_num = vstorage->NumLevelFiles(i);
+    if (file_num != 0) {
+      if (first_nonempty_level < 0) {
+        first_nonempty_level = i;
+        first_nonempty_level_filenum = file_num;
+      } else {
+        char msg[255];
+        snprintf(msg, sizeof(msg),
+                 "Found at least two levels containing files: "
+                 "[%d:%d],[%d:%d].\n",
+                 first_nonempty_level, first_nonempty_level_filenum, i,
+                 file_num);
+        return Status::InvalidArgument(msg);
+      }
+    }
+  }
+
+  // we need to allocate an array with the old number of levels size to
+  // avoid SIGSEGV in WriteCurrentStatetoManifest()
+  // however, all levels bigger or equal to new_levels will be empty
+  std::vector<FileMetaData*>* new_files_list =
+      new std::vector<FileMetaData*>[current_levels];
+  for (int i = 0; i < new_levels - 1; i++) {
+    new_files_list[i] = vstorage->LevelFiles(i);
+  }
+
+  if (first_nonempty_level > 0) {
+    new_files_list[new_levels - 1] = vstorage->LevelFiles(first_nonempty_level);
+  }
+
+  delete[] vstorage -> files_;
+  vstorage->files_ = new_files_list;
+  vstorage->num_levels_ = new_levels;
+
+  MutableCFOptions mutable_cf_options(*options);
+  VersionEdit ve;
+  InstrumentedMutex dummy_mutex;
+  InstrumentedMutexLock l(&dummy_mutex);
+  return versions.LogAndApply(
+      versions.GetColumnFamilySet()->GetDefault(),
+      mutable_cf_options, &ve, &dummy_mutex, nullptr, true);
+}
+
+// Get the checksum information including the checksum and checksum function
+// name of all SST files in VersionSet. Store the information in
+// FileChecksumList which contains a map from file number to its checksum info.
+// If DB is not running, make sure call VersionSet::Recover() to load the file
+// metadata from Manifest to VersionSet before calling this function.
+Status VersionSet::GetLiveFilesChecksumInfo(FileChecksumList* checksum_list) {
+  // Clean the previously stored checksum information if any.
+  if (checksum_list == nullptr) {
+    return Status::InvalidArgument("checksum_list is nullptr");
+  }
+  checksum_list->reset();
+
+  for (auto cfd : *column_family_set_) {
+    if (cfd->IsDropped() || !cfd->initialized()) {
+      continue;
+    }
+    for (int level = 0; level < cfd->NumberLevels(); level++) {
+      for (const auto& file :
+           cfd->current()->storage_info()->LevelFiles(level)) {
+        checksum_list->InsertOneFileChecksum(file->fd.GetNumber(),
+                                             file->file_checksum,
+                                             file->file_checksum_func_name);
+      }
+    }
+  }
+  return Status::OK();
+}
+
+Status VersionSet::DumpManifest(Options& options, std::string& dscname,
+                                bool verbose, bool hex, bool json) {
+  // Open the specified manifest file.
+  std::unique_ptr<SequentialFileReader> file_reader;
+  Status s;
+  {
+    std::unique_ptr<FSSequentialFile> file;
+    s = options.file_system->NewSequentialFile(
+        dscname,
+        options.file_system->OptimizeForManifestRead(file_options_), &file,
+        nullptr);
+    if (!s.ok()) {
+      return s;
+    }
+    file_reader.reset(new SequentialFileReader(
+        std::move(file), dscname, db_options_->log_readahead_size));
+  }
+
+  bool have_prev_log_number = false;
+  bool have_next_file = false;
+  bool have_last_sequence = false;
+  uint64_t next_file = 0;
+  uint64_t last_sequence = 0;
+  uint64_t previous_log_number = 0;
+  int count = 0;
+  std::unordered_map<uint32_t, std::string> comparators;
+  std::unordered_map<uint32_t, std::unique_ptr<BaseReferencedVersionBuilder>>
+      builders;
+
+  // add default column family
+  VersionEdit default_cf_edit;
+  default_cf_edit.AddColumnFamily(kDefaultColumnFamilyName);
+  default_cf_edit.SetColumnFamily(0);
+  ColumnFamilyData* default_cfd =
+      CreateColumnFamily(ColumnFamilyOptions(options), &default_cf_edit);
+  builders.insert(
+      std::make_pair(0, std::unique_ptr<BaseReferencedVersionBuilder>(
+                            new BaseReferencedVersionBuilder(default_cfd))));
+
+  {
+    VersionSet::LogReporter reporter;
+    reporter.status = &s;
+    log::Reader reader(nullptr, std::move(file_reader), &reporter,
+                       true /* checksum */, 0 /* log_number */);
+    Slice record;
+    std::string scratch;
+    while (reader.ReadRecord(&record, &scratch) && s.ok()) {
+      VersionEdit edit;
+      s = edit.DecodeFrom(record);
+      if (!s.ok()) {
+        break;
+      }
+
+      // Write out each individual edit
+      if (verbose && !json) {
+        printf("%s\n", edit.DebugString(hex).c_str());
+      } else if (json) {
+        printf("%s\n", edit.DebugJSON(count, hex).c_str());
+      }
+      count++;
+
+      bool cf_in_builders =
+          builders.find(edit.column_family_) != builders.end();
+
+      if (edit.has_comparator_) {
+        comparators.insert({edit.column_family_, edit.comparator_});
+      }
+
+      ColumnFamilyData* cfd = nullptr;
+
+      if (edit.is_column_family_add_) {
+        if (cf_in_builders) {
+          s = Status::Corruption(
+              "Manifest adding the same column family twice");
+          break;
+        }
+        cfd = CreateColumnFamily(ColumnFamilyOptions(options), &edit);
+        cfd->set_initialized();
+        builders.insert(std::make_pair(
+            edit.column_family_, std::unique_ptr<BaseReferencedVersionBuilder>(
+                                     new BaseReferencedVersionBuilder(cfd))));
+      } else if (edit.is_column_family_drop_) {
+        if (!cf_in_builders) {
+          s = Status::Corruption(
+              "Manifest - dropping non-existing column family");
+          break;
+        }
+        auto builder_iter = builders.find(edit.column_family_);
+        builders.erase(builder_iter);
+        comparators.erase(edit.column_family_);
+        cfd = column_family_set_->GetColumnFamily(edit.column_family_);
+        assert(cfd != nullptr);
+        cfd->UnrefAndTryDelete();
+        cfd = nullptr;
+      } else {
+        if (!cf_in_builders) {
+          s = Status::Corruption(
+              "Manifest record referencing unknown column family");
+          break;
+        }
+
+        cfd = column_family_set_->GetColumnFamily(edit.column_family_);
+        // this should never happen since cf_in_builders is true
+        assert(cfd != nullptr);
+
+        // if it is not column family add or column family drop,
+        // then it's a file add/delete, which should be forwarded
+        // to builder
+        auto builder = builders.find(edit.column_family_);
+        assert(builder != builders.end());
+        s = builder->second->version_builder()->Apply(&edit);
+        if (!s.ok()) {
+          break;
+        }
+      }
+
+      if (cfd != nullptr && edit.has_log_number_) {
+        cfd->SetLogNumber(edit.log_number_);
+      }
+
+
+      if (edit.has_prev_log_number_) {
+        previous_log_number = edit.prev_log_number_;
+        have_prev_log_number = true;
+      }
+
+      if (edit.has_next_file_number_) {
+        next_file = edit.next_file_number_;
+        have_next_file = true;
+      }
+
+      if (edit.has_last_sequence_) {
+        last_sequence = edit.last_sequence_;
+        have_last_sequence = true;
+      }
+
+      if (edit.has_max_column_family_) {
+        column_family_set_->UpdateMaxColumnFamily(edit.max_column_family_);
+      }
+
+      if (edit.has_min_log_number_to_keep_) {
+        MarkMinLogNumberToKeep2PC(edit.min_log_number_to_keep_);
+      }
+    }
+  }
+  file_reader.reset();
+
+  if (s.ok()) {
+    if (!have_next_file) {
+      s = Status::Corruption("no meta-nextfile entry in descriptor");
+      printf("no meta-nextfile entry in descriptor");
+    } else if (!have_last_sequence) {
+      printf("no last-sequence-number entry in descriptor");
+      s = Status::Corruption("no last-sequence-number entry in descriptor");
+    }
+
+    if (!have_prev_log_number) {
+      previous_log_number = 0;
+    }
+  }
+
+  if (s.ok()) {
+    for (auto cfd : *column_family_set_) {
+      if (cfd->IsDropped()) {
+        continue;
+      }
+      auto builders_iter = builders.find(cfd->GetID());
+      assert(builders_iter != builders.end());
+      auto builder = builders_iter->second->version_builder();
+
+      Version* v = new Version(cfd, this, file_options_,
+                               *cfd->GetLatestMutableCFOptions(),
+                               current_version_number_++);
+      builder->SaveTo(v->storage_info());
+      v->PrepareApply(*cfd->GetLatestMutableCFOptions(), false);
+
+      printf("--------------- Column family \"%s\"  (ID %" PRIu32
+             ") --------------\n",
+             cfd->GetName().c_str(), cfd->GetID());
+      printf("log number: %" PRIu64 "\n", cfd->GetLogNumber());
+      auto comparator = comparators.find(cfd->GetID());
+      if (comparator != comparators.end()) {
+        printf("comparator: %s\n", comparator->second.c_str());
+      } else {
+        printf("comparator: <NO COMPARATOR>\n");
+      }
+      printf("%s \n", v->DebugString(hex).c_str());
+      delete v;
+    }
+
+    next_file_number_.store(next_file + 1);
+    last_allocated_sequence_ = last_sequence;
+    last_published_sequence_ = last_sequence;
+    last_sequence_ = last_sequence;
+    prev_log_number_ = previous_log_number;
+
+    printf("next_file_number %" PRIu64 " last_sequence %" PRIu64
+           "  prev_log_number %" PRIu64 " max_column_family %" PRIu32
+           " min_log_number_to_keep "
+           "%" PRIu64 "\n",
+           next_file_number_.load(), last_sequence, previous_log_number,
+           column_family_set_->GetMaxColumnFamily(),
+           min_log_number_to_keep_2pc());
+  }
+
+  return s;
+}
+#endif  // ROCKSDB_LITE
+
+void VersionSet::MarkFileNumberUsed(uint64_t number) {
+  // only called during recovery and repair which are single threaded, so this
+  // works because there can't be concurrent calls
+  if (next_file_number_.load(std::memory_order_relaxed) <= number) {
+    next_file_number_.store(number + 1, std::memory_order_relaxed);
+  }
+}
+// Called only either from ::LogAndApply which is protected by mutex or during
+// recovery which is single-threaded.
+void VersionSet::MarkMinLogNumberToKeep2PC(uint64_t number) {
+  if (min_log_number_to_keep_2pc_.load(std::memory_order_relaxed) < number) {
+    min_log_number_to_keep_2pc_.store(number, std::memory_order_relaxed);
+  }
+}
+
+Status VersionSet::WriteCurrentStateToManifest(
+    const std::unordered_map<uint32_t, MutableCFState>& curr_state,
+    log::Writer* log) {
+  // TODO: Break up into multiple records to reduce memory usage on recovery?
+
+  // WARNING: This method doesn't hold a mutex!!
+
+  // This is done without DB mutex lock held, but only within single-threaded
+  // LogAndApply. Column family manipulations can only happen within LogAndApply
+  // (the same single thread), so we're safe to iterate.
+
+  if (db_options_->write_dbid_to_manifest) {
+    VersionEdit edit_for_db_id;
+    assert(!db_id_.empty());
+    edit_for_db_id.SetDBId(db_id_);
+    std::string db_id_record;
+    if (!edit_for_db_id.EncodeTo(&db_id_record)) {
+      return Status::Corruption("Unable to Encode VersionEdit:" +
+                                edit_for_db_id.DebugString(true));
+    }
+    Status add_record = log->AddRecord(db_id_record);
+    if (!add_record.ok()) {
+      return add_record;
+    }
+  }
+
+  for (auto cfd : *column_family_set_) {
+    if (cfd->IsDropped()) {
+      continue;
+    }
+    assert(cfd->initialized());
+    {
+      // Store column family info
+      VersionEdit edit;
+      if (cfd->GetID() != 0) {
+        // default column family is always there,
+        // no need to explicitly write it
+        edit.AddColumnFamily(cfd->GetName());
+        edit.SetColumnFamily(cfd->GetID());
+      }
+      edit.SetComparatorName(
+          cfd->internal_comparator().user_comparator()->Name());
+      std::string record;
+      if (!edit.EncodeTo(&record)) {
+        return Status::Corruption(
+            "Unable to Encode VersionEdit:" + edit.DebugString(true));
+      }
+      Status s = log->AddRecord(record);
+      if (!s.ok()) {
+        return s;
+      }
+    }
+
+    {
+      // Save files
+      VersionEdit edit;
+      edit.SetColumnFamily(cfd->GetID());
+
+      for (int level = 0; level < cfd->NumberLevels(); level++) {
+        for (const auto& f :
+             cfd->current()->storage_info()->LevelFiles(level)) {
+          edit.AddFile(level, f->fd.GetNumber(), f->fd.GetPathId(),
+                       f->fd.GetFileSize(), f->smallest, f->largest,
+                       f->fd.smallest_seqno, f->fd.largest_seqno,
+                       f->marked_for_compaction, f->oldest_blob_file_number,
+                       f->oldest_ancester_time, f->file_creation_time,
+                       f->file_checksum, f->file_checksum_func_name);
+        }
+      }
+      const auto iter = curr_state.find(cfd->GetID());
+      assert(iter != curr_state.end());
+      uint64_t log_number = iter->second.log_number;
+      edit.SetLogNumber(log_number);
+      std::string record;
+      if (!edit.EncodeTo(&record)) {
+        return Status::Corruption(
+            "Unable to Encode VersionEdit:" + edit.DebugString(true));
+      }
+      Status s = log->AddRecord(record);
+      if (!s.ok()) {
+        return s;
+      }
+    }
+  }
+  return Status::OK();
+}
+
+// TODO(aekmekji): in CompactionJob::GenSubcompactionBoundaries(), this
+// function is called repeatedly with consecutive pairs of slices. For example
+// if the slice list is [a, b, c, d] this function is called with arguments
+// (a,b) then (b,c) then (c,d). Knowing this, an optimization is possible where
+// we avoid doing binary search for the keys b and c twice and instead somehow
+// maintain state of where they first appear in the files.
+uint64_t VersionSet::ApproximateSize(const SizeApproximationOptions& options,
+                                     Version* v, const Slice& start,
+                                     const Slice& end, int start_level,
+                                     int end_level, TableReaderCaller caller) {
+  const auto& icmp = v->cfd_->internal_comparator();
+
+  // pre-condition
+  assert(icmp.Compare(start, end) <= 0);
+
+  uint64_t total_full_size = 0;
+  const auto* vstorage = v->storage_info();
+  const int num_non_empty_levels = vstorage->num_non_empty_levels();
+  end_level = (end_level == -1) ? num_non_empty_levels
+                                : std::min(end_level, num_non_empty_levels);
+
+  assert(start_level <= end_level);
+
+  // Outline of the optimization that uses options.files_size_error_margin.
+  // When approximating the files total size that is used to store a keys range,
+  // we first sum up the sizes of the files that fully fall into the range.
+  // Then we sum up the sizes of all the files that may intersect with the range
+  // (this includes all files in L0 as well). Then, if total_intersecting_size
+  // is smaller than total_full_size * options.files_size_error_margin - we can
+  // infer that the intersecting files have a sufficiently negligible
+  // contribution to the total size, and we can approximate the storage required
+  // for the keys in range as just half of the intersecting_files_size.
+  // E.g., if the value of files_size_error_margin is 0.1, then the error of the
+  // approximation is limited to only ~10% of the total size of files that fully
+  // fall into the keys range. In such case, this helps to avoid a costly
+  // process of binary searching the intersecting files that is required only
+  // for a more precise calculation of the total size.
+
+  autovector<FdWithKeyRange*, 32> first_files;
+  autovector<FdWithKeyRange*, 16> last_files;
+
+  // scan all the levels
+  for (int level = start_level; level < end_level; ++level) {
+    const LevelFilesBrief& files_brief = vstorage->LevelFilesBrief(level);
+    if (files_brief.num_files == 0) {
+      // empty level, skip exploration
+      continue;
+    }
+
+    if (level == 0) {
+      // level 0 files are not in sorted order, we need to iterate through
+      // the list to compute the total bytes that require scanning,
+      // so handle the case explicitly (similarly to first_files case)
+      for (size_t i = 0; i < files_brief.num_files; i++) {
+        first_files.push_back(&files_brief.files[i]);
+      }
+      continue;
+    }
+
+    assert(level > 0);
+    assert(files_brief.num_files > 0);
+
+    // identify the file position for start key
+    const int idx_start =
+        FindFileInRange(icmp, files_brief, start, 0,
+                        static_cast<uint32_t>(files_brief.num_files - 1));
+    assert(static_cast<size_t>(idx_start) < files_brief.num_files);
+
+    // identify the file position for end key
+    int idx_end = idx_start;
+    if (icmp.Compare(files_brief.files[idx_end].largest_key, end) < 0) {
+      idx_end =
+          FindFileInRange(icmp, files_brief, end, idx_start,
+                          static_cast<uint32_t>(files_brief.num_files - 1));
+    }
+    assert(idx_end >= idx_start &&
+           static_cast<size_t>(idx_end) < files_brief.num_files);
+
+    // scan all files from the starting index to the ending index
+    // (inferred from the sorted order)
+
+    // first scan all the intermediate full files (excluding first and last)
+    for (int i = idx_start + 1; i < idx_end; ++i) {
+      uint64_t file_size = files_brief.files[i].fd.GetFileSize();
+      // The entire file falls into the range, so we can just take its size.
+      assert(file_size ==
+             ApproximateSize(v, files_brief.files[i], start, end, caller));
+      total_full_size += file_size;
+    }
+
+    // save the first and the last files (which may be the same file), so we
+    // can scan them later.
+    first_files.push_back(&files_brief.files[idx_start]);
+    if (idx_start != idx_end) {
+      // we need to estimate size for both files, only if they are different
+      last_files.push_back(&files_brief.files[idx_end]);
+    }
+  }
+
+  // The sum of all file sizes that intersect the [start, end] keys range.
+  uint64_t total_intersecting_size = 0;
+  for (const auto* file_ptr : first_files) {
+    total_intersecting_size += file_ptr->fd.GetFileSize();
+  }
+  for (const auto* file_ptr : last_files) {
+    total_intersecting_size += file_ptr->fd.GetFileSize();
+  }
+
+  // Now scan all the first & last files at each level, and estimate their size.
+  // If the total_intersecting_size is less than X% of the total_full_size - we
+  // want to approximate the result in order to avoid the costly binary search
+  // inside ApproximateSize. We use half of file size as an approximation below.
+
+  const double margin = options.files_size_error_margin;
+  if (margin > 0 && total_intersecting_size <
+                        static_cast<uint64_t>(total_full_size * margin)) {
+    total_full_size += total_intersecting_size / 2;
+  } else {
+    // Estimate for all the first files, at each level
+    for (const auto file_ptr : first_files) {
+      total_full_size += ApproximateSize(v, *file_ptr, start, end, caller);
+    }
+
+    // Estimate for all the last files, at each level
+    for (const auto file_ptr : last_files) {
+      // We could use ApproximateSize here, but calling ApproximateOffsetOf
+      // directly is just more efficient.
+      total_full_size += ApproximateOffsetOf(v, *file_ptr, end, caller);
+    }
+  }
+
+  return total_full_size;
+}
+
+uint64_t VersionSet::ApproximateOffsetOf(Version* v, const FdWithKeyRange& f,
+                                         const Slice& key,
+                                         TableReaderCaller caller) {
+  // pre-condition
+  assert(v);
+  const auto& icmp = v->cfd_->internal_comparator();
+
+  uint64_t result = 0;
+  if (icmp.Compare(f.largest_key, key) <= 0) {
+    // Entire file is before "key", so just add the file size
+    result = f.fd.GetFileSize();
+  } else if (icmp.Compare(f.smallest_key, key) > 0) {
+    // Entire file is after "key", so ignore
+    result = 0;
+  } else {
+    // "key" falls in the range for this table.  Add the
+    // approximate offset of "key" within the table.
+    TableCache* table_cache = v->cfd_->table_cache();
+    if (table_cache != nullptr) {
+      result = table_cache->ApproximateOffsetOf(
+          key, f.file_metadata->fd, caller, icmp,
+          v->GetMutableCFOptions().prefix_extractor.get());
+    }
+  }
+  return result;
+}
+
+uint64_t VersionSet::ApproximateSize(Version* v, const FdWithKeyRange& f,
+                                     const Slice& start, const Slice& end,
+                                     TableReaderCaller caller) {
+  // pre-condition
+  assert(v);
+  const auto& icmp = v->cfd_->internal_comparator();
+  assert(icmp.Compare(start, end) <= 0);
+
+  if (icmp.Compare(f.largest_key, start) <= 0 ||
+      icmp.Compare(f.smallest_key, end) > 0) {
+    // Entire file is before or after the start/end keys range
+    return 0;
+  }
+
+  if (icmp.Compare(f.smallest_key, start) >= 0) {
+    // Start of the range is before the file start - approximate by end offset
+    return ApproximateOffsetOf(v, f, end, caller);
+  }
+
+  if (icmp.Compare(f.largest_key, end) < 0) {
+    // End of the range is after the file end - approximate by subtracting
+    // start offset from the file size
+    uint64_t start_offset = ApproximateOffsetOf(v, f, start, caller);
+    assert(f.fd.GetFileSize() >= start_offset);
+    return f.fd.GetFileSize() - start_offset;
+  }
+
+  // The interval falls entirely in the range for this file.
+  TableCache* table_cache = v->cfd_->table_cache();
+  if (table_cache == nullptr) {
+    return 0;
+  }
+  return table_cache->ApproximateSize(
+      start, end, f.file_metadata->fd, caller, icmp,
+      v->GetMutableCFOptions().prefix_extractor.get());
+}
+
+void VersionSet::AddLiveFiles(std::vector<FileDescriptor>* live_list) {
+  // pre-calculate space requirement
+  int64_t total_files = 0;
+  for (auto cfd : *column_family_set_) {
+    if (!cfd->initialized()) {
+      continue;
+    }
+    Version* dummy_versions = cfd->dummy_versions();
+    for (Version* v = dummy_versions->next_; v != dummy_versions;
+         v = v->next_) {
+      const auto* vstorage = v->storage_info();
+      for (int level = 0; level < vstorage->num_levels(); level++) {
+        total_files += vstorage->LevelFiles(level).size();
+      }
+    }
+  }
+
+  // just one time extension to the right size
+  live_list->reserve(live_list->size() + static_cast<size_t>(total_files));
+
+  for (auto cfd : *column_family_set_) {
+    if (!cfd->initialized()) {
+      continue;
+    }
+    auto* current = cfd->current();
+    bool found_current = false;
+    Version* dummy_versions = cfd->dummy_versions();
+    for (Version* v = dummy_versions->next_; v != dummy_versions;
+         v = v->next_) {
+      v->AddLiveFiles(live_list);
+      if (v == current) {
+        found_current = true;
+      }
+    }
+    if (!found_current && current != nullptr) {
+      // Should never happen unless it is a bug.
+      assert(false);
+      current->AddLiveFiles(live_list);
+    }
+  }
+}
+
+InternalIterator* VersionSet::MakeInputIterator(
+    const Compaction* c, RangeDelAggregator* range_del_agg,
+    const FileOptions& file_options_compactions) {
+  auto cfd = c->column_family_data();
+  ReadOptions read_options;
+  read_options.verify_checksums = true;
+  read_options.fill_cache = false;
+  // Compaction iterators shouldn't be confined to a single prefix.
+  // Compactions use Seek() for
+  // (a) concurrent compactions,
+  // (b) CompactionFilter::Decision::kRemoveAndSkipUntil.
+  read_options.total_order_seek = true;
+
+  // Level-0 files have to be merged together.  For other levels,
+  // we will make a concatenating iterator per level.
+  // TODO(opt): use concatenating iterator for level-0 if there is no overlap
+  const size_t space = (c->level() == 0 ? c->input_levels(0)->num_files +
+                                              c->num_input_levels() - 1
+                                        : c->num_input_levels());
+  InternalIterator** list = new InternalIterator* [space];
+  size_t num = 0;
+  for (size_t which = 0; which < c->num_input_levels(); which++) {
+    if (c->input_levels(which)->num_files != 0) {
+      if (c->level(which) == 0) {
+        const LevelFilesBrief* flevel = c->input_levels(which);
+        for (size_t i = 0; i < flevel->num_files; i++) {
+          list[num++] = cfd->table_cache()->NewIterator(
+              read_options, file_options_compactions,
+              cfd->internal_comparator(),
+              *flevel->files[i].file_metadata, range_del_agg,
+              c->mutable_cf_options()->prefix_extractor.get(),
+              /*table_reader_ptr=*/nullptr,
+              /*file_read_hist=*/nullptr, TableReaderCaller::kCompaction,
+              /*arena=*/nullptr,
+              /*skip_filters=*/false, /*level=*/static_cast<int>(which),
+              /*smallest_compaction_key=*/nullptr,
+              /*largest_compaction_key=*/nullptr);
+        }
+      } else {
+        // Create concatenating iterator for the files from this level
+        list[num++] = new LevelIterator(
+            cfd->table_cache(), read_options, file_options_compactions,
+            cfd->internal_comparator(), c->input_levels(which),
+            c->mutable_cf_options()->prefix_extractor.get(),
+            /*should_sample=*/false,
+            /*no per level latency histogram=*/nullptr,
+            TableReaderCaller::kCompaction, /*skip_filters=*/false,
+            /*level=*/static_cast<int>(which), range_del_agg,
+            c->boundaries(which));
+      }
+    }
+  }
+  assert(num <= space);
+  InternalIterator* result =
+      NewMergingIterator(&c->column_family_data()->internal_comparator(), list,
+                         static_cast<int>(num));
+  delete[] list;
+  return result;
+}
+
+// verify that the files listed in this compaction are present
+// in the current version
+bool VersionSet::VerifyCompactionFileConsistency(Compaction* c) {
+#ifndef NDEBUG
+  Version* version = c->column_family_data()->current();
+  const VersionStorageInfo* vstorage = version->storage_info();
+  if (c->input_version() != version) {
+    ROCKS_LOG_INFO(
+        db_options_->info_log,
+        "[%s] compaction output being applied to a different base version from"
+        " input version",
+        c->column_family_data()->GetName().c_str());
+
+    if (vstorage->compaction_style_ == kCompactionStyleLevel &&
+        c->start_level() == 0 && c->num_input_levels() > 2U) {
+      // We are doing a L0->base_level compaction. The assumption is if
+      // base level is not L1, levels from L1 to base_level - 1 is empty.
+      // This is ensured by having one compaction from L0 going on at the
+      // same time in level-based compaction. So that during the time, no
+      // compaction/flush can put files to those levels.
+      for (int l = c->start_level() + 1; l < c->output_level(); l++) {
+        if (vstorage->NumLevelFiles(l) != 0) {
+          return false;
+        }
+      }
+    }
+  }
+
+  for (size_t input = 0; input < c->num_input_levels(); ++input) {
+    int level = c->level(input);
+    for (size_t i = 0; i < c->num_input_files(input); ++i) {
+      uint64_t number = c->input(input, i)->fd.GetNumber();
+      bool found = false;
+      for (size_t j = 0; j < vstorage->files_[level].size(); j++) {
+        FileMetaData* f = vstorage->files_[level][j];
+        if (f->fd.GetNumber() == number) {
+          found = true;
+          break;
+        }
+      }
+      if (!found) {
+        return false;  // input files non existent in current version
+      }
+    }
+  }
+#else
+  (void)c;
+#endif
+  return true;     // everything good
+}
+
+Status VersionSet::GetMetadataForFile(uint64_t number, int* filelevel,
+                                      FileMetaData** meta,
+                                      ColumnFamilyData** cfd) {
+  for (auto cfd_iter : *column_family_set_) {
+    if (!cfd_iter->initialized()) {
+      continue;
+    }
+    Version* version = cfd_iter->current();
+    const auto* vstorage = version->storage_info();
+    for (int level = 0; level < vstorage->num_levels(); level++) {
+      for (const auto& file : vstorage->LevelFiles(level)) {
+        if (file->fd.GetNumber() == number) {
+          *meta = file;
+          *filelevel = level;
+          *cfd = cfd_iter;
+          return Status::OK();
+        }
+      }
+    }
+  }
+  return Status::NotFound("File not present in any level");
+}
+
+void VersionSet::GetLiveFilesMetaData(std::vector<LiveFileMetaData>* metadata) {
+  for (auto cfd : *column_family_set_) {
+    if (cfd->IsDropped() || !cfd->initialized()) {
+      continue;
+    }
+    for (int level = 0; level < cfd->NumberLevels(); level++) {
+      for (const auto& file :
+           cfd->current()->storage_info()->LevelFiles(level)) {
+        LiveFileMetaData filemetadata;
+        filemetadata.column_family_name = cfd->GetName();
+        uint32_t path_id = file->fd.GetPathId();
+        if (path_id < cfd->ioptions()->cf_paths.size()) {
+          filemetadata.db_path = cfd->ioptions()->cf_paths[path_id].path;
+        } else {
+          assert(!cfd->ioptions()->cf_paths.empty());
+          filemetadata.db_path = cfd->ioptions()->cf_paths.back().path;
+        }
+        const uint64_t file_number = file->fd.GetNumber();
+        filemetadata.name = MakeTableFileName("", file_number);
+        filemetadata.file_number = file_number;
+        filemetadata.level = level;
+        filemetadata.size = static_cast<size_t>(file->fd.GetFileSize());
+        filemetadata.smallestkey = file->smallest.user_key().ToString();
+        filemetadata.largestkey = file->largest.user_key().ToString();
+        filemetadata.smallest_seqno = file->fd.smallest_seqno;
+        filemetadata.largest_seqno = file->fd.largest_seqno;
+        filemetadata.num_reads_sampled = file->stats.num_reads_sampled.load(
+            std::memory_order_relaxed);
+        filemetadata.being_compacted = file->being_compacted;
+        filemetadata.num_entries = file->num_entries;
+        filemetadata.num_deletions = file->num_deletions;
+        filemetadata.oldest_blob_file_number = file->oldest_blob_file_number;
+        filemetadata.file_checksum = file->file_checksum;
+        filemetadata.file_checksum_func_name = file->file_checksum_func_name;
+        metadata->push_back(filemetadata);
+      }
+    }
+  }
+}
+
+void VersionSet::GetObsoleteFiles(std::vector<ObsoleteFileInfo>* files,
+                                  std::vector<std::string>* manifest_filenames,
+                                  uint64_t min_pending_output) {
+  assert(manifest_filenames->empty());
+  obsolete_manifests_.swap(*manifest_filenames);
+  std::vector<ObsoleteFileInfo> pending_files;
+  for (auto& f : obsolete_files_) {
+    if (f.metadata->fd.GetNumber() < min_pending_output) {
+      files->push_back(std::move(f));
+    } else {
+      pending_files.push_back(std::move(f));
+    }
+  }
+  obsolete_files_.swap(pending_files);
+}
+
+ColumnFamilyData* VersionSet::CreateColumnFamily(
+    const ColumnFamilyOptions& cf_options, VersionEdit* edit) {
+  assert(edit->is_column_family_add_);
+
+  MutableCFOptions dummy_cf_options;
+  Version* dummy_versions =
+      new Version(nullptr, this, file_options_, dummy_cf_options);
+  // Ref() dummy version once so that later we can call Unref() to delete it
+  // by avoiding calling "delete" explicitly (~Version is private)
+  dummy_versions->Ref();
+  auto new_cfd = column_family_set_->CreateColumnFamily(
+      edit->column_family_name_, edit->column_family_, dummy_versions,
+      cf_options);
+
+  Version* v = new Version(new_cfd, this, file_options_,
+                           *new_cfd->GetLatestMutableCFOptions(),
+                           current_version_number_++);
+
+  // Fill level target base information.
+  v->storage_info()->CalculateBaseBytes(*new_cfd->ioptions(),
+                                        *new_cfd->GetLatestMutableCFOptions());
+  AppendVersion(new_cfd, v);
+  // GetLatestMutableCFOptions() is safe here without mutex since the
+  // cfd is not available to client
+  new_cfd->CreateNewMemtable(*new_cfd->GetLatestMutableCFOptions(),
+                             LastSequence());
+  new_cfd->SetLogNumber(edit->log_number_);
+  return new_cfd;
+}
+
+uint64_t VersionSet::GetNumLiveVersions(Version* dummy_versions) {
+  uint64_t count = 0;
+  for (Version* v = dummy_versions->next_; v != dummy_versions; v = v->next_) {
+    count++;
+  }
+  return count;
+}
+
+uint64_t VersionSet::GetTotalSstFilesSize(Version* dummy_versions) {
+  std::unordered_set<uint64_t> unique_files;
+  uint64_t total_files_size = 0;
+  for (Version* v = dummy_versions->next_; v != dummy_versions; v = v->next_) {
+    VersionStorageInfo* storage_info = v->storage_info();
+    for (int level = 0; level < storage_info->num_levels_; level++) {
+      for (const auto& file_meta : storage_info->LevelFiles(level)) {
+        if (unique_files.find(file_meta->fd.packed_number_and_path_id) ==
+            unique_files.end()) {
+          unique_files.insert(file_meta->fd.packed_number_and_path_id);
+          total_files_size += file_meta->fd.GetFileSize();
+        }
+      }
+    }
+  }
+  return total_files_size;
+}
+
+ReactiveVersionSet::ReactiveVersionSet(const std::string& dbname,
+                                       const ImmutableDBOptions* _db_options,
+                                       const FileOptions& _file_options,
+                                       Cache* table_cache,
+                                       WriteBufferManager* write_buffer_manager,
+                                       WriteController* write_controller)
+    : VersionSet(dbname, _db_options, _file_options, table_cache,
+                 write_buffer_manager, write_controller,
+                 /*block_cache_tracer=*/nullptr),
+      number_of_edits_to_skip_(0) {}
+
+ReactiveVersionSet::~ReactiveVersionSet() {}
+
+Status ReactiveVersionSet::Recover(
+    const std::vector<ColumnFamilyDescriptor>& column_families,
+    std::unique_ptr<log::FragmentBufferedReader>* manifest_reader,
+    std::unique_ptr<log::Reader::Reporter>* manifest_reporter,
+    std::unique_ptr<Status>* manifest_reader_status) {
+  assert(manifest_reader != nullptr);
+  assert(manifest_reporter != nullptr);
+  assert(manifest_reader_status != nullptr);
+
+  std::unordered_map<std::string, ColumnFamilyOptions> cf_name_to_options;
+  for (const auto& cf : column_families) {
+    cf_name_to_options.insert({cf.name, cf.options});
+  }
+
+  // add default column family
+  auto default_cf_iter = cf_name_to_options.find(kDefaultColumnFamilyName);
+  if (default_cf_iter == cf_name_to_options.end()) {
+    return Status::InvalidArgument("Default column family not specified");
+  }
+  VersionEdit default_cf_edit;
+  default_cf_edit.AddColumnFamily(kDefaultColumnFamilyName);
+  default_cf_edit.SetColumnFamily(0);
+  ColumnFamilyData* default_cfd =
+      CreateColumnFamily(default_cf_iter->second, &default_cf_edit);
+  // In recovery, nobody else can access it, so it's fine to set it to be
+  // initialized earlier.
+  default_cfd->set_initialized();
+  std::unordered_map<uint32_t, std::unique_ptr<BaseReferencedVersionBuilder>>
+      builders;
+  std::unordered_map<int, std::string> column_families_not_found;
+  builders.insert(
+      std::make_pair(0, std::unique_ptr<BaseReferencedVersionBuilder>(
+                            new BaseReferencedVersionBuilder(default_cfd))));
+
+  manifest_reader_status->reset(new Status());
+  manifest_reporter->reset(new LogReporter());
+  static_cast<LogReporter*>(manifest_reporter->get())->status =
+      manifest_reader_status->get();
+  Status s = MaybeSwitchManifest(manifest_reporter->get(), manifest_reader);
+  log::Reader* reader = manifest_reader->get();
+
+  int retry = 0;
+  VersionEdit version_edit;
+  while (s.ok() && retry < 1) {
+    assert(reader != nullptr);
+    Slice record;
+    std::string scratch;
+    s = ReadAndRecover(reader, &read_buffer_, cf_name_to_options,
+                       column_families_not_found, builders, &version_edit);
+    if (s.ok()) {
+      bool enough = version_edit.has_next_file_number_ &&
+                    version_edit.has_log_number_ &&
+                    version_edit.has_last_sequence_;
+      if (enough) {
+        for (const auto& cf : column_families) {
+          auto cfd = column_family_set_->GetColumnFamily(cf.name);
+          if (cfd == nullptr) {
+            enough = false;
+            break;
+          }
+        }
+      }
+      if (enough) {
+        for (const auto& cf : column_families) {
+          auto cfd = column_family_set_->GetColumnFamily(cf.name);
+          assert(cfd != nullptr);
+          if (!cfd->IsDropped()) {
+            auto builder_iter = builders.find(cfd->GetID());
+            assert(builder_iter != builders.end());
+            auto builder = builder_iter->second->version_builder();
+            assert(builder != nullptr);
+            s = builder->LoadTableHandlers(
+                cfd->internal_stats(), db_options_->max_file_opening_threads,
+                false /* prefetch_index_and_filter_in_cache */,
+                true /* is_initial_load */,
+                cfd->GetLatestMutableCFOptions()->prefix_extractor.get());
+            if (!s.ok()) {
+              enough = false;
+              if (s.IsPathNotFound()) {
+                s = Status::OK();
+              }
+              break;
+            }
+          }
+        }
+      }
+      if (enough) {
+        break;
+      }
+    }
+    ++retry;
+  }
+
+  if (s.ok()) {
+    if (!version_edit.has_prev_log_number_) {
+      version_edit.prev_log_number_ = 0;
+    }
+    column_family_set_->UpdateMaxColumnFamily(version_edit.max_column_family_);
+
+    MarkMinLogNumberToKeep2PC(version_edit.min_log_number_to_keep_);
+    MarkFileNumberUsed(version_edit.prev_log_number_);
+    MarkFileNumberUsed(version_edit.log_number_);
+
+    for (auto cfd : *column_family_set_) {
+      assert(builders.count(cfd->GetID()) > 0);
+      auto builder = builders[cfd->GetID()]->version_builder();
+      if (!builder->CheckConsistencyForNumLevels()) {
+        s = Status::InvalidArgument(
+            "db has more levels than options.num_levels");
+        break;
+      }
+    }
+  }
+
+  if (s.ok()) {
+    for (auto cfd : *column_family_set_) {
+      if (cfd->IsDropped()) {
+        continue;
+      }
+      assert(cfd->initialized());
+      auto builders_iter = builders.find(cfd->GetID());
+      assert(builders_iter != builders.end());
+      auto* builder = builders_iter->second->version_builder();
+
+      Version* v = new Version(cfd, this, file_options_,
+                               *cfd->GetLatestMutableCFOptions(),
+                               current_version_number_++);
+      builder->SaveTo(v->storage_info());
+
+      // Install recovered version
+      v->PrepareApply(*cfd->GetLatestMutableCFOptions(),
+                      !(db_options_->skip_stats_update_on_db_open));
+      AppendVersion(cfd, v);
+    }
+    next_file_number_.store(version_edit.next_file_number_ + 1);
+    last_allocated_sequence_ = version_edit.last_sequence_;
+    last_published_sequence_ = version_edit.last_sequence_;
+    last_sequence_ = version_edit.last_sequence_;
+    prev_log_number_ = version_edit.prev_log_number_;
+    for (auto cfd : *column_family_set_) {
+      if (cfd->IsDropped()) {
+        continue;
+      }
+      ROCKS_LOG_INFO(db_options_->info_log,
+                     "Column family [%s] (ID %u), log number is %" PRIu64 "\n",
+                     cfd->GetName().c_str(), cfd->GetID(), cfd->GetLogNumber());
+    }
+  }
+  return s;
+}
+
+Status ReactiveVersionSet::ReadAndApply(
+    InstrumentedMutex* mu,
+    std::unique_ptr<log::FragmentBufferedReader>* manifest_reader,
+    std::unordered_set<ColumnFamilyData*>* cfds_changed) {
+  assert(manifest_reader != nullptr);
+  assert(cfds_changed != nullptr);
+  mu->AssertHeld();
+
+  Status s;
+  uint64_t applied_edits = 0;
+  while (s.ok()) {
+    Slice record;
+    std::string scratch;
+    log::Reader* reader = manifest_reader->get();
+    std::string old_manifest_path = reader->file()->file_name();
+    while (reader->ReadRecord(&record, &scratch)) {
+      VersionEdit edit;
+      s = edit.DecodeFrom(record);
+      if (!s.ok()) {
+        break;
+      }
+
+      // Skip the first VersionEdits of each MANIFEST generated by
+      // VersionSet::WriteCurrentStatetoManifest.
+      if (number_of_edits_to_skip_ > 0) {
+        ColumnFamilyData* cfd =
+            column_family_set_->GetColumnFamily(edit.column_family_);
+        if (cfd != nullptr && !cfd->IsDropped()) {
+          --number_of_edits_to_skip_;
+        }
+        continue;
+      }
+
+      s = read_buffer_.AddEdit(&edit);
+      if (!s.ok()) {
+        break;
+      }
+      VersionEdit temp_edit;
+      if (edit.is_in_atomic_group_) {
+        if (read_buffer_.IsFull()) {
+          // Apply edits in an atomic group when we have read all edits in the
+          // group.
+          for (auto& e : read_buffer_.replay_buffer()) {
+            s = ApplyOneVersionEditToBuilder(e, cfds_changed, &temp_edit);
+            if (!s.ok()) {
+              break;
+            }
+            applied_edits++;
+          }
+          if (!s.ok()) {
+            break;
+          }
+          read_buffer_.Clear();
+        }
+      } else {
+        // Apply a normal edit immediately.
+        s = ApplyOneVersionEditToBuilder(edit, cfds_changed, &temp_edit);
+        if (s.ok()) {
+          applied_edits++;
+        }
+      }
+    }
+    if (!s.ok()) {
+      // Clear the buffer if we fail to decode/apply an edit.
+      read_buffer_.Clear();
+    }
+    // It's possible that:
+    // 1) s.IsCorruption(), indicating the current MANIFEST is corrupted.
+    // 2) we have finished reading the current MANIFEST.
+    // 3) we have encountered an IOError reading the current MANIFEST.
+    // We need to look for the next MANIFEST and start from there. If we cannot
+    // find the next MANIFEST, we should exit the loop.
+    s = MaybeSwitchManifest(reader->GetReporter(), manifest_reader);
+    reader = manifest_reader->get();
+    if (s.ok()) {
+      if (reader->file()->file_name() == old_manifest_path) {
+        // Still processing the same MANIFEST, thus no need to continue this
+        // loop since no record is available if we have reached here.
+        break;
+      } else {
+        // We have switched to a new MANIFEST whose first records have been
+        // generated by VersionSet::WriteCurrentStatetoManifest. Since the
+        // secondary instance has already finished recovering upon start, there
+        // is no need for the secondary to process these records. Actually, if
+        // the secondary were to replay these records, the secondary may end up
+        // adding the same SST files AGAIN to each column family, causing
+        // consistency checks done by VersionBuilder to fail. Therefore, we
+        // record the number of records to skip at the beginning of the new
+        // MANIFEST and ignore them.
+        number_of_edits_to_skip_ = 0;
+        for (auto* cfd : *column_family_set_) {
+          if (cfd->IsDropped()) {
+            continue;
+          }
+          // Increase number_of_edits_to_skip by 2 because
+          // WriteCurrentStatetoManifest() writes 2 version edits for each
+          // column family at the beginning of the newly-generated MANIFEST.
+          // TODO(yanqin) remove hard-coded value.
+          if (db_options_->write_dbid_to_manifest) {
+            number_of_edits_to_skip_ += 3;
+          } else {
+            number_of_edits_to_skip_ += 2;
+          }
+        }
+      }
+    }
+  }
+
+  if (s.ok()) {
+    for (auto cfd : *column_family_set_) {
+      auto builder_iter = active_version_builders_.find(cfd->GetID());
+      if (builder_iter == active_version_builders_.end()) {
+        continue;
+      }
+      auto builder = builder_iter->second->version_builder();
+      if (!builder->CheckConsistencyForNumLevels()) {
+        s = Status::InvalidArgument(
+            "db has more levels than options.num_levels");
+        break;
+      }
+    }
+  }
+  TEST_SYNC_POINT_CALLBACK("ReactiveVersionSet::ReadAndApply:AppliedEdits",
+                           &applied_edits);
+  return s;
+}
+
+Status ReactiveVersionSet::ApplyOneVersionEditToBuilder(
+    VersionEdit& edit, std::unordered_set<ColumnFamilyData*>* cfds_changed,
+    VersionEdit* version_edit) {
+  ColumnFamilyData* cfd =
+      column_family_set_->GetColumnFamily(edit.column_family_);
+
+  // If we cannot find this column family in our column family set, then it
+  // may be a new column family created by the primary after the secondary
+  // starts. It is also possible that the secondary instance opens only a subset
+  // of column families. Ignore it for now.
+  if (nullptr == cfd) {
+    return Status::OK();
+  }
+  if (active_version_builders_.find(edit.column_family_) ==
+          active_version_builders_.end() &&
+      !cfd->IsDropped()) {
+    std::unique_ptr<BaseReferencedVersionBuilder> builder_guard(
+        new BaseReferencedVersionBuilder(cfd));
+    active_version_builders_.insert(
+        std::make_pair(edit.column_family_, std::move(builder_guard)));
+  }
+
+  auto builder_iter = active_version_builders_.find(edit.column_family_);
+  assert(builder_iter != active_version_builders_.end());
+  auto builder = builder_iter->second->version_builder();
+  assert(builder != nullptr);
+
+  if (edit.is_column_family_add_) {
+    // TODO (yanqin) for now the secondary ignores column families created
+    // after Open. This also simplifies handling of switching to a new MANIFEST
+    // and processing the snapshot of the system at the beginning of the
+    // MANIFEST.
+  } else if (edit.is_column_family_drop_) {
+    // Drop the column family by setting it to be 'dropped' without destroying
+    // the column family handle.
+    // TODO (haoyu) figure out how to handle column faimly drop for
+    // secondary instance. (Is it possible that the ref count for cfd is 0 but
+    // the ref count for its versions is higher than 0?)
+    cfd->SetDropped();
+    if (cfd->UnrefAndTryDelete()) {
+      cfd = nullptr;
+    }
+    active_version_builders_.erase(builder_iter);
+  } else {
+    Status s = builder->Apply(&edit);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+  Status s = ExtractInfoFromVersionEdit(cfd, edit, version_edit);
+  if (!s.ok()) {
+    return s;
+  }
+
+  if (cfd != nullptr && !cfd->IsDropped()) {
+    s = builder->LoadTableHandlers(
+        cfd->internal_stats(), db_options_->max_file_opening_threads,
+        false /* prefetch_index_and_filter_in_cache */,
+        false /* is_initial_load */,
+        cfd->GetLatestMutableCFOptions()->prefix_extractor.get());
+    TEST_SYNC_POINT_CALLBACK(
+        "ReactiveVersionSet::ApplyOneVersionEditToBuilder:"
+        "AfterLoadTableHandlers",
+        &s);
+
+    if (s.ok()) {
+      auto version = new Version(cfd, this, file_options_,
+                                 *cfd->GetLatestMutableCFOptions(),
+                                 current_version_number_++);
+      builder->SaveTo(version->storage_info());
+      version->PrepareApply(*cfd->GetLatestMutableCFOptions(), true);
+      AppendVersion(cfd, version);
+      active_version_builders_.erase(builder_iter);
+      if (cfds_changed->count(cfd) == 0) {
+        cfds_changed->insert(cfd);
+      }
+    } else if (s.IsPathNotFound()) {
+      s = Status::OK();
+    }
+    // Some other error has occurred during LoadTableHandlers.
+  }
+
+  if (version_edit->HasNextFile()) {
+    next_file_number_.store(version_edit->next_file_number_ + 1);
+  }
+  if (version_edit->has_last_sequence_) {
+    last_allocated_sequence_ = version_edit->last_sequence_;
+    last_published_sequence_ = version_edit->last_sequence_;
+    last_sequence_ = version_edit->last_sequence_;
+  }
+  if (version_edit->has_prev_log_number_) {
+    prev_log_number_ = version_edit->prev_log_number_;
+    MarkFileNumberUsed(version_edit->prev_log_number_);
+  }
+  if (version_edit->has_log_number_) {
+    MarkFileNumberUsed(version_edit->log_number_);
+  }
+  column_family_set_->UpdateMaxColumnFamily(version_edit->max_column_family_);
+  MarkMinLogNumberToKeep2PC(version_edit->min_log_number_to_keep_);
+  return s;
+}
+
+Status ReactiveVersionSet::MaybeSwitchManifest(
+    log::Reader::Reporter* reporter,
+    std::unique_ptr<log::FragmentBufferedReader>* manifest_reader) {
+  assert(manifest_reader != nullptr);
+  Status s;
+  do {
+    std::string manifest_path;
+    s = GetCurrentManifestPath(dbname_, fs_, &manifest_path,
+                               &manifest_file_number_);
+    std::unique_ptr<FSSequentialFile> manifest_file;
+    if (s.ok()) {
+      if (nullptr == manifest_reader->get() ||
+          manifest_reader->get()->file()->file_name() != manifest_path) {
+        TEST_SYNC_POINT(
+            "ReactiveVersionSet::MaybeSwitchManifest:"
+            "AfterGetCurrentManifestPath:0");
+        TEST_SYNC_POINT(
+            "ReactiveVersionSet::MaybeSwitchManifest:"
+            "AfterGetCurrentManifestPath:1");
+        s = fs_->NewSequentialFile(manifest_path,
+                                   env_->OptimizeForManifestRead(file_options_),
+                                   &manifest_file, nullptr);
+      } else {
+        // No need to switch manifest.
+        break;
+      }
+    }
+    std::unique_ptr<SequentialFileReader> manifest_file_reader;
+    if (s.ok()) {
+      manifest_file_reader.reset(
+          new SequentialFileReader(std::move(manifest_file), manifest_path,
+                                   db_options_->log_readahead_size));
+      manifest_reader->reset(new log::FragmentBufferedReader(
+          nullptr, std::move(manifest_file_reader), reporter,
+          true /* checksum */, 0 /* log_number */));
+      ROCKS_LOG_INFO(db_options_->info_log, "Switched to new manifest: %s\n",
+                     manifest_path.c_str());
+      // TODO (yanqin) every time we switch to a new MANIFEST, we clear the
+      // active_version_builders_ map because we choose to construct the
+      // versions from scratch, thanks to the first part of each MANIFEST
+      // written by VersionSet::WriteCurrentStatetoManifest. This is not
+      // necessary, but we choose this at present for the sake of simplicity.
+      active_version_builders_.clear();
+    }
+  } while (s.IsPathNotFound());
+  return s;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/version_set.h b/src/rocksdb/db/version_set.h
new file mode 100644
index 000000000..2ab09a5f8
--- /dev/null
+++ b/src/rocksdb/db/version_set.h
@@ -0,0 +1,1251 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// The representation of a DBImpl consists of a set of Versions.  The
+// newest version is called "current".  Older versions may be kept
+// around to provide a consistent view to live iterators.
+//
+// Each Version keeps track of a set of Table files per level.  The
+// entire set of versions is maintained in a VersionSet.
+//
+// Version,VersionSet are thread-compatible, but require external
+// synchronization on all accesses.
+
+#pragma once
+#include <atomic>
+#include <deque>
+#include <limits>
+#include <map>
+#include <memory>
+#include <set>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "db/column_family.h"
+#include "db/compaction/compaction.h"
+#include "db/compaction/compaction_picker.h"
+#include "db/dbformat.h"
+#include "db/file_indexer.h"
+#include "db/log_reader.h"
+#include "db/range_del_aggregator.h"
+#include "db/read_callback.h"
+#include "db/table_cache.h"
+#include "db/version_builder.h"
+#include "db/version_edit.h"
+#include "db/write_controller.h"
+#include "monitoring/instrumented_mutex.h"
+#include "options/db_options.h"
+#include "port/port.h"
+#include "rocksdb/env.h"
+#include "rocksdb/file_checksum.h"
+#include "table/get_context.h"
+#include "table/multiget_context.h"
+#include "trace_replay/block_cache_tracer.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace log {
+class Writer;
+}
+
+class Compaction;
+class LogBuffer;
+class LookupKey;
+class MemTable;
+class Version;
+class VersionSet;
+class WriteBufferManager;
+class MergeContext;
+class ColumnFamilySet;
+class MergeIteratorBuilder;
+
+// VersionEdit is always supposed to be valid and it is used to point at
+// entries in Manifest. Ideally it should not be used as a container to
+// carry around few of its fields as function params because it can cause
+// readers to think it's a valid entry from Manifest. To avoid that confusion
+// introducing VersionEditParams to simply carry around multiple VersionEdit
+// params. It need not point to a valid record in Manifest.
+using VersionEditParams = VersionEdit;
+
+// Return the smallest index i such that file_level.files[i]->largest >= key.
+// Return file_level.num_files if there is no such file.
+// REQUIRES: "file_level.files" contains a sorted list of
+// non-overlapping files.
+extern int FindFile(const InternalKeyComparator& icmp,
+                    const LevelFilesBrief& file_level, const Slice& key);
+
+// Returns true iff some file in "files" overlaps the user key range
+// [*smallest,*largest].
+// smallest==nullptr represents a key smaller than all keys in the DB.
+// largest==nullptr represents a key largest than all keys in the DB.
+// REQUIRES: If disjoint_sorted_files, file_level.files[]
+// contains disjoint ranges in sorted order.
+extern bool SomeFileOverlapsRange(const InternalKeyComparator& icmp,
+                                  bool disjoint_sorted_files,
+                                  const LevelFilesBrief& file_level,
+                                  const Slice* smallest_user_key,
+                                  const Slice* largest_user_key);
+
+// Generate LevelFilesBrief from vector<FdWithKeyRange*>
+// Would copy smallest_key and largest_key data to sequential memory
+// arena: Arena used to allocate the memory
+extern void DoGenerateLevelFilesBrief(LevelFilesBrief* file_level,
+                                      const std::vector<FileMetaData*>& files,
+                                      Arena* arena);
+
+// Information of the storage associated with each Version, including number of
+// levels of LSM tree, files information at each level, files marked for
+// compaction, etc.
+class VersionStorageInfo {
+ public:
+  VersionStorageInfo(const InternalKeyComparator* internal_comparator,
+                     const Comparator* user_comparator, int num_levels,
+                     CompactionStyle compaction_style,
+                     VersionStorageInfo* src_vstorage,
+                     bool _force_consistency_checks);
+  // No copying allowed
+  VersionStorageInfo(const VersionStorageInfo&) = delete;
+  void operator=(const VersionStorageInfo&) = delete;
+  ~VersionStorageInfo();
+
+  void Reserve(int level, size_t size) { files_[level].reserve(size); }
+
+  void AddFile(int level, FileMetaData* f, Logger* info_log = nullptr);
+
+  void SetFinalized();
+
+  // Update num_non_empty_levels_.
+  void UpdateNumNonEmptyLevels();
+
+  void GenerateFileIndexer() {
+    file_indexer_.UpdateIndex(&arena_, num_non_empty_levels_, files_);
+  }
+
+  // Update the accumulated stats from a file-meta.
+  void UpdateAccumulatedStats(FileMetaData* file_meta);
+
+  // Decrease the current stat from a to-be-deleted file-meta
+  void RemoveCurrentStats(FileMetaData* file_meta);
+
+  void ComputeCompensatedSizes();
+
+  // Updates internal structures that keep track of compaction scores
+  // We use compaction scores to figure out which compaction to do next
+  // REQUIRES: db_mutex held!!
+  // TODO find a better way to pass compaction_options_fifo.
+  void ComputeCompactionScore(const ImmutableCFOptions& immutable_cf_options,
+                              const MutableCFOptions& mutable_cf_options);
+
+  // Estimate est_comp_needed_bytes_
+  void EstimateCompactionBytesNeeded(
+      const MutableCFOptions& mutable_cf_options);
+
+  // This computes files_marked_for_compaction_ and is called by
+  // ComputeCompactionScore()
+  void ComputeFilesMarkedForCompaction();
+
+  // This computes ttl_expired_files_ and is called by
+  // ComputeCompactionScore()
+  void ComputeExpiredTtlFiles(const ImmutableCFOptions& ioptions,
+                              const uint64_t ttl);
+
+  // This computes files_marked_for_periodic_compaction_ and is called by
+  // ComputeCompactionScore()
+  void ComputeFilesMarkedForPeriodicCompaction(
+      const ImmutableCFOptions& ioptions,
+      const uint64_t periodic_compaction_seconds);
+
+  // This computes bottommost_files_marked_for_compaction_ and is called by
+  // ComputeCompactionScore() or UpdateOldestSnapshot().
+  //
+  // Among bottommost files (assumes they've already been computed), marks the
+  // ones that have keys that would be eliminated if recompacted, according to
+  // the seqnum of the oldest existing snapshot. Must be called every time
+  // oldest snapshot changes as that is when bottom-level files can become
+  // eligible for compaction.
+  //
+  // REQUIRES: DB mutex held
+  void ComputeBottommostFilesMarkedForCompaction();
+
+  // Generate level_files_brief_ from files_
+  void GenerateLevelFilesBrief();
+  // Sort all files for this version based on their file size and
+  // record results in files_by_compaction_pri_. The largest files are listed
+  // first.
+  void UpdateFilesByCompactionPri(CompactionPri compaction_pri);
+
+  void GenerateLevel0NonOverlapping();
+  bool level0_non_overlapping() const {
+    return level0_non_overlapping_;
+  }
+
+  // Check whether each file in this version is bottommost (i.e., nothing in its
+  // key-range could possibly exist in an older file/level).
+  // REQUIRES: This version has not been saved
+  void GenerateBottommostFiles();
+
+  // Updates the oldest snapshot and related internal state, like the bottommost
+  // files marked for compaction.
+  // REQUIRES: DB mutex held
+  void UpdateOldestSnapshot(SequenceNumber oldest_snapshot_seqnum);
+
+  int MaxInputLevel() const;
+  int MaxOutputLevel(bool allow_ingest_behind) const;
+
+  // Return level number that has idx'th highest score
+  int CompactionScoreLevel(int idx) const { return compaction_level_[idx]; }
+
+  // Return idx'th highest score
+  double CompactionScore(int idx) const { return compaction_score_[idx]; }
+
+  void GetOverlappingInputs(
+      int level, const InternalKey* begin,  // nullptr means before all keys
+      const InternalKey* end,               // nullptr means after all keys
+      std::vector<FileMetaData*>* inputs,
+      int hint_index = -1,        // index of overlap file
+      int* file_index = nullptr,  // return index of overlap file
+      bool expand_range = true,   // if set, returns files which overlap the
+                                  // range and overlap each other. If false,
+                                  // then just files intersecting the range
+      InternalKey** next_smallest = nullptr)  // if non-null, returns the
+      const;  // smallest key of next file not included
+  void GetCleanInputsWithinInterval(
+      int level, const InternalKey* begin,  // nullptr means before all keys
+      const InternalKey* end,               // nullptr means after all keys
+      std::vector<FileMetaData*>* inputs,
+      int hint_index = -1,        // index of overlap file
+      int* file_index = nullptr)  // return index of overlap file
+      const;
+
+  void GetOverlappingInputsRangeBinarySearch(
+      int level,                 // level > 0
+      const InternalKey* begin,  // nullptr means before all keys
+      const InternalKey* end,    // nullptr means after all keys
+      std::vector<FileMetaData*>* inputs,
+      int hint_index,                // index of overlap file
+      int* file_index,               // return index of overlap file
+      bool within_interval = false,  // if set, force the inputs within interval
+      InternalKey** next_smallest = nullptr)  // if non-null, returns the
+      const;  // smallest key of next file not included
+
+  // Returns true iff some file in the specified level overlaps
+  // some part of [*smallest_user_key,*largest_user_key].
+  // smallest_user_key==NULL represents a key smaller than all keys in the DB.
+  // largest_user_key==NULL represents a key largest than all keys in the DB.
+  bool OverlapInLevel(int level, const Slice* smallest_user_key,
+                      const Slice* largest_user_key);
+
+  // Returns true iff the first or last file in inputs contains
+  // an overlapping user key to the file "just outside" of it (i.e.
+  // just after the last file, or just before the first file)
+  // REQUIRES: "*inputs" is a sorted list of non-overlapping files
+  bool HasOverlappingUserKey(const std::vector<FileMetaData*>* inputs,
+                             int level);
+
+  int num_levels() const { return num_levels_; }
+
+  // REQUIRES: This version has been saved (see VersionSet::SaveTo)
+  int num_non_empty_levels() const {
+    assert(finalized_);
+    return num_non_empty_levels_;
+  }
+
+  // REQUIRES: This version has been finalized.
+  // (CalculateBaseBytes() is called)
+  // This may or may not return number of level files. It is to keep backward
+  // compatible behavior in universal compaction.
+  int l0_delay_trigger_count() const { return l0_delay_trigger_count_; }
+
+  void set_l0_delay_trigger_count(int v) { l0_delay_trigger_count_ = v; }
+
+  // REQUIRES: This version has been saved (see VersionSet::SaveTo)
+  int NumLevelFiles(int level) const {
+    assert(finalized_);
+    return static_cast<int>(files_[level].size());
+  }
+
+  // Return the combined file size of all files at the specified level.
+  uint64_t NumLevelBytes(int level) const;
+
+  // REQUIRES: This version has been saved (see VersionSet::SaveTo)
+  const std::vector<FileMetaData*>& LevelFiles(int level) const {
+    return files_[level];
+  }
+
+  const ROCKSDB_NAMESPACE::LevelFilesBrief& LevelFilesBrief(int level) const {
+    assert(level < static_cast<int>(level_files_brief_.size()));
+    return level_files_brief_[level];
+  }
+
+  // REQUIRES: This version has been saved (see VersionSet::SaveTo)
+  const std::vector<int>& FilesByCompactionPri(int level) const {
+    assert(finalized_);
+    return files_by_compaction_pri_[level];
+  }
+
+  // REQUIRES: This version has been saved (see VersionSet::SaveTo)
+  // REQUIRES: DB mutex held during access
+  const autovector<std::pair<int, FileMetaData*>>& FilesMarkedForCompaction()
+      const {
+    assert(finalized_);
+    return files_marked_for_compaction_;
+  }
+
+  // REQUIRES: This version has been saved (see VersionSet::SaveTo)
+  // REQUIRES: DB mutex held during access
+  const autovector<std::pair<int, FileMetaData*>>& ExpiredTtlFiles() const {
+    assert(finalized_);
+    return expired_ttl_files_;
+  }
+
+  // REQUIRES: This version has been saved (see VersionSet::SaveTo)
+  // REQUIRES: DB mutex held during access
+  const autovector<std::pair<int, FileMetaData*>>&
+  FilesMarkedForPeriodicCompaction() const {
+    assert(finalized_);
+    return files_marked_for_periodic_compaction_;
+  }
+
+  void TEST_AddFileMarkedForPeriodicCompaction(int level, FileMetaData* f) {
+    files_marked_for_periodic_compaction_.emplace_back(level, f);
+  }
+
+  // REQUIRES: This version has been saved (see VersionSet::SaveTo)
+  // REQUIRES: DB mutex held during access
+  const autovector<std::pair<int, FileMetaData*>>&
+  BottommostFilesMarkedForCompaction() const {
+    assert(finalized_);
+    return bottommost_files_marked_for_compaction_;
+  }
+
+  int base_level() const { return base_level_; }
+  double level_multiplier() const { return level_multiplier_; }
+
+  // REQUIRES: lock is held
+  // Set the index that is used to offset into files_by_compaction_pri_ to find
+  // the next compaction candidate file.
+  void SetNextCompactionIndex(int level, int index) {
+    next_file_to_compact_by_size_[level] = index;
+  }
+
+  // REQUIRES: lock is held
+  int NextCompactionIndex(int level) const {
+    return next_file_to_compact_by_size_[level];
+  }
+
+  // REQUIRES: This version has been saved (see VersionSet::SaveTo)
+  const FileIndexer& file_indexer() const {
+    assert(finalized_);
+    return file_indexer_;
+  }
+
+  // Only the first few entries of files_by_compaction_pri_ are sorted.
+  // There is no need to sort all the files because it is likely
+  // that on a running system, we need to look at only the first
+  // few largest files because a new version is created every few
+  // seconds/minutes (because of concurrent compactions).
+  static const size_t kNumberFilesToSort = 50;
+
+  // Return a human-readable short (single-line) summary of the number
+  // of files per level.  Uses *scratch as backing store.
+  struct LevelSummaryStorage {
+    char buffer[1000];
+  };
+  struct FileSummaryStorage {
+    char buffer[3000];
+  };
+  const char* LevelSummary(LevelSummaryStorage* scratch) const;
+  // Return a human-readable short (single-line) summary of files
+  // in a specified level.  Uses *scratch as backing store.
+  const char* LevelFileSummary(FileSummaryStorage* scratch, int level) const;
+
+  // Return the maximum overlapping data (in bytes) at next level for any
+  // file at a level >= 1.
+  int64_t MaxNextLevelOverlappingBytes();
+
+  // Return a human readable string that describes this version's contents.
+  std::string DebugString(bool hex = false) const;
+
+  uint64_t GetAverageValueSize() const {
+    if (accumulated_num_non_deletions_ == 0) {
+      return 0;
+    }
+    assert(accumulated_raw_key_size_ + accumulated_raw_value_size_ > 0);
+    assert(accumulated_file_size_ > 0);
+    return accumulated_raw_value_size_ / accumulated_num_non_deletions_ *
+           accumulated_file_size_ /
+           (accumulated_raw_key_size_ + accumulated_raw_value_size_);
+  }
+
+  uint64_t GetEstimatedActiveKeys() const;
+
+  double GetEstimatedCompressionRatioAtLevel(int level) const;
+
+  // re-initializes the index that is used to offset into
+  // files_by_compaction_pri_
+  // to find the next compaction candidate file.
+  void ResetNextCompactionIndex(int level) {
+    next_file_to_compact_by_size_[level] = 0;
+  }
+
+  const InternalKeyComparator* InternalComparator() {
+    return internal_comparator_;
+  }
+
+  // Returns maximum total bytes of data on a given level.
+  uint64_t MaxBytesForLevel(int level) const;
+
+  // Must be called after any change to MutableCFOptions.
+  void CalculateBaseBytes(const ImmutableCFOptions& ioptions,
+                          const MutableCFOptions& options);
+
+  // Returns an estimate of the amount of live data in bytes.
+  uint64_t EstimateLiveDataSize() const;
+
+  uint64_t estimated_compaction_needed_bytes() const {
+    return estimated_compaction_needed_bytes_;
+  }
+
+  void TEST_set_estimated_compaction_needed_bytes(uint64_t v) {
+    estimated_compaction_needed_bytes_ = v;
+  }
+
+  bool force_consistency_checks() const { return force_consistency_checks_; }
+
+  SequenceNumber bottommost_files_mark_threshold() const {
+    return bottommost_files_mark_threshold_;
+  }
+
+  // Returns whether any key in [`smallest_key`, `largest_key`] could appear in
+  // an older L0 file than `last_l0_idx` or in a greater level than `last_level`
+  //
+  // @param last_level Level after which we check for overlap
+  // @param last_l0_idx If `last_level == 0`, index of L0 file after which we
+  //    check for overlap; otherwise, must be -1
+  bool RangeMightExistAfterSortedRun(const Slice& smallest_user_key,
+                                     const Slice& largest_user_key,
+                                     int last_level, int last_l0_idx);
+
+ private:
+  const InternalKeyComparator* internal_comparator_;
+  const Comparator* user_comparator_;
+  int num_levels_;            // Number of levels
+  int num_non_empty_levels_;  // Number of levels. Any level larger than it
+                              // is guaranteed to be empty.
+  // Per-level max bytes
+  std::vector<uint64_t> level_max_bytes_;
+
+  // A short brief metadata of files per level
+  autovector<ROCKSDB_NAMESPACE::LevelFilesBrief> level_files_brief_;
+  FileIndexer file_indexer_;
+  Arena arena_;  // Used to allocate space for file_levels_
+
+  CompactionStyle compaction_style_;
+
+  // List of files per level, files in each level are arranged
+  // in increasing order of keys
+  std::vector<FileMetaData*>* files_;
+
+  // Level that L0 data should be compacted to. All levels < base_level_ should
+  // be empty. -1 if it is not level-compaction so it's not applicable.
+  int base_level_;
+
+  double level_multiplier_;
+
+  // A list for the same set of files that are stored in files_,
+  // but files in each level are now sorted based on file
+  // size. The file with the largest size is at the front.
+  // This vector stores the index of the file from files_.
+  std::vector<std::vector<int>> files_by_compaction_pri_;
+
+  // If true, means that files in L0 have keys with non overlapping ranges
+  bool level0_non_overlapping_;
+
+  // An index into files_by_compaction_pri_ that specifies the first
+  // file that is not yet compacted
+  std::vector<int> next_file_to_compact_by_size_;
+
+  // Only the first few entries of files_by_compaction_pri_ are sorted.
+  // There is no need to sort all the files because it is likely
+  // that on a running system, we need to look at only the first
+  // few largest files because a new version is created every few
+  // seconds/minutes (because of concurrent compactions).
+  static const size_t number_of_files_to_sort_ = 50;
+
+  // This vector contains list of files marked for compaction and also not
+  // currently being compacted. It is protected by DB mutex. It is calculated in
+  // ComputeCompactionScore()
+  autovector<std::pair<int, FileMetaData*>> files_marked_for_compaction_;
+
+  autovector<std::pair<int, FileMetaData*>> expired_ttl_files_;
+
+  autovector<std::pair<int, FileMetaData*>>
+      files_marked_for_periodic_compaction_;
+
+  // These files are considered bottommost because none of their keys can exist
+  // at lower levels. They are not necessarily all in the same level. The marked
+  // ones are eligible for compaction because they contain duplicate key
+  // versions that are no longer protected by snapshot. These variables are
+  // protected by DB mutex and are calculated in `GenerateBottommostFiles()` and
+  // `ComputeBottommostFilesMarkedForCompaction()`.
+  autovector<std::pair<int, FileMetaData*>> bottommost_files_;
+  autovector<std::pair<int, FileMetaData*>>
+      bottommost_files_marked_for_compaction_;
+
+  // Threshold for needing to mark another bottommost file. Maintain it so we
+  // can quickly check when releasing a snapshot whether more bottommost files
+  // became eligible for compaction. It's defined as the min of the max nonzero
+  // seqnums of unmarked bottommost files.
+  SequenceNumber bottommost_files_mark_threshold_ = kMaxSequenceNumber;
+
+  // Monotonically increases as we release old snapshots. Zero indicates no
+  // snapshots have been released yet. When no snapshots remain we set it to the
+  // current seqnum, which needs to be protected as a snapshot can still be
+  // created that references it.
+  SequenceNumber oldest_snapshot_seqnum_ = 0;
+
+  // Level that should be compacted next and its compaction score.
+  // Score < 1 means compaction is not strictly needed.  These fields
+  // are initialized by Finalize().
+  // The most critical level to be compacted is listed first
+  // These are used to pick the best compaction level
+  std::vector<double> compaction_score_;
+  std::vector<int> compaction_level_;
+  int l0_delay_trigger_count_ = 0;  // Count used to trigger slow down and stop
+                                    // for number of L0 files.
+
+  // the following are the sampled temporary stats.
+  // the current accumulated size of sampled files.
+  uint64_t accumulated_file_size_;
+  // the current accumulated size of all raw keys based on the sampled files.
+  uint64_t accumulated_raw_key_size_;
+  // the current accumulated size of all raw keys based on the sampled files.
+  uint64_t accumulated_raw_value_size_;
+  // total number of non-deletion entries
+  uint64_t accumulated_num_non_deletions_;
+  // total number of deletion entries
+  uint64_t accumulated_num_deletions_;
+  // current number of non_deletion entries
+  uint64_t current_num_non_deletions_;
+  // current number of deletion entries
+  uint64_t current_num_deletions_;
+  // current number of file samples
+  uint64_t current_num_samples_;
+  // Estimated bytes needed to be compacted until all levels' size is down to
+  // target sizes.
+  uint64_t estimated_compaction_needed_bytes_;
+
+  bool finalized_;
+
+  // If set to true, we will run consistency checks even if RocksDB
+  // is compiled in release mode
+  bool force_consistency_checks_;
+
+  friend class Version;
+  friend class VersionSet;
+};
+
+using MultiGetRange = MultiGetContext::Range;
+// A column family's version consists of the SST files owned by the column
+// family at a certain point in time.
+class Version {
+ public:
+  // Append to *iters a sequence of iterators that will
+  // yield the contents of this Version when merged together.
+  // REQUIRES: This version has been saved (see VersionSet::SaveTo)
+  void AddIterators(const ReadOptions&, const FileOptions& soptions,
+                    MergeIteratorBuilder* merger_iter_builder,
+                    RangeDelAggregator* range_del_agg);
+
+  void AddIteratorsForLevel(const ReadOptions&, const FileOptions& soptions,
+                            MergeIteratorBuilder* merger_iter_builder,
+                            int level, RangeDelAggregator* range_del_agg);
+
+  Status OverlapWithLevelIterator(const ReadOptions&, const FileOptions&,
+                                  const Slice& smallest_user_key,
+                                  const Slice& largest_user_key,
+                                  int level, bool* overlap);
+
+  // Lookup the value for key or get all merge operands for key.
+  // If do_merge = true (default) then lookup value for key.
+  // Behavior if do_merge = true:
+  //    If found, store it in *value and
+  //    return OK.  Else return a non-OK status.
+  //    Uses *operands to store merge_operator operations to apply later.
+  //
+  //    If the ReadOptions.read_tier is set to do a read-only fetch, then
+  //    *value_found will be set to false if it cannot be determined whether
+  //    this value exists without doing IO.
+  //
+  //    If the key is Deleted, *status will be set to NotFound and
+  //                        *key_exists will be set to true.
+  //    If no key was found, *status will be set to NotFound and
+  //                      *key_exists will be set to false.
+  //    If seq is non-null, *seq will be set to the sequence number found
+  //    for the key if a key was found.
+  // Behavior if do_merge = false
+  //    If the key has any merge operands then store them in
+  //    merge_context.operands_list and don't merge the operands
+  // REQUIRES: lock is not held
+  void Get(const ReadOptions&, const LookupKey& key, PinnableSlice* value,
+           Status* status, MergeContext* merge_context,
+           SequenceNumber* max_covering_tombstone_seq,
+           bool* value_found = nullptr, bool* key_exists = nullptr,
+           SequenceNumber* seq = nullptr, ReadCallback* callback = nullptr,
+           bool* is_blob = nullptr, bool do_merge = true);
+
+  void MultiGet(const ReadOptions&, MultiGetRange* range,
+                ReadCallback* callback = nullptr, bool* is_blob = nullptr);
+
+  // Loads some stats information from files. Call without mutex held. It needs
+  // to be called before applying the version to the version set.
+  void PrepareApply(const MutableCFOptions& mutable_cf_options,
+                    bool update_stats);
+
+  // Reference count management (so Versions do not disappear out from
+  // under live iterators)
+  void Ref();
+  // Decrease reference count. Delete the object if no reference left
+  // and return true. Otherwise, return false.
+  bool Unref();
+
+  // Add all files listed in the current version to *live.
+  void AddLiveFiles(std::vector<FileDescriptor>* live);
+
+  // Return a human readable string that describes this version's contents.
+  std::string DebugString(bool hex = false, bool print_stats = false) const;
+
+  // Returns the version number of this version
+  uint64_t GetVersionNumber() const { return version_number_; }
+
+  // REQUIRES: lock is held
+  // On success, "tp" will contains the table properties of the file
+  // specified in "file_meta".  If the file name of "file_meta" is
+  // known ahead, passing it by a non-null "fname" can save a
+  // file-name conversion.
+  Status GetTableProperties(std::shared_ptr<const TableProperties>* tp,
+                            const FileMetaData* file_meta,
+                            const std::string* fname = nullptr) const;
+
+  // REQUIRES: lock is held
+  // On success, *props will be populated with all SSTables' table properties.
+  // The keys of `props` are the sst file name, the values of `props` are the
+  // tables' properties, represented as std::shared_ptr.
+  Status GetPropertiesOfAllTables(TablePropertiesCollection* props);
+  Status GetPropertiesOfAllTables(TablePropertiesCollection* props, int level);
+  Status GetPropertiesOfTablesInRange(const Range* range, std::size_t n,
+                                      TablePropertiesCollection* props) const;
+
+  // Print summary of range delete tombstones in SST files into out_str,
+  // with maximum max_entries_to_print entries printed out.
+  Status TablesRangeTombstoneSummary(int max_entries_to_print,
+                                     std::string* out_str);
+
+  // REQUIRES: lock is held
+  // On success, "tp" will contains the aggregated table property among
+  // the table properties of all sst files in this version.
+  Status GetAggregatedTableProperties(
+      std::shared_ptr<const TableProperties>* tp, int level = -1);
+
+  uint64_t GetEstimatedActiveKeys() {
+    return storage_info_.GetEstimatedActiveKeys();
+  }
+
+  size_t GetMemoryUsageByTableReaders();
+
+  ColumnFamilyData* cfd() const { return cfd_; }
+
+  // Return the next Version in the linked list. Used for debug only
+  Version* TEST_Next() const {
+    return next_;
+  }
+
+  int TEST_refs() const { return refs_; }
+
+  VersionStorageInfo* storage_info() { return &storage_info_; }
+
+  VersionSet* version_set() { return vset_; }
+
+  void GetColumnFamilyMetaData(ColumnFamilyMetaData* cf_meta);
+
+  uint64_t GetSstFilesSize();
+
+  // Retrieves the file_creation_time of the oldest file in the DB.
+  // Prerequisite for this API is max_open_files = -1
+  void GetCreationTimeOfOldestFile(uint64_t* creation_time);
+
+  const MutableCFOptions& GetMutableCFOptions() { return mutable_cf_options_; }
+
+ private:
+  Env* env_;
+  FileSystem* fs_;
+  friend class ReactiveVersionSet;
+  friend class VersionSet;
+
+  const InternalKeyComparator* internal_comparator() const {
+    return storage_info_.internal_comparator_;
+  }
+  const Comparator* user_comparator() const {
+    return storage_info_.user_comparator_;
+  }
+
+  bool PrefixMayMatch(const ReadOptions& read_options,
+                      InternalIterator* level_iter,
+                      const Slice& internal_prefix) const;
+
+  // Returns true if the filter blocks in the specified level will not be
+  // checked during read operations. In certain cases (trivial move or preload),
+  // the filter block may already be cached, but we still do not access it such
+  // that it eventually expires from the cache.
+  bool IsFilterSkipped(int level, bool is_file_last_in_level = false);
+
+  // The helper function of UpdateAccumulatedStats, which may fill the missing
+  // fields of file_meta from its associated TableProperties.
+  // Returns true if it does initialize FileMetaData.
+  bool MaybeInitializeFileMetaData(FileMetaData* file_meta);
+
+  // Update the accumulated stats associated with the current version.
+  // This accumulated stats will be used in compaction.
+  void UpdateAccumulatedStats(bool update_stats);
+
+  // Sort all files for this version based on their file size and
+  // record results in files_by_compaction_pri_. The largest files are listed
+  // first.
+  void UpdateFilesByCompactionPri();
+
+  ColumnFamilyData* cfd_;  // ColumnFamilyData to which this Version belongs
+  Logger* info_log_;
+  Statistics* db_statistics_;
+  TableCache* table_cache_;
+  const MergeOperator* merge_operator_;
+
+  VersionStorageInfo storage_info_;
+  VersionSet* vset_;            // VersionSet to which this Version belongs
+  Version* next_;               // Next version in linked list
+  Version* prev_;               // Previous version in linked list
+  int refs_;                    // Number of live refs to this version
+  const FileOptions file_options_;
+  const MutableCFOptions mutable_cf_options_;
+
+  // A version number that uniquely represents this version. This is
+  // used for debugging and logging purposes only.
+  uint64_t version_number_;
+
+  Version(ColumnFamilyData* cfd, VersionSet* vset, const FileOptions& file_opt,
+          MutableCFOptions mutable_cf_options, uint64_t version_number = 0);
+
+  ~Version();
+
+  // No copying allowed
+  Version(const Version&) = delete;
+  void operator=(const Version&) = delete;
+};
+
+struct ObsoleteFileInfo {
+  FileMetaData* metadata;
+  std::string   path;
+
+  ObsoleteFileInfo() noexcept : metadata(nullptr) {}
+  ObsoleteFileInfo(FileMetaData* f, const std::string& file_path)
+      : metadata(f), path(file_path) {}
+
+  ObsoleteFileInfo(const ObsoleteFileInfo&) = delete;
+  ObsoleteFileInfo& operator=(const ObsoleteFileInfo&) = delete;
+
+  ObsoleteFileInfo(ObsoleteFileInfo&& rhs) noexcept :
+    ObsoleteFileInfo() {
+      *this = std::move(rhs);
+  }
+
+  ObsoleteFileInfo& operator=(ObsoleteFileInfo&& rhs) noexcept {
+    path = std::move(rhs.path);
+    metadata = rhs.metadata;
+    rhs.metadata = nullptr;
+
+    return *this;
+  }
+
+  void DeleteMetadata() {
+    delete metadata;
+    metadata = nullptr;
+  }
+};
+
+class BaseReferencedVersionBuilder;
+
+class AtomicGroupReadBuffer {
+ public:
+  Status AddEdit(VersionEdit* edit);
+  void Clear();
+  bool IsFull() const;
+  bool IsEmpty() const;
+
+  uint64_t TEST_read_edits_in_atomic_group() const {
+    return read_edits_in_atomic_group_;
+  }
+  std::vector<VersionEdit>& replay_buffer() { return replay_buffer_; }
+
+ private:
+  uint64_t read_edits_in_atomic_group_ = 0;
+  std::vector<VersionEdit> replay_buffer_;
+};
+
+// VersionSet is the collection of versions of all the column families of the
+// database. Each database owns one VersionSet. A VersionSet has access to all
+// column families via ColumnFamilySet, i.e. set of the column families.
+class VersionSet {
+ public:
+  VersionSet(const std::string& dbname, const ImmutableDBOptions* db_options,
+             const FileOptions& file_options, Cache* table_cache,
+             WriteBufferManager* write_buffer_manager,
+             WriteController* write_controller,
+             BlockCacheTracer* const block_cache_tracer);
+  // No copying allowed
+  VersionSet(const VersionSet&) = delete;
+  void operator=(const VersionSet&) = delete;
+
+  virtual ~VersionSet();
+
+  // Apply *edit to the current version to form a new descriptor that
+  // is both saved to persistent state and installed as the new
+  // current version.  Will release *mu while actually writing to the file.
+  // column_family_options has to be set if edit is column family add
+  // REQUIRES: *mu is held on entry.
+  // REQUIRES: no other thread concurrently calls LogAndApply()
+  Status LogAndApply(
+      ColumnFamilyData* column_family_data,
+      const MutableCFOptions& mutable_cf_options, VersionEdit* edit,
+      InstrumentedMutex* mu, Directory* db_directory = nullptr,
+      bool new_descriptor_log = false,
+      const ColumnFamilyOptions* column_family_options = nullptr) {
+    autovector<ColumnFamilyData*> cfds;
+    cfds.emplace_back(column_family_data);
+    autovector<const MutableCFOptions*> mutable_cf_options_list;
+    mutable_cf_options_list.emplace_back(&mutable_cf_options);
+    autovector<autovector<VersionEdit*>> edit_lists;
+    autovector<VersionEdit*> edit_list;
+    edit_list.emplace_back(edit);
+    edit_lists.emplace_back(edit_list);
+    return LogAndApply(cfds, mutable_cf_options_list, edit_lists, mu,
+                       db_directory, new_descriptor_log, column_family_options);
+  }
+  // The batch version. If edit_list.size() > 1, caller must ensure that
+  // no edit in the list column family add or drop
+  Status LogAndApply(
+      ColumnFamilyData* column_family_data,
+      const MutableCFOptions& mutable_cf_options,
+      const autovector<VersionEdit*>& edit_list, InstrumentedMutex* mu,
+      Directory* db_directory = nullptr, bool new_descriptor_log = false,
+      const ColumnFamilyOptions* column_family_options = nullptr) {
+    autovector<ColumnFamilyData*> cfds;
+    cfds.emplace_back(column_family_data);
+    autovector<const MutableCFOptions*> mutable_cf_options_list;
+    mutable_cf_options_list.emplace_back(&mutable_cf_options);
+    autovector<autovector<VersionEdit*>> edit_lists;
+    edit_lists.emplace_back(edit_list);
+    return LogAndApply(cfds, mutable_cf_options_list, edit_lists, mu,
+                       db_directory, new_descriptor_log, column_family_options);
+  }
+
+  // The across-multi-cf batch version. If edit_lists contain more than
+  // 1 version edits, caller must ensure that no edit in the []list is column
+  // family manipulation.
+  virtual Status LogAndApply(
+      const autovector<ColumnFamilyData*>& cfds,
+      const autovector<const MutableCFOptions*>& mutable_cf_options_list,
+      const autovector<autovector<VersionEdit*>>& edit_lists,
+      InstrumentedMutex* mu, Directory* db_directory = nullptr,
+      bool new_descriptor_log = false,
+      const ColumnFamilyOptions* new_cf_options = nullptr);
+
+  static Status GetCurrentManifestPath(const std::string& dbname,
+                                       FileSystem* fs,
+                                       std::string* manifest_filename,
+                                       uint64_t* manifest_file_number);
+
+  // Recover the last saved descriptor from persistent storage.
+  // If read_only == true, Recover() will not complain if some column families
+  // are not opened
+  Status Recover(const std::vector<ColumnFamilyDescriptor>& column_families,
+                 bool read_only = false, std::string* db_id = nullptr);
+
+  // Reads a manifest file and returns a list of column families in
+  // column_families.
+  static Status ListColumnFamilies(std::vector<std::string>* column_families,
+                                   const std::string& dbname, FileSystem* fs);
+
+#ifndef ROCKSDB_LITE
+  // Try to reduce the number of levels. This call is valid when
+  // only one level from the new max level to the old
+  // max level containing files.
+  // The call is static, since number of levels is immutable during
+  // the lifetime of a RocksDB instance. It reduces number of levels
+  // in a DB by applying changes to manifest.
+  // For example, a db currently has 7 levels [0-6], and a call to
+  // to reduce to 5 [0-4] can only be executed when only one level
+  // among [4-6] contains files.
+  static Status ReduceNumberOfLevels(const std::string& dbname,
+                                     const Options* options,
+                                     const FileOptions& file_options,
+                                     int new_levels);
+
+  // Get the checksum information of all live files
+  Status GetLiveFilesChecksumInfo(FileChecksumList* checksum_list);
+
+  // printf contents (for debugging)
+  Status DumpManifest(Options& options, std::string& manifestFileName,
+                      bool verbose, bool hex = false, bool json = false);
+
+#endif  // ROCKSDB_LITE
+
+  // Return the current manifest file number
+  uint64_t manifest_file_number() const { return manifest_file_number_; }
+
+  uint64_t options_file_number() const { return options_file_number_; }
+
+  uint64_t pending_manifest_file_number() const {
+    return pending_manifest_file_number_;
+  }
+
+  uint64_t current_next_file_number() const { return next_file_number_.load(); }
+
+  uint64_t min_log_number_to_keep_2pc() const {
+    return min_log_number_to_keep_2pc_.load();
+  }
+
+  // Allocate and return a new file number
+  uint64_t NewFileNumber() { return next_file_number_.fetch_add(1); }
+
+  // Fetch And Add n new file number
+  uint64_t FetchAddFileNumber(uint64_t n) {
+    return next_file_number_.fetch_add(n);
+  }
+
+  // Return the last sequence number.
+  uint64_t LastSequence() const {
+    return last_sequence_.load(std::memory_order_acquire);
+  }
+
+  // Note: memory_order_acquire must be sufficient.
+  uint64_t LastAllocatedSequence() const {
+    return last_allocated_sequence_.load(std::memory_order_seq_cst);
+  }
+
+  // Note: memory_order_acquire must be sufficient.
+  uint64_t LastPublishedSequence() const {
+    return last_published_sequence_.load(std::memory_order_seq_cst);
+  }
+
+  // Set the last sequence number to s.
+  void SetLastSequence(uint64_t s) {
+    assert(s >= last_sequence_);
+    // Last visible sequence must always be less than last written seq
+    assert(!db_options_->two_write_queues || s <= last_allocated_sequence_);
+    last_sequence_.store(s, std::memory_order_release);
+  }
+
+  // Note: memory_order_release must be sufficient
+  void SetLastPublishedSequence(uint64_t s) {
+    assert(s >= last_published_sequence_);
+    last_published_sequence_.store(s, std::memory_order_seq_cst);
+  }
+
+  // Note: memory_order_release must be sufficient
+  void SetLastAllocatedSequence(uint64_t s) {
+    assert(s >= last_allocated_sequence_);
+    last_allocated_sequence_.store(s, std::memory_order_seq_cst);
+  }
+
+  // Note: memory_order_release must be sufficient
+  uint64_t FetchAddLastAllocatedSequence(uint64_t s) {
+    return last_allocated_sequence_.fetch_add(s, std::memory_order_seq_cst);
+  }
+
+  // Mark the specified file number as used.
+  // REQUIRED: this is only called during single-threaded recovery or repair.
+  void MarkFileNumberUsed(uint64_t number);
+
+  // Mark the specified log number as deleted
+  // REQUIRED: this is only called during single-threaded recovery or repair, or
+  // from ::LogAndApply where the global mutex is held.
+  void MarkMinLogNumberToKeep2PC(uint64_t number);
+
+  // Return the log file number for the log file that is currently
+  // being compacted, or zero if there is no such log file.
+  uint64_t prev_log_number() const { return prev_log_number_; }
+
+  // Returns the minimum log number which still has data not flushed to any SST
+  // file.
+  // In non-2PC mode, all the log numbers smaller than this number can be safely
+  // deleted.
+  uint64_t MinLogNumberWithUnflushedData() const {
+    return PreComputeMinLogNumberWithUnflushedData(nullptr);
+  }
+  // Returns the minimum log number which still has data not flushed to any SST
+  // file, except data from `cfd_to_skip`.
+  uint64_t PreComputeMinLogNumberWithUnflushedData(
+      const ColumnFamilyData* cfd_to_skip) const {
+    uint64_t min_log_num = std::numeric_limits<uint64_t>::max();
+    for (auto cfd : *column_family_set_) {
+      if (cfd == cfd_to_skip) {
+        continue;
+      }
+      // It's safe to ignore dropped column families here:
+      // cfd->IsDropped() becomes true after the drop is persisted in MANIFEST.
+      if (min_log_num > cfd->GetLogNumber() && !cfd->IsDropped()) {
+        min_log_num = cfd->GetLogNumber();
+      }
+    }
+    return min_log_num;
+  }
+
+  // Create an iterator that reads over the compaction inputs for "*c".
+  // The caller should delete the iterator when no longer needed.
+  InternalIterator* MakeInputIterator(
+      const Compaction* c, RangeDelAggregator* range_del_agg,
+      const FileOptions& file_options_compactions);
+
+  // Add all files listed in any live version to *live.
+  void AddLiveFiles(std::vector<FileDescriptor>* live_list);
+
+  // Return the approximate size of data to be scanned for range [start, end)
+  // in levels [start_level, end_level). If end_level == -1 it will search
+  // through all non-empty levels
+  uint64_t ApproximateSize(const SizeApproximationOptions& options, Version* v,
+                           const Slice& start, const Slice& end,
+                           int start_level, int end_level,
+                           TableReaderCaller caller);
+
+  // Return the size of the current manifest file
+  uint64_t manifest_file_size() const { return manifest_file_size_; }
+
+  // verify that the files that we started with for a compaction
+  // still exist in the current version and in the same original level.
+  // This ensures that a concurrent compaction did not erroneously
+  // pick the same files to compact.
+  bool VerifyCompactionFileConsistency(Compaction* c);
+
+  Status GetMetadataForFile(uint64_t number, int* filelevel,
+                            FileMetaData** metadata, ColumnFamilyData** cfd);
+
+  // This function doesn't support leveldb SST filenames
+  void GetLiveFilesMetaData(std::vector<LiveFileMetaData> *metadata);
+
+  void GetObsoleteFiles(std::vector<ObsoleteFileInfo>* files,
+                        std::vector<std::string>* manifest_filenames,
+                        uint64_t min_pending_output);
+
+  ColumnFamilySet* GetColumnFamilySet() { return column_family_set_.get(); }
+  const FileOptions& file_options() { return file_options_; }
+  void ChangeFileOptions(const MutableDBOptions& new_options) {
+    file_options_.writable_file_max_buffer_size =
+        new_options.writable_file_max_buffer_size;
+  }
+
+  const ImmutableDBOptions* db_options() const { return db_options_; }
+
+  static uint64_t GetNumLiveVersions(Version* dummy_versions);
+
+  static uint64_t GetTotalSstFilesSize(Version* dummy_versions);
+
+ protected:
+  struct ManifestWriter;
+
+  friend class Version;
+  friend class DBImpl;
+  friend class DBImplReadOnly;
+
+  struct LogReporter : public log::Reader::Reporter {
+    Status* status;
+    virtual void Corruption(size_t /*bytes*/, const Status& s) override {
+      if (this->status->ok()) *this->status = s;
+    }
+  };
+
+  // Returns approximated offset of a key in a file for a given version.
+  uint64_t ApproximateOffsetOf(Version* v, const FdWithKeyRange& f,
+                               const Slice& key, TableReaderCaller caller);
+
+  // Returns approximated data size between start and end keys in a file
+  // for a given version.
+  uint64_t ApproximateSize(Version* v, const FdWithKeyRange& f,
+                           const Slice& start, const Slice& end,
+                           TableReaderCaller caller);
+
+  struct MutableCFState {
+    uint64_t log_number;
+  };
+
+  // Save current contents to *log
+  Status WriteCurrentStateToManifest(
+      const std::unordered_map<uint32_t, MutableCFState>& curr_state,
+      log::Writer* log);
+
+  void AppendVersion(ColumnFamilyData* column_family_data, Version* v);
+
+  ColumnFamilyData* CreateColumnFamily(const ColumnFamilyOptions& cf_options,
+                                       VersionEdit* edit);
+
+  Status ReadAndRecover(
+      log::Reader* reader, AtomicGroupReadBuffer* read_buffer,
+      const std::unordered_map<std::string, ColumnFamilyOptions>&
+          name_to_options,
+      std::unordered_map<int, std::string>& column_families_not_found,
+      std::unordered_map<
+          uint32_t, std::unique_ptr<BaseReferencedVersionBuilder>>& builders,
+      VersionEditParams* version_edit, std::string* db_id = nullptr);
+
+  // REQUIRES db mutex
+  Status ApplyOneVersionEditToBuilder(
+      VersionEdit& edit,
+      const std::unordered_map<std::string, ColumnFamilyOptions>& name_to_opts,
+      std::unordered_map<int, std::string>& column_families_not_found,
+      std::unordered_map<
+          uint32_t, std::unique_ptr<BaseReferencedVersionBuilder>>& builders,
+      VersionEditParams* version_edit);
+
+  Status ExtractInfoFromVersionEdit(ColumnFamilyData* cfd,
+                                    const VersionEdit& from_edit,
+                                    VersionEditParams* version_edit_params);
+
+  std::unique_ptr<ColumnFamilySet> column_family_set_;
+
+  Env* const env_;
+  FileSystem* const fs_;
+  const std::string dbname_;
+  std::string db_id_;
+  const ImmutableDBOptions* const db_options_;
+  std::atomic<uint64_t> next_file_number_;
+  // Any log number equal or lower than this should be ignored during recovery,
+  // and is qualified for being deleted in 2PC mode. In non-2PC mode, this
+  // number is ignored.
+  std::atomic<uint64_t> min_log_number_to_keep_2pc_ = {0};
+  uint64_t manifest_file_number_;
+  uint64_t options_file_number_;
+  uint64_t pending_manifest_file_number_;
+  // The last seq visible to reads. It normally indicates the last sequence in
+  // the memtable but when using two write queues it could also indicate the
+  // last sequence in the WAL visible to reads.
+  std::atomic<uint64_t> last_sequence_;
+  // The last seq that is already allocated. It is applicable only when we have
+  // two write queues. In that case seq might or might not have appreated in
+  // memtable but it is expected to appear in the WAL.
+  // We have last_sequence <= last_allocated_sequence_
+  std::atomic<uint64_t> last_allocated_sequence_;
+  // The last allocated sequence that is also published to the readers. This is
+  // applicable only when last_seq_same_as_publish_seq_ is not set. Otherwise
+  // last_sequence_ also indicates the last published seq.
+  // We have last_sequence <= last_published_sequence_ <=
+  // last_allocated_sequence_
+  std::atomic<uint64_t> last_published_sequence_;
+  uint64_t prev_log_number_;  // 0 or backing store for memtable being compacted
+
+  // Opened lazily
+  std::unique_ptr<log::Writer> descriptor_log_;
+
+  // generates a increasing version number for every new version
+  uint64_t current_version_number_;
+
+  // Queue of writers to the manifest file
+  std::deque<ManifestWriter*> manifest_writers_;
+
+  // Current size of manifest file
+  uint64_t manifest_file_size_;
+
+  std::vector<ObsoleteFileInfo> obsolete_files_;
+  std::vector<std::string> obsolete_manifests_;
+
+  // env options for all reads and writes except compactions
+  FileOptions file_options_;
+
+  BlockCacheTracer* const block_cache_tracer_;
+
+ private:
+  // REQUIRES db mutex at beginning. may release and re-acquire db mutex
+  Status ProcessManifestWrites(std::deque<ManifestWriter>& writers,
+                               InstrumentedMutex* mu, Directory* db_directory,
+                               bool new_descriptor_log,
+                               const ColumnFamilyOptions* new_cf_options);
+
+  void LogAndApplyCFHelper(VersionEdit* edit);
+  Status LogAndApplyHelper(ColumnFamilyData* cfd, VersionBuilder* b,
+                           VersionEdit* edit, InstrumentedMutex* mu);
+};
+
+// ReactiveVersionSet represents a collection of versions of the column
+// families of the database. Users of ReactiveVersionSet, e.g. DBImplSecondary,
+// need to replay the MANIFEST (description log in older terms) in order to
+// reconstruct and install versions.
+class ReactiveVersionSet : public VersionSet {
+ public:
+  ReactiveVersionSet(const std::string& dbname,
+                     const ImmutableDBOptions* _db_options,
+                     const FileOptions& _file_options, Cache* table_cache,
+                     WriteBufferManager* write_buffer_manager,
+                     WriteController* write_controller);
+
+  ~ReactiveVersionSet() override;
+
+  Status ReadAndApply(
+      InstrumentedMutex* mu,
+      std::unique_ptr<log::FragmentBufferedReader>* manifest_reader,
+      std::unordered_set<ColumnFamilyData*>* cfds_changed);
+
+  Status Recover(const std::vector<ColumnFamilyDescriptor>& column_families,
+                 std::unique_ptr<log::FragmentBufferedReader>* manifest_reader,
+                 std::unique_ptr<log::Reader::Reporter>* manifest_reporter,
+                 std::unique_ptr<Status>* manifest_reader_status);
+
+  uint64_t TEST_read_edits_in_atomic_group() const {
+    return read_buffer_.TEST_read_edits_in_atomic_group();
+  }
+  std::vector<VersionEdit>& replay_buffer() {
+    return read_buffer_.replay_buffer();
+  }
+
+ protected:
+  using VersionSet::ApplyOneVersionEditToBuilder;
+
+  // REQUIRES db mutex
+  Status ApplyOneVersionEditToBuilder(
+      VersionEdit& edit, std::unordered_set<ColumnFamilyData*>* cfds_changed,
+      VersionEdit* version_edit);
+
+  Status MaybeSwitchManifest(
+      log::Reader::Reporter* reporter,
+      std::unique_ptr<log::FragmentBufferedReader>* manifest_reader);
+
+ private:
+  std::unordered_map<uint32_t, std::unique_ptr<BaseReferencedVersionBuilder>>
+      active_version_builders_;
+  AtomicGroupReadBuffer read_buffer_;
+  // Number of version edits to skip by ReadAndApply at the beginning of a new
+  // MANIFEST created by primary.
+  int number_of_edits_to_skip_;
+
+  using VersionSet::LogAndApply;
+  using VersionSet::Recover;
+
+  Status LogAndApply(
+      const autovector<ColumnFamilyData*>& /*cfds*/,
+      const autovector<const MutableCFOptions*>& /*mutable_cf_options_list*/,
+      const autovector<autovector<VersionEdit*>>& /*edit_lists*/,
+      InstrumentedMutex* /*mu*/, Directory* /*db_directory*/,
+      bool /*new_descriptor_log*/,
+      const ColumnFamilyOptions* /*new_cf_option*/) override {
+    return Status::NotSupported("not supported in reactive mode");
+  }
+
+  // No copy allowed
+  ReactiveVersionSet(const ReactiveVersionSet&);
+  ReactiveVersionSet& operator=(const ReactiveVersionSet&);
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/version_set_test.cc b/src/rocksdb/db/version_set_test.cc
new file mode 100644
index 000000000..03e0e26d2
--- /dev/null
+++ b/src/rocksdb/db/version_set_test.cc
@@ -0,0 +1,1287 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/version_set.h"
+#include "db/db_impl/db_impl.h"
+#include "db/log_writer.h"
+#include "logging/logging.h"
+#include "table/mock_table.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class GenerateLevelFilesBriefTest : public testing::Test {
+ public:
+  std::vector<FileMetaData*> files_;
+  LevelFilesBrief file_level_;
+  Arena arena_;
+
+  GenerateLevelFilesBriefTest() { }
+
+  ~GenerateLevelFilesBriefTest() override {
+    for (size_t i = 0; i < files_.size(); i++) {
+      delete files_[i];
+    }
+  }
+
+  void Add(const char* smallest, const char* largest,
+           SequenceNumber smallest_seq = 100,
+           SequenceNumber largest_seq = 100) {
+    FileMetaData* f = new FileMetaData(
+        files_.size() + 1, 0, 0,
+        InternalKey(smallest, smallest_seq, kTypeValue),
+        InternalKey(largest, largest_seq, kTypeValue), smallest_seq,
+        largest_seq, /* marked_for_compact */ false, kInvalidBlobFileNumber,
+        kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+        kUnknownFileChecksum, kUnknownFileChecksumFuncName);
+    files_.push_back(f);
+  }
+
+  int Compare() {
+    int diff = 0;
+    for (size_t i = 0; i < files_.size(); i++) {
+      if (file_level_.files[i].fd.GetNumber() != files_[i]->fd.GetNumber()) {
+        diff++;
+      }
+    }
+    return diff;
+  }
+};
+
+TEST_F(GenerateLevelFilesBriefTest, Empty) {
+  DoGenerateLevelFilesBrief(&file_level_, files_, &arena_);
+  ASSERT_EQ(0u, file_level_.num_files);
+  ASSERT_EQ(0, Compare());
+}
+
+TEST_F(GenerateLevelFilesBriefTest, Single) {
+  Add("p", "q");
+  DoGenerateLevelFilesBrief(&file_level_, files_, &arena_);
+  ASSERT_EQ(1u, file_level_.num_files);
+  ASSERT_EQ(0, Compare());
+}
+
+TEST_F(GenerateLevelFilesBriefTest, Multiple) {
+  Add("150", "200");
+  Add("200", "250");
+  Add("300", "350");
+  Add("400", "450");
+  DoGenerateLevelFilesBrief(&file_level_, files_, &arena_);
+  ASSERT_EQ(4u, file_level_.num_files);
+  ASSERT_EQ(0, Compare());
+}
+
+class CountingLogger : public Logger {
+ public:
+  CountingLogger() : log_count(0) {}
+  using Logger::Logv;
+  void Logv(const char* /*format*/, va_list /*ap*/) override { log_count++; }
+  int log_count;
+};
+
+Options GetOptionsWithNumLevels(int num_levels,
+                                std::shared_ptr<CountingLogger> logger) {
+  Options opt;
+  opt.num_levels = num_levels;
+  opt.info_log = logger;
+  return opt;
+}
+
+class VersionStorageInfoTest : public testing::Test {
+ public:
+  const Comparator* ucmp_;
+  InternalKeyComparator icmp_;
+  std::shared_ptr<CountingLogger> logger_;
+  Options options_;
+  ImmutableCFOptions ioptions_;
+  MutableCFOptions mutable_cf_options_;
+  VersionStorageInfo vstorage_;
+
+  InternalKey GetInternalKey(const char* ukey,
+                             SequenceNumber smallest_seq = 100) {
+    return InternalKey(ukey, smallest_seq, kTypeValue);
+  }
+
+  VersionStorageInfoTest()
+      : ucmp_(BytewiseComparator()),
+        icmp_(ucmp_),
+        logger_(new CountingLogger()),
+        options_(GetOptionsWithNumLevels(6, logger_)),
+        ioptions_(options_),
+        mutable_cf_options_(options_),
+        vstorage_(&icmp_, ucmp_, 6, kCompactionStyleLevel, nullptr, false) {}
+
+  ~VersionStorageInfoTest() override {
+    for (int i = 0; i < vstorage_.num_levels(); i++) {
+      for (auto* f : vstorage_.LevelFiles(i)) {
+        if (--f->refs == 0) {
+          delete f;
+        }
+      }
+    }
+  }
+
+  void Add(int level, uint32_t file_number, const char* smallest,
+           const char* largest, uint64_t file_size = 0) {
+    assert(level < vstorage_.num_levels());
+    FileMetaData* f = new FileMetaData(
+        file_number, 0, file_size, GetInternalKey(smallest, 0),
+        GetInternalKey(largest, 0), /* smallest_seq */ 0, /* largest_seq */ 0,
+        /* marked_for_compact */ false, kInvalidBlobFileNumber,
+        kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+        kUnknownFileChecksum, kUnknownFileChecksumFuncName);
+    f->compensated_file_size = file_size;
+    vstorage_.AddFile(level, f);
+  }
+
+  void Add(int level, uint32_t file_number, const InternalKey& smallest,
+           const InternalKey& largest, uint64_t file_size = 0) {
+    assert(level < vstorage_.num_levels());
+    FileMetaData* f = new FileMetaData(
+        file_number, 0, file_size, smallest, largest, /* smallest_seq */ 0,
+        /* largest_seq */ 0, /* marked_for_compact */ false,
+        kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
+        kUnknownFileCreationTime, kUnknownFileChecksum,
+        kUnknownFileChecksumFuncName);
+    f->compensated_file_size = file_size;
+    vstorage_.AddFile(level, f);
+  }
+
+  std::string GetOverlappingFiles(int level, const InternalKey& begin,
+                                  const InternalKey& end) {
+    std::vector<FileMetaData*> inputs;
+    vstorage_.GetOverlappingInputs(level, &begin, &end, &inputs);
+
+    std::string result;
+    for (size_t i = 0; i < inputs.size(); ++i) {
+      if (i > 0) {
+        result += ",";
+      }
+      AppendNumberTo(&result, inputs[i]->fd.GetNumber());
+    }
+    return result;
+  }
+};
+
+TEST_F(VersionStorageInfoTest, MaxBytesForLevelStatic) {
+  ioptions_.level_compaction_dynamic_level_bytes = false;
+  mutable_cf_options_.max_bytes_for_level_base = 10;
+  mutable_cf_options_.max_bytes_for_level_multiplier = 5;
+  Add(4, 100U, "1", "2");
+  Add(5, 101U, "1", "2");
+
+  vstorage_.CalculateBaseBytes(ioptions_, mutable_cf_options_);
+  ASSERT_EQ(vstorage_.MaxBytesForLevel(1), 10U);
+  ASSERT_EQ(vstorage_.MaxBytesForLevel(2), 50U);
+  ASSERT_EQ(vstorage_.MaxBytesForLevel(3), 250U);
+  ASSERT_EQ(vstorage_.MaxBytesForLevel(4), 1250U);
+
+  ASSERT_EQ(0, logger_->log_count);
+}
+
+TEST_F(VersionStorageInfoTest, MaxBytesForLevelDynamic) {
+  ioptions_.level_compaction_dynamic_level_bytes = true;
+  mutable_cf_options_.max_bytes_for_level_base = 1000;
+  mutable_cf_options_.max_bytes_for_level_multiplier = 5;
+  Add(5, 1U, "1", "2", 500U);
+
+  vstorage_.CalculateBaseBytes(ioptions_, mutable_cf_options_);
+  ASSERT_EQ(0, logger_->log_count);
+  ASSERT_EQ(vstorage_.base_level(), 5);
+
+  Add(5, 2U, "3", "4", 550U);
+  vstorage_.CalculateBaseBytes(ioptions_, mutable_cf_options_);
+  ASSERT_EQ(0, logger_->log_count);
+  ASSERT_EQ(vstorage_.MaxBytesForLevel(4), 1000U);
+  ASSERT_EQ(vstorage_.base_level(), 4);
+
+  Add(4, 3U, "3", "4", 550U);
+  vstorage_.CalculateBaseBytes(ioptions_, mutable_cf_options_);
+  ASSERT_EQ(0, logger_->log_count);
+  ASSERT_EQ(vstorage_.MaxBytesForLevel(4), 1000U);
+  ASSERT_EQ(vstorage_.base_level(), 4);
+
+  Add(3, 4U, "3", "4", 250U);
+  Add(3, 5U, "5", "7", 300U);
+  vstorage_.CalculateBaseBytes(ioptions_, mutable_cf_options_);
+  ASSERT_EQ(1, logger_->log_count);
+  ASSERT_EQ(vstorage_.MaxBytesForLevel(4), 1005U);
+  ASSERT_EQ(vstorage_.MaxBytesForLevel(3), 1000U);
+  ASSERT_EQ(vstorage_.base_level(), 3);
+
+  Add(1, 6U, "3", "4", 5U);
+  Add(1, 7U, "8", "9", 5U);
+  logger_->log_count = 0;
+  vstorage_.CalculateBaseBytes(ioptions_, mutable_cf_options_);
+  ASSERT_EQ(1, logger_->log_count);
+  ASSERT_GT(vstorage_.MaxBytesForLevel(4), 1005U);
+  ASSERT_GT(vstorage_.MaxBytesForLevel(3), 1005U);
+  ASSERT_EQ(vstorage_.MaxBytesForLevel(2), 1005U);
+  ASSERT_EQ(vstorage_.MaxBytesForLevel(1), 1000U);
+  ASSERT_EQ(vstorage_.base_level(), 1);
+}
+
+TEST_F(VersionStorageInfoTest, MaxBytesForLevelDynamicLotsOfData) {
+  ioptions_.level_compaction_dynamic_level_bytes = true;
+  mutable_cf_options_.max_bytes_for_level_base = 100;
+  mutable_cf_options_.max_bytes_for_level_multiplier = 2;
+  Add(0, 1U, "1", "2", 50U);
+  Add(1, 2U, "1", "2", 50U);
+  Add(2, 3U, "1", "2", 500U);
+  Add(3, 4U, "1", "2", 500U);
+  Add(4, 5U, "1", "2", 1700U);
+  Add(5, 6U, "1", "2", 500U);
+
+  vstorage_.CalculateBaseBytes(ioptions_, mutable_cf_options_);
+  ASSERT_EQ(vstorage_.MaxBytesForLevel(4), 800U);
+  ASSERT_EQ(vstorage_.MaxBytesForLevel(3), 400U);
+  ASSERT_EQ(vstorage_.MaxBytesForLevel(2), 200U);
+  ASSERT_EQ(vstorage_.MaxBytesForLevel(1), 100U);
+  ASSERT_EQ(vstorage_.base_level(), 1);
+  ASSERT_EQ(0, logger_->log_count);
+}
+
+TEST_F(VersionStorageInfoTest, MaxBytesForLevelDynamicLargeLevel) {
+  uint64_t kOneGB = 1000U * 1000U * 1000U;
+  ioptions_.level_compaction_dynamic_level_bytes = true;
+  mutable_cf_options_.max_bytes_for_level_base = 10U * kOneGB;
+  mutable_cf_options_.max_bytes_for_level_multiplier = 10;
+  Add(0, 1U, "1", "2", 50U);
+  Add(3, 4U, "1", "2", 32U * kOneGB);
+  Add(4, 5U, "1", "2", 500U * kOneGB);
+  Add(5, 6U, "1", "2", 3000U * kOneGB);
+
+  vstorage_.CalculateBaseBytes(ioptions_, mutable_cf_options_);
+  ASSERT_EQ(vstorage_.MaxBytesForLevel(5), 3000U * kOneGB);
+  ASSERT_EQ(vstorage_.MaxBytesForLevel(4), 300U * kOneGB);
+  ASSERT_EQ(vstorage_.MaxBytesForLevel(3), 30U * kOneGB);
+  ASSERT_EQ(vstorage_.MaxBytesForLevel(2), 10U * kOneGB);
+  ASSERT_EQ(vstorage_.base_level(), 2);
+  ASSERT_EQ(0, logger_->log_count);
+}
+
+TEST_F(VersionStorageInfoTest, MaxBytesForLevelDynamicWithLargeL0_1) {
+  ioptions_.level_compaction_dynamic_level_bytes = true;
+  mutable_cf_options_.max_bytes_for_level_base = 40000;
+  mutable_cf_options_.max_bytes_for_level_multiplier = 5;
+  mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+
+  Add(0, 1U, "1", "2", 10000U);
+  Add(0, 2U, "1", "2", 10000U);
+  Add(0, 3U, "1", "2", 10000U);
+
+  Add(5, 4U, "1", "2", 1286250U);
+  Add(4, 5U, "1", "2", 200000U);
+  Add(3, 6U, "1", "2", 40000U);
+  Add(2, 7U, "1", "2", 8000U);
+
+  vstorage_.CalculateBaseBytes(ioptions_, mutable_cf_options_);
+  ASSERT_EQ(0, logger_->log_count);
+  ASSERT_EQ(2, vstorage_.base_level());
+  // level multiplier should be 3.5
+  ASSERT_EQ(vstorage_.level_multiplier(), 5.0);
+  // Level size should be around 30,000, 105,000, 367,500
+  ASSERT_EQ(40000U, vstorage_.MaxBytesForLevel(2));
+  ASSERT_EQ(51450U, vstorage_.MaxBytesForLevel(3));
+  ASSERT_EQ(257250U, vstorage_.MaxBytesForLevel(4));
+}
+
+TEST_F(VersionStorageInfoTest, MaxBytesForLevelDynamicWithLargeL0_2) {
+  ioptions_.level_compaction_dynamic_level_bytes = true;
+  mutable_cf_options_.max_bytes_for_level_base = 10000;
+  mutable_cf_options_.max_bytes_for_level_multiplier = 5;
+  mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+
+  Add(0, 11U, "1", "2", 10000U);
+  Add(0, 12U, "1", "2", 10000U);
+  Add(0, 13U, "1", "2", 10000U);
+
+  Add(5, 4U, "1", "2", 1286250U);
+  Add(4, 5U, "1", "2", 200000U);
+  Add(3, 6U, "1", "2", 40000U);
+  Add(2, 7U, "1", "2", 8000U);
+
+  vstorage_.CalculateBaseBytes(ioptions_, mutable_cf_options_);
+  ASSERT_EQ(0, logger_->log_count);
+  ASSERT_EQ(2, vstorage_.base_level());
+  // level multiplier should be 3.5
+  ASSERT_LT(vstorage_.level_multiplier(), 3.6);
+  ASSERT_GT(vstorage_.level_multiplier(), 3.4);
+  // Level size should be around 30,000, 105,000, 367,500
+  ASSERT_EQ(30000U, vstorage_.MaxBytesForLevel(2));
+  ASSERT_LT(vstorage_.MaxBytesForLevel(3), 110000U);
+  ASSERT_GT(vstorage_.MaxBytesForLevel(3), 100000U);
+  ASSERT_LT(vstorage_.MaxBytesForLevel(4), 370000U);
+  ASSERT_GT(vstorage_.MaxBytesForLevel(4), 360000U);
+}
+
+TEST_F(VersionStorageInfoTest, MaxBytesForLevelDynamicWithLargeL0_3) {
+  ioptions_.level_compaction_dynamic_level_bytes = true;
+  mutable_cf_options_.max_bytes_for_level_base = 10000;
+  mutable_cf_options_.max_bytes_for_level_multiplier = 5;
+  mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+
+  Add(0, 11U, "1", "2", 5000U);
+  Add(0, 12U, "1", "2", 5000U);
+  Add(0, 13U, "1", "2", 5000U);
+  Add(0, 14U, "1", "2", 5000U);
+  Add(0, 15U, "1", "2", 5000U);
+  Add(0, 16U, "1", "2", 5000U);
+
+  Add(5, 4U, "1", "2", 1286250U);
+  Add(4, 5U, "1", "2", 200000U);
+  Add(3, 6U, "1", "2", 40000U);
+  Add(2, 7U, "1", "2", 8000U);
+
+  vstorage_.CalculateBaseBytes(ioptions_, mutable_cf_options_);
+  ASSERT_EQ(0, logger_->log_count);
+  ASSERT_EQ(2, vstorage_.base_level());
+  // level multiplier should be 3.5
+  ASSERT_LT(vstorage_.level_multiplier(), 3.6);
+  ASSERT_GT(vstorage_.level_multiplier(), 3.4);
+  // Level size should be around 30,000, 105,000, 367,500
+  ASSERT_EQ(30000U, vstorage_.MaxBytesForLevel(2));
+  ASSERT_LT(vstorage_.MaxBytesForLevel(3), 110000U);
+  ASSERT_GT(vstorage_.MaxBytesForLevel(3), 100000U);
+  ASSERT_LT(vstorage_.MaxBytesForLevel(4), 370000U);
+  ASSERT_GT(vstorage_.MaxBytesForLevel(4), 360000U);
+}
+
+TEST_F(VersionStorageInfoTest, EstimateLiveDataSize) {
+  // Test whether the overlaps are detected as expected
+  Add(1, 1U, "4", "7", 1U);  // Perfect overlap with last level
+  Add(2, 2U, "3", "5", 1U);  // Partial overlap with last level
+  Add(2, 3U, "6", "8", 1U);  // Partial overlap with last level
+  Add(3, 4U, "1", "9", 1U);  // Contains range of last level
+  Add(4, 5U, "4", "5", 1U);  // Inside range of last level
+  Add(4, 5U, "6", "7", 1U);  // Inside range of last level
+  Add(5, 6U, "4", "7", 10U);
+  ASSERT_EQ(10U, vstorage_.EstimateLiveDataSize());
+}
+
+TEST_F(VersionStorageInfoTest, EstimateLiveDataSize2) {
+  Add(0, 1U, "9", "9", 1U);  // Level 0 is not ordered
+  Add(0, 1U, "5", "6", 1U);  // Ignored because of [5,6] in l1
+  Add(1, 1U, "1", "2", 1U);  // Ignored because of [2,3] in l2
+  Add(1, 2U, "3", "4", 1U);  // Ignored because of [2,3] in l2
+  Add(1, 3U, "5", "6", 1U);
+  Add(2, 4U, "2", "3", 1U);
+  Add(3, 5U, "7", "8", 1U);
+  ASSERT_EQ(4U, vstorage_.EstimateLiveDataSize());
+}
+
+TEST_F(VersionStorageInfoTest, GetOverlappingInputs) {
+  // Two files that overlap at the range deletion tombstone sentinel.
+  Add(1, 1U, {"a", 0, kTypeValue}, {"b", kMaxSequenceNumber, kTypeRangeDeletion}, 1);
+  Add(1, 2U, {"b", 0, kTypeValue}, {"c", 0, kTypeValue}, 1);
+  // Two files that overlap at the same user key.
+  Add(1, 3U, {"d", 0, kTypeValue}, {"e", kMaxSequenceNumber, kTypeValue}, 1);
+  Add(1, 4U, {"e", 0, kTypeValue}, {"f", 0, kTypeValue}, 1);
+  // Two files that do not overlap.
+  Add(1, 5U, {"g", 0, kTypeValue}, {"h", 0, kTypeValue}, 1);
+  Add(1, 6U, {"i", 0, kTypeValue}, {"j", 0, kTypeValue}, 1);
+  vstorage_.UpdateNumNonEmptyLevels();
+  vstorage_.GenerateLevelFilesBrief();
+
+  ASSERT_EQ("1,2", GetOverlappingFiles(
+      1, {"a", 0, kTypeValue}, {"b", 0, kTypeValue}));
+  ASSERT_EQ("1", GetOverlappingFiles(
+      1, {"a", 0, kTypeValue}, {"b", kMaxSequenceNumber, kTypeRangeDeletion}));
+  ASSERT_EQ("2", GetOverlappingFiles(
+      1, {"b", kMaxSequenceNumber, kTypeValue}, {"c", 0, kTypeValue}));
+  ASSERT_EQ("3,4", GetOverlappingFiles(
+      1, {"d", 0, kTypeValue}, {"e", 0, kTypeValue}));
+  ASSERT_EQ("3", GetOverlappingFiles(
+      1, {"d", 0, kTypeValue}, {"e", kMaxSequenceNumber, kTypeRangeDeletion}));
+  ASSERT_EQ("3,4", GetOverlappingFiles(
+      1, {"e", kMaxSequenceNumber, kTypeValue}, {"f", 0, kTypeValue}));
+  ASSERT_EQ("3,4", GetOverlappingFiles(
+      1, {"e", 0, kTypeValue}, {"f", 0, kTypeValue}));
+  ASSERT_EQ("5", GetOverlappingFiles(
+      1, {"g", 0, kTypeValue}, {"h", 0, kTypeValue}));
+  ASSERT_EQ("6", GetOverlappingFiles(
+      1, {"i", 0, kTypeValue}, {"j", 0, kTypeValue}));
+}
+
+
+class FindLevelFileTest : public testing::Test {
+ public:
+  LevelFilesBrief file_level_;
+  bool disjoint_sorted_files_;
+  Arena arena_;
+
+  FindLevelFileTest() : disjoint_sorted_files_(true) { }
+
+  ~FindLevelFileTest() override {}
+
+  void LevelFileInit(size_t num = 0) {
+    char* mem = arena_.AllocateAligned(num * sizeof(FdWithKeyRange));
+    file_level_.files = new (mem)FdWithKeyRange[num];
+    file_level_.num_files = 0;
+  }
+
+  void Add(const char* smallest, const char* largest,
+           SequenceNumber smallest_seq = 100,
+           SequenceNumber largest_seq = 100) {
+    InternalKey smallest_key = InternalKey(smallest, smallest_seq, kTypeValue);
+    InternalKey largest_key = InternalKey(largest, largest_seq, kTypeValue);
+
+    Slice smallest_slice = smallest_key.Encode();
+    Slice largest_slice = largest_key.Encode();
+
+    char* mem = arena_.AllocateAligned(
+        smallest_slice.size() + largest_slice.size());
+    memcpy(mem, smallest_slice.data(), smallest_slice.size());
+    memcpy(mem + smallest_slice.size(), largest_slice.data(),
+        largest_slice.size());
+
+    // add to file_level_
+    size_t num = file_level_.num_files;
+    auto& file = file_level_.files[num];
+    file.fd = FileDescriptor(num + 1, 0, 0);
+    file.smallest_key = Slice(mem, smallest_slice.size());
+    file.largest_key = Slice(mem + smallest_slice.size(),
+        largest_slice.size());
+    file_level_.num_files++;
+  }
+
+  int Find(const char* key) {
+    InternalKey target(key, 100, kTypeValue);
+    InternalKeyComparator cmp(BytewiseComparator());
+    return FindFile(cmp, file_level_, target.Encode());
+  }
+
+  bool Overlaps(const char* smallest, const char* largest) {
+    InternalKeyComparator cmp(BytewiseComparator());
+    Slice s(smallest != nullptr ? smallest : "");
+    Slice l(largest != nullptr ? largest : "");
+    return SomeFileOverlapsRange(cmp, disjoint_sorted_files_, file_level_,
+                                 (smallest != nullptr ? &s : nullptr),
+                                 (largest != nullptr ? &l : nullptr));
+  }
+};
+
+TEST_F(FindLevelFileTest, LevelEmpty) {
+  LevelFileInit(0);
+
+  ASSERT_EQ(0, Find("foo"));
+  ASSERT_TRUE(! Overlaps("a", "z"));
+  ASSERT_TRUE(! Overlaps(nullptr, "z"));
+  ASSERT_TRUE(! Overlaps("a", nullptr));
+  ASSERT_TRUE(! Overlaps(nullptr, nullptr));
+}
+
+TEST_F(FindLevelFileTest, LevelSingle) {
+  LevelFileInit(1);
+
+  Add("p", "q");
+  ASSERT_EQ(0, Find("a"));
+  ASSERT_EQ(0, Find("p"));
+  ASSERT_EQ(0, Find("p1"));
+  ASSERT_EQ(0, Find("q"));
+  ASSERT_EQ(1, Find("q1"));
+  ASSERT_EQ(1, Find("z"));
+
+  ASSERT_TRUE(! Overlaps("a", "b"));
+  ASSERT_TRUE(! Overlaps("z1", "z2"));
+  ASSERT_TRUE(Overlaps("a", "p"));
+  ASSERT_TRUE(Overlaps("a", "q"));
+  ASSERT_TRUE(Overlaps("a", "z"));
+  ASSERT_TRUE(Overlaps("p", "p1"));
+  ASSERT_TRUE(Overlaps("p", "q"));
+  ASSERT_TRUE(Overlaps("p", "z"));
+  ASSERT_TRUE(Overlaps("p1", "p2"));
+  ASSERT_TRUE(Overlaps("p1", "z"));
+  ASSERT_TRUE(Overlaps("q", "q"));
+  ASSERT_TRUE(Overlaps("q", "q1"));
+
+  ASSERT_TRUE(! Overlaps(nullptr, "j"));
+  ASSERT_TRUE(! Overlaps("r", nullptr));
+  ASSERT_TRUE(Overlaps(nullptr, "p"));
+  ASSERT_TRUE(Overlaps(nullptr, "p1"));
+  ASSERT_TRUE(Overlaps("q", nullptr));
+  ASSERT_TRUE(Overlaps(nullptr, nullptr));
+}
+
+TEST_F(FindLevelFileTest, LevelMultiple) {
+  LevelFileInit(4);
+
+  Add("150", "200");
+  Add("200", "250");
+  Add("300", "350");
+  Add("400", "450");
+  ASSERT_EQ(0, Find("100"));
+  ASSERT_EQ(0, Find("150"));
+  ASSERT_EQ(0, Find("151"));
+  ASSERT_EQ(0, Find("199"));
+  ASSERT_EQ(0, Find("200"));
+  ASSERT_EQ(1, Find("201"));
+  ASSERT_EQ(1, Find("249"));
+  ASSERT_EQ(1, Find("250"));
+  ASSERT_EQ(2, Find("251"));
+  ASSERT_EQ(2, Find("299"));
+  ASSERT_EQ(2, Find("300"));
+  ASSERT_EQ(2, Find("349"));
+  ASSERT_EQ(2, Find("350"));
+  ASSERT_EQ(3, Find("351"));
+  ASSERT_EQ(3, Find("400"));
+  ASSERT_EQ(3, Find("450"));
+  ASSERT_EQ(4, Find("451"));
+
+  ASSERT_TRUE(! Overlaps("100", "149"));
+  ASSERT_TRUE(! Overlaps("251", "299"));
+  ASSERT_TRUE(! Overlaps("451", "500"));
+  ASSERT_TRUE(! Overlaps("351", "399"));
+
+  ASSERT_TRUE(Overlaps("100", "150"));
+  ASSERT_TRUE(Overlaps("100", "200"));
+  ASSERT_TRUE(Overlaps("100", "300"));
+  ASSERT_TRUE(Overlaps("100", "400"));
+  ASSERT_TRUE(Overlaps("100", "500"));
+  ASSERT_TRUE(Overlaps("375", "400"));
+  ASSERT_TRUE(Overlaps("450", "450"));
+  ASSERT_TRUE(Overlaps("450", "500"));
+}
+
+TEST_F(FindLevelFileTest, LevelMultipleNullBoundaries) {
+  LevelFileInit(4);
+
+  Add("150", "200");
+  Add("200", "250");
+  Add("300", "350");
+  Add("400", "450");
+  ASSERT_TRUE(! Overlaps(nullptr, "149"));
+  ASSERT_TRUE(! Overlaps("451", nullptr));
+  ASSERT_TRUE(Overlaps(nullptr, nullptr));
+  ASSERT_TRUE(Overlaps(nullptr, "150"));
+  ASSERT_TRUE(Overlaps(nullptr, "199"));
+  ASSERT_TRUE(Overlaps(nullptr, "200"));
+  ASSERT_TRUE(Overlaps(nullptr, "201"));
+  ASSERT_TRUE(Overlaps(nullptr, "400"));
+  ASSERT_TRUE(Overlaps(nullptr, "800"));
+  ASSERT_TRUE(Overlaps("100", nullptr));
+  ASSERT_TRUE(Overlaps("200", nullptr));
+  ASSERT_TRUE(Overlaps("449", nullptr));
+  ASSERT_TRUE(Overlaps("450", nullptr));
+}
+
+TEST_F(FindLevelFileTest, LevelOverlapSequenceChecks) {
+  LevelFileInit(1);
+
+  Add("200", "200", 5000, 3000);
+  ASSERT_TRUE(! Overlaps("199", "199"));
+  ASSERT_TRUE(! Overlaps("201", "300"));
+  ASSERT_TRUE(Overlaps("200", "200"));
+  ASSERT_TRUE(Overlaps("190", "200"));
+  ASSERT_TRUE(Overlaps("200", "210"));
+}
+
+TEST_F(FindLevelFileTest, LevelOverlappingFiles) {
+  LevelFileInit(2);
+
+  Add("150", "600");
+  Add("400", "500");
+  disjoint_sorted_files_ = false;
+  ASSERT_TRUE(! Overlaps("100", "149"));
+  ASSERT_TRUE(! Overlaps("601", "700"));
+  ASSERT_TRUE(Overlaps("100", "150"));
+  ASSERT_TRUE(Overlaps("100", "200"));
+  ASSERT_TRUE(Overlaps("100", "300"));
+  ASSERT_TRUE(Overlaps("100", "400"));
+  ASSERT_TRUE(Overlaps("100", "500"));
+  ASSERT_TRUE(Overlaps("375", "400"));
+  ASSERT_TRUE(Overlaps("450", "450"));
+  ASSERT_TRUE(Overlaps("450", "500"));
+  ASSERT_TRUE(Overlaps("450", "700"));
+  ASSERT_TRUE(Overlaps("600", "700"));
+}
+
+class VersionSetTestBase {
+ public:
+  const static std::string kColumnFamilyName1;
+  const static std::string kColumnFamilyName2;
+  const static std::string kColumnFamilyName3;
+  int num_initial_edits_;
+
+  VersionSetTestBase()
+      : env_(Env::Default()),
+        fs_(std::make_shared<LegacyFileSystemWrapper>(env_)),
+        dbname_(test::PerThreadDBPath("version_set_test")),
+        db_options_(),
+        mutable_cf_options_(cf_options_),
+        table_cache_(NewLRUCache(50000, 16)),
+        write_buffer_manager_(db_options_.db_write_buffer_size),
+        shutting_down_(false),
+        mock_table_factory_(std::make_shared<mock::MockTableFactory>()) {
+    EXPECT_OK(env_->CreateDirIfMissing(dbname_));
+
+    db_options_.env = env_;
+    db_options_.fs = fs_;
+    versions_.reset(new VersionSet(dbname_, &db_options_, env_options_,
+                                   table_cache_.get(), &write_buffer_manager_,
+                                   &write_controller_,
+                                   /*block_cache_tracer=*/nullptr)),
+        reactive_versions_ = std::make_shared<ReactiveVersionSet>(
+            dbname_, &db_options_, env_options_, table_cache_.get(),
+            &write_buffer_manager_, &write_controller_);
+    db_options_.db_paths.emplace_back(dbname_,
+                                      std::numeric_limits<uint64_t>::max());
+  }
+
+  void PrepareManifest(std::vector<ColumnFamilyDescriptor>* column_families,
+                       SequenceNumber* last_seqno,
+                       std::unique_ptr<log::Writer>* log_writer) {
+    assert(column_families != nullptr);
+    assert(last_seqno != nullptr);
+    assert(log_writer != nullptr);
+    VersionEdit new_db;
+    if (db_options_.write_dbid_to_manifest) {
+      DBImpl* impl = new DBImpl(DBOptions(), dbname_);
+      std::string db_id;
+      impl->GetDbIdentityFromIdentityFile(&db_id);
+      new_db.SetDBId(db_id);
+    }
+    new_db.SetLogNumber(0);
+    new_db.SetNextFile(2);
+    new_db.SetLastSequence(0);
+
+    const std::vector<std::string> cf_names = {
+        kDefaultColumnFamilyName, kColumnFamilyName1, kColumnFamilyName2,
+        kColumnFamilyName3};
+    const int kInitialNumOfCfs = static_cast<int>(cf_names.size());
+    autovector<VersionEdit> new_cfs;
+    uint64_t last_seq = 1;
+    uint32_t cf_id = 1;
+    for (int i = 1; i != kInitialNumOfCfs; ++i) {
+      VersionEdit new_cf;
+      new_cf.AddColumnFamily(cf_names[i]);
+      new_cf.SetColumnFamily(cf_id++);
+      new_cf.SetLogNumber(0);
+      new_cf.SetNextFile(2);
+      new_cf.SetLastSequence(last_seq++);
+      new_cfs.emplace_back(new_cf);
+    }
+    *last_seqno = last_seq;
+    num_initial_edits_ = static_cast<int>(new_cfs.size() + 1);
+    const std::string manifest = DescriptorFileName(dbname_, 1);
+    std::unique_ptr<WritableFile> file;
+    Status s = env_->NewWritableFile(
+        manifest, &file, env_->OptimizeForManifestWrite(env_options_));
+    ASSERT_OK(s);
+    std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
+        NewLegacyWritableFileWrapper(std::move(file)), manifest, env_options_));
+    {
+      log_writer->reset(new log::Writer(std::move(file_writer), 0, false));
+      std::string record;
+      new_db.EncodeTo(&record);
+      s = (*log_writer)->AddRecord(record);
+      for (const auto& e : new_cfs) {
+        record.clear();
+        e.EncodeTo(&record);
+        s = (*log_writer)->AddRecord(record);
+        ASSERT_OK(s);
+      }
+    }
+    ASSERT_OK(s);
+
+    cf_options_.table_factory = mock_table_factory_;
+    for (const auto& cf_name : cf_names) {
+      column_families->emplace_back(cf_name, cf_options_);
+    }
+  }
+
+  // Create DB with 3 column families.
+  void NewDB() {
+    std::vector<ColumnFamilyDescriptor> column_families;
+    SequenceNumber last_seqno;
+    std::unique_ptr<log::Writer> log_writer;
+    SetIdentityFile(env_, dbname_);
+    PrepareManifest(&column_families, &last_seqno, &log_writer);
+    log_writer.reset();
+    // Make "CURRENT" file point to the new manifest file.
+    Status s = SetCurrentFile(env_, dbname_, 1, nullptr);
+    ASSERT_OK(s);
+
+    EXPECT_OK(versions_->Recover(column_families, false));
+    EXPECT_EQ(column_families.size(),
+              versions_->GetColumnFamilySet()->NumberOfColumnFamilies());
+  }
+
+  Env* env_;
+  std::shared_ptr<FileSystem> fs_;
+  const std::string dbname_;
+  EnvOptions env_options_;
+  ImmutableDBOptions db_options_;
+  ColumnFamilyOptions cf_options_;
+  MutableCFOptions mutable_cf_options_;
+  std::shared_ptr<Cache> table_cache_;
+  WriteController write_controller_;
+  WriteBufferManager write_buffer_manager_;
+  std::shared_ptr<VersionSet> versions_;
+  std::shared_ptr<ReactiveVersionSet> reactive_versions_;
+  InstrumentedMutex mutex_;
+  std::atomic<bool> shutting_down_;
+  std::shared_ptr<mock::MockTableFactory> mock_table_factory_;
+};
+
+const std::string VersionSetTestBase::kColumnFamilyName1 = "alice";
+const std::string VersionSetTestBase::kColumnFamilyName2 = "bob";
+const std::string VersionSetTestBase::kColumnFamilyName3 = "charles";
+
+class VersionSetTest : public VersionSetTestBase, public testing::Test {
+ public:
+  VersionSetTest() : VersionSetTestBase() {}
+};
+
+TEST_F(VersionSetTest, SameColumnFamilyGroupCommit) {
+  NewDB();
+  const int kGroupSize = 5;
+  autovector<VersionEdit> edits;
+  for (int i = 0; i != kGroupSize; ++i) {
+    edits.emplace_back(VersionEdit());
+  }
+  autovector<ColumnFamilyData*> cfds;
+  autovector<const MutableCFOptions*> all_mutable_cf_options;
+  autovector<autovector<VersionEdit*>> edit_lists;
+  for (int i = 0; i != kGroupSize; ++i) {
+    cfds.emplace_back(versions_->GetColumnFamilySet()->GetDefault());
+    all_mutable_cf_options.emplace_back(&mutable_cf_options_);
+    autovector<VersionEdit*> edit_list;
+    edit_list.emplace_back(&edits[i]);
+    edit_lists.emplace_back(edit_list);
+  }
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  int count = 0;
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::ProcessManifestWrites:SameColumnFamily", [&](void* arg) {
+        uint32_t* cf_id = reinterpret_cast<uint32_t*>(arg);
+        EXPECT_EQ(0u, *cf_id);
+        ++count;
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+  mutex_.Lock();
+  Status s =
+      versions_->LogAndApply(cfds, all_mutable_cf_options, edit_lists, &mutex_);
+  mutex_.Unlock();
+  EXPECT_OK(s);
+  EXPECT_EQ(kGroupSize - 1, count);
+}
+
+class VersionSetAtomicGroupTest : public VersionSetTestBase,
+                                  public testing::Test {
+ public:
+  VersionSetAtomicGroupTest() : VersionSetTestBase() {}
+
+  void SetUp() override {
+    PrepareManifest(&column_families_, &last_seqno_, &log_writer_);
+    SetupTestSyncPoints();
+  }
+
+  void SetupValidAtomicGroup(int atomic_group_size) {
+    edits_.resize(atomic_group_size);
+    int remaining = atomic_group_size;
+    for (size_t i = 0; i != edits_.size(); ++i) {
+      edits_[i].SetLogNumber(0);
+      edits_[i].SetNextFile(2);
+      edits_[i].MarkAtomicGroup(--remaining);
+      edits_[i].SetLastSequence(last_seqno_++);
+    }
+    ASSERT_OK(SetCurrentFile(env_, dbname_, 1, nullptr));
+  }
+
+  void SetupIncompleteTrailingAtomicGroup(int atomic_group_size) {
+    edits_.resize(atomic_group_size);
+    int remaining = atomic_group_size;
+    for (size_t i = 0; i != edits_.size(); ++i) {
+      edits_[i].SetLogNumber(0);
+      edits_[i].SetNextFile(2);
+      edits_[i].MarkAtomicGroup(--remaining);
+      edits_[i].SetLastSequence(last_seqno_++);
+    }
+    ASSERT_OK(SetCurrentFile(env_, dbname_, 1, nullptr));
+  }
+
+  void SetupCorruptedAtomicGroup(int atomic_group_size) {
+    edits_.resize(atomic_group_size);
+    int remaining = atomic_group_size;
+    for (size_t i = 0; i != edits_.size(); ++i) {
+      edits_[i].SetLogNumber(0);
+      edits_[i].SetNextFile(2);
+      if (i != ((size_t)atomic_group_size / 2)) {
+        edits_[i].MarkAtomicGroup(--remaining);
+      }
+      edits_[i].SetLastSequence(last_seqno_++);
+    }
+    ASSERT_OK(SetCurrentFile(env_, dbname_, 1, nullptr));
+  }
+
+  void SetupIncorrectAtomicGroup(int atomic_group_size) {
+    edits_.resize(atomic_group_size);
+    int remaining = atomic_group_size;
+    for (size_t i = 0; i != edits_.size(); ++i) {
+      edits_[i].SetLogNumber(0);
+      edits_[i].SetNextFile(2);
+      if (i != 1) {
+        edits_[i].MarkAtomicGroup(--remaining);
+      } else {
+        edits_[i].MarkAtomicGroup(remaining--);
+      }
+      edits_[i].SetLastSequence(last_seqno_++);
+    }
+    ASSERT_OK(SetCurrentFile(env_, dbname_, 1, nullptr));
+  }
+
+  void SetupTestSyncPoints() {
+    SyncPoint::GetInstance()->DisableProcessing();
+    SyncPoint::GetInstance()->ClearAllCallBacks();
+    SyncPoint::GetInstance()->SetCallBack(
+        "AtomicGroupReadBuffer::AddEdit:FirstInAtomicGroup", [&](void* arg) {
+          VersionEdit* e = reinterpret_cast<VersionEdit*>(arg);
+          EXPECT_EQ(edits_.front().DebugString(),
+                    e->DebugString());  // compare based on value
+          first_in_atomic_group_ = true;
+        });
+    SyncPoint::GetInstance()->SetCallBack(
+        "AtomicGroupReadBuffer::AddEdit:LastInAtomicGroup", [&](void* arg) {
+          VersionEdit* e = reinterpret_cast<VersionEdit*>(arg);
+          EXPECT_EQ(edits_.back().DebugString(),
+                    e->DebugString());  // compare based on value
+          EXPECT_TRUE(first_in_atomic_group_);
+          last_in_atomic_group_ = true;
+        });
+    SyncPoint::GetInstance()->SetCallBack(
+        "VersionSet::ReadAndRecover:RecoveredEdits", [&](void* arg) {
+          num_recovered_edits_ = *reinterpret_cast<int*>(arg);
+        });
+    SyncPoint::GetInstance()->SetCallBack(
+        "ReactiveVersionSet::ReadAndApply:AppliedEdits",
+        [&](void* arg) { num_applied_edits_ = *reinterpret_cast<int*>(arg); });
+    SyncPoint::GetInstance()->SetCallBack(
+        "AtomicGroupReadBuffer::AddEdit:AtomicGroup",
+        [&](void* /* arg */) { ++num_edits_in_atomic_group_; });
+    SyncPoint::GetInstance()->SetCallBack(
+        "AtomicGroupReadBuffer::AddEdit:AtomicGroupMixedWithNormalEdits",
+        [&](void* arg) {
+          corrupted_edit_ = *reinterpret_cast<VersionEdit*>(arg);
+        });
+    SyncPoint::GetInstance()->SetCallBack(
+        "AtomicGroupReadBuffer::AddEdit:IncorrectAtomicGroupSize",
+        [&](void* arg) {
+          edit_with_incorrect_group_size_ =
+              *reinterpret_cast<VersionEdit*>(arg);
+        });
+    SyncPoint::GetInstance()->EnableProcessing();
+  }
+
+  void AddNewEditsToLog(int num_edits) {
+    for (int i = 0; i < num_edits; i++) {
+      std::string record;
+      edits_[i].EncodeTo(&record);
+      ASSERT_OK(log_writer_->AddRecord(record));
+    }
+  }
+
+  void TearDown() override {
+    SyncPoint::GetInstance()->DisableProcessing();
+    SyncPoint::GetInstance()->ClearAllCallBacks();
+    log_writer_.reset();
+  }
+
+ protected:
+  std::vector<ColumnFamilyDescriptor> column_families_;
+  SequenceNumber last_seqno_;
+  std::vector<VersionEdit> edits_;
+  bool first_in_atomic_group_ = false;
+  bool last_in_atomic_group_ = false;
+  int num_edits_in_atomic_group_ = 0;
+  int num_recovered_edits_ = 0;
+  int num_applied_edits_ = 0;
+  VersionEdit corrupted_edit_;
+  VersionEdit edit_with_incorrect_group_size_;
+  std::unique_ptr<log::Writer> log_writer_;
+};
+
+TEST_F(VersionSetAtomicGroupTest, HandleValidAtomicGroupWithVersionSetRecover) {
+  const int kAtomicGroupSize = 3;
+  SetupValidAtomicGroup(kAtomicGroupSize);
+  AddNewEditsToLog(kAtomicGroupSize);
+  EXPECT_OK(versions_->Recover(column_families_, false));
+  EXPECT_EQ(column_families_.size(),
+            versions_->GetColumnFamilySet()->NumberOfColumnFamilies());
+  EXPECT_TRUE(first_in_atomic_group_);
+  EXPECT_TRUE(last_in_atomic_group_);
+  EXPECT_EQ(num_initial_edits_ + kAtomicGroupSize, num_recovered_edits_);
+  EXPECT_EQ(0, num_applied_edits_);
+}
+
+TEST_F(VersionSetAtomicGroupTest,
+       HandleValidAtomicGroupWithReactiveVersionSetRecover) {
+  const int kAtomicGroupSize = 3;
+  SetupValidAtomicGroup(kAtomicGroupSize);
+  AddNewEditsToLog(kAtomicGroupSize);
+  std::unique_ptr<log::FragmentBufferedReader> manifest_reader;
+  std::unique_ptr<log::Reader::Reporter> manifest_reporter;
+  std::unique_ptr<Status> manifest_reader_status;
+  EXPECT_OK(reactive_versions_->Recover(column_families_, &manifest_reader,
+                                        &manifest_reporter,
+                                        &manifest_reader_status));
+  EXPECT_EQ(column_families_.size(),
+            reactive_versions_->GetColumnFamilySet()->NumberOfColumnFamilies());
+  EXPECT_TRUE(first_in_atomic_group_);
+  EXPECT_TRUE(last_in_atomic_group_);
+  // The recover should clean up the replay buffer.
+  EXPECT_TRUE(reactive_versions_->TEST_read_edits_in_atomic_group() == 0);
+  EXPECT_TRUE(reactive_versions_->replay_buffer().size() == 0);
+  EXPECT_EQ(num_initial_edits_ + kAtomicGroupSize, num_recovered_edits_);
+  EXPECT_EQ(0, num_applied_edits_);
+}
+
+TEST_F(VersionSetAtomicGroupTest,
+       HandleValidAtomicGroupWithReactiveVersionSetReadAndApply) {
+  const int kAtomicGroupSize = 3;
+  SetupValidAtomicGroup(kAtomicGroupSize);
+  std::unique_ptr<log::FragmentBufferedReader> manifest_reader;
+  std::unique_ptr<log::Reader::Reporter> manifest_reporter;
+  std::unique_ptr<Status> manifest_reader_status;
+  EXPECT_OK(reactive_versions_->Recover(column_families_, &manifest_reader,
+                                        &manifest_reporter,
+                                        &manifest_reader_status));
+  AddNewEditsToLog(kAtomicGroupSize);
+  InstrumentedMutex mu;
+  std::unordered_set<ColumnFamilyData*> cfds_changed;
+  mu.Lock();
+  EXPECT_OK(
+      reactive_versions_->ReadAndApply(&mu, &manifest_reader, &cfds_changed));
+  mu.Unlock();
+  EXPECT_TRUE(first_in_atomic_group_);
+  EXPECT_TRUE(last_in_atomic_group_);
+  // The recover should clean up the replay buffer.
+  EXPECT_TRUE(reactive_versions_->TEST_read_edits_in_atomic_group() == 0);
+  EXPECT_TRUE(reactive_versions_->replay_buffer().size() == 0);
+  EXPECT_EQ(num_initial_edits_, num_recovered_edits_);
+  EXPECT_EQ(kAtomicGroupSize, num_applied_edits_);
+}
+
+TEST_F(VersionSetAtomicGroupTest,
+       HandleIncompleteTrailingAtomicGroupWithVersionSetRecover) {
+  const int kAtomicGroupSize = 4;
+  const int kNumberOfPersistedVersionEdits = kAtomicGroupSize - 1;
+  SetupIncompleteTrailingAtomicGroup(kAtomicGroupSize);
+  AddNewEditsToLog(kNumberOfPersistedVersionEdits);
+  EXPECT_OK(versions_->Recover(column_families_, false));
+  EXPECT_EQ(column_families_.size(),
+            versions_->GetColumnFamilySet()->NumberOfColumnFamilies());
+  EXPECT_TRUE(first_in_atomic_group_);
+  EXPECT_FALSE(last_in_atomic_group_);
+  EXPECT_EQ(kNumberOfPersistedVersionEdits, num_edits_in_atomic_group_);
+  EXPECT_EQ(num_initial_edits_, num_recovered_edits_);
+  EXPECT_EQ(0, num_applied_edits_);
+}
+
+TEST_F(VersionSetAtomicGroupTest,
+       HandleIncompleteTrailingAtomicGroupWithReactiveVersionSetRecover) {
+  const int kAtomicGroupSize = 4;
+  const int kNumberOfPersistedVersionEdits = kAtomicGroupSize - 1;
+  SetupIncompleteTrailingAtomicGroup(kAtomicGroupSize);
+  AddNewEditsToLog(kNumberOfPersistedVersionEdits);
+  std::unique_ptr<log::FragmentBufferedReader> manifest_reader;
+  std::unique_ptr<log::Reader::Reporter> manifest_reporter;
+  std::unique_ptr<Status> manifest_reader_status;
+  EXPECT_OK(reactive_versions_->Recover(column_families_, &manifest_reader,
+                                        &manifest_reporter,
+                                        &manifest_reader_status));
+  EXPECT_EQ(column_families_.size(),
+            reactive_versions_->GetColumnFamilySet()->NumberOfColumnFamilies());
+  EXPECT_TRUE(first_in_atomic_group_);
+  EXPECT_FALSE(last_in_atomic_group_);
+  EXPECT_EQ(kNumberOfPersistedVersionEdits, num_edits_in_atomic_group_);
+  // Reactive version set should store the edits in the replay buffer.
+  EXPECT_TRUE(reactive_versions_->TEST_read_edits_in_atomic_group() ==
+              kNumberOfPersistedVersionEdits);
+  EXPECT_TRUE(reactive_versions_->replay_buffer().size() == kAtomicGroupSize);
+  // Write the last record. The reactive version set should now apply all
+  // edits.
+  std::string last_record;
+  edits_[kAtomicGroupSize - 1].EncodeTo(&last_record);
+  EXPECT_OK(log_writer_->AddRecord(last_record));
+  InstrumentedMutex mu;
+  std::unordered_set<ColumnFamilyData*> cfds_changed;
+  mu.Lock();
+  EXPECT_OK(
+      reactive_versions_->ReadAndApply(&mu, &manifest_reader, &cfds_changed));
+  mu.Unlock();
+  // Reactive version set should be empty now.
+  EXPECT_TRUE(reactive_versions_->TEST_read_edits_in_atomic_group() == 0);
+  EXPECT_TRUE(reactive_versions_->replay_buffer().size() == 0);
+  EXPECT_EQ(num_initial_edits_, num_recovered_edits_);
+  EXPECT_EQ(kAtomicGroupSize, num_applied_edits_);
+}
+
+TEST_F(VersionSetAtomicGroupTest,
+       HandleIncompleteTrailingAtomicGroupWithReactiveVersionSetReadAndApply) {
+  const int kAtomicGroupSize = 4;
+  const int kNumberOfPersistedVersionEdits = kAtomicGroupSize - 1;
+  SetupIncompleteTrailingAtomicGroup(kAtomicGroupSize);
+  std::unique_ptr<log::FragmentBufferedReader> manifest_reader;
+  std::unique_ptr<log::Reader::Reporter> manifest_reporter;
+  std::unique_ptr<Status> manifest_reader_status;
+  // No edits in an atomic group.
+  EXPECT_OK(reactive_versions_->Recover(column_families_, &manifest_reader,
+                                        &manifest_reporter,
+                                        &manifest_reader_status));
+  EXPECT_EQ(column_families_.size(),
+            reactive_versions_->GetColumnFamilySet()->NumberOfColumnFamilies());
+  // Write a few edits in an atomic group.
+  AddNewEditsToLog(kNumberOfPersistedVersionEdits);
+  InstrumentedMutex mu;
+  std::unordered_set<ColumnFamilyData*> cfds_changed;
+  mu.Lock();
+  EXPECT_OK(
+      reactive_versions_->ReadAndApply(&mu, &manifest_reader, &cfds_changed));
+  mu.Unlock();
+  EXPECT_TRUE(first_in_atomic_group_);
+  EXPECT_FALSE(last_in_atomic_group_);
+  EXPECT_EQ(kNumberOfPersistedVersionEdits, num_edits_in_atomic_group_);
+  // Reactive version set should store the edits in the replay buffer.
+  EXPECT_TRUE(reactive_versions_->TEST_read_edits_in_atomic_group() ==
+              kNumberOfPersistedVersionEdits);
+  EXPECT_TRUE(reactive_versions_->replay_buffer().size() == kAtomicGroupSize);
+  EXPECT_EQ(num_initial_edits_, num_recovered_edits_);
+  EXPECT_EQ(0, num_applied_edits_);
+}
+
+TEST_F(VersionSetAtomicGroupTest,
+       HandleCorruptedAtomicGroupWithVersionSetRecover) {
+  const int kAtomicGroupSize = 4;
+  SetupCorruptedAtomicGroup(kAtomicGroupSize);
+  AddNewEditsToLog(kAtomicGroupSize);
+  EXPECT_NOK(versions_->Recover(column_families_, false));
+  EXPECT_EQ(column_families_.size(),
+            versions_->GetColumnFamilySet()->NumberOfColumnFamilies());
+  EXPECT_EQ(edits_[kAtomicGroupSize / 2].DebugString(),
+            corrupted_edit_.DebugString());
+}
+
+TEST_F(VersionSetAtomicGroupTest,
+       HandleCorruptedAtomicGroupWithReactiveVersionSetRecover) {
+  const int kAtomicGroupSize = 4;
+  SetupCorruptedAtomicGroup(kAtomicGroupSize);
+  AddNewEditsToLog(kAtomicGroupSize);
+  std::unique_ptr<log::FragmentBufferedReader> manifest_reader;
+  std::unique_ptr<log::Reader::Reporter> manifest_reporter;
+  std::unique_ptr<Status> manifest_reader_status;
+  EXPECT_NOK(reactive_versions_->Recover(column_families_, &manifest_reader,
+                                         &manifest_reporter,
+                                         &manifest_reader_status));
+  EXPECT_EQ(column_families_.size(),
+            reactive_versions_->GetColumnFamilySet()->NumberOfColumnFamilies());
+  EXPECT_EQ(edits_[kAtomicGroupSize / 2].DebugString(),
+            corrupted_edit_.DebugString());
+}
+
+TEST_F(VersionSetAtomicGroupTest,
+       HandleCorruptedAtomicGroupWithReactiveVersionSetReadAndApply) {
+  const int kAtomicGroupSize = 4;
+  SetupCorruptedAtomicGroup(kAtomicGroupSize);
+  InstrumentedMutex mu;
+  std::unordered_set<ColumnFamilyData*> cfds_changed;
+  std::unique_ptr<log::FragmentBufferedReader> manifest_reader;
+  std::unique_ptr<log::Reader::Reporter> manifest_reporter;
+  std::unique_ptr<Status> manifest_reader_status;
+  EXPECT_OK(reactive_versions_->Recover(column_families_, &manifest_reader,
+                                        &manifest_reporter,
+                                        &manifest_reader_status));
+  // Write the corrupted edits.
+  AddNewEditsToLog(kAtomicGroupSize);
+  mu.Lock();
+  EXPECT_OK(
+      reactive_versions_->ReadAndApply(&mu, &manifest_reader, &cfds_changed));
+  mu.Unlock();
+  EXPECT_EQ(edits_[kAtomicGroupSize / 2].DebugString(),
+            corrupted_edit_.DebugString());
+}
+
+TEST_F(VersionSetAtomicGroupTest,
+       HandleIncorrectAtomicGroupSizeWithVersionSetRecover) {
+  const int kAtomicGroupSize = 4;
+  SetupIncorrectAtomicGroup(kAtomicGroupSize);
+  AddNewEditsToLog(kAtomicGroupSize);
+  EXPECT_NOK(versions_->Recover(column_families_, false));
+  EXPECT_EQ(column_families_.size(),
+            versions_->GetColumnFamilySet()->NumberOfColumnFamilies());
+  EXPECT_EQ(edits_[1].DebugString(),
+            edit_with_incorrect_group_size_.DebugString());
+}
+
+TEST_F(VersionSetAtomicGroupTest,
+       HandleIncorrectAtomicGroupSizeWithReactiveVersionSetRecover) {
+  const int kAtomicGroupSize = 4;
+  SetupIncorrectAtomicGroup(kAtomicGroupSize);
+  AddNewEditsToLog(kAtomicGroupSize);
+  std::unique_ptr<log::FragmentBufferedReader> manifest_reader;
+  std::unique_ptr<log::Reader::Reporter> manifest_reporter;
+  std::unique_ptr<Status> manifest_reader_status;
+  EXPECT_NOK(reactive_versions_->Recover(column_families_, &manifest_reader,
+                                         &manifest_reporter,
+                                         &manifest_reader_status));
+  EXPECT_EQ(column_families_.size(),
+            reactive_versions_->GetColumnFamilySet()->NumberOfColumnFamilies());
+  EXPECT_EQ(edits_[1].DebugString(),
+            edit_with_incorrect_group_size_.DebugString());
+}
+
+TEST_F(VersionSetAtomicGroupTest,
+       HandleIncorrectAtomicGroupSizeWithReactiveVersionSetReadAndApply) {
+  const int kAtomicGroupSize = 4;
+  SetupIncorrectAtomicGroup(kAtomicGroupSize);
+  InstrumentedMutex mu;
+  std::unordered_set<ColumnFamilyData*> cfds_changed;
+  std::unique_ptr<log::FragmentBufferedReader> manifest_reader;
+  std::unique_ptr<log::Reader::Reporter> manifest_reporter;
+  std::unique_ptr<Status> manifest_reader_status;
+  EXPECT_OK(reactive_versions_->Recover(column_families_, &manifest_reader,
+                                        &manifest_reporter,
+                                        &manifest_reader_status));
+  AddNewEditsToLog(kAtomicGroupSize);
+  mu.Lock();
+  EXPECT_OK(
+      reactive_versions_->ReadAndApply(&mu, &manifest_reader, &cfds_changed));
+  mu.Unlock();
+  EXPECT_EQ(edits_[1].DebugString(),
+            edit_with_incorrect_group_size_.DebugString());
+}
+
+class VersionSetTestDropOneCF : public VersionSetTestBase,
+                                public testing::TestWithParam<std::string> {
+ public:
+  VersionSetTestDropOneCF() : VersionSetTestBase() {}
+};
+
+// This test simulates the following execution sequence
+// Time  thread1                  bg_flush_thr
+//  |                             Prepare version edits (e1,e2,e3) for atomic
+//  |                             flush cf1, cf2, cf3
+//  |    Enqueue e to drop cfi
+//  |    to manifest_writers_
+//  |                             Enqueue (e1,e2,e3) to manifest_writers_
+//  |
+//  |    Apply e,
+//  |    cfi.IsDropped() is true
+//  |                             Apply (e1,e2,e3),
+//  |                             since cfi.IsDropped() == true, we need to
+//  |                             drop ei and write the rest to MANIFEST.
+//  V
+//
+//  Repeat the test for i = 1, 2, 3 to simulate dropping the first, middle and
+//  last column family in an atomic group.
+TEST_P(VersionSetTestDropOneCF, HandleDroppedColumnFamilyInAtomicGroup) {
+  std::vector<ColumnFamilyDescriptor> column_families;
+  SequenceNumber last_seqno;
+  std::unique_ptr<log::Writer> log_writer;
+  PrepareManifest(&column_families, &last_seqno, &log_writer);
+  Status s = SetCurrentFile(env_, dbname_, 1, nullptr);
+  ASSERT_OK(s);
+
+  EXPECT_OK(versions_->Recover(column_families, false /* read_only */));
+  EXPECT_EQ(column_families.size(),
+            versions_->GetColumnFamilySet()->NumberOfColumnFamilies());
+
+  const int kAtomicGroupSize = 3;
+  const std::vector<std::string> non_default_cf_names = {
+      kColumnFamilyName1, kColumnFamilyName2, kColumnFamilyName3};
+
+  // Drop one column family
+  VersionEdit drop_cf_edit;
+  drop_cf_edit.DropColumnFamily();
+  const std::string cf_to_drop_name(GetParam());
+  auto cfd_to_drop =
+      versions_->GetColumnFamilySet()->GetColumnFamily(cf_to_drop_name);
+  ASSERT_NE(nullptr, cfd_to_drop);
+  // Increase its refcount because cfd_to_drop is used later, and we need to
+  // prevent it from being deleted.
+  cfd_to_drop->Ref();
+  drop_cf_edit.SetColumnFamily(cfd_to_drop->GetID());
+  mutex_.Lock();
+  s = versions_->LogAndApply(cfd_to_drop,
+                             *cfd_to_drop->GetLatestMutableCFOptions(),
+                             &drop_cf_edit, &mutex_);
+  mutex_.Unlock();
+  ASSERT_OK(s);
+
+  std::vector<VersionEdit> edits(kAtomicGroupSize);
+  uint32_t remaining = kAtomicGroupSize;
+  size_t i = 0;
+  autovector<ColumnFamilyData*> cfds;
+  autovector<const MutableCFOptions*> mutable_cf_options_list;
+  autovector<autovector<VersionEdit*>> edit_lists;
+  for (const auto& cf_name : non_default_cf_names) {
+    auto cfd = (cf_name != cf_to_drop_name)
+                   ? versions_->GetColumnFamilySet()->GetColumnFamily(cf_name)
+                   : cfd_to_drop;
+    ASSERT_NE(nullptr, cfd);
+    cfds.push_back(cfd);
+    mutable_cf_options_list.emplace_back(cfd->GetLatestMutableCFOptions());
+    edits[i].SetColumnFamily(cfd->GetID());
+    edits[i].SetLogNumber(0);
+    edits[i].SetNextFile(2);
+    edits[i].MarkAtomicGroup(--remaining);
+    edits[i].SetLastSequence(last_seqno++);
+    autovector<VersionEdit*> tmp_edits;
+    tmp_edits.push_back(&edits[i]);
+    edit_lists.emplace_back(tmp_edits);
+    ++i;
+  }
+  int called = 0;
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::ProcessManifestWrites:CheckOneAtomicGroup", [&](void* arg) {
+        std::vector<VersionEdit*>* tmp_edits =
+            reinterpret_cast<std::vector<VersionEdit*>*>(arg);
+        EXPECT_EQ(kAtomicGroupSize - 1, tmp_edits->size());
+        for (const auto e : *tmp_edits) {
+          bool found = false;
+          for (const auto& e2 : edits) {
+            if (&e2 == e) {
+              found = true;
+              break;
+            }
+          }
+          ASSERT_TRUE(found);
+        }
+        ++called;
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+  mutex_.Lock();
+  s = versions_->LogAndApply(cfds, mutable_cf_options_list, edit_lists,
+                             &mutex_);
+  mutex_.Unlock();
+  ASSERT_OK(s);
+  ASSERT_EQ(1, called);
+  if (cfd_to_drop->Unref()) {
+    delete cfd_to_drop;
+    cfd_to_drop = nullptr;
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(
+    AtomicGroup, VersionSetTestDropOneCF,
+    testing::Values(VersionSetTestBase::kColumnFamilyName1,
+                    VersionSetTestBase::kColumnFamilyName2,
+                    VersionSetTestBase::kColumnFamilyName3));
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/wal_manager.cc b/src/rocksdb/db/wal_manager.cc
new file mode 100644
index 000000000..5b699274c
--- /dev/null
+++ b/src/rocksdb/db/wal_manager.cc
@@ -0,0 +1,510 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/wal_manager.h"
+
+#include <algorithm>
+#include <cinttypes>
+#include <memory>
+#include <vector>
+
+#include "db/log_reader.h"
+#include "db/log_writer.h"
+#include "db/transaction_log_impl.h"
+#include "db/write_batch_internal.h"
+#include "file/file_util.h"
+#include "file/filename.h"
+#include "file/sequence_file_reader.h"
+#include "logging/logging.h"
+#include "port/port.h"
+#include "rocksdb/env.h"
+#include "rocksdb/options.h"
+#include "rocksdb/write_batch.h"
+#include "test_util/sync_point.h"
+#include "util/cast_util.h"
+#include "util/coding.h"
+#include "util/mutexlock.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+#ifndef ROCKSDB_LITE
+
+Status WalManager::DeleteFile(const std::string& fname, uint64_t number) {
+  auto s = env_->DeleteFile(db_options_.wal_dir + "/" + fname);
+  if (s.ok()) {
+    MutexLock l(&read_first_record_cache_mutex_);
+    read_first_record_cache_.erase(number);
+  }
+  return s;
+}
+
+Status WalManager::GetSortedWalFiles(VectorLogPtr& files) {
+  // First get sorted files in db dir, then get sorted files from archived
+  // dir, to avoid a race condition where a log file is moved to archived
+  // dir in between.
+  Status s;
+  // list wal files in main db dir.
+  VectorLogPtr logs;
+  s = GetSortedWalsOfType(db_options_.wal_dir, logs, kAliveLogFile);
+  if (!s.ok()) {
+    return s;
+  }
+
+  // Reproduce the race condition where a log file is moved
+  // to archived dir, between these two sync points, used in
+  // (DBTest,TransactionLogIteratorRace)
+  TEST_SYNC_POINT("WalManager::GetSortedWalFiles:1");
+  TEST_SYNC_POINT("WalManager::GetSortedWalFiles:2");
+
+  files.clear();
+  // list wal files in archive dir.
+  std::string archivedir = ArchivalDirectory(db_options_.wal_dir);
+  Status exists = env_->FileExists(archivedir);
+  if (exists.ok()) {
+    s = GetSortedWalsOfType(archivedir, files, kArchivedLogFile);
+    if (!s.ok()) {
+      return s;
+    }
+  } else if (!exists.IsNotFound()) {
+    assert(s.IsIOError());
+    return s;
+  }
+
+  uint64_t latest_archived_log_number = 0;
+  if (!files.empty()) {
+    latest_archived_log_number = files.back()->LogNumber();
+    ROCKS_LOG_INFO(db_options_.info_log, "Latest Archived log: %" PRIu64,
+                   latest_archived_log_number);
+  }
+
+  files.reserve(files.size() + logs.size());
+  for (auto& log : logs) {
+    if (log->LogNumber() > latest_archived_log_number) {
+      files.push_back(std::move(log));
+    } else {
+      // When the race condition happens, we could see the
+      // same log in both db dir and archived dir. Simply
+      // ignore the one in db dir. Note that, if we read
+      // archived dir first, we would have missed the log file.
+      ROCKS_LOG_WARN(db_options_.info_log, "%s already moved to archive",
+                     log->PathName().c_str());
+    }
+  }
+
+  return s;
+}
+
+Status WalManager::GetUpdatesSince(
+    SequenceNumber seq, std::unique_ptr<TransactionLogIterator>* iter,
+    const TransactionLogIterator::ReadOptions& read_options,
+    VersionSet* version_set) {
+
+  //  Get all sorted Wal Files.
+  //  Do binary search and open files and find the seq number.
+
+  std::unique_ptr<VectorLogPtr> wal_files(new VectorLogPtr);
+  Status s = GetSortedWalFiles(*wal_files);
+  if (!s.ok()) {
+    return s;
+  }
+
+  s = RetainProbableWalFiles(*wal_files, seq);
+  if (!s.ok()) {
+    return s;
+  }
+  iter->reset(new TransactionLogIteratorImpl(
+      db_options_.wal_dir, &db_options_, read_options, file_options_, seq,
+      std::move(wal_files), version_set, seq_per_batch_));
+  return (*iter)->status();
+}
+
+// 1. Go through all archived files and
+//    a. if ttl is enabled, delete outdated files
+//    b. if archive size limit is enabled, delete empty files,
+//        compute file number and size.
+// 2. If size limit is enabled:
+//    a. compute how many files should be deleted
+//    b. get sorted non-empty archived logs
+//    c. delete what should be deleted
+void WalManager::PurgeObsoleteWALFiles() {
+  bool const ttl_enabled = db_options_.wal_ttl_seconds > 0;
+  bool const size_limit_enabled = db_options_.wal_size_limit_mb > 0;
+  if (!ttl_enabled && !size_limit_enabled) {
+    return;
+  }
+
+  int64_t current_time;
+  Status s = env_->GetCurrentTime(&current_time);
+  if (!s.ok()) {
+    ROCKS_LOG_ERROR(db_options_.info_log, "Can't get current time: %s",
+                    s.ToString().c_str());
+    assert(false);
+    return;
+  }
+  uint64_t const now_seconds = static_cast<uint64_t>(current_time);
+  uint64_t const time_to_check = (ttl_enabled && !size_limit_enabled)
+                                     ? db_options_.wal_ttl_seconds / 2
+                                     : kDefaultIntervalToDeleteObsoleteWAL;
+
+  if (purge_wal_files_last_run_ + time_to_check > now_seconds) {
+    return;
+  }
+
+  purge_wal_files_last_run_ = now_seconds;
+
+  std::string archival_dir = ArchivalDirectory(db_options_.wal_dir);
+  std::vector<std::string> files;
+  s = env_->GetChildren(archival_dir, &files);
+  if (!s.ok()) {
+    ROCKS_LOG_ERROR(db_options_.info_log, "Can't get archive files: %s",
+                    s.ToString().c_str());
+    assert(false);
+    return;
+  }
+
+  size_t log_files_num = 0;
+  uint64_t log_file_size = 0;
+
+  for (auto& f : files) {
+    uint64_t number;
+    FileType type;
+    if (ParseFileName(f, &number, &type) && type == kLogFile) {
+      std::string const file_path = archival_dir + "/" + f;
+      if (ttl_enabled) {
+        uint64_t file_m_time;
+        s = env_->GetFileModificationTime(file_path, &file_m_time);
+        if (!s.ok()) {
+          ROCKS_LOG_WARN(db_options_.info_log,
+                         "Can't get file mod time: %s: %s", file_path.c_str(),
+                         s.ToString().c_str());
+          continue;
+        }
+        if (now_seconds - file_m_time > db_options_.wal_ttl_seconds) {
+          s = DeleteDBFile(&db_options_, file_path, archival_dir, false,
+                           /*force_fg=*/!wal_in_db_path_);
+          if (!s.ok()) {
+            ROCKS_LOG_WARN(db_options_.info_log, "Can't delete file: %s: %s",
+                           file_path.c_str(), s.ToString().c_str());
+            continue;
+          } else {
+            MutexLock l(&read_first_record_cache_mutex_);
+            read_first_record_cache_.erase(number);
+          }
+          continue;
+        }
+      }
+
+      if (size_limit_enabled) {
+        uint64_t file_size;
+        s = env_->GetFileSize(file_path, &file_size);
+        if (!s.ok()) {
+          ROCKS_LOG_ERROR(db_options_.info_log,
+                          "Unable to get file size: %s: %s", file_path.c_str(),
+                          s.ToString().c_str());
+          return;
+        } else {
+          if (file_size > 0) {
+            log_file_size = std::max(log_file_size, file_size);
+            ++log_files_num;
+          } else {
+            s = DeleteDBFile(&db_options_, file_path, archival_dir, false,
+                             /*force_fg=*/!wal_in_db_path_);
+            if (!s.ok()) {
+              ROCKS_LOG_WARN(db_options_.info_log,
+                             "Unable to delete file: %s: %s", file_path.c_str(),
+                             s.ToString().c_str());
+              continue;
+            } else {
+              MutexLock l(&read_first_record_cache_mutex_);
+              read_first_record_cache_.erase(number);
+            }
+          }
+        }
+      }
+    }
+  }
+
+  if (0 == log_files_num || !size_limit_enabled) {
+    return;
+  }
+
+  size_t const files_keep_num =
+      static_cast<size_t>(db_options_.wal_size_limit_mb * 1024 * 1024 / log_file_size);
+  if (log_files_num <= files_keep_num) {
+    return;
+  }
+
+  size_t files_del_num = log_files_num - files_keep_num;
+  VectorLogPtr archived_logs;
+  GetSortedWalsOfType(archival_dir, archived_logs, kArchivedLogFile);
+
+  if (files_del_num > archived_logs.size()) {
+    ROCKS_LOG_WARN(db_options_.info_log,
+                   "Trying to delete more archived log files than "
+                   "exist. Deleting all");
+    files_del_num = archived_logs.size();
+  }
+
+  for (size_t i = 0; i < files_del_num; ++i) {
+    std::string const file_path = archived_logs[i]->PathName();
+    s = DeleteDBFile(&db_options_, db_options_.wal_dir + "/" + file_path,
+                     db_options_.wal_dir, false,
+                     /*force_fg=*/!wal_in_db_path_);
+    if (!s.ok()) {
+      ROCKS_LOG_WARN(db_options_.info_log, "Unable to delete file: %s: %s",
+                     file_path.c_str(), s.ToString().c_str());
+      continue;
+    } else {
+      MutexLock l(&read_first_record_cache_mutex_);
+      read_first_record_cache_.erase(archived_logs[i]->LogNumber());
+    }
+  }
+}
+
+void WalManager::ArchiveWALFile(const std::string& fname, uint64_t number) {
+  auto archived_log_name = ArchivedLogFileName(db_options_.wal_dir, number);
+  // The sync point below is used in (DBTest,TransactionLogIteratorRace)
+  TEST_SYNC_POINT("WalManager::PurgeObsoleteFiles:1");
+  Status s = env_->RenameFile(fname, archived_log_name);
+  // The sync point below is used in (DBTest,TransactionLogIteratorRace)
+  TEST_SYNC_POINT("WalManager::PurgeObsoleteFiles:2");
+  ROCKS_LOG_INFO(db_options_.info_log, "Move log file %s to %s -- %s\n",
+                 fname.c_str(), archived_log_name.c_str(),
+                 s.ToString().c_str());
+}
+
+Status WalManager::GetSortedWalsOfType(const std::string& path,
+                                       VectorLogPtr& log_files,
+                                       WalFileType log_type) {
+  std::vector<std::string> all_files;
+  const Status status = env_->GetChildren(path, &all_files);
+  if (!status.ok()) {
+    return status;
+  }
+  log_files.reserve(all_files.size());
+  for (const auto& f : all_files) {
+    uint64_t number;
+    FileType type;
+    if (ParseFileName(f, &number, &type) && type == kLogFile) {
+      SequenceNumber sequence;
+      Status s = ReadFirstRecord(log_type, number, &sequence);
+      if (!s.ok()) {
+        return s;
+      }
+      if (sequence == 0) {
+        // empty file
+        continue;
+      }
+
+      // Reproduce the race condition where a log file is moved
+      // to archived dir, between these two sync points, used in
+      // (DBTest,TransactionLogIteratorRace)
+      TEST_SYNC_POINT("WalManager::GetSortedWalsOfType:1");
+      TEST_SYNC_POINT("WalManager::GetSortedWalsOfType:2");
+
+      uint64_t size_bytes;
+      s = env_->GetFileSize(LogFileName(path, number), &size_bytes);
+      // re-try in case the alive log file has been moved to archive.
+      if (!s.ok() && log_type == kAliveLogFile) {
+        std::string archived_file = ArchivedLogFileName(path, number);
+        if (env_->FileExists(archived_file).ok()) {
+          s = env_->GetFileSize(archived_file, &size_bytes);
+          if (!s.ok() && env_->FileExists(archived_file).IsNotFound()) {
+            // oops, the file just got deleted from archived dir! move on
+            s = Status::OK();
+            continue;
+          }
+        }
+      }
+      if (!s.ok()) {
+        return s;
+      }
+
+      log_files.push_back(std::unique_ptr<LogFile>(
+          new LogFileImpl(number, log_type, sequence, size_bytes)));
+    }
+  }
+  std::sort(
+      log_files.begin(), log_files.end(),
+      [](const std::unique_ptr<LogFile>& a, const std::unique_ptr<LogFile>& b) {
+        LogFileImpl* a_impl =
+            static_cast_with_check<LogFileImpl, LogFile>(a.get());
+        LogFileImpl* b_impl =
+            static_cast_with_check<LogFileImpl, LogFile>(b.get());
+        return *a_impl < *b_impl;
+      });
+  return status;
+}
+
+Status WalManager::RetainProbableWalFiles(VectorLogPtr& all_logs,
+                                          const SequenceNumber target) {
+  int64_t start = 0;  // signed to avoid overflow when target is < first file.
+  int64_t end = static_cast<int64_t>(all_logs.size()) - 1;
+  // Binary Search. avoid opening all files.
+  while (end >= start) {
+    int64_t mid = start + (end - start) / 2;  // Avoid overflow.
+    SequenceNumber current_seq_num = all_logs.at(static_cast<size_t>(mid))->StartSequence();
+    if (current_seq_num == target) {
+      end = mid;
+      break;
+    } else if (current_seq_num < target) {
+      start = mid + 1;
+    } else {
+      end = mid - 1;
+    }
+  }
+  // end could be -ve.
+  size_t start_index = static_cast<size_t>(std::max(static_cast<int64_t>(0), end));
+  // The last wal file is always included
+  all_logs.erase(all_logs.begin(), all_logs.begin() + start_index);
+  return Status::OK();
+}
+
+Status WalManager::ReadFirstRecord(const WalFileType type,
+                                   const uint64_t number,
+                                   SequenceNumber* sequence) {
+  *sequence = 0;
+  if (type != kAliveLogFile && type != kArchivedLogFile) {
+    ROCKS_LOG_ERROR(db_options_.info_log, "[WalManger] Unknown file type %s",
+                    ToString(type).c_str());
+    return Status::NotSupported(
+        "File Type Not Known " + ToString(type));
+  }
+  {
+    MutexLock l(&read_first_record_cache_mutex_);
+    auto itr = read_first_record_cache_.find(number);
+    if (itr != read_first_record_cache_.end()) {
+      *sequence = itr->second;
+      return Status::OK();
+    }
+  }
+  Status s;
+  if (type == kAliveLogFile) {
+    std::string fname = LogFileName(db_options_.wal_dir, number);
+    s = ReadFirstLine(fname, number, sequence);
+    if (!s.ok() && env_->FileExists(fname).ok()) {
+      // return any error that is not caused by non-existing file
+      return s;
+    }
+  }
+
+  if (type == kArchivedLogFile || !s.ok()) {
+    //  check if the file got moved to archive.
+    std::string archived_file =
+        ArchivedLogFileName(db_options_.wal_dir, number);
+    s = ReadFirstLine(archived_file, number, sequence);
+    // maybe the file was deleted from archive dir. If that's the case, return
+    // Status::OK(). The caller with identify this as empty file because
+    // *sequence == 0
+    if (!s.ok() && env_->FileExists(archived_file).IsNotFound()) {
+      return Status::OK();
+    }
+  }
+
+  if (s.ok() && *sequence != 0) {
+    MutexLock l(&read_first_record_cache_mutex_);
+    read_first_record_cache_.insert({number, *sequence});
+  }
+  return s;
+}
+
+Status WalManager::GetLiveWalFile(uint64_t number,
+                                  std::unique_ptr<LogFile>* log_file) {
+  if (!log_file) {
+    return Status::InvalidArgument("log_file not preallocated.");
+  }
+
+  if (!number) {
+    return Status::PathNotFound("log file not available");
+  }
+
+  Status s;
+
+  uint64_t size_bytes;
+  s = env_->GetFileSize(LogFileName(db_options_.wal_dir, number), &size_bytes);
+
+  if (!s.ok()) {
+    return s;
+  }
+
+  log_file->reset(new LogFileImpl(number, kAliveLogFile,
+                                  0,  // SequenceNumber
+                                  size_bytes));
+
+  return Status::OK();
+}
+
+// the function returns status.ok() and sequence == 0 if the file exists, but is
+// empty
+Status WalManager::ReadFirstLine(const std::string& fname,
+                                 const uint64_t number,
+                                 SequenceNumber* sequence) {
+  struct LogReporter : public log::Reader::Reporter {
+    Env* env;
+    Logger* info_log;
+    const char* fname;
+
+    Status* status;
+    bool ignore_error;  // true if db_options_.paranoid_checks==false
+    void Corruption(size_t bytes, const Status& s) override {
+      ROCKS_LOG_WARN(info_log, "[WalManager] %s%s: dropping %d bytes; %s",
+                     (this->ignore_error ? "(ignoring error) " : ""), fname,
+                     static_cast<int>(bytes), s.ToString().c_str());
+      if (this->status->ok()) {
+        // only keep the first error
+        *this->status = s;
+      }
+    }
+  };
+
+  std::unique_ptr<FSSequentialFile> file;
+  Status status = fs_->NewSequentialFile(fname,
+                                         fs_->OptimizeForLogRead(file_options_),
+                                         &file, nullptr);
+  std::unique_ptr<SequentialFileReader> file_reader(
+      new SequentialFileReader(std::move(file), fname));
+
+  if (!status.ok()) {
+    return status;
+  }
+
+  LogReporter reporter;
+  reporter.env = env_;
+  reporter.info_log = db_options_.info_log.get();
+  reporter.fname = fname.c_str();
+  reporter.status = &status;
+  reporter.ignore_error = !db_options_.paranoid_checks;
+  log::Reader reader(db_options_.info_log, std::move(file_reader), &reporter,
+                     true /*checksum*/, number);
+  std::string scratch;
+  Slice record;
+
+  if (reader.ReadRecord(&record, &scratch) &&
+      (status.ok() || !db_options_.paranoid_checks)) {
+    if (record.size() < WriteBatchInternal::kHeader) {
+      reporter.Corruption(record.size(),
+                          Status::Corruption("log record too small"));
+      // TODO read record's till the first no corrupt entry?
+    } else {
+      WriteBatch batch;
+      WriteBatchInternal::SetContents(&batch, record);
+      *sequence = WriteBatchInternal::Sequence(&batch);
+      return Status::OK();
+    }
+  }
+
+  // ReadRecord returns false on EOF, which means that the log file is empty. we
+  // return status.ok() in that case and set sequence number to 0
+  *sequence = 0;
+  return status;
+}
+
+#endif  // ROCKSDB_LITE
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/wal_manager.h b/src/rocksdb/db/wal_manager.h
new file mode 100644
index 000000000..783bfe99c
--- /dev/null
+++ b/src/rocksdb/db/wal_manager.h
@@ -0,0 +1,114 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+
+#include <atomic>
+#include <deque>
+#include <limits>
+#include <set>
+#include <utility>
+#include <vector>
+#include <string>
+#include <memory>
+
+#include "db/version_set.h"
+#include "file/file_util.h"
+#include "options/db_options.h"
+#include "port/port.h"
+#include "rocksdb/env.h"
+#include "rocksdb/status.h"
+#include "rocksdb/transaction_log.h"
+#include "rocksdb/types.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+#ifndef ROCKSDB_LITE
+
+// WAL manager provides the abstraction for reading the WAL files as a single
+// unit. Internally, it opens and reads the files using Reader or Writer
+// abstraction.
+class WalManager {
+ public:
+  WalManager(const ImmutableDBOptions& db_options,
+             const FileOptions& file_options, const bool seq_per_batch = false)
+      : db_options_(db_options),
+        file_options_(file_options),
+        env_(db_options.env),
+        fs_(db_options.fs.get()),
+        purge_wal_files_last_run_(0),
+        seq_per_batch_(seq_per_batch),
+        wal_in_db_path_(IsWalDirSameAsDBPath(&db_options)) {}
+
+  Status GetSortedWalFiles(VectorLogPtr& files);
+
+  // Allow user to tail transaction log to find all recent changes to the
+  // database that are newer than `seq_number`.
+  Status GetUpdatesSince(
+      SequenceNumber seq_number, std::unique_ptr<TransactionLogIterator>* iter,
+      const TransactionLogIterator::ReadOptions& read_options,
+      VersionSet* version_set);
+
+  void PurgeObsoleteWALFiles();
+
+  void ArchiveWALFile(const std::string& fname, uint64_t number);
+
+  Status DeleteFile(const std::string& fname, uint64_t number);
+
+  Status GetLiveWalFile(uint64_t number, std::unique_ptr<LogFile>* log_file);
+
+  Status TEST_ReadFirstRecord(const WalFileType type, const uint64_t number,
+                              SequenceNumber* sequence) {
+    return ReadFirstRecord(type, number, sequence);
+  }
+
+  Status TEST_ReadFirstLine(const std::string& fname, const uint64_t number,
+                            SequenceNumber* sequence) {
+    return ReadFirstLine(fname, number, sequence);
+  }
+
+ private:
+  Status GetSortedWalsOfType(const std::string& path, VectorLogPtr& log_files,
+                             WalFileType type);
+  // Requires: all_logs should be sorted with earliest log file first
+  // Retains all log files in all_logs which contain updates with seq no.
+  // Greater Than or Equal to the requested SequenceNumber.
+  Status RetainProbableWalFiles(VectorLogPtr& all_logs,
+                                const SequenceNumber target);
+
+  Status ReadFirstRecord(const WalFileType type, const uint64_t number,
+                         SequenceNumber* sequence);
+
+  Status ReadFirstLine(const std::string& fname, const uint64_t number,
+                       SequenceNumber* sequence);
+
+  // ------- state from DBImpl ------
+  const ImmutableDBOptions& db_options_;
+  const FileOptions file_options_;
+  Env* env_;
+  FileSystem* fs_;
+
+  // ------- WalManager state -------
+  // cache for ReadFirstRecord() calls
+  std::unordered_map<uint64_t, SequenceNumber> read_first_record_cache_;
+  port::Mutex read_first_record_cache_mutex_;
+
+  // last time when PurgeObsoleteWALFiles ran.
+  uint64_t purge_wal_files_last_run_;
+
+  bool seq_per_batch_;
+
+  bool wal_in_db_path_;
+
+  // obsolete files will be deleted every this seconds if ttl deletion is
+  // enabled and archive size_limit is disabled.
+  static const uint64_t kDefaultIntervalToDeleteObsoleteWAL = 600;
+};
+
+#endif  // ROCKSDB_LITE
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/wal_manager_test.cc b/src/rocksdb/db/wal_manager_test.cc
new file mode 100644
index 000000000..26bad368e
--- /dev/null
+++ b/src/rocksdb/db/wal_manager_test.cc
@@ -0,0 +1,338 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include <map>
+#include <string>
+
+#include "rocksdb/cache.h"
+#include "rocksdb/write_batch.h"
+#include "rocksdb/write_buffer_manager.h"
+
+#include "db/column_family.h"
+#include "db/db_impl/db_impl.h"
+#include "db/log_writer.h"
+#include "db/version_set.h"
+#include "db/wal_manager.h"
+#include "env/mock_env.h"
+#include "file/writable_file_writer.h"
+#include "table/mock_table.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// TODO(icanadi) mock out VersionSet
+// TODO(icanadi) move other WalManager-specific tests from db_test here
+class WalManagerTest : public testing::Test {
+ public:
+  WalManagerTest()
+      : env_(new MockEnv(Env::Default())),
+        dbname_(test::PerThreadDBPath("wal_manager_test")),
+        db_options_(),
+        table_cache_(NewLRUCache(50000, 16)),
+        write_buffer_manager_(db_options_.db_write_buffer_size),
+        current_log_number_(0) {
+    DestroyDB(dbname_, Options());
+  }
+
+  void Init() {
+    ASSERT_OK(env_->CreateDirIfMissing(dbname_));
+    ASSERT_OK(env_->CreateDirIfMissing(ArchivalDirectory(dbname_)));
+    db_options_.db_paths.emplace_back(dbname_,
+                                      std::numeric_limits<uint64_t>::max());
+    db_options_.wal_dir = dbname_;
+    db_options_.env = env_.get();
+    fs_.reset(new LegacyFileSystemWrapper(env_.get()));
+    db_options_.fs = fs_;
+
+    versions_.reset(new VersionSet(dbname_, &db_options_, env_options_,
+                                   table_cache_.get(), &write_buffer_manager_,
+                                   &write_controller_,
+                                   /*block_cache_tracer=*/nullptr));
+
+    wal_manager_.reset(new WalManager(db_options_, env_options_));
+  }
+
+  void Reopen() {
+    wal_manager_.reset(new WalManager(db_options_, env_options_));
+  }
+
+  // NOT thread safe
+  void Put(const std::string& key, const std::string& value) {
+    assert(current_log_writer_.get() != nullptr);
+    uint64_t seq =  versions_->LastSequence() + 1;
+    WriteBatch batch;
+    batch.Put(key, value);
+    WriteBatchInternal::SetSequence(&batch, seq);
+    current_log_writer_->AddRecord(WriteBatchInternal::Contents(&batch));
+    versions_->SetLastAllocatedSequence(seq);
+    versions_->SetLastPublishedSequence(seq);
+    versions_->SetLastSequence(seq);
+  }
+
+  // NOT thread safe
+  void RollTheLog(bool /*archived*/) {
+    current_log_number_++;
+    std::string fname = ArchivedLogFileName(dbname_, current_log_number_);
+    std::unique_ptr<WritableFile> file;
+    ASSERT_OK(env_->NewWritableFile(fname, &file, env_options_));
+    std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
+        NewLegacyWritableFileWrapper(std::move(file)), fname, env_options_));
+    current_log_writer_.reset(new log::Writer(std::move(file_writer), 0, false));
+  }
+
+  void CreateArchiveLogs(int num_logs, int entries_per_log) {
+    for (int i = 1; i <= num_logs; ++i) {
+      RollTheLog(true);
+      for (int k = 0; k < entries_per_log; ++k) {
+        Put(ToString(k), std::string(1024, 'a'));
+      }
+    }
+  }
+
+  std::unique_ptr<TransactionLogIterator> OpenTransactionLogIter(
+      const SequenceNumber seq) {
+    std::unique_ptr<TransactionLogIterator> iter;
+    Status status = wal_manager_->GetUpdatesSince(
+        seq, &iter, TransactionLogIterator::ReadOptions(), versions_.get());
+    EXPECT_OK(status);
+    return iter;
+  }
+
+  std::unique_ptr<MockEnv> env_;
+  std::string dbname_;
+  ImmutableDBOptions db_options_;
+  WriteController write_controller_;
+  EnvOptions env_options_;
+  std::shared_ptr<Cache> table_cache_;
+  WriteBufferManager write_buffer_manager_;
+  std::unique_ptr<VersionSet> versions_;
+  std::unique_ptr<WalManager> wal_manager_;
+  std::shared_ptr<LegacyFileSystemWrapper> fs_;
+
+  std::unique_ptr<log::Writer> current_log_writer_;
+  uint64_t current_log_number_;
+};
+
+TEST_F(WalManagerTest, ReadFirstRecordCache) {
+  Init();
+  std::string path = dbname_ + "/000001.log";
+  std::unique_ptr<WritableFile> file;
+  ASSERT_OK(env_->NewWritableFile(path, &file, EnvOptions()));
+
+  SequenceNumber s;
+  ASSERT_OK(wal_manager_->TEST_ReadFirstLine(path, 1 /* number */, &s));
+  ASSERT_EQ(s, 0U);
+
+  ASSERT_OK(
+      wal_manager_->TEST_ReadFirstRecord(kAliveLogFile, 1 /* number */, &s));
+  ASSERT_EQ(s, 0U);
+
+  std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
+      NewLegacyWritableFileWrapper(std::move(file)), path, EnvOptions()));
+  log::Writer writer(std::move(file_writer), 1,
+                     db_options_.recycle_log_file_num > 0);
+  WriteBatch batch;
+  batch.Put("foo", "bar");
+  WriteBatchInternal::SetSequence(&batch, 10);
+  writer.AddRecord(WriteBatchInternal::Contents(&batch));
+
+  // TODO(icanadi) move SpecialEnv outside of db_test, so we can reuse it here.
+  // Waiting for lei to finish with db_test
+  // env_->count_sequential_reads_ = true;
+  // sequential_read_counter_ sanity test
+  // ASSERT_EQ(env_->sequential_read_counter_.Read(), 0);
+
+  ASSERT_OK(wal_manager_->TEST_ReadFirstRecord(kAliveLogFile, 1, &s));
+  ASSERT_EQ(s, 10U);
+  // did a read
+  // TODO(icanadi) move SpecialEnv outside of db_test, so we can reuse it here
+  // ASSERT_EQ(env_->sequential_read_counter_.Read(), 1);
+
+  ASSERT_OK(wal_manager_->TEST_ReadFirstRecord(kAliveLogFile, 1, &s));
+  ASSERT_EQ(s, 10U);
+  // no new reads since the value is cached
+  // TODO(icanadi) move SpecialEnv outside of db_test, so we can reuse it here
+  // ASSERT_EQ(env_->sequential_read_counter_.Read(), 1);
+}
+
+namespace {
+uint64_t GetLogDirSize(std::string dir_path, Env* env) {
+  uint64_t dir_size = 0;
+  std::vector<std::string> files;
+  env->GetChildren(dir_path, &files);
+  for (auto& f : files) {
+    uint64_t number;
+    FileType type;
+    if (ParseFileName(f, &number, &type) && type == kLogFile) {
+      std::string const file_path = dir_path + "/" + f;
+      uint64_t file_size;
+      env->GetFileSize(file_path, &file_size);
+      dir_size += file_size;
+    }
+  }
+  return dir_size;
+}
+std::vector<std::uint64_t> ListSpecificFiles(
+    Env* env, const std::string& path, const FileType expected_file_type) {
+  std::vector<std::string> files;
+  std::vector<uint64_t> file_numbers;
+  env->GetChildren(path, &files);
+  uint64_t number;
+  FileType type;
+  for (size_t i = 0; i < files.size(); ++i) {
+    if (ParseFileName(files[i], &number, &type)) {
+      if (type == expected_file_type) {
+        file_numbers.push_back(number);
+      }
+    }
+  }
+  return file_numbers;
+}
+
+int CountRecords(TransactionLogIterator* iter) {
+  int count = 0;
+  SequenceNumber lastSequence = 0;
+  BatchResult res;
+  while (iter->Valid()) {
+    res = iter->GetBatch();
+    EXPECT_TRUE(res.sequence > lastSequence);
+    ++count;
+    lastSequence = res.sequence;
+    EXPECT_OK(iter->status());
+    iter->Next();
+  }
+  return count;
+}
+}  // namespace
+
+TEST_F(WalManagerTest, WALArchivalSizeLimit) {
+  db_options_.wal_ttl_seconds = 0;
+  db_options_.wal_size_limit_mb = 1000;
+  Init();
+
+  // TEST : Create WalManager with huge size limit and no ttl.
+  // Create some archived files and call PurgeObsoleteWALFiles().
+  // Count the archived log files that survived.
+  // Assert that all of them did.
+  // Change size limit. Re-open WalManager.
+  // Assert that archive is not greater than wal_size_limit_mb after
+  // PurgeObsoleteWALFiles()
+  // Set ttl and time_to_check_ to small values. Re-open db.
+  // Assert that there are no archived logs left.
+
+  std::string archive_dir = ArchivalDirectory(dbname_);
+  CreateArchiveLogs(20, 5000);
+
+  std::vector<std::uint64_t> log_files =
+      ListSpecificFiles(env_.get(), archive_dir, kLogFile);
+  ASSERT_EQ(log_files.size(), 20U);
+
+  db_options_.wal_size_limit_mb = 8;
+  Reopen();
+  wal_manager_->PurgeObsoleteWALFiles();
+
+  uint64_t archive_size = GetLogDirSize(archive_dir, env_.get());
+  ASSERT_TRUE(archive_size <= db_options_.wal_size_limit_mb * 1024 * 1024);
+
+  db_options_.wal_ttl_seconds = 1;
+  env_->FakeSleepForMicroseconds(2 * 1000 * 1000);
+  Reopen();
+  wal_manager_->PurgeObsoleteWALFiles();
+
+  log_files = ListSpecificFiles(env_.get(), archive_dir, kLogFile);
+  ASSERT_TRUE(log_files.empty());
+}
+
+TEST_F(WalManagerTest, WALArchivalTtl) {
+  db_options_.wal_ttl_seconds = 1000;
+  Init();
+
+  // TEST : Create WalManager with a ttl and no size limit.
+  // Create some archived log files and call PurgeObsoleteWALFiles().
+  // Assert that files are not deleted
+  // Reopen db with small ttl.
+  // Assert that all archived logs was removed.
+
+  std::string archive_dir = ArchivalDirectory(dbname_);
+  CreateArchiveLogs(20, 5000);
+
+  std::vector<uint64_t> log_files =
+      ListSpecificFiles(env_.get(), archive_dir, kLogFile);
+  ASSERT_GT(log_files.size(), 0U);
+
+  db_options_.wal_ttl_seconds = 1;
+  env_->FakeSleepForMicroseconds(3 * 1000 * 1000);
+  Reopen();
+  wal_manager_->PurgeObsoleteWALFiles();
+
+  log_files = ListSpecificFiles(env_.get(), archive_dir, kLogFile);
+  ASSERT_TRUE(log_files.empty());
+}
+
+TEST_F(WalManagerTest, TransactionLogIteratorMoveOverZeroFiles) {
+  Init();
+  RollTheLog(false);
+  Put("key1", std::string(1024, 'a'));
+  // Create a zero record WAL file.
+  RollTheLog(false);
+  RollTheLog(false);
+
+  Put("key2", std::string(1024, 'a'));
+
+  auto iter = OpenTransactionLogIter(0);
+  ASSERT_EQ(2, CountRecords(iter.get()));
+}
+
+TEST_F(WalManagerTest, TransactionLogIteratorJustEmptyFile) {
+  Init();
+  RollTheLog(false);
+  auto iter = OpenTransactionLogIter(0);
+  // Check that an empty iterator is returned
+  ASSERT_TRUE(!iter->Valid());
+}
+
+TEST_F(WalManagerTest, TransactionLogIteratorNewFileWhileScanning) {
+  Init();
+  CreateArchiveLogs(2, 100);
+  auto iter = OpenTransactionLogIter(0);
+  CreateArchiveLogs(1, 100);
+  int i = 0;
+  for (; iter->Valid(); iter->Next()) {
+    i++;
+  }
+  ASSERT_EQ(i, 200);
+  // A new log file was added after the iterator was created.
+  // TryAgain indicates a new iterator is needed to fetch the new data
+  ASSERT_TRUE(iter->status().IsTryAgain());
+
+  iter = OpenTransactionLogIter(0);
+  i = 0;
+  for (; iter->Valid(); iter->Next()) {
+    i++;
+  }
+  ASSERT_EQ(i, 300);
+  ASSERT_TRUE(iter->status().ok());
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+  fprintf(stderr, "SKIPPED as WalManager is not supported in ROCKSDB_LITE\n");
+  return 0;
+}
+
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/write_batch.cc b/src/rocksdb/db/write_batch.cc
new file mode 100644
index 000000000..d578db59b
--- /dev/null
+++ b/src/rocksdb/db/write_batch.cc
@@ -0,0 +1,2092 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// WriteBatch::rep_ :=
+//    sequence: fixed64
+//    count: fixed32
+//    data: record[count]
+// record :=
+//    kTypeValue varstring varstring
+//    kTypeDeletion varstring
+//    kTypeSingleDeletion varstring
+//    kTypeRangeDeletion varstring varstring
+//    kTypeMerge varstring varstring
+//    kTypeColumnFamilyValue varint32 varstring varstring
+//    kTypeColumnFamilyDeletion varint32 varstring
+//    kTypeColumnFamilySingleDeletion varint32 varstring
+//    kTypeColumnFamilyRangeDeletion varint32 varstring varstring
+//    kTypeColumnFamilyMerge varint32 varstring varstring
+//    kTypeBeginPrepareXID varstring
+//    kTypeEndPrepareXID
+//    kTypeCommitXID varstring
+//    kTypeRollbackXID varstring
+//    kTypeBeginPersistedPrepareXID varstring
+//    kTypeBeginUnprepareXID varstring
+//    kTypeNoop
+// varstring :=
+//    len: varint32
+//    data: uint8[len]
+
+#include "rocksdb/write_batch.h"
+
+#include <map>
+#include <stack>
+#include <stdexcept>
+#include <type_traits>
+#include <unordered_map>
+#include <vector>
+
+#include "db/column_family.h"
+#include "db/db_impl/db_impl.h"
+#include "db/dbformat.h"
+#include "db/flush_scheduler.h"
+#include "db/memtable.h"
+#include "db/merge_context.h"
+#include "db/snapshot_impl.h"
+#include "db/trim_history_scheduler.h"
+#include "db/write_batch_internal.h"
+#include "monitoring/perf_context_imp.h"
+#include "monitoring/statistics.h"
+#include "rocksdb/merge_operator.h"
+#include "util/autovector.h"
+#include "util/cast_util.h"
+#include "util/coding.h"
+#include "util/duplicate_detector.h"
+#include "util/string_util.h"
+#include "util/util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// anon namespace for file-local types
+namespace {
+
+enum ContentFlags : uint32_t {
+  DEFERRED = 1 << 0,
+  HAS_PUT = 1 << 1,
+  HAS_DELETE = 1 << 2,
+  HAS_SINGLE_DELETE = 1 << 3,
+  HAS_MERGE = 1 << 4,
+  HAS_BEGIN_PREPARE = 1 << 5,
+  HAS_END_PREPARE = 1 << 6,
+  HAS_COMMIT = 1 << 7,
+  HAS_ROLLBACK = 1 << 8,
+  HAS_DELETE_RANGE = 1 << 9,
+  HAS_BLOB_INDEX = 1 << 10,
+  HAS_BEGIN_UNPREPARE = 1 << 11,
+};
+
+struct BatchContentClassifier : public WriteBatch::Handler {
+  uint32_t content_flags = 0;
+
+  Status PutCF(uint32_t, const Slice&, const Slice&) override {
+    content_flags |= ContentFlags::HAS_PUT;
+    return Status::OK();
+  }
+
+  Status DeleteCF(uint32_t, const Slice&) override {
+    content_flags |= ContentFlags::HAS_DELETE;
+    return Status::OK();
+  }
+
+  Status SingleDeleteCF(uint32_t, const Slice&) override {
+    content_flags |= ContentFlags::HAS_SINGLE_DELETE;
+    return Status::OK();
+  }
+
+  Status DeleteRangeCF(uint32_t, const Slice&, const Slice&) override {
+    content_flags |= ContentFlags::HAS_DELETE_RANGE;
+    return Status::OK();
+  }
+
+  Status MergeCF(uint32_t, const Slice&, const Slice&) override {
+    content_flags |= ContentFlags::HAS_MERGE;
+    return Status::OK();
+  }
+
+  Status PutBlobIndexCF(uint32_t, const Slice&, const Slice&) override {
+    content_flags |= ContentFlags::HAS_BLOB_INDEX;
+    return Status::OK();
+  }
+
+  Status MarkBeginPrepare(bool unprepare) override {
+    content_flags |= ContentFlags::HAS_BEGIN_PREPARE;
+    if (unprepare) {
+      content_flags |= ContentFlags::HAS_BEGIN_UNPREPARE;
+    }
+    return Status::OK();
+  }
+
+  Status MarkEndPrepare(const Slice&) override {
+    content_flags |= ContentFlags::HAS_END_PREPARE;
+    return Status::OK();
+  }
+
+  Status MarkCommit(const Slice&) override {
+    content_flags |= ContentFlags::HAS_COMMIT;
+    return Status::OK();
+  }
+
+  Status MarkRollback(const Slice&) override {
+    content_flags |= ContentFlags::HAS_ROLLBACK;
+    return Status::OK();
+  }
+};
+
+class TimestampAssigner : public WriteBatch::Handler {
+ public:
+  explicit TimestampAssigner(const Slice& ts)
+      : timestamp_(ts), timestamps_(kEmptyTimestampList) {}
+  explicit TimestampAssigner(const std::vector<Slice>& ts_list)
+      : timestamps_(ts_list) {
+    SanityCheck();
+  }
+  ~TimestampAssigner() override {}
+
+  Status PutCF(uint32_t, const Slice& key, const Slice&) override {
+    AssignTimestamp(key);
+    ++idx_;
+    return Status::OK();
+  }
+
+  Status DeleteCF(uint32_t, const Slice& key) override {
+    AssignTimestamp(key);
+    ++idx_;
+    return Status::OK();
+  }
+
+  Status SingleDeleteCF(uint32_t, const Slice& key) override {
+    AssignTimestamp(key);
+    ++idx_;
+    return Status::OK();
+  }
+
+  Status DeleteRangeCF(uint32_t, const Slice& begin_key,
+                       const Slice& end_key) override {
+    AssignTimestamp(begin_key);
+    AssignTimestamp(end_key);
+    ++idx_;
+    return Status::OK();
+  }
+
+  Status MergeCF(uint32_t, const Slice& key, const Slice&) override {
+    AssignTimestamp(key);
+    ++idx_;
+    return Status::OK();
+  }
+
+  Status PutBlobIndexCF(uint32_t, const Slice&, const Slice&) override {
+    // TODO (yanqin): support blob db in the future.
+    return Status::OK();
+  }
+
+  Status MarkBeginPrepare(bool) override {
+    // TODO (yanqin): support in the future.
+    return Status::OK();
+  }
+
+  Status MarkEndPrepare(const Slice&) override {
+    // TODO (yanqin): support in the future.
+    return Status::OK();
+  }
+
+  Status MarkCommit(const Slice&) override {
+    // TODO (yanqin): support in the future.
+    return Status::OK();
+  }
+
+  Status MarkRollback(const Slice&) override {
+    // TODO (yanqin): support in the future.
+    return Status::OK();
+  }
+
+ private:
+  void SanityCheck() const {
+    assert(!timestamps_.empty());
+#ifndef NDEBUG
+    const size_t ts_sz = timestamps_[0].size();
+    for (size_t i = 1; i != timestamps_.size(); ++i) {
+      assert(ts_sz == timestamps_[i].size());
+    }
+#endif  // !NDEBUG
+  }
+
+  void AssignTimestamp(const Slice& key) {
+    assert(timestamps_.empty() || idx_ < timestamps_.size());
+    const Slice& ts = timestamps_.empty() ? timestamp_ : timestamps_[idx_];
+    size_t ts_sz = ts.size();
+    char* ptr = const_cast<char*>(key.data() + key.size() - ts_sz);
+    memcpy(ptr, ts.data(), ts_sz);
+  }
+
+  static const std::vector<Slice> kEmptyTimestampList;
+  const Slice timestamp_;
+  const std::vector<Slice>& timestamps_;
+  size_t idx_ = 0;
+
+  // No copy or move.
+  TimestampAssigner(const TimestampAssigner&) = delete;
+  TimestampAssigner(TimestampAssigner&&) = delete;
+  TimestampAssigner& operator=(const TimestampAssigner&) = delete;
+  TimestampAssigner&& operator=(TimestampAssigner&&) = delete;
+};
+const std::vector<Slice> TimestampAssigner::kEmptyTimestampList;
+
+}  // anon namespace
+
+struct SavePoints {
+  std::stack<SavePoint, autovector<SavePoint>> stack;
+};
+
+WriteBatch::WriteBatch(size_t reserved_bytes, size_t max_bytes)
+    : content_flags_(0), max_bytes_(max_bytes), rep_(), timestamp_size_(0) {
+  rep_.reserve((reserved_bytes > WriteBatchInternal::kHeader)
+                   ? reserved_bytes
+                   : WriteBatchInternal::kHeader);
+  rep_.resize(WriteBatchInternal::kHeader);
+}
+
+WriteBatch::WriteBatch(size_t reserved_bytes, size_t max_bytes, size_t ts_sz)
+    : content_flags_(0), max_bytes_(max_bytes), rep_(), timestamp_size_(ts_sz) {
+  rep_.reserve((reserved_bytes > WriteBatchInternal::kHeader) ?
+    reserved_bytes : WriteBatchInternal::kHeader);
+  rep_.resize(WriteBatchInternal::kHeader);
+}
+
+WriteBatch::WriteBatch(const std::string& rep)
+    : content_flags_(ContentFlags::DEFERRED),
+      max_bytes_(0),
+      rep_(rep),
+      timestamp_size_(0) {}
+
+WriteBatch::WriteBatch(std::string&& rep)
+    : content_flags_(ContentFlags::DEFERRED),
+      max_bytes_(0),
+      rep_(std::move(rep)),
+      timestamp_size_(0) {}
+
+WriteBatch::WriteBatch(const WriteBatch& src)
+    : wal_term_point_(src.wal_term_point_),
+      content_flags_(src.content_flags_.load(std::memory_order_relaxed)),
+      max_bytes_(src.max_bytes_),
+      rep_(src.rep_),
+      timestamp_size_(src.timestamp_size_) {
+  if (src.save_points_ != nullptr) {
+    save_points_.reset(new SavePoints());
+    save_points_->stack = src.save_points_->stack;
+  }
+}
+
+WriteBatch::WriteBatch(WriteBatch&& src) noexcept
+    : save_points_(std::move(src.save_points_)),
+      wal_term_point_(std::move(src.wal_term_point_)),
+      content_flags_(src.content_flags_.load(std::memory_order_relaxed)),
+      max_bytes_(src.max_bytes_),
+      rep_(std::move(src.rep_)),
+      timestamp_size_(src.timestamp_size_) {}
+
+WriteBatch& WriteBatch::operator=(const WriteBatch& src) {
+  if (&src != this) {
+    this->~WriteBatch();
+    new (this) WriteBatch(src);
+  }
+  return *this;
+}
+
+WriteBatch& WriteBatch::operator=(WriteBatch&& src) {
+  if (&src != this) {
+    this->~WriteBatch();
+    new (this) WriteBatch(std::move(src));
+  }
+  return *this;
+}
+
+WriteBatch::~WriteBatch() { }
+
+WriteBatch::Handler::~Handler() { }
+
+void WriteBatch::Handler::LogData(const Slice& /*blob*/) {
+  // If the user has not specified something to do with blobs, then we ignore
+  // them.
+}
+
+bool WriteBatch::Handler::Continue() {
+  return true;
+}
+
+void WriteBatch::Clear() {
+  rep_.clear();
+  rep_.resize(WriteBatchInternal::kHeader);
+
+  content_flags_.store(0, std::memory_order_relaxed);
+
+  if (save_points_ != nullptr) {
+    while (!save_points_->stack.empty()) {
+      save_points_->stack.pop();
+    }
+  }
+
+  wal_term_point_.clear();
+}
+
+uint32_t WriteBatch::Count() const { return WriteBatchInternal::Count(this); }
+
+uint32_t WriteBatch::ComputeContentFlags() const {
+  auto rv = content_flags_.load(std::memory_order_relaxed);
+  if ((rv & ContentFlags::DEFERRED) != 0) {
+    BatchContentClassifier classifier;
+    Iterate(&classifier);
+    rv = classifier.content_flags;
+
+    // this method is conceptually const, because it is performing a lazy
+    // computation that doesn't affect the abstract state of the batch.
+    // content_flags_ is marked mutable so that we can perform the
+    // following assignment
+    content_flags_.store(rv, std::memory_order_relaxed);
+  }
+  return rv;
+}
+
+void WriteBatch::MarkWalTerminationPoint() {
+  wal_term_point_.size = GetDataSize();
+  wal_term_point_.count = Count();
+  wal_term_point_.content_flags = content_flags_;
+}
+
+bool WriteBatch::HasPut() const {
+  return (ComputeContentFlags() & ContentFlags::HAS_PUT) != 0;
+}
+
+bool WriteBatch::HasDelete() const {
+  return (ComputeContentFlags() & ContentFlags::HAS_DELETE) != 0;
+}
+
+bool WriteBatch::HasSingleDelete() const {
+  return (ComputeContentFlags() & ContentFlags::HAS_SINGLE_DELETE) != 0;
+}
+
+bool WriteBatch::HasDeleteRange() const {
+  return (ComputeContentFlags() & ContentFlags::HAS_DELETE_RANGE) != 0;
+}
+
+bool WriteBatch::HasMerge() const {
+  return (ComputeContentFlags() & ContentFlags::HAS_MERGE) != 0;
+}
+
+bool ReadKeyFromWriteBatchEntry(Slice* input, Slice* key, bool cf_record) {
+  assert(input != nullptr && key != nullptr);
+  // Skip tag byte
+  input->remove_prefix(1);
+
+  if (cf_record) {
+    // Skip column_family bytes
+    uint32_t cf;
+    if (!GetVarint32(input, &cf)) {
+      return false;
+    }
+  }
+
+  // Extract key
+  return GetLengthPrefixedSlice(input, key);
+}
+
+bool WriteBatch::HasBeginPrepare() const {
+  return (ComputeContentFlags() & ContentFlags::HAS_BEGIN_PREPARE) != 0;
+}
+
+bool WriteBatch::HasEndPrepare() const {
+  return (ComputeContentFlags() & ContentFlags::HAS_END_PREPARE) != 0;
+}
+
+bool WriteBatch::HasCommit() const {
+  return (ComputeContentFlags() & ContentFlags::HAS_COMMIT) != 0;
+}
+
+bool WriteBatch::HasRollback() const {
+  return (ComputeContentFlags() & ContentFlags::HAS_ROLLBACK) != 0;
+}
+
+Status ReadRecordFromWriteBatch(Slice* input, char* tag,
+                                uint32_t* column_family, Slice* key,
+                                Slice* value, Slice* blob, Slice* xid) {
+  assert(key != nullptr && value != nullptr);
+  *tag = (*input)[0];
+  input->remove_prefix(1);
+  *column_family = 0;  // default
+  switch (*tag) {
+    case kTypeColumnFamilyValue:
+      if (!GetVarint32(input, column_family)) {
+        return Status::Corruption("bad WriteBatch Put");
+      }
+      FALLTHROUGH_INTENDED;
+    case kTypeValue:
+      if (!GetLengthPrefixedSlice(input, key) ||
+          !GetLengthPrefixedSlice(input, value)) {
+        return Status::Corruption("bad WriteBatch Put");
+      }
+      break;
+    case kTypeColumnFamilyDeletion:
+    case kTypeColumnFamilySingleDeletion:
+      if (!GetVarint32(input, column_family)) {
+        return Status::Corruption("bad WriteBatch Delete");
+      }
+      FALLTHROUGH_INTENDED;
+    case kTypeDeletion:
+    case kTypeSingleDeletion:
+      if (!GetLengthPrefixedSlice(input, key)) {
+        return Status::Corruption("bad WriteBatch Delete");
+      }
+      break;
+    case kTypeColumnFamilyRangeDeletion:
+      if (!GetVarint32(input, column_family)) {
+        return Status::Corruption("bad WriteBatch DeleteRange");
+      }
+      FALLTHROUGH_INTENDED;
+    case kTypeRangeDeletion:
+      // for range delete, "key" is begin_key, "value" is end_key
+      if (!GetLengthPrefixedSlice(input, key) ||
+          !GetLengthPrefixedSlice(input, value)) {
+        return Status::Corruption("bad WriteBatch DeleteRange");
+      }
+      break;
+    case kTypeColumnFamilyMerge:
+      if (!GetVarint32(input, column_family)) {
+        return Status::Corruption("bad WriteBatch Merge");
+      }
+      FALLTHROUGH_INTENDED;
+    case kTypeMerge:
+      if (!GetLengthPrefixedSlice(input, key) ||
+          !GetLengthPrefixedSlice(input, value)) {
+        return Status::Corruption("bad WriteBatch Merge");
+      }
+      break;
+    case kTypeColumnFamilyBlobIndex:
+      if (!GetVarint32(input, column_family)) {
+        return Status::Corruption("bad WriteBatch BlobIndex");
+      }
+      FALLTHROUGH_INTENDED;
+    case kTypeBlobIndex:
+      if (!GetLengthPrefixedSlice(input, key) ||
+          !GetLengthPrefixedSlice(input, value)) {
+        return Status::Corruption("bad WriteBatch BlobIndex");
+      }
+      break;
+    case kTypeLogData:
+      assert(blob != nullptr);
+      if (!GetLengthPrefixedSlice(input, blob)) {
+        return Status::Corruption("bad WriteBatch Blob");
+      }
+      break;
+    case kTypeNoop:
+    case kTypeBeginPrepareXID:
+      // This indicates that the prepared batch is also persisted in the db.
+      // This is used in WritePreparedTxn
+    case kTypeBeginPersistedPrepareXID:
+      // This is used in WriteUnpreparedTxn
+    case kTypeBeginUnprepareXID:
+      break;
+    case kTypeEndPrepareXID:
+      if (!GetLengthPrefixedSlice(input, xid)) {
+        return Status::Corruption("bad EndPrepare XID");
+      }
+      break;
+    case kTypeCommitXID:
+      if (!GetLengthPrefixedSlice(input, xid)) {
+        return Status::Corruption("bad Commit XID");
+      }
+      break;
+    case kTypeRollbackXID:
+      if (!GetLengthPrefixedSlice(input, xid)) {
+        return Status::Corruption("bad Rollback XID");
+      }
+      break;
+    default:
+      return Status::Corruption("unknown WriteBatch tag");
+  }
+  return Status::OK();
+}
+
+Status WriteBatch::Iterate(Handler* handler) const {
+  if (rep_.size() < WriteBatchInternal::kHeader) {
+    return Status::Corruption("malformed WriteBatch (too small)");
+  }
+
+  return WriteBatchInternal::Iterate(this, handler, WriteBatchInternal::kHeader,
+                                     rep_.size());
+}
+
+Status WriteBatchInternal::Iterate(const WriteBatch* wb,
+                                   WriteBatch::Handler* handler, size_t begin,
+                                   size_t end) {
+  if (begin > wb->rep_.size() || end > wb->rep_.size() || end < begin) {
+    return Status::Corruption("Invalid start/end bounds for Iterate");
+  }
+  assert(begin <= end);
+  Slice input(wb->rep_.data() + begin, static_cast<size_t>(end - begin));
+  bool whole_batch =
+      (begin == WriteBatchInternal::kHeader) && (end == wb->rep_.size());
+
+  Slice key, value, blob, xid;
+  // Sometimes a sub-batch starts with a Noop. We want to exclude such Noops as
+  // the batch boundary symbols otherwise we would mis-count the number of
+  // batches. We do that by checking whether the accumulated batch is empty
+  // before seeing the next Noop.
+  bool empty_batch = true;
+  uint32_t found = 0;
+  Status s;
+  char tag = 0;
+  uint32_t column_family = 0;  // default
+  bool last_was_try_again = false;
+  bool handler_continue = true;
+  while (((s.ok() && !input.empty()) || UNLIKELY(s.IsTryAgain()))) {
+    handler_continue = handler->Continue();
+    if (!handler_continue) {
+      break;
+    }
+
+    if (LIKELY(!s.IsTryAgain())) {
+      last_was_try_again = false;
+      tag = 0;
+      column_family = 0;  // default
+
+      s = ReadRecordFromWriteBatch(&input, &tag, &column_family, &key, &value,
+                                   &blob, &xid);
+      if (!s.ok()) {
+        return s;
+      }
+    } else {
+      assert(s.IsTryAgain());
+      assert(!last_was_try_again);  // to detect infinite loop bugs
+      if (UNLIKELY(last_was_try_again)) {
+        return Status::Corruption(
+            "two consecutive TryAgain in WriteBatch handler; this is either a "
+            "software bug or data corruption.");
+      }
+      last_was_try_again = true;
+      s = Status::OK();
+    }
+
+    switch (tag) {
+      case kTypeColumnFamilyValue:
+      case kTypeValue:
+        assert(wb->content_flags_.load(std::memory_order_relaxed) &
+               (ContentFlags::DEFERRED | ContentFlags::HAS_PUT));
+        s = handler->PutCF(column_family, key, value);
+        if (LIKELY(s.ok())) {
+          empty_batch = false;
+          found++;
+        }
+        break;
+      case kTypeColumnFamilyDeletion:
+      case kTypeDeletion:
+        assert(wb->content_flags_.load(std::memory_order_relaxed) &
+               (ContentFlags::DEFERRED | ContentFlags::HAS_DELETE));
+        s = handler->DeleteCF(column_family, key);
+        if (LIKELY(s.ok())) {
+          empty_batch = false;
+          found++;
+        }
+        break;
+      case kTypeColumnFamilySingleDeletion:
+      case kTypeSingleDeletion:
+        assert(wb->content_flags_.load(std::memory_order_relaxed) &
+               (ContentFlags::DEFERRED | ContentFlags::HAS_SINGLE_DELETE));
+        s = handler->SingleDeleteCF(column_family, key);
+        if (LIKELY(s.ok())) {
+          empty_batch = false;
+          found++;
+        }
+        break;
+      case kTypeColumnFamilyRangeDeletion:
+      case kTypeRangeDeletion:
+        assert(wb->content_flags_.load(std::memory_order_relaxed) &
+               (ContentFlags::DEFERRED | ContentFlags::HAS_DELETE_RANGE));
+        s = handler->DeleteRangeCF(column_family, key, value);
+        if (LIKELY(s.ok())) {
+          empty_batch = false;
+          found++;
+        }
+        break;
+      case kTypeColumnFamilyMerge:
+      case kTypeMerge:
+        assert(wb->content_flags_.load(std::memory_order_relaxed) &
+               (ContentFlags::DEFERRED | ContentFlags::HAS_MERGE));
+        s = handler->MergeCF(column_family, key, value);
+        if (LIKELY(s.ok())) {
+          empty_batch = false;
+          found++;
+        }
+        break;
+      case kTypeColumnFamilyBlobIndex:
+      case kTypeBlobIndex:
+        assert(wb->content_flags_.load(std::memory_order_relaxed) &
+               (ContentFlags::DEFERRED | ContentFlags::HAS_BLOB_INDEX));
+        s = handler->PutBlobIndexCF(column_family, key, value);
+        if (LIKELY(s.ok())) {
+          found++;
+        }
+        break;
+      case kTypeLogData:
+        handler->LogData(blob);
+        // A batch might have nothing but LogData. It is still a batch.
+        empty_batch = false;
+        break;
+      case kTypeBeginPrepareXID:
+        assert(wb->content_flags_.load(std::memory_order_relaxed) &
+               (ContentFlags::DEFERRED | ContentFlags::HAS_BEGIN_PREPARE));
+        handler->MarkBeginPrepare();
+        empty_batch = false;
+        if (!handler->WriteAfterCommit()) {
+          s = Status::NotSupported(
+              "WriteCommitted txn tag when write_after_commit_ is disabled (in "
+              "WritePrepared/WriteUnprepared mode). If it is not due to "
+              "corruption, the WAL must be emptied before changing the "
+              "WritePolicy.");
+        }
+        if (handler->WriteBeforePrepare()) {
+          s = Status::NotSupported(
+              "WriteCommitted txn tag when write_before_prepare_ is enabled "
+              "(in WriteUnprepared mode). If it is not due to corruption, the "
+              "WAL must be emptied before changing the WritePolicy.");
+        }
+        break;
+      case kTypeBeginPersistedPrepareXID:
+        assert(wb->content_flags_.load(std::memory_order_relaxed) &
+               (ContentFlags::DEFERRED | ContentFlags::HAS_BEGIN_PREPARE));
+        handler->MarkBeginPrepare();
+        empty_batch = false;
+        if (handler->WriteAfterCommit()) {
+          s = Status::NotSupported(
+              "WritePrepared/WriteUnprepared txn tag when write_after_commit_ "
+              "is enabled (in default WriteCommitted mode). If it is not due "
+              "to corruption, the WAL must be emptied before changing the "
+              "WritePolicy.");
+        }
+        break;
+      case kTypeBeginUnprepareXID:
+        assert(wb->content_flags_.load(std::memory_order_relaxed) &
+               (ContentFlags::DEFERRED | ContentFlags::HAS_BEGIN_UNPREPARE));
+        handler->MarkBeginPrepare(true /* unprepared */);
+        empty_batch = false;
+        if (handler->WriteAfterCommit()) {
+          s = Status::NotSupported(
+              "WriteUnprepared txn tag when write_after_commit_ is enabled (in "
+              "default WriteCommitted mode). If it is not due to corruption, "
+              "the WAL must be emptied before changing the WritePolicy.");
+        }
+        if (!handler->WriteBeforePrepare()) {
+          s = Status::NotSupported(
+              "WriteUnprepared txn tag when write_before_prepare_ is disabled "
+              "(in WriteCommitted/WritePrepared mode). If it is not due to "
+              "corruption, the WAL must be emptied before changing the "
+              "WritePolicy.");
+        }
+        break;
+      case kTypeEndPrepareXID:
+        assert(wb->content_flags_.load(std::memory_order_relaxed) &
+               (ContentFlags::DEFERRED | ContentFlags::HAS_END_PREPARE));
+        handler->MarkEndPrepare(xid);
+        empty_batch = true;
+        break;
+      case kTypeCommitXID:
+        assert(wb->content_flags_.load(std::memory_order_relaxed) &
+               (ContentFlags::DEFERRED | ContentFlags::HAS_COMMIT));
+        handler->MarkCommit(xid);
+        empty_batch = true;
+        break;
+      case kTypeRollbackXID:
+        assert(wb->content_flags_.load(std::memory_order_relaxed) &
+               (ContentFlags::DEFERRED | ContentFlags::HAS_ROLLBACK));
+        handler->MarkRollback(xid);
+        empty_batch = true;
+        break;
+      case kTypeNoop:
+        handler->MarkNoop(empty_batch);
+        empty_batch = true;
+        break;
+      default:
+        return Status::Corruption("unknown WriteBatch tag");
+    }
+  }
+  if (!s.ok()) {
+    return s;
+  }
+  if (handler_continue && whole_batch &&
+      found != WriteBatchInternal::Count(wb)) {
+    return Status::Corruption("WriteBatch has wrong count");
+  } else {
+    return Status::OK();
+  }
+}
+
+bool WriteBatchInternal::IsLatestPersistentState(const WriteBatch* b) {
+  return b->is_latest_persistent_state_;
+}
+
+void WriteBatchInternal::SetAsLastestPersistentState(WriteBatch* b) {
+  b->is_latest_persistent_state_ = true;
+}
+
+uint32_t WriteBatchInternal::Count(const WriteBatch* b) {
+  return DecodeFixed32(b->rep_.data() + 8);
+}
+
+void WriteBatchInternal::SetCount(WriteBatch* b, uint32_t n) {
+  EncodeFixed32(&b->rep_[8], n);
+}
+
+SequenceNumber WriteBatchInternal::Sequence(const WriteBatch* b) {
+  return SequenceNumber(DecodeFixed64(b->rep_.data()));
+}
+
+void WriteBatchInternal::SetSequence(WriteBatch* b, SequenceNumber seq) {
+  EncodeFixed64(&b->rep_[0], seq);
+}
+
+size_t WriteBatchInternal::GetFirstOffset(WriteBatch* /*b*/) {
+  return WriteBatchInternal::kHeader;
+}
+
+Status WriteBatchInternal::Put(WriteBatch* b, uint32_t column_family_id,
+                               const Slice& key, const Slice& value) {
+  if (key.size() > size_t{port::kMaxUint32}) {
+    return Status::InvalidArgument("key is too large");
+  }
+  if (value.size() > size_t{port::kMaxUint32}) {
+    return Status::InvalidArgument("value is too large");
+  }
+
+  LocalSavePoint save(b);
+  WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1);
+  if (column_family_id == 0) {
+    b->rep_.push_back(static_cast<char>(kTypeValue));
+  } else {
+    b->rep_.push_back(static_cast<char>(kTypeColumnFamilyValue));
+    PutVarint32(&b->rep_, column_family_id);
+  }
+  if (0 == b->timestamp_size_) {
+    PutLengthPrefixedSlice(&b->rep_, key);
+  } else {
+    PutVarint32(&b->rep_,
+                static_cast<uint32_t>(key.size() + b->timestamp_size_));
+    b->rep_.append(key.data(), key.size());
+    b->rep_.append(b->timestamp_size_, '\0');
+  }
+  PutLengthPrefixedSlice(&b->rep_, value);
+  b->content_flags_.store(
+      b->content_flags_.load(std::memory_order_relaxed) | ContentFlags::HAS_PUT,
+      std::memory_order_relaxed);
+  return save.commit();
+}
+
+Status WriteBatch::Put(ColumnFamilyHandle* column_family, const Slice& key,
+                       const Slice& value) {
+  return WriteBatchInternal::Put(this, GetColumnFamilyID(column_family), key,
+                                 value);
+}
+
+Status WriteBatchInternal::CheckSlicePartsLength(const SliceParts& key,
+                                                 const SliceParts& value) {
+  size_t total_key_bytes = 0;
+  for (int i = 0; i < key.num_parts; ++i) {
+    total_key_bytes += key.parts[i].size();
+  }
+  if (total_key_bytes >= size_t{port::kMaxUint32}) {
+    return Status::InvalidArgument("key is too large");
+  }
+
+  size_t total_value_bytes = 0;
+  for (int i = 0; i < value.num_parts; ++i) {
+    total_value_bytes += value.parts[i].size();
+  }
+  if (total_value_bytes >= size_t{port::kMaxUint32}) {
+    return Status::InvalidArgument("value is too large");
+  }
+  return Status::OK();
+}
+
+Status WriteBatchInternal::Put(WriteBatch* b, uint32_t column_family_id,
+                               const SliceParts& key, const SliceParts& value) {
+  Status s = CheckSlicePartsLength(key, value);
+  if (!s.ok()) {
+    return s;
+  }
+
+  LocalSavePoint save(b);
+  WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1);
+  if (column_family_id == 0) {
+    b->rep_.push_back(static_cast<char>(kTypeValue));
+  } else {
+    b->rep_.push_back(static_cast<char>(kTypeColumnFamilyValue));
+    PutVarint32(&b->rep_, column_family_id);
+  }
+  if (0 == b->timestamp_size_) {
+    PutLengthPrefixedSliceParts(&b->rep_, key);
+  } else {
+    PutLengthPrefixedSlicePartsWithPadding(&b->rep_, key, b->timestamp_size_);
+  }
+  PutLengthPrefixedSliceParts(&b->rep_, value);
+  b->content_flags_.store(
+      b->content_flags_.load(std::memory_order_relaxed) | ContentFlags::HAS_PUT,
+      std::memory_order_relaxed);
+  return save.commit();
+}
+
+Status WriteBatch::Put(ColumnFamilyHandle* column_family, const SliceParts& key,
+                       const SliceParts& value) {
+  return WriteBatchInternal::Put(this, GetColumnFamilyID(column_family), key,
+                                 value);
+}
+
+Status WriteBatchInternal::InsertNoop(WriteBatch* b) {
+  b->rep_.push_back(static_cast<char>(kTypeNoop));
+  return Status::OK();
+}
+
+Status WriteBatchInternal::MarkEndPrepare(WriteBatch* b, const Slice& xid,
+                                          bool write_after_commit,
+                                          bool unprepared_batch) {
+  // a manually constructed batch can only contain one prepare section
+  assert(b->rep_[12] == static_cast<char>(kTypeNoop));
+
+  // all savepoints up to this point are cleared
+  if (b->save_points_ != nullptr) {
+    while (!b->save_points_->stack.empty()) {
+      b->save_points_->stack.pop();
+    }
+  }
+
+  // rewrite noop as begin marker
+  b->rep_[12] = static_cast<char>(
+      write_after_commit ? kTypeBeginPrepareXID
+                         : (unprepared_batch ? kTypeBeginUnprepareXID
+                                             : kTypeBeginPersistedPrepareXID));
+  b->rep_.push_back(static_cast<char>(kTypeEndPrepareXID));
+  PutLengthPrefixedSlice(&b->rep_, xid);
+  b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
+                              ContentFlags::HAS_END_PREPARE |
+                              ContentFlags::HAS_BEGIN_PREPARE,
+                          std::memory_order_relaxed);
+  if (unprepared_batch) {
+    b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
+                                ContentFlags::HAS_BEGIN_UNPREPARE,
+                            std::memory_order_relaxed);
+  }
+  return Status::OK();
+}
+
+Status WriteBatchInternal::MarkCommit(WriteBatch* b, const Slice& xid) {
+  b->rep_.push_back(static_cast<char>(kTypeCommitXID));
+  PutLengthPrefixedSlice(&b->rep_, xid);
+  b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
+                              ContentFlags::HAS_COMMIT,
+                          std::memory_order_relaxed);
+  return Status::OK();
+}
+
+Status WriteBatchInternal::MarkRollback(WriteBatch* b, const Slice& xid) {
+  b->rep_.push_back(static_cast<char>(kTypeRollbackXID));
+  PutLengthPrefixedSlice(&b->rep_, xid);
+  b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
+                              ContentFlags::HAS_ROLLBACK,
+                          std::memory_order_relaxed);
+  return Status::OK();
+}
+
+Status WriteBatchInternal::Delete(WriteBatch* b, uint32_t column_family_id,
+                                  const Slice& key) {
+  LocalSavePoint save(b);
+  WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1);
+  if (column_family_id == 0) {
+    b->rep_.push_back(static_cast<char>(kTypeDeletion));
+  } else {
+    b->rep_.push_back(static_cast<char>(kTypeColumnFamilyDeletion));
+    PutVarint32(&b->rep_, column_family_id);
+  }
+  PutLengthPrefixedSlice(&b->rep_, key);
+  b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
+                              ContentFlags::HAS_DELETE,
+                          std::memory_order_relaxed);
+  return save.commit();
+}
+
+Status WriteBatch::Delete(ColumnFamilyHandle* column_family, const Slice& key) {
+  return WriteBatchInternal::Delete(this, GetColumnFamilyID(column_family),
+                                    key);
+}
+
+Status WriteBatchInternal::Delete(WriteBatch* b, uint32_t column_family_id,
+                                  const SliceParts& key) {
+  LocalSavePoint save(b);
+  WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1);
+  if (column_family_id == 0) {
+    b->rep_.push_back(static_cast<char>(kTypeDeletion));
+  } else {
+    b->rep_.push_back(static_cast<char>(kTypeColumnFamilyDeletion));
+    PutVarint32(&b->rep_, column_family_id);
+  }
+  PutLengthPrefixedSliceParts(&b->rep_, key);
+  b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
+                              ContentFlags::HAS_DELETE,
+                          std::memory_order_relaxed);
+  return save.commit();
+}
+
+Status WriteBatch::Delete(ColumnFamilyHandle* column_family,
+                          const SliceParts& key) {
+  return WriteBatchInternal::Delete(this, GetColumnFamilyID(column_family),
+                                    key);
+}
+
+Status WriteBatchInternal::SingleDelete(WriteBatch* b,
+                                        uint32_t column_family_id,
+                                        const Slice& key) {
+  LocalSavePoint save(b);
+  WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1);
+  if (column_family_id == 0) {
+    b->rep_.push_back(static_cast<char>(kTypeSingleDeletion));
+  } else {
+    b->rep_.push_back(static_cast<char>(kTypeColumnFamilySingleDeletion));
+    PutVarint32(&b->rep_, column_family_id);
+  }
+  PutLengthPrefixedSlice(&b->rep_, key);
+  b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
+                              ContentFlags::HAS_SINGLE_DELETE,
+                          std::memory_order_relaxed);
+  return save.commit();
+}
+
+Status WriteBatch::SingleDelete(ColumnFamilyHandle* column_family,
+                                const Slice& key) {
+  return WriteBatchInternal::SingleDelete(
+      this, GetColumnFamilyID(column_family), key);
+}
+
+Status WriteBatchInternal::SingleDelete(WriteBatch* b,
+                                        uint32_t column_family_id,
+                                        const SliceParts& key) {
+  LocalSavePoint save(b);
+  WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1);
+  if (column_family_id == 0) {
+    b->rep_.push_back(static_cast<char>(kTypeSingleDeletion));
+  } else {
+    b->rep_.push_back(static_cast<char>(kTypeColumnFamilySingleDeletion));
+    PutVarint32(&b->rep_, column_family_id);
+  }
+  PutLengthPrefixedSliceParts(&b->rep_, key);
+  b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
+                              ContentFlags::HAS_SINGLE_DELETE,
+                          std::memory_order_relaxed);
+  return save.commit();
+}
+
+Status WriteBatch::SingleDelete(ColumnFamilyHandle* column_family,
+                                const SliceParts& key) {
+  return WriteBatchInternal::SingleDelete(
+      this, GetColumnFamilyID(column_family), key);
+}
+
+Status WriteBatchInternal::DeleteRange(WriteBatch* b, uint32_t column_family_id,
+                                       const Slice& begin_key,
+                                       const Slice& end_key) {
+  LocalSavePoint save(b);
+  WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1);
+  if (column_family_id == 0) {
+    b->rep_.push_back(static_cast<char>(kTypeRangeDeletion));
+  } else {
+    b->rep_.push_back(static_cast<char>(kTypeColumnFamilyRangeDeletion));
+    PutVarint32(&b->rep_, column_family_id);
+  }
+  PutLengthPrefixedSlice(&b->rep_, begin_key);
+  PutLengthPrefixedSlice(&b->rep_, end_key);
+  b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
+                              ContentFlags::HAS_DELETE_RANGE,
+                          std::memory_order_relaxed);
+  return save.commit();
+}
+
+Status WriteBatch::DeleteRange(ColumnFamilyHandle* column_family,
+                               const Slice& begin_key, const Slice& end_key) {
+  return WriteBatchInternal::DeleteRange(this, GetColumnFamilyID(column_family),
+                                         begin_key, end_key);
+}
+
+Status WriteBatchInternal::DeleteRange(WriteBatch* b, uint32_t column_family_id,
+                                       const SliceParts& begin_key,
+                                       const SliceParts& end_key) {
+  LocalSavePoint save(b);
+  WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1);
+  if (column_family_id == 0) {
+    b->rep_.push_back(static_cast<char>(kTypeRangeDeletion));
+  } else {
+    b->rep_.push_back(static_cast<char>(kTypeColumnFamilyRangeDeletion));
+    PutVarint32(&b->rep_, column_family_id);
+  }
+  PutLengthPrefixedSliceParts(&b->rep_, begin_key);
+  PutLengthPrefixedSliceParts(&b->rep_, end_key);
+  b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
+                              ContentFlags::HAS_DELETE_RANGE,
+                          std::memory_order_relaxed);
+  return save.commit();
+}
+
+Status WriteBatch::DeleteRange(ColumnFamilyHandle* column_family,
+                               const SliceParts& begin_key,
+                               const SliceParts& end_key) {
+  return WriteBatchInternal::DeleteRange(this, GetColumnFamilyID(column_family),
+                                         begin_key, end_key);
+}
+
+Status WriteBatchInternal::Merge(WriteBatch* b, uint32_t column_family_id,
+                                 const Slice& key, const Slice& value) {
+  if (key.size() > size_t{port::kMaxUint32}) {
+    return Status::InvalidArgument("key is too large");
+  }
+  if (value.size() > size_t{port::kMaxUint32}) {
+    return Status::InvalidArgument("value is too large");
+  }
+
+  LocalSavePoint save(b);
+  WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1);
+  if (column_family_id == 0) {
+    b->rep_.push_back(static_cast<char>(kTypeMerge));
+  } else {
+    b->rep_.push_back(static_cast<char>(kTypeColumnFamilyMerge));
+    PutVarint32(&b->rep_, column_family_id);
+  }
+  PutLengthPrefixedSlice(&b->rep_, key);
+  PutLengthPrefixedSlice(&b->rep_, value);
+  b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
+                              ContentFlags::HAS_MERGE,
+                          std::memory_order_relaxed);
+  return save.commit();
+}
+
+Status WriteBatch::Merge(ColumnFamilyHandle* column_family, const Slice& key,
+                         const Slice& value) {
+  return WriteBatchInternal::Merge(this, GetColumnFamilyID(column_family), key,
+                                   value);
+}
+
+Status WriteBatchInternal::Merge(WriteBatch* b, uint32_t column_family_id,
+                                 const SliceParts& key,
+                                 const SliceParts& value) {
+  Status s = CheckSlicePartsLength(key, value);
+  if (!s.ok()) {
+    return s;
+  }
+
+  LocalSavePoint save(b);
+  WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1);
+  if (column_family_id == 0) {
+    b->rep_.push_back(static_cast<char>(kTypeMerge));
+  } else {
+    b->rep_.push_back(static_cast<char>(kTypeColumnFamilyMerge));
+    PutVarint32(&b->rep_, column_family_id);
+  }
+  PutLengthPrefixedSliceParts(&b->rep_, key);
+  PutLengthPrefixedSliceParts(&b->rep_, value);
+  b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
+                              ContentFlags::HAS_MERGE,
+                          std::memory_order_relaxed);
+  return save.commit();
+}
+
+Status WriteBatch::Merge(ColumnFamilyHandle* column_family,
+                         const SliceParts& key, const SliceParts& value) {
+  return WriteBatchInternal::Merge(this, GetColumnFamilyID(column_family), key,
+                                   value);
+}
+
+Status WriteBatchInternal::PutBlobIndex(WriteBatch* b,
+                                        uint32_t column_family_id,
+                                        const Slice& key, const Slice& value) {
+  LocalSavePoint save(b);
+  WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1);
+  if (column_family_id == 0) {
+    b->rep_.push_back(static_cast<char>(kTypeBlobIndex));
+  } else {
+    b->rep_.push_back(static_cast<char>(kTypeColumnFamilyBlobIndex));
+    PutVarint32(&b->rep_, column_family_id);
+  }
+  PutLengthPrefixedSlice(&b->rep_, key);
+  PutLengthPrefixedSlice(&b->rep_, value);
+  b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
+                              ContentFlags::HAS_BLOB_INDEX,
+                          std::memory_order_relaxed);
+  return save.commit();
+}
+
+Status WriteBatch::PutLogData(const Slice& blob) {
+  LocalSavePoint save(this);
+  rep_.push_back(static_cast<char>(kTypeLogData));
+  PutLengthPrefixedSlice(&rep_, blob);
+  return save.commit();
+}
+
+void WriteBatch::SetSavePoint() {
+  if (save_points_ == nullptr) {
+    save_points_.reset(new SavePoints());
+  }
+  // Record length and count of current batch of writes.
+  save_points_->stack.push(SavePoint(
+      GetDataSize(), Count(), content_flags_.load(std::memory_order_relaxed)));
+}
+
+Status WriteBatch::RollbackToSavePoint() {
+  if (save_points_ == nullptr || save_points_->stack.size() == 0) {
+    return Status::NotFound();
+  }
+
+  // Pop the most recent savepoint off the stack
+  SavePoint savepoint = save_points_->stack.top();
+  save_points_->stack.pop();
+
+  assert(savepoint.size <= rep_.size());
+  assert(static_cast<uint32_t>(savepoint.count) <= Count());
+
+  if (savepoint.size == rep_.size()) {
+    // No changes to rollback
+  } else if (savepoint.size == 0) {
+    // Rollback everything
+    Clear();
+  } else {
+    rep_.resize(savepoint.size);
+    WriteBatchInternal::SetCount(this, savepoint.count);
+    content_flags_.store(savepoint.content_flags, std::memory_order_relaxed);
+  }
+
+  return Status::OK();
+}
+
+Status WriteBatch::PopSavePoint() {
+  if (save_points_ == nullptr || save_points_->stack.size() == 0) {
+    return Status::NotFound();
+  }
+
+  // Pop the most recent savepoint off the stack
+  save_points_->stack.pop();
+
+  return Status::OK();
+}
+
+Status WriteBatch::AssignTimestamp(const Slice& ts) {
+  TimestampAssigner ts_assigner(ts);
+  return Iterate(&ts_assigner);
+}
+
+Status WriteBatch::AssignTimestamps(const std::vector<Slice>& ts_list) {
+  TimestampAssigner ts_assigner(ts_list);
+  return Iterate(&ts_assigner);
+}
+
+class MemTableInserter : public WriteBatch::Handler {
+
+  SequenceNumber sequence_;
+  ColumnFamilyMemTables* const cf_mems_;
+  FlushScheduler* const flush_scheduler_;
+  TrimHistoryScheduler* const trim_history_scheduler_;
+  const bool ignore_missing_column_families_;
+  const uint64_t recovering_log_number_;
+  // log number that all Memtables inserted into should reference
+  uint64_t log_number_ref_;
+  DBImpl* db_;
+  const bool concurrent_memtable_writes_;
+  bool       post_info_created_;
+
+  bool* has_valid_writes_;
+  // On some (!) platforms just default creating
+  // a map is too expensive in the Write() path as they
+  // cause memory allocations though unused.
+  // Make creation optional but do not incur
+  // std::unique_ptr additional allocation
+  using MemPostInfoMap = std::map<MemTable*, MemTablePostProcessInfo>;
+  using PostMapType = std::aligned_storage<sizeof(MemPostInfoMap)>::type;
+  PostMapType mem_post_info_map_;
+  // current recovered transaction we are rebuilding (recovery)
+  WriteBatch* rebuilding_trx_;
+  SequenceNumber rebuilding_trx_seq_;
+  // Increase seq number once per each write batch. Otherwise increase it once
+  // per key.
+  bool seq_per_batch_;
+  // Whether the memtable write will be done only after the commit
+  bool write_after_commit_;
+  // Whether memtable write can be done before prepare
+  bool write_before_prepare_;
+  // Whether this batch was unprepared or not
+  bool unprepared_batch_;
+  using DupDetector = std::aligned_storage<sizeof(DuplicateDetector)>::type;
+  DupDetector       duplicate_detector_;
+  bool              dup_dectector_on_;
+
+  bool hint_per_batch_;
+  bool hint_created_;
+  // Hints for this batch
+  using HintMap = std::unordered_map<MemTable*, void*>;
+  using HintMapType = std::aligned_storage<sizeof(HintMap)>::type;
+  HintMapType hint_;
+
+  HintMap& GetHintMap() {
+    assert(hint_per_batch_);
+    if (!hint_created_) {
+      new (&hint_) HintMap();
+      hint_created_ = true;
+    }
+    return *reinterpret_cast<HintMap*>(&hint_);
+  }
+
+  MemPostInfoMap& GetPostMap() {
+    assert(concurrent_memtable_writes_);
+    if(!post_info_created_) {
+      new (&mem_post_info_map_) MemPostInfoMap();
+      post_info_created_ = true;
+    }
+    return *reinterpret_cast<MemPostInfoMap*>(&mem_post_info_map_);
+  }
+
+  bool IsDuplicateKeySeq(uint32_t column_family_id, const Slice& key) {
+    assert(!write_after_commit_);
+    assert(rebuilding_trx_ != nullptr);
+    if (!dup_dectector_on_) {
+      new (&duplicate_detector_) DuplicateDetector(db_);
+      dup_dectector_on_ = true;
+    }
+    return reinterpret_cast<DuplicateDetector*>
+      (&duplicate_detector_)->IsDuplicateKeySeq(column_family_id, key, sequence_);
+  }
+
+ protected:
+  bool WriteBeforePrepare() const override { return write_before_prepare_; }
+  bool WriteAfterCommit() const override { return write_after_commit_; }
+
+ public:
+  // cf_mems should not be shared with concurrent inserters
+  MemTableInserter(SequenceNumber _sequence, ColumnFamilyMemTables* cf_mems,
+                   FlushScheduler* flush_scheduler,
+                   TrimHistoryScheduler* trim_history_scheduler,
+                   bool ignore_missing_column_families,
+                   uint64_t recovering_log_number, DB* db,
+                   bool concurrent_memtable_writes,
+                   bool* has_valid_writes = nullptr, bool seq_per_batch = false,
+                   bool batch_per_txn = true, bool hint_per_batch = false)
+      : sequence_(_sequence),
+        cf_mems_(cf_mems),
+        flush_scheduler_(flush_scheduler),
+        trim_history_scheduler_(trim_history_scheduler),
+        ignore_missing_column_families_(ignore_missing_column_families),
+        recovering_log_number_(recovering_log_number),
+        log_number_ref_(0),
+        db_(static_cast_with_check<DBImpl, DB>(db)),
+        concurrent_memtable_writes_(concurrent_memtable_writes),
+        post_info_created_(false),
+        has_valid_writes_(has_valid_writes),
+        rebuilding_trx_(nullptr),
+        rebuilding_trx_seq_(0),
+        seq_per_batch_(seq_per_batch),
+        // Write after commit currently uses one seq per key (instead of per
+        // batch). So seq_per_batch being false indicates write_after_commit
+        // approach.
+        write_after_commit_(!seq_per_batch),
+        // WriteUnprepared can write WriteBatches per transaction, so
+        // batch_per_txn being false indicates write_before_prepare.
+        write_before_prepare_(!batch_per_txn),
+        unprepared_batch_(false),
+        duplicate_detector_(),
+        dup_dectector_on_(false),
+        hint_per_batch_(hint_per_batch),
+        hint_created_(false) {
+    assert(cf_mems_);
+  }
+
+  ~MemTableInserter() override {
+    if (dup_dectector_on_) {
+      reinterpret_cast<DuplicateDetector*>
+        (&duplicate_detector_)->~DuplicateDetector();
+    }
+    if (post_info_created_) {
+      reinterpret_cast<MemPostInfoMap*>
+        (&mem_post_info_map_)->~MemPostInfoMap();
+    }
+    if (hint_created_) {
+      for (auto iter : GetHintMap()) {
+        delete[] reinterpret_cast<char*>(iter.second);
+      }
+      reinterpret_cast<HintMap*>(&hint_)->~HintMap();
+    }
+    delete rebuilding_trx_;
+  }
+
+  MemTableInserter(const MemTableInserter&) = delete;
+  MemTableInserter& operator=(const MemTableInserter&) = delete;
+
+  // The batch seq is regularly restarted; In normal mode it is set when
+  // MemTableInserter is constructed in the write thread and in recovery mode it
+  // is set when a batch, which is tagged with seq, is read from the WAL.
+  // Within a sequenced batch, which could be a merge of multiple batches, we
+  // have two policies to advance the seq: i) seq_per_key (default) and ii)
+  // seq_per_batch. To implement the latter we need to mark the boundary between
+  // the individual batches. The approach is this: 1) Use the terminating
+  // markers to indicate the boundary (kTypeEndPrepareXID, kTypeCommitXID,
+  // kTypeRollbackXID) 2) Terminate a batch with kTypeNoop in the absence of a
+  // natural boundary marker.
+  void MaybeAdvanceSeq(bool batch_boundry = false) {
+    if (batch_boundry == seq_per_batch_) {
+      sequence_++;
+    }
+  }
+
+  void set_log_number_ref(uint64_t log) { log_number_ref_ = log; }
+
+  SequenceNumber sequence() const { return sequence_; }
+
+  void PostProcess() {
+    assert(concurrent_memtable_writes_);
+    // If post info was not created there is nothing
+    // to process and no need to create on demand
+    if(post_info_created_) {
+      for (auto& pair : GetPostMap()) {
+        pair.first->BatchPostProcess(pair.second);
+      }
+    }
+  }
+
+  bool SeekToColumnFamily(uint32_t column_family_id, Status* s) {
+    // If we are in a concurrent mode, it is the caller's responsibility
+    // to clone the original ColumnFamilyMemTables so that each thread
+    // has its own instance.  Otherwise, it must be guaranteed that there
+    // is no concurrent access
+    bool found = cf_mems_->Seek(column_family_id);
+    if (!found) {
+      if (ignore_missing_column_families_) {
+        *s = Status::OK();
+      } else {
+        *s = Status::InvalidArgument(
+            "Invalid column family specified in write batch");
+      }
+      return false;
+    }
+    if (recovering_log_number_ != 0 &&
+        recovering_log_number_ < cf_mems_->GetLogNumber()) {
+      // This is true only in recovery environment (recovering_log_number_ is
+      // always 0 in
+      // non-recovery, regular write code-path)
+      // * If recovering_log_number_ < cf_mems_->GetLogNumber(), this means that
+      // column
+      // family already contains updates from this log. We can't apply updates
+      // twice because of update-in-place or merge workloads -- ignore the
+      // update
+      *s = Status::OK();
+      return false;
+    }
+
+    if (has_valid_writes_ != nullptr) {
+      *has_valid_writes_ = true;
+    }
+
+    if (log_number_ref_ > 0) {
+      cf_mems_->GetMemTable()->RefLogContainingPrepSection(log_number_ref_);
+    }
+
+    return true;
+  }
+
+  Status PutCFImpl(uint32_t column_family_id, const Slice& key,
+                   const Slice& value, ValueType value_type) {
+    // optimize for non-recovery mode
+    if (UNLIKELY(write_after_commit_ && rebuilding_trx_ != nullptr)) {
+      WriteBatchInternal::Put(rebuilding_trx_, column_family_id, key, value);
+      return Status::OK();
+      // else insert the values to the memtable right away
+    }
+
+    Status seek_status;
+    if (UNLIKELY(!SeekToColumnFamily(column_family_id, &seek_status))) {
+      bool batch_boundry = false;
+      if (rebuilding_trx_ != nullptr) {
+        assert(!write_after_commit_);
+        // The CF is probably flushed and hence no need for insert but we still
+        // need to keep track of the keys for upcoming rollback/commit.
+        WriteBatchInternal::Put(rebuilding_trx_, column_family_id, key, value);
+        batch_boundry = IsDuplicateKeySeq(column_family_id, key);
+      }
+      MaybeAdvanceSeq(batch_boundry);
+      return seek_status;
+    }
+    Status ret_status;
+
+    MemTable* mem = cf_mems_->GetMemTable();
+    auto* moptions = mem->GetImmutableMemTableOptions();
+    // inplace_update_support is inconsistent with snapshots, and therefore with
+    // any kind of transactions including the ones that use seq_per_batch
+    assert(!seq_per_batch_ || !moptions->inplace_update_support);
+    if (!moptions->inplace_update_support) {
+      bool mem_res =
+          mem->Add(sequence_, value_type, key, value,
+                   concurrent_memtable_writes_, get_post_process_info(mem),
+                   hint_per_batch_ ? &GetHintMap()[mem] : nullptr);
+      if (UNLIKELY(!mem_res)) {
+        assert(seq_per_batch_);
+        ret_status = Status::TryAgain("key+seq exists");
+        const bool BATCH_BOUNDRY = true;
+        MaybeAdvanceSeq(BATCH_BOUNDRY);
+      }
+    } else if (moptions->inplace_callback == nullptr) {
+      assert(!concurrent_memtable_writes_);
+      mem->Update(sequence_, key, value);
+    } else {
+      assert(!concurrent_memtable_writes_);
+      if (mem->UpdateCallback(sequence_, key, value)) {
+      } else {
+        // key not found in memtable. Do sst get, update, add
+        SnapshotImpl read_from_snapshot;
+        read_from_snapshot.number_ = sequence_;
+        ReadOptions ropts;
+        // it's going to be overwritten for sure, so no point caching data block
+        // containing the old version
+        ropts.fill_cache = false;
+        ropts.snapshot = &read_from_snapshot;
+
+        std::string prev_value;
+        std::string merged_value;
+
+        auto cf_handle = cf_mems_->GetColumnFamilyHandle();
+        Status s = Status::NotSupported();
+        if (db_ != nullptr && recovering_log_number_ == 0) {
+          if (cf_handle == nullptr) {
+            cf_handle = db_->DefaultColumnFamily();
+          }
+          s = db_->Get(ropts, cf_handle, key, &prev_value);
+        }
+
+        char* prev_buffer = const_cast<char*>(prev_value.c_str());
+        uint32_t prev_size = static_cast<uint32_t>(prev_value.size());
+        auto status = moptions->inplace_callback(s.ok() ? prev_buffer : nullptr,
+                                                 s.ok() ? &prev_size : nullptr,
+                                                 value, &merged_value);
+        if (status == UpdateStatus::UPDATED_INPLACE) {
+          // prev_value is updated in-place with final value.
+          bool mem_res __attribute__((__unused__));
+          mem_res = mem->Add(
+              sequence_, value_type, key, Slice(prev_buffer, prev_size));
+          assert(mem_res);
+          RecordTick(moptions->statistics, NUMBER_KEYS_WRITTEN);
+        } else if (status == UpdateStatus::UPDATED) {
+          // merged_value contains the final value.
+          bool mem_res __attribute__((__unused__));
+          mem_res =
+              mem->Add(sequence_, value_type, key, Slice(merged_value));
+          assert(mem_res);
+          RecordTick(moptions->statistics, NUMBER_KEYS_WRITTEN);
+        }
+      }
+    }
+    // optimize for non-recovery mode
+    if (UNLIKELY(!ret_status.IsTryAgain() && rebuilding_trx_ != nullptr)) {
+      assert(!write_after_commit_);
+      // If the ret_status is TryAgain then let the next try to add the ky to
+      // the rebuilding transaction object.
+      WriteBatchInternal::Put(rebuilding_trx_, column_family_id, key, value);
+    }
+    // Since all Puts are logged in transaction logs (if enabled), always bump
+    // sequence number. Even if the update eventually fails and does not result
+    // in memtable add/update.
+    MaybeAdvanceSeq();
+    CheckMemtableFull();
+    return ret_status;
+  }
+
+  Status PutCF(uint32_t column_family_id, const Slice& key,
+               const Slice& value) override {
+    return PutCFImpl(column_family_id, key, value, kTypeValue);
+  }
+
+  Status DeleteImpl(uint32_t /*column_family_id*/, const Slice& key,
+                    const Slice& value, ValueType delete_type) {
+    Status ret_status;
+    MemTable* mem = cf_mems_->GetMemTable();
+    bool mem_res =
+        mem->Add(sequence_, delete_type, key, value,
+                 concurrent_memtable_writes_, get_post_process_info(mem),
+                 hint_per_batch_ ? &GetHintMap()[mem] : nullptr);
+    if (UNLIKELY(!mem_res)) {
+      assert(seq_per_batch_);
+      ret_status = Status::TryAgain("key+seq exists");
+      const bool BATCH_BOUNDRY = true;
+      MaybeAdvanceSeq(BATCH_BOUNDRY);
+    }
+    MaybeAdvanceSeq();
+    CheckMemtableFull();
+    return ret_status;
+  }
+
+  Status DeleteCF(uint32_t column_family_id, const Slice& key) override {
+    // optimize for non-recovery mode
+    if (UNLIKELY(write_after_commit_ && rebuilding_trx_ != nullptr)) {
+      WriteBatchInternal::Delete(rebuilding_trx_, column_family_id, key);
+      return Status::OK();
+      // else insert the values to the memtable right away
+    }
+
+    Status seek_status;
+    if (UNLIKELY(!SeekToColumnFamily(column_family_id, &seek_status))) {
+      bool batch_boundry = false;
+      if (rebuilding_trx_ != nullptr) {
+        assert(!write_after_commit_);
+        // The CF is probably flushed and hence no need for insert but we still
+        // need to keep track of the keys for upcoming rollback/commit.
+        WriteBatchInternal::Delete(rebuilding_trx_, column_family_id, key);
+        batch_boundry = IsDuplicateKeySeq(column_family_id, key);
+      }
+      MaybeAdvanceSeq(batch_boundry);
+      return seek_status;
+    }
+
+    auto ret_status = DeleteImpl(column_family_id, key, Slice(), kTypeDeletion);
+    // optimize for non-recovery mode
+    if (UNLIKELY(!ret_status.IsTryAgain() && rebuilding_trx_ != nullptr)) {
+      assert(!write_after_commit_);
+      // If the ret_status is TryAgain then let the next try to add the ky to
+      // the rebuilding transaction object.
+      WriteBatchInternal::Delete(rebuilding_trx_, column_family_id, key);
+    }
+    return ret_status;
+  }
+
+  Status SingleDeleteCF(uint32_t column_family_id, const Slice& key) override {
+    // optimize for non-recovery mode
+    if (UNLIKELY(write_after_commit_ && rebuilding_trx_ != nullptr)) {
+      WriteBatchInternal::SingleDelete(rebuilding_trx_, column_family_id, key);
+      return Status::OK();
+      // else insert the values to the memtable right away
+    }
+
+    Status seek_status;
+    if (UNLIKELY(!SeekToColumnFamily(column_family_id, &seek_status))) {
+      bool batch_boundry = false;
+      if (rebuilding_trx_ != nullptr) {
+        assert(!write_after_commit_);
+        // The CF is probably flushed and hence no need for insert but we still
+        // need to keep track of the keys for upcoming rollback/commit.
+        WriteBatchInternal::SingleDelete(rebuilding_trx_, column_family_id,
+                                         key);
+        batch_boundry = IsDuplicateKeySeq(column_family_id, key);
+      }
+      MaybeAdvanceSeq(batch_boundry);
+      return seek_status;
+    }
+
+    auto ret_status =
+        DeleteImpl(column_family_id, key, Slice(), kTypeSingleDeletion);
+    // optimize for non-recovery mode
+    if (UNLIKELY(!ret_status.IsTryAgain() && rebuilding_trx_ != nullptr)) {
+      assert(!write_after_commit_);
+      // If the ret_status is TryAgain then let the next try to add the ky to
+      // the rebuilding transaction object.
+      WriteBatchInternal::SingleDelete(rebuilding_trx_, column_family_id, key);
+    }
+    return ret_status;
+  }
+
+  Status DeleteRangeCF(uint32_t column_family_id, const Slice& begin_key,
+                       const Slice& end_key) override {
+    // optimize for non-recovery mode
+    if (UNLIKELY(write_after_commit_ && rebuilding_trx_ != nullptr)) {
+      WriteBatchInternal::DeleteRange(rebuilding_trx_, column_family_id,
+                                      begin_key, end_key);
+      return Status::OK();
+      // else insert the values to the memtable right away
+    }
+
+    Status seek_status;
+    if (UNLIKELY(!SeekToColumnFamily(column_family_id, &seek_status))) {
+      bool batch_boundry = false;
+      if (rebuilding_trx_ != nullptr) {
+        assert(!write_after_commit_);
+        // The CF is probably flushed and hence no need for insert but we still
+        // need to keep track of the keys for upcoming rollback/commit.
+        WriteBatchInternal::DeleteRange(rebuilding_trx_, column_family_id,
+                                        begin_key, end_key);
+        // TODO(myabandeh): when transactional DeleteRange support is added,
+        // check if end_key must also be added.
+        batch_boundry = IsDuplicateKeySeq(column_family_id, begin_key);
+      }
+      MaybeAdvanceSeq(batch_boundry);
+      return seek_status;
+    }
+    if (db_ != nullptr) {
+      auto cf_handle = cf_mems_->GetColumnFamilyHandle();
+      if (cf_handle == nullptr) {
+        cf_handle = db_->DefaultColumnFamily();
+      }
+      auto* cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(cf_handle)->cfd();
+      if (!cfd->is_delete_range_supported()) {
+        return Status::NotSupported(
+            std::string("DeleteRange not supported for table type ") +
+            cfd->ioptions()->table_factory->Name() + " in CF " +
+            cfd->GetName());
+      }
+    }
+
+    auto ret_status =
+        DeleteImpl(column_family_id, begin_key, end_key, kTypeRangeDeletion);
+    // optimize for non-recovery mode
+    if (UNLIKELY(!ret_status.IsTryAgain() && rebuilding_trx_ != nullptr)) {
+      assert(!write_after_commit_);
+      // If the ret_status is TryAgain then let the next try to add the ky to
+      // the rebuilding transaction object.
+      WriteBatchInternal::DeleteRange(rebuilding_trx_, column_family_id,
+                                      begin_key, end_key);
+    }
+    return ret_status;
+  }
+
+  Status MergeCF(uint32_t column_family_id, const Slice& key,
+                 const Slice& value) override {
+    // optimize for non-recovery mode
+    if (UNLIKELY(write_after_commit_ && rebuilding_trx_ != nullptr)) {
+      WriteBatchInternal::Merge(rebuilding_trx_, column_family_id, key, value);
+      return Status::OK();
+      // else insert the values to the memtable right away
+    }
+
+    Status seek_status;
+    if (UNLIKELY(!SeekToColumnFamily(column_family_id, &seek_status))) {
+      bool batch_boundry = false;
+      if (rebuilding_trx_ != nullptr) {
+        assert(!write_after_commit_);
+        // The CF is probably flushed and hence no need for insert but we still
+        // need to keep track of the keys for upcoming rollback/commit.
+        WriteBatchInternal::Merge(rebuilding_trx_, column_family_id, key,
+                                  value);
+        batch_boundry = IsDuplicateKeySeq(column_family_id, key);
+      }
+      MaybeAdvanceSeq(batch_boundry);
+      return seek_status;
+    }
+
+    Status ret_status;
+    MemTable* mem = cf_mems_->GetMemTable();
+    auto* moptions = mem->GetImmutableMemTableOptions();
+    bool perform_merge = false;
+    assert(!concurrent_memtable_writes_ ||
+           moptions->max_successive_merges == 0);
+
+    // If we pass DB through and options.max_successive_merges is hit
+    // during recovery, Get() will be issued which will try to acquire
+    // DB mutex and cause deadlock, as DB mutex is already held.
+    // So we disable merge in recovery
+    if (moptions->max_successive_merges > 0 && db_ != nullptr &&
+        recovering_log_number_ == 0) {
+      assert(!concurrent_memtable_writes_);
+      LookupKey lkey(key, sequence_);
+
+      // Count the number of successive merges at the head
+      // of the key in the memtable
+      size_t num_merges = mem->CountSuccessiveMergeEntries(lkey);
+
+      if (num_merges >= moptions->max_successive_merges) {
+        perform_merge = true;
+      }
+    }
+
+    if (perform_merge) {
+      // 1) Get the existing value
+      std::string get_value;
+
+      // Pass in the sequence number so that we also include previous merge
+      // operations in the same batch.
+      SnapshotImpl read_from_snapshot;
+      read_from_snapshot.number_ = sequence_;
+      ReadOptions read_options;
+      read_options.snapshot = &read_from_snapshot;
+
+      auto cf_handle = cf_mems_->GetColumnFamilyHandle();
+      if (cf_handle == nullptr) {
+        cf_handle = db_->DefaultColumnFamily();
+      }
+      db_->Get(read_options, cf_handle, key, &get_value);
+      Slice get_value_slice = Slice(get_value);
+
+      // 2) Apply this merge
+      auto merge_operator = moptions->merge_operator;
+      assert(merge_operator);
+
+      std::string new_value;
+
+      Status merge_status = MergeHelper::TimedFullMerge(
+          merge_operator, key, &get_value_slice, {value}, &new_value,
+          moptions->info_log, moptions->statistics, Env::Default());
+
+      if (!merge_status.ok()) {
+        // Failed to merge!
+        // Store the delta in memtable
+        perform_merge = false;
+      } else {
+        // 3) Add value to memtable
+        assert(!concurrent_memtable_writes_);
+        bool mem_res = mem->Add(sequence_, kTypeValue, key, new_value);
+        if (UNLIKELY(!mem_res)) {
+          assert(seq_per_batch_);
+          ret_status = Status::TryAgain("key+seq exists");
+          const bool BATCH_BOUNDRY = true;
+          MaybeAdvanceSeq(BATCH_BOUNDRY);
+        }
+      }
+    }
+
+    if (!perform_merge) {
+      // Add merge operator to memtable
+      bool mem_res =
+          mem->Add(sequence_, kTypeMerge, key, value,
+                   concurrent_memtable_writes_, get_post_process_info(mem));
+      if (UNLIKELY(!mem_res)) {
+        assert(seq_per_batch_);
+        ret_status = Status::TryAgain("key+seq exists");
+        const bool BATCH_BOUNDRY = true;
+        MaybeAdvanceSeq(BATCH_BOUNDRY);
+      }
+    }
+
+    // optimize for non-recovery mode
+    if (UNLIKELY(!ret_status.IsTryAgain() && rebuilding_trx_ != nullptr)) {
+      assert(!write_after_commit_);
+      // If the ret_status is TryAgain then let the next try to add the ky to
+      // the rebuilding transaction object.
+      WriteBatchInternal::Merge(rebuilding_trx_, column_family_id, key, value);
+    }
+    MaybeAdvanceSeq();
+    CheckMemtableFull();
+    return ret_status;
+  }
+
+  Status PutBlobIndexCF(uint32_t column_family_id, const Slice& key,
+                        const Slice& value) override {
+    // Same as PutCF except for value type.
+    return PutCFImpl(column_family_id, key, value, kTypeBlobIndex);
+  }
+
+  void CheckMemtableFull() {
+    if (flush_scheduler_ != nullptr) {
+      auto* cfd = cf_mems_->current();
+      assert(cfd != nullptr);
+      if (cfd->mem()->ShouldScheduleFlush() &&
+          cfd->mem()->MarkFlushScheduled()) {
+        // MarkFlushScheduled only returns true if we are the one that
+        // should take action, so no need to dedup further
+        flush_scheduler_->ScheduleWork(cfd);
+      }
+    }
+    // check if memtable_list size exceeds max_write_buffer_size_to_maintain
+    if (trim_history_scheduler_ != nullptr) {
+      auto* cfd = cf_mems_->current();
+
+      assert(cfd);
+      assert(cfd->ioptions());
+
+      const size_t size_to_maintain = static_cast<size_t>(
+          cfd->ioptions()->max_write_buffer_size_to_maintain);
+
+      if (size_to_maintain > 0) {
+        MemTableList* const imm = cfd->imm();
+        assert(imm);
+
+        if (imm->HasHistory()) {
+          const MemTable* const mem = cfd->mem();
+          assert(mem);
+
+          if (mem->ApproximateMemoryUsageFast() +
+                      imm->ApproximateMemoryUsageExcludingLast() >=
+                  size_to_maintain &&
+              imm->MarkTrimHistoryNeeded()) {
+            trim_history_scheduler_->ScheduleWork(cfd);
+          }
+        }
+      }
+    }
+  }
+
+  // The write batch handler calls MarkBeginPrepare with unprepare set to true
+  // if it encounters the kTypeBeginUnprepareXID marker.
+  Status MarkBeginPrepare(bool unprepare) override {
+    assert(rebuilding_trx_ == nullptr);
+    assert(db_);
+
+    if (recovering_log_number_ != 0) {
+      // during recovery we rebuild a hollow transaction
+      // from all encountered prepare sections of the wal
+      if (db_->allow_2pc() == false) {
+        return Status::NotSupported(
+            "WAL contains prepared transactions. Open with "
+            "TransactionDB::Open().");
+      }
+
+      // we are now iterating through a prepared section
+      rebuilding_trx_ = new WriteBatch();
+      rebuilding_trx_seq_ = sequence_;
+      // Verify that we have matching MarkBeginPrepare/MarkEndPrepare markers.
+      // unprepared_batch_ should be false because it is false by default, and
+      // gets reset to false in MarkEndPrepare.
+      assert(!unprepared_batch_);
+      unprepared_batch_ = unprepare;
+
+      if (has_valid_writes_ != nullptr) {
+        *has_valid_writes_ = true;
+      }
+    }
+
+    return Status::OK();
+  }
+
+  Status MarkEndPrepare(const Slice& name) override {
+    assert(db_);
+    assert((rebuilding_trx_ != nullptr) == (recovering_log_number_ != 0));
+
+    if (recovering_log_number_ != 0) {
+      assert(db_->allow_2pc());
+      size_t batch_cnt =
+          write_after_commit_
+              ? 0  // 0 will disable further checks
+              : static_cast<size_t>(sequence_ - rebuilding_trx_seq_ + 1);
+      db_->InsertRecoveredTransaction(recovering_log_number_, name.ToString(),
+                                      rebuilding_trx_, rebuilding_trx_seq_,
+                                      batch_cnt, unprepared_batch_);
+      unprepared_batch_ = false;
+      rebuilding_trx_ = nullptr;
+    } else {
+      assert(rebuilding_trx_ == nullptr);
+    }
+    const bool batch_boundry = true;
+    MaybeAdvanceSeq(batch_boundry);
+
+    return Status::OK();
+  }
+
+  Status MarkNoop(bool empty_batch) override {
+    // A hack in pessimistic transaction could result into a noop at the start
+    // of the write batch, that should be ignored.
+    if (!empty_batch) {
+      // In the absence of Prepare markers, a kTypeNoop tag indicates the end of
+      // a batch. This happens when write batch commits skipping the prepare
+      // phase.
+      const bool batch_boundry = true;
+      MaybeAdvanceSeq(batch_boundry);
+    }
+    return Status::OK();
+  }
+
+  Status MarkCommit(const Slice& name) override {
+    assert(db_);
+
+    Status s;
+
+    if (recovering_log_number_ != 0) {
+      // in recovery when we encounter a commit marker
+      // we lookup this transaction in our set of rebuilt transactions
+      // and commit.
+      auto trx = db_->GetRecoveredTransaction(name.ToString());
+
+      // the log containing the prepared section may have
+      // been released in the last incarnation because the
+      // data was flushed to L0
+      if (trx != nullptr) {
+        // at this point individual CF lognumbers will prevent
+        // duplicate re-insertion of values.
+        assert(log_number_ref_ == 0);
+        if (write_after_commit_) {
+          // write_after_commit_ can only have one batch in trx.
+          assert(trx->batches_.size() == 1);
+          const auto& batch_info = trx->batches_.begin()->second;
+          // all inserts must reference this trx log number
+          log_number_ref_ = batch_info.log_number_;
+          s = batch_info.batch_->Iterate(this);
+          log_number_ref_ = 0;
+        }
+        // else the values are already inserted before the commit
+
+        if (s.ok()) {
+          db_->DeleteRecoveredTransaction(name.ToString());
+        }
+        if (has_valid_writes_ != nullptr) {
+          *has_valid_writes_ = true;
+        }
+      }
+    } else {
+      // When writes are not delayed until commit, there is no disconnect
+      // between a memtable write and the WAL that supports it. So the commit
+      // need not reference any log as the only log to which it depends.
+      assert(!write_after_commit_ || log_number_ref_ > 0);
+    }
+    const bool batch_boundry = true;
+    MaybeAdvanceSeq(batch_boundry);
+
+    return s;
+  }
+
+  Status MarkRollback(const Slice& name) override {
+    assert(db_);
+
+    if (recovering_log_number_ != 0) {
+      auto trx = db_->GetRecoveredTransaction(name.ToString());
+
+      // the log containing the transactions prep section
+      // may have been released in the previous incarnation
+      // because we knew it had been rolled back
+      if (trx != nullptr) {
+        db_->DeleteRecoveredTransaction(name.ToString());
+      }
+    } else {
+      // in non recovery we simply ignore this tag
+    }
+
+    const bool batch_boundry = true;
+    MaybeAdvanceSeq(batch_boundry);
+
+    return Status::OK();
+  }
+
+ private:
+  MemTablePostProcessInfo* get_post_process_info(MemTable* mem) {
+    if (!concurrent_memtable_writes_) {
+      // No need to batch counters locally if we don't use concurrent mode.
+      return nullptr;
+    }
+    return &GetPostMap()[mem];
+  }
+};
+
+// This function can only be called in these conditions:
+// 1) During Recovery()
+// 2) During Write(), in a single-threaded write thread
+// 3) During Write(), in a concurrent context where memtables has been cloned
+// The reason is that it calls memtables->Seek(), which has a stateful cache
+Status WriteBatchInternal::InsertInto(
+    WriteThread::WriteGroup& write_group, SequenceNumber sequence,
+    ColumnFamilyMemTables* memtables, FlushScheduler* flush_scheduler,
+    TrimHistoryScheduler* trim_history_scheduler,
+    bool ignore_missing_column_families, uint64_t recovery_log_number, DB* db,
+    bool concurrent_memtable_writes, bool seq_per_batch, bool batch_per_txn) {
+  MemTableInserter inserter(
+      sequence, memtables, flush_scheduler, trim_history_scheduler,
+      ignore_missing_column_families, recovery_log_number, db,
+      concurrent_memtable_writes, nullptr /*has_valid_writes*/, seq_per_batch,
+      batch_per_txn);
+  for (auto w : write_group) {
+    if (w->CallbackFailed()) {
+      continue;
+    }
+    w->sequence = inserter.sequence();
+    if (!w->ShouldWriteToMemtable()) {
+      // In seq_per_batch_ mode this advances the seq by one.
+      inserter.MaybeAdvanceSeq(true);
+      continue;
+    }
+    SetSequence(w->batch, inserter.sequence());
+    inserter.set_log_number_ref(w->log_ref);
+    w->status = w->batch->Iterate(&inserter);
+    if (!w->status.ok()) {
+      return w->status;
+    }
+    assert(!seq_per_batch || w->batch_cnt != 0);
+    assert(!seq_per_batch || inserter.sequence() - w->sequence == w->batch_cnt);
+  }
+  return Status::OK();
+}
+
+Status WriteBatchInternal::InsertInto(
+    WriteThread::Writer* writer, SequenceNumber sequence,
+    ColumnFamilyMemTables* memtables, FlushScheduler* flush_scheduler,
+    TrimHistoryScheduler* trim_history_scheduler,
+    bool ignore_missing_column_families, uint64_t log_number, DB* db,
+    bool concurrent_memtable_writes, bool seq_per_batch, size_t batch_cnt,
+    bool batch_per_txn, bool hint_per_batch) {
+#ifdef NDEBUG
+  (void)batch_cnt;
+#endif
+  assert(writer->ShouldWriteToMemtable());
+  MemTableInserter inserter(
+      sequence, memtables, flush_scheduler, trim_history_scheduler,
+      ignore_missing_column_families, log_number, db,
+      concurrent_memtable_writes, nullptr /*has_valid_writes*/, seq_per_batch,
+      batch_per_txn, hint_per_batch);
+  SetSequence(writer->batch, sequence);
+  inserter.set_log_number_ref(writer->log_ref);
+  Status s = writer->batch->Iterate(&inserter);
+  assert(!seq_per_batch || batch_cnt != 0);
+  assert(!seq_per_batch || inserter.sequence() - sequence == batch_cnt);
+  if (concurrent_memtable_writes) {
+    inserter.PostProcess();
+  }
+  return s;
+}
+
+Status WriteBatchInternal::InsertInto(
+    const WriteBatch* batch, ColumnFamilyMemTables* memtables,
+    FlushScheduler* flush_scheduler,
+    TrimHistoryScheduler* trim_history_scheduler,
+    bool ignore_missing_column_families, uint64_t log_number, DB* db,
+    bool concurrent_memtable_writes, SequenceNumber* next_seq,
+    bool* has_valid_writes, bool seq_per_batch, bool batch_per_txn) {
+  MemTableInserter inserter(Sequence(batch), memtables, flush_scheduler,
+                            trim_history_scheduler,
+                            ignore_missing_column_families, log_number, db,
+                            concurrent_memtable_writes, has_valid_writes,
+                            seq_per_batch, batch_per_txn);
+  Status s = batch->Iterate(&inserter);
+  if (next_seq != nullptr) {
+    *next_seq = inserter.sequence();
+  }
+  if (concurrent_memtable_writes) {
+    inserter.PostProcess();
+  }
+  return s;
+}
+
+Status WriteBatchInternal::SetContents(WriteBatch* b, const Slice& contents) {
+  assert(contents.size() >= WriteBatchInternal::kHeader);
+  b->rep_.assign(contents.data(), contents.size());
+  b->content_flags_.store(ContentFlags::DEFERRED, std::memory_order_relaxed);
+  return Status::OK();
+}
+
+Status WriteBatchInternal::Append(WriteBatch* dst, const WriteBatch* src,
+                                  const bool wal_only) {
+  size_t src_len;
+  int src_count;
+  uint32_t src_flags;
+
+  const SavePoint& batch_end = src->GetWalTerminationPoint();
+
+  if (wal_only && !batch_end.is_cleared()) {
+    src_len = batch_end.size - WriteBatchInternal::kHeader;
+    src_count = batch_end.count;
+    src_flags = batch_end.content_flags;
+  } else {
+    src_len = src->rep_.size() - WriteBatchInternal::kHeader;
+    src_count = Count(src);
+    src_flags = src->content_flags_.load(std::memory_order_relaxed);
+  }
+
+  SetCount(dst, Count(dst) + src_count);
+  assert(src->rep_.size() >= WriteBatchInternal::kHeader);
+  dst->rep_.append(src->rep_.data() + WriteBatchInternal::kHeader, src_len);
+  dst->content_flags_.store(
+      dst->content_flags_.load(std::memory_order_relaxed) | src_flags,
+      std::memory_order_relaxed);
+  return Status::OK();
+}
+
+size_t WriteBatchInternal::AppendedByteSize(size_t leftByteSize,
+                                            size_t rightByteSize) {
+  if (leftByteSize == 0 || rightByteSize == 0) {
+    return leftByteSize + rightByteSize;
+  } else {
+    return leftByteSize + rightByteSize - WriteBatchInternal::kHeader;
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/write_batch_base.cc b/src/rocksdb/db/write_batch_base.cc
new file mode 100644
index 000000000..e4c0e74bd
--- /dev/null
+++ b/src/rocksdb/db/write_batch_base.cc
@@ -0,0 +1,94 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "rocksdb/write_batch_base.h"
+
+#include <string>
+
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Simple implementation of SlicePart variants of Put().  Child classes
+// can override these method with more performant solutions if they choose.
+Status WriteBatchBase::Put(ColumnFamilyHandle* column_family,
+                           const SliceParts& key, const SliceParts& value) {
+  std::string key_buf, value_buf;
+  Slice key_slice(key, &key_buf);
+  Slice value_slice(value, &value_buf);
+
+  return Put(column_family, key_slice, value_slice);
+}
+
+Status WriteBatchBase::Put(const SliceParts& key, const SliceParts& value) {
+  std::string key_buf, value_buf;
+  Slice key_slice(key, &key_buf);
+  Slice value_slice(value, &value_buf);
+
+  return Put(key_slice, value_slice);
+}
+
+Status WriteBatchBase::Delete(ColumnFamilyHandle* column_family,
+                              const SliceParts& key) {
+  std::string key_buf;
+  Slice key_slice(key, &key_buf);
+  return Delete(column_family, key_slice);
+}
+
+Status WriteBatchBase::Delete(const SliceParts& key) {
+  std::string key_buf;
+  Slice key_slice(key, &key_buf);
+  return Delete(key_slice);
+}
+
+Status WriteBatchBase::SingleDelete(ColumnFamilyHandle* column_family,
+                                    const SliceParts& key) {
+  std::string key_buf;
+  Slice key_slice(key, &key_buf);
+  return SingleDelete(column_family, key_slice);
+}
+
+Status WriteBatchBase::SingleDelete(const SliceParts& key) {
+  std::string key_buf;
+  Slice key_slice(key, &key_buf);
+  return SingleDelete(key_slice);
+}
+
+Status WriteBatchBase::DeleteRange(ColumnFamilyHandle* column_family,
+                                   const SliceParts& begin_key,
+                                   const SliceParts& end_key) {
+  std::string begin_key_buf, end_key_buf;
+  Slice begin_key_slice(begin_key, &begin_key_buf);
+  Slice end_key_slice(end_key, &end_key_buf);
+  return DeleteRange(column_family, begin_key_slice, end_key_slice);
+}
+
+Status WriteBatchBase::DeleteRange(const SliceParts& begin_key,
+                                   const SliceParts& end_key) {
+  std::string begin_key_buf, end_key_buf;
+  Slice begin_key_slice(begin_key, &begin_key_buf);
+  Slice end_key_slice(end_key, &end_key_buf);
+  return DeleteRange(begin_key_slice, end_key_slice);
+}
+
+Status WriteBatchBase::Merge(ColumnFamilyHandle* column_family,
+                             const SliceParts& key, const SliceParts& value) {
+  std::string key_buf, value_buf;
+  Slice key_slice(key, &key_buf);
+  Slice value_slice(value, &value_buf);
+
+  return Merge(column_family, key_slice, value_slice);
+}
+
+Status WriteBatchBase::Merge(const SliceParts& key, const SliceParts& value) {
+  std::string key_buf, value_buf;
+  Slice key_slice(key, &key_buf);
+  Slice value_slice(value, &value_buf);
+
+  return Merge(key_slice, value_slice);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/write_batch_internal.h b/src/rocksdb/db/write_batch_internal.h
new file mode 100644
index 000000000..30c489965
--- /dev/null
+++ b/src/rocksdb/db/write_batch_internal.h
@@ -0,0 +1,250 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <vector>
+#include "db/flush_scheduler.h"
+#include "db/trim_history_scheduler.h"
+#include "db/write_thread.h"
+#include "rocksdb/db.h"
+#include "rocksdb/options.h"
+#include "rocksdb/types.h"
+#include "rocksdb/write_batch.h"
+#include "util/autovector.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class MemTable;
+class FlushScheduler;
+class ColumnFamilyData;
+
+class ColumnFamilyMemTables {
+ public:
+  virtual ~ColumnFamilyMemTables() {}
+  virtual bool Seek(uint32_t column_family_id) = 0;
+  // returns true if the update to memtable should be ignored
+  // (useful when recovering from log whose updates have already
+  // been processed)
+  virtual uint64_t GetLogNumber() const = 0;
+  virtual MemTable* GetMemTable() const = 0;
+  virtual ColumnFamilyHandle* GetColumnFamilyHandle() = 0;
+  virtual ColumnFamilyData* current() { return nullptr; }
+};
+
+class ColumnFamilyMemTablesDefault : public ColumnFamilyMemTables {
+ public:
+  explicit ColumnFamilyMemTablesDefault(MemTable* mem)
+      : ok_(false), mem_(mem) {}
+
+  bool Seek(uint32_t column_family_id) override {
+    ok_ = (column_family_id == 0);
+    return ok_;
+  }
+
+  uint64_t GetLogNumber() const override { return 0; }
+
+  MemTable* GetMemTable() const override {
+    assert(ok_);
+    return mem_;
+  }
+
+  ColumnFamilyHandle* GetColumnFamilyHandle() override { return nullptr; }
+
+ private:
+  bool ok_;
+  MemTable* mem_;
+};
+
+// WriteBatchInternal provides static methods for manipulating a
+// WriteBatch that we don't want in the public WriteBatch interface.
+class WriteBatchInternal {
+ public:
+
+  // WriteBatch header has an 8-byte sequence number followed by a 4-byte count.
+  static const size_t kHeader = 12;
+
+  // WriteBatch methods with column_family_id instead of ColumnFamilyHandle*
+  static Status Put(WriteBatch* batch, uint32_t column_family_id,
+                    const Slice& key, const Slice& value);
+
+  static Status Put(WriteBatch* batch, uint32_t column_family_id,
+                    const SliceParts& key, const SliceParts& value);
+
+  static Status Delete(WriteBatch* batch, uint32_t column_family_id,
+                       const SliceParts& key);
+
+  static Status Delete(WriteBatch* batch, uint32_t column_family_id,
+                       const Slice& key);
+
+  static Status SingleDelete(WriteBatch* batch, uint32_t column_family_id,
+                             const SliceParts& key);
+
+  static Status SingleDelete(WriteBatch* batch, uint32_t column_family_id,
+                             const Slice& key);
+
+  static Status DeleteRange(WriteBatch* b, uint32_t column_family_id,
+                            const Slice& begin_key, const Slice& end_key);
+
+  static Status DeleteRange(WriteBatch* b, uint32_t column_family_id,
+                            const SliceParts& begin_key,
+                            const SliceParts& end_key);
+
+  static Status Merge(WriteBatch* batch, uint32_t column_family_id,
+                      const Slice& key, const Slice& value);
+
+  static Status Merge(WriteBatch* batch, uint32_t column_family_id,
+                      const SliceParts& key, const SliceParts& value);
+
+  static Status PutBlobIndex(WriteBatch* batch, uint32_t column_family_id,
+                             const Slice& key, const Slice& value);
+
+  static Status MarkEndPrepare(WriteBatch* batch, const Slice& xid,
+                               const bool write_after_commit = true,
+                               const bool unprepared_batch = false);
+
+  static Status MarkRollback(WriteBatch* batch, const Slice& xid);
+
+  static Status MarkCommit(WriteBatch* batch, const Slice& xid);
+
+  static Status InsertNoop(WriteBatch* batch);
+
+  // Return the number of entries in the batch.
+  static uint32_t Count(const WriteBatch* batch);
+
+  // Set the count for the number of entries in the batch.
+  static void SetCount(WriteBatch* batch, uint32_t n);
+
+  // Return the sequence number for the start of this batch.
+  static SequenceNumber Sequence(const WriteBatch* batch);
+
+  // Store the specified number as the sequence number for the start of
+  // this batch.
+  static void SetSequence(WriteBatch* batch, SequenceNumber seq);
+
+  // Returns the offset of the first entry in the batch.
+  // This offset is only valid if the batch is not empty.
+  static size_t GetFirstOffset(WriteBatch* batch);
+
+  static Slice Contents(const WriteBatch* batch) {
+    return Slice(batch->rep_);
+  }
+
+  static size_t ByteSize(const WriteBatch* batch) {
+    return batch->rep_.size();
+  }
+
+  static Status SetContents(WriteBatch* batch, const Slice& contents);
+
+  static Status CheckSlicePartsLength(const SliceParts& key,
+                                      const SliceParts& value);
+
+  // Inserts batches[i] into memtable, for i in 0..num_batches-1 inclusive.
+  //
+  // If ignore_missing_column_families == true. WriteBatch
+  // referencing non-existing column family will be ignored.
+  // If ignore_missing_column_families == false, processing of the
+  // batches will be stopped if a reference is found to a non-existing
+  // column family and InvalidArgument() will be returned.  The writes
+  // in batches may be only partially applied at that point.
+  //
+  // If log_number is non-zero, the memtable will be updated only if
+  // memtables->GetLogNumber() >= log_number.
+  //
+  // If flush_scheduler is non-null, it will be invoked if the memtable
+  // should be flushed.
+  //
+  // Under concurrent use, the caller is responsible for making sure that
+  // the memtables object itself is thread-local.
+  static Status InsertInto(
+      WriteThread::WriteGroup& write_group, SequenceNumber sequence,
+      ColumnFamilyMemTables* memtables, FlushScheduler* flush_scheduler,
+      TrimHistoryScheduler* trim_history_scheduler,
+      bool ignore_missing_column_families = false, uint64_t log_number = 0,
+      DB* db = nullptr, bool concurrent_memtable_writes = false,
+      bool seq_per_batch = false, bool batch_per_txn = true);
+
+  // Convenience form of InsertInto when you have only one batch
+  // next_seq returns the seq after last sequence number used in MemTable insert
+  static Status InsertInto(
+      const WriteBatch* batch, ColumnFamilyMemTables* memtables,
+      FlushScheduler* flush_scheduler,
+      TrimHistoryScheduler* trim_history_scheduler,
+      bool ignore_missing_column_families = false, uint64_t log_number = 0,
+      DB* db = nullptr, bool concurrent_memtable_writes = false,
+      SequenceNumber* next_seq = nullptr, bool* has_valid_writes = nullptr,
+      bool seq_per_batch = false, bool batch_per_txn = true);
+
+  static Status InsertInto(WriteThread::Writer* writer, SequenceNumber sequence,
+                           ColumnFamilyMemTables* memtables,
+                           FlushScheduler* flush_scheduler,
+                           TrimHistoryScheduler* trim_history_scheduler,
+                           bool ignore_missing_column_families = false,
+                           uint64_t log_number = 0, DB* db = nullptr,
+                           bool concurrent_memtable_writes = false,
+                           bool seq_per_batch = false, size_t batch_cnt = 0,
+                           bool batch_per_txn = true,
+                           bool hint_per_batch = false);
+
+  static Status Append(WriteBatch* dst, const WriteBatch* src,
+                       const bool WAL_only = false);
+
+  // Returns the byte size of appending a WriteBatch with ByteSize
+  // leftByteSize and a WriteBatch with ByteSize rightByteSize
+  static size_t AppendedByteSize(size_t leftByteSize, size_t rightByteSize);
+
+  // Iterate over [begin, end) range of a write batch
+  static Status Iterate(const WriteBatch* wb, WriteBatch::Handler* handler,
+                        size_t begin, size_t end);
+
+  // This write batch includes the latest state that should be persisted. Such
+  // state meant to be used only during recovery.
+  static void SetAsLastestPersistentState(WriteBatch* b);
+  static bool IsLatestPersistentState(const WriteBatch* b);
+};
+
+// LocalSavePoint is similar to a scope guard
+class LocalSavePoint {
+ public:
+  explicit LocalSavePoint(WriteBatch* batch)
+      : batch_(batch),
+        savepoint_(batch->GetDataSize(), batch->Count(),
+                   batch->content_flags_.load(std::memory_order_relaxed))
+#ifndef NDEBUG
+        ,
+        committed_(false)
+#endif
+  {
+  }
+
+#ifndef NDEBUG
+  ~LocalSavePoint() { assert(committed_); }
+#endif
+  Status commit() {
+#ifndef NDEBUG
+    committed_ = true;
+#endif
+    if (batch_->max_bytes_ && batch_->rep_.size() > batch_->max_bytes_) {
+      batch_->rep_.resize(savepoint_.size);
+      WriteBatchInternal::SetCount(batch_, savepoint_.count);
+      batch_->content_flags_.store(savepoint_.content_flags,
+                                   std::memory_order_relaxed);
+      return Status::MemoryLimit();
+    }
+    return Status::OK();
+  }
+
+ private:
+  WriteBatch* batch_;
+  SavePoint savepoint_;
+#ifndef NDEBUG
+  bool committed_;
+#endif
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/write_batch_test.cc b/src/rocksdb/db/write_batch_test.cc
new file mode 100644
index 000000000..84f9a45ec
--- /dev/null
+++ b/src/rocksdb/db/write_batch_test.cc
@@ -0,0 +1,888 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "rocksdb/db.h"
+
+#include <memory>
+#include "db/column_family.h"
+#include "db/memtable.h"
+#include "db/write_batch_internal.h"
+#include "rocksdb/env.h"
+#include "rocksdb/memtablerep.h"
+#include "rocksdb/utilities/write_batch_with_index.h"
+#include "rocksdb/write_buffer_manager.h"
+#include "table/scoped_arena_iterator.h"
+#include "test_util/testharness.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+static std::string PrintContents(WriteBatch* b) {
+  InternalKeyComparator cmp(BytewiseComparator());
+  auto factory = std::make_shared<SkipListFactory>();
+  Options options;
+  options.memtable_factory = factory;
+  ImmutableCFOptions ioptions(options);
+  WriteBufferManager wb(options.db_write_buffer_size);
+  MemTable* mem = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb,
+                               kMaxSequenceNumber, 0 /* column_family_id */);
+  mem->Ref();
+  std::string state;
+  ColumnFamilyMemTablesDefault cf_mems_default(mem);
+  Status s =
+      WriteBatchInternal::InsertInto(b, &cf_mems_default, nullptr, nullptr);
+  uint32_t count = 0;
+  int put_count = 0;
+  int delete_count = 0;
+  int single_delete_count = 0;
+  int delete_range_count = 0;
+  int merge_count = 0;
+  for (int i = 0; i < 2; ++i) {
+    Arena arena;
+    ScopedArenaIterator arena_iter_guard;
+    std::unique_ptr<InternalIterator> iter_guard;
+    InternalIterator* iter;
+    if (i == 0) {
+      iter = mem->NewIterator(ReadOptions(), &arena);
+      arena_iter_guard.set(iter);
+    } else {
+      iter = mem->NewRangeTombstoneIterator(ReadOptions(),
+                                            kMaxSequenceNumber /* read_seq */);
+      iter_guard.reset(iter);
+    }
+    if (iter == nullptr) {
+      continue;
+    }
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+      ParsedInternalKey ikey;
+      ikey.clear();
+      EXPECT_TRUE(ParseInternalKey(iter->key(), &ikey));
+      switch (ikey.type) {
+        case kTypeValue:
+          state.append("Put(");
+          state.append(ikey.user_key.ToString());
+          state.append(", ");
+          state.append(iter->value().ToString());
+          state.append(")");
+          count++;
+          put_count++;
+          break;
+        case kTypeDeletion:
+          state.append("Delete(");
+          state.append(ikey.user_key.ToString());
+          state.append(")");
+          count++;
+          delete_count++;
+          break;
+        case kTypeSingleDeletion:
+          state.append("SingleDelete(");
+          state.append(ikey.user_key.ToString());
+          state.append(")");
+          count++;
+          single_delete_count++;
+          break;
+        case kTypeRangeDeletion:
+          state.append("DeleteRange(");
+          state.append(ikey.user_key.ToString());
+          state.append(", ");
+          state.append(iter->value().ToString());
+          state.append(")");
+          count++;
+          delete_range_count++;
+          break;
+        case kTypeMerge:
+          state.append("Merge(");
+          state.append(ikey.user_key.ToString());
+          state.append(", ");
+          state.append(iter->value().ToString());
+          state.append(")");
+          count++;
+          merge_count++;
+          break;
+        default:
+          assert(false);
+          break;
+      }
+      state.append("@");
+      state.append(NumberToString(ikey.sequence));
+    }
+  }
+  EXPECT_EQ(b->HasPut(), put_count > 0);
+  EXPECT_EQ(b->HasDelete(), delete_count > 0);
+  EXPECT_EQ(b->HasSingleDelete(), single_delete_count > 0);
+  EXPECT_EQ(b->HasDeleteRange(), delete_range_count > 0);
+  EXPECT_EQ(b->HasMerge(), merge_count > 0);
+  if (!s.ok()) {
+    state.append(s.ToString());
+  } else if (count != WriteBatchInternal::Count(b)) {
+    state.append("CountMismatch()");
+  }
+  delete mem->Unref();
+  return state;
+}
+
+class WriteBatchTest : public testing::Test {};
+
+TEST_F(WriteBatchTest, Empty) {
+  WriteBatch batch;
+  ASSERT_EQ("", PrintContents(&batch));
+  ASSERT_EQ(0u, WriteBatchInternal::Count(&batch));
+  ASSERT_EQ(0u, batch.Count());
+}
+
+TEST_F(WriteBatchTest, Multiple) {
+  WriteBatch batch;
+  batch.Put(Slice("foo"), Slice("bar"));
+  batch.Delete(Slice("box"));
+  batch.DeleteRange(Slice("bar"), Slice("foo"));
+  batch.Put(Slice("baz"), Slice("boo"));
+  WriteBatchInternal::SetSequence(&batch, 100);
+  ASSERT_EQ(100U, WriteBatchInternal::Sequence(&batch));
+  ASSERT_EQ(4u, WriteBatchInternal::Count(&batch));
+  ASSERT_EQ(
+      "Put(baz, boo)@103"
+      "Delete(box)@101"
+      "Put(foo, bar)@100"
+      "DeleteRange(bar, foo)@102",
+      PrintContents(&batch));
+  ASSERT_EQ(4u, batch.Count());
+}
+
+TEST_F(WriteBatchTest, Corruption) {
+  WriteBatch batch;
+  batch.Put(Slice("foo"), Slice("bar"));
+  batch.Delete(Slice("box"));
+  WriteBatchInternal::SetSequence(&batch, 200);
+  Slice contents = WriteBatchInternal::Contents(&batch);
+  WriteBatchInternal::SetContents(&batch,
+                                  Slice(contents.data(),contents.size()-1));
+  ASSERT_EQ("Put(foo, bar)@200"
+            "Corruption: bad WriteBatch Delete",
+            PrintContents(&batch));
+}
+
+TEST_F(WriteBatchTest, Append) {
+  WriteBatch b1, b2;
+  WriteBatchInternal::SetSequence(&b1, 200);
+  WriteBatchInternal::SetSequence(&b2, 300);
+  WriteBatchInternal::Append(&b1, &b2);
+  ASSERT_EQ("",
+            PrintContents(&b1));
+  ASSERT_EQ(0u, b1.Count());
+  b2.Put("a", "va");
+  WriteBatchInternal::Append(&b1, &b2);
+  ASSERT_EQ("Put(a, va)@200",
+            PrintContents(&b1));
+  ASSERT_EQ(1u, b1.Count());
+  b2.Clear();
+  b2.Put("b", "vb");
+  WriteBatchInternal::Append(&b1, &b2);
+  ASSERT_EQ("Put(a, va)@200"
+            "Put(b, vb)@201",
+            PrintContents(&b1));
+  ASSERT_EQ(2u, b1.Count());
+  b2.Delete("foo");
+  WriteBatchInternal::Append(&b1, &b2);
+  ASSERT_EQ("Put(a, va)@200"
+            "Put(b, vb)@202"
+            "Put(b, vb)@201"
+            "Delete(foo)@203",
+            PrintContents(&b1));
+  ASSERT_EQ(4u, b1.Count());
+  b2.Clear();
+  b2.Put("c", "cc");
+  b2.Put("d", "dd");
+  b2.MarkWalTerminationPoint();
+  b2.Put("e", "ee");
+  WriteBatchInternal::Append(&b1, &b2, /*wal only*/ true);
+  ASSERT_EQ(
+      "Put(a, va)@200"
+      "Put(b, vb)@202"
+      "Put(b, vb)@201"
+      "Put(c, cc)@204"
+      "Put(d, dd)@205"
+      "Delete(foo)@203",
+      PrintContents(&b1));
+  ASSERT_EQ(6u, b1.Count());
+  ASSERT_EQ(
+      "Put(c, cc)@0"
+      "Put(d, dd)@1"
+      "Put(e, ee)@2",
+      PrintContents(&b2));
+  ASSERT_EQ(3u, b2.Count());
+}
+
+TEST_F(WriteBatchTest, SingleDeletion) {
+  WriteBatch batch;
+  WriteBatchInternal::SetSequence(&batch, 100);
+  ASSERT_EQ("", PrintContents(&batch));
+  ASSERT_EQ(0u, batch.Count());
+  batch.Put("a", "va");
+  ASSERT_EQ("Put(a, va)@100", PrintContents(&batch));
+  ASSERT_EQ(1u, batch.Count());
+  batch.SingleDelete("a");
+  ASSERT_EQ(
+      "SingleDelete(a)@101"
+      "Put(a, va)@100",
+      PrintContents(&batch));
+  ASSERT_EQ(2u, batch.Count());
+}
+
+namespace {
+  struct TestHandler : public WriteBatch::Handler {
+    std::string seen;
+    Status PutCF(uint32_t column_family_id, const Slice& key,
+                 const Slice& value) override {
+      if (column_family_id == 0) {
+        seen += "Put(" + key.ToString() + ", " + value.ToString() + ")";
+      } else {
+        seen += "PutCF(" + ToString(column_family_id) + ", " +
+                key.ToString() + ", " + value.ToString() + ")";
+      }
+      return Status::OK();
+    }
+    Status DeleteCF(uint32_t column_family_id, const Slice& key) override {
+      if (column_family_id == 0) {
+        seen += "Delete(" + key.ToString() + ")";
+      } else {
+        seen += "DeleteCF(" + ToString(column_family_id) + ", " +
+                key.ToString() + ")";
+      }
+      return Status::OK();
+    }
+    Status SingleDeleteCF(uint32_t column_family_id,
+                          const Slice& key) override {
+      if (column_family_id == 0) {
+        seen += "SingleDelete(" + key.ToString() + ")";
+      } else {
+        seen += "SingleDeleteCF(" + ToString(column_family_id) + ", " +
+                key.ToString() + ")";
+      }
+      return Status::OK();
+    }
+    Status DeleteRangeCF(uint32_t column_family_id, const Slice& begin_key,
+                         const Slice& end_key) override {
+      if (column_family_id == 0) {
+        seen += "DeleteRange(" + begin_key.ToString() + ", " +
+                end_key.ToString() + ")";
+      } else {
+        seen += "DeleteRangeCF(" + ToString(column_family_id) + ", " +
+                begin_key.ToString() + ", " + end_key.ToString() + ")";
+      }
+      return Status::OK();
+    }
+    Status MergeCF(uint32_t column_family_id, const Slice& key,
+                   const Slice& value) override {
+      if (column_family_id == 0) {
+        seen += "Merge(" + key.ToString() + ", " + value.ToString() + ")";
+      } else {
+        seen += "MergeCF(" + ToString(column_family_id) + ", " +
+                key.ToString() + ", " + value.ToString() + ")";
+      }
+      return Status::OK();
+    }
+    void LogData(const Slice& blob) override {
+      seen += "LogData(" + blob.ToString() + ")";
+    }
+    Status MarkBeginPrepare(bool unprepare) override {
+      seen +=
+          "MarkBeginPrepare(" + std::string(unprepare ? "true" : "false") + ")";
+      return Status::OK();
+    }
+    Status MarkEndPrepare(const Slice& xid) override {
+      seen += "MarkEndPrepare(" + xid.ToString() + ")";
+      return Status::OK();
+    }
+    Status MarkNoop(bool empty_batch) override {
+      seen += "MarkNoop(" + std::string(empty_batch ? "true" : "false") + ")";
+      return Status::OK();
+    }
+    Status MarkCommit(const Slice& xid) override {
+      seen += "MarkCommit(" + xid.ToString() + ")";
+      return Status::OK();
+    }
+    Status MarkRollback(const Slice& xid) override {
+      seen += "MarkRollback(" + xid.ToString() + ")";
+      return Status::OK();
+    }
+  };
+}
+
+TEST_F(WriteBatchTest, PutNotImplemented) {
+  WriteBatch batch;
+  batch.Put(Slice("k1"), Slice("v1"));
+  ASSERT_EQ(1u, batch.Count());
+  ASSERT_EQ("Put(k1, v1)@0", PrintContents(&batch));
+
+  WriteBatch::Handler handler;
+  ASSERT_OK(batch.Iterate(&handler));
+}
+
+TEST_F(WriteBatchTest, DeleteNotImplemented) {
+  WriteBatch batch;
+  batch.Delete(Slice("k2"));
+  ASSERT_EQ(1u, batch.Count());
+  ASSERT_EQ("Delete(k2)@0", PrintContents(&batch));
+
+  WriteBatch::Handler handler;
+  ASSERT_OK(batch.Iterate(&handler));
+}
+
+TEST_F(WriteBatchTest, SingleDeleteNotImplemented) {
+  WriteBatch batch;
+  batch.SingleDelete(Slice("k2"));
+  ASSERT_EQ(1u, batch.Count());
+  ASSERT_EQ("SingleDelete(k2)@0", PrintContents(&batch));
+
+  WriteBatch::Handler handler;
+  ASSERT_OK(batch.Iterate(&handler));
+}
+
+TEST_F(WriteBatchTest, MergeNotImplemented) {
+  WriteBatch batch;
+  batch.Merge(Slice("foo"), Slice("bar"));
+  ASSERT_EQ(1u, batch.Count());
+  ASSERT_EQ("Merge(foo, bar)@0", PrintContents(&batch));
+
+  WriteBatch::Handler handler;
+  ASSERT_OK(batch.Iterate(&handler));
+}
+
+TEST_F(WriteBatchTest, Blob) {
+  WriteBatch batch;
+  batch.Put(Slice("k1"), Slice("v1"));
+  batch.Put(Slice("k2"), Slice("v2"));
+  batch.Put(Slice("k3"), Slice("v3"));
+  batch.PutLogData(Slice("blob1"));
+  batch.Delete(Slice("k2"));
+  batch.SingleDelete(Slice("k3"));
+  batch.PutLogData(Slice("blob2"));
+  batch.Merge(Slice("foo"), Slice("bar"));
+  ASSERT_EQ(6u, batch.Count());
+  ASSERT_EQ(
+      "Merge(foo, bar)@5"
+      "Put(k1, v1)@0"
+      "Delete(k2)@3"
+      "Put(k2, v2)@1"
+      "SingleDelete(k3)@4"
+      "Put(k3, v3)@2",
+      PrintContents(&batch));
+
+  TestHandler handler;
+  batch.Iterate(&handler);
+  ASSERT_EQ(
+      "Put(k1, v1)"
+      "Put(k2, v2)"
+      "Put(k3, v3)"
+      "LogData(blob1)"
+      "Delete(k2)"
+      "SingleDelete(k3)"
+      "LogData(blob2)"
+      "Merge(foo, bar)",
+      handler.seen);
+}
+
+TEST_F(WriteBatchTest, PrepareCommit) {
+  WriteBatch batch;
+  WriteBatchInternal::InsertNoop(&batch);
+  batch.Put(Slice("k1"), Slice("v1"));
+  batch.Put(Slice("k2"), Slice("v2"));
+  batch.SetSavePoint();
+  WriteBatchInternal::MarkEndPrepare(&batch, Slice("xid1"));
+  Status s = batch.RollbackToSavePoint();
+  ASSERT_EQ(s, Status::NotFound());
+  WriteBatchInternal::MarkCommit(&batch, Slice("xid1"));
+  WriteBatchInternal::MarkRollback(&batch, Slice("xid1"));
+  ASSERT_EQ(2u, batch.Count());
+
+  TestHandler handler;
+  batch.Iterate(&handler);
+  ASSERT_EQ(
+      "MarkBeginPrepare(false)"
+      "Put(k1, v1)"
+      "Put(k2, v2)"
+      "MarkEndPrepare(xid1)"
+      "MarkCommit(xid1)"
+      "MarkRollback(xid1)",
+      handler.seen);
+}
+
+// It requires more than 30GB of memory to run the test. With single memory
+// allocation of more than 30GB.
+// Not all platform can run it. Also it runs a long time. So disable it.
+TEST_F(WriteBatchTest, DISABLED_ManyUpdates) {
+  // Insert key and value of 3GB and push total batch size to 12GB.
+  static const size_t kKeyValueSize = 4u;
+  static const uint32_t kNumUpdates = uint32_t(3 << 30);
+  std::string raw(kKeyValueSize, 'A');
+  WriteBatch batch(kNumUpdates * (4 + kKeyValueSize * 2) + 1024u);
+  char c = 'A';
+  for (uint32_t i = 0; i < kNumUpdates; i++) {
+    if (c > 'Z') {
+      c = 'A';
+    }
+    raw[0] = c;
+    raw[raw.length() - 1] = c;
+    c++;
+    batch.Put(raw, raw);
+  }
+
+  ASSERT_EQ(kNumUpdates, batch.Count());
+
+  struct NoopHandler : public WriteBatch::Handler {
+    uint32_t num_seen = 0;
+    char expected_char = 'A';
+    Status PutCF(uint32_t /*column_family_id*/, const Slice& key,
+                 const Slice& value) override {
+      EXPECT_EQ(kKeyValueSize, key.size());
+      EXPECT_EQ(kKeyValueSize, value.size());
+      EXPECT_EQ(expected_char, key[0]);
+      EXPECT_EQ(expected_char, value[0]);
+      EXPECT_EQ(expected_char, key[kKeyValueSize - 1]);
+      EXPECT_EQ(expected_char, value[kKeyValueSize - 1]);
+      expected_char++;
+      if (expected_char > 'Z') {
+        expected_char = 'A';
+      }
+      ++num_seen;
+      return Status::OK();
+    }
+    Status DeleteCF(uint32_t /*column_family_id*/,
+                    const Slice& /*key*/) override {
+      ADD_FAILURE();
+      return Status::OK();
+    }
+    Status SingleDeleteCF(uint32_t /*column_family_id*/,
+                          const Slice& /*key*/) override {
+      ADD_FAILURE();
+      return Status::OK();
+    }
+    Status MergeCF(uint32_t /*column_family_id*/, const Slice& /*key*/,
+                   const Slice& /*value*/) override {
+      ADD_FAILURE();
+      return Status::OK();
+    }
+    void LogData(const Slice& /*blob*/) override { ADD_FAILURE(); }
+    bool Continue() override { return num_seen < kNumUpdates; }
+  } handler;
+
+  batch.Iterate(&handler);
+  ASSERT_EQ(kNumUpdates, handler.num_seen);
+}
+
+// The test requires more than 18GB memory to run it, with single memory
+// allocation of more than 12GB. Not all the platform can run it. So disable it.
+TEST_F(WriteBatchTest, DISABLED_LargeKeyValue) {
+  // Insert key and value of 3GB and push total batch size to 12GB.
+  static const size_t kKeyValueSize = 3221225472u;
+  std::string raw(kKeyValueSize, 'A');
+  WriteBatch batch(size_t(12884901888ull + 1024u));
+  for (char i = 0; i < 2; i++) {
+    raw[0] = 'A' + i;
+    raw[raw.length() - 1] = 'A' - i;
+    batch.Put(raw, raw);
+  }
+
+  ASSERT_EQ(2u, batch.Count());
+
+  struct NoopHandler : public WriteBatch::Handler {
+    int num_seen = 0;
+    Status PutCF(uint32_t /*column_family_id*/, const Slice& key,
+                 const Slice& value) override {
+      EXPECT_EQ(kKeyValueSize, key.size());
+      EXPECT_EQ(kKeyValueSize, value.size());
+      EXPECT_EQ('A' + num_seen, key[0]);
+      EXPECT_EQ('A' + num_seen, value[0]);
+      EXPECT_EQ('A' - num_seen, key[kKeyValueSize - 1]);
+      EXPECT_EQ('A' - num_seen, value[kKeyValueSize - 1]);
+      ++num_seen;
+      return Status::OK();
+    }
+    Status DeleteCF(uint32_t /*column_family_id*/,
+                    const Slice& /*key*/) override {
+      ADD_FAILURE();
+      return Status::OK();
+    }
+    Status SingleDeleteCF(uint32_t /*column_family_id*/,
+                          const Slice& /*key*/) override {
+      ADD_FAILURE();
+      return Status::OK();
+    }
+    Status MergeCF(uint32_t /*column_family_id*/, const Slice& /*key*/,
+                   const Slice& /*value*/) override {
+      ADD_FAILURE();
+      return Status::OK();
+    }
+    void LogData(const Slice& /*blob*/) override { ADD_FAILURE(); }
+    bool Continue() override { return num_seen < 2; }
+  } handler;
+
+  batch.Iterate(&handler);
+  ASSERT_EQ(2, handler.num_seen);
+}
+
+TEST_F(WriteBatchTest, Continue) {
+  WriteBatch batch;
+
+  struct Handler : public TestHandler {
+    int num_seen = 0;
+    Status PutCF(uint32_t column_family_id, const Slice& key,
+                 const Slice& value) override {
+      ++num_seen;
+      return TestHandler::PutCF(column_family_id, key, value);
+    }
+    Status DeleteCF(uint32_t column_family_id, const Slice& key) override {
+      ++num_seen;
+      return TestHandler::DeleteCF(column_family_id, key);
+    }
+    Status SingleDeleteCF(uint32_t column_family_id,
+                          const Slice& key) override {
+      ++num_seen;
+      return TestHandler::SingleDeleteCF(column_family_id, key);
+    }
+    Status MergeCF(uint32_t column_family_id, const Slice& key,
+                   const Slice& value) override {
+      ++num_seen;
+      return TestHandler::MergeCF(column_family_id, key, value);
+    }
+    void LogData(const Slice& blob) override {
+      ++num_seen;
+      TestHandler::LogData(blob);
+    }
+    bool Continue() override { return num_seen < 5; }
+  } handler;
+
+  batch.Put(Slice("k1"), Slice("v1"));
+  batch.Put(Slice("k2"), Slice("v2"));
+  batch.PutLogData(Slice("blob1"));
+  batch.Delete(Slice("k1"));
+  batch.SingleDelete(Slice("k2"));
+  batch.PutLogData(Slice("blob2"));
+  batch.Merge(Slice("foo"), Slice("bar"));
+  batch.Iterate(&handler);
+  ASSERT_EQ(
+      "Put(k1, v1)"
+      "Put(k2, v2)"
+      "LogData(blob1)"
+      "Delete(k1)"
+      "SingleDelete(k2)",
+      handler.seen);
+}
+
+TEST_F(WriteBatchTest, PutGatherSlices) {
+  WriteBatch batch;
+  batch.Put(Slice("foo"), Slice("bar"));
+
+  {
+    // Try a write where the key is one slice but the value is two
+    Slice key_slice("baz");
+    Slice value_slices[2] = { Slice("header"), Slice("payload") };
+    batch.Put(SliceParts(&key_slice, 1),
+              SliceParts(value_slices, 2));
+  }
+
+  {
+    // One where the key is composite but the value is a single slice
+    Slice key_slices[3] = { Slice("key"), Slice("part2"), Slice("part3") };
+    Slice value_slice("value");
+    batch.Put(SliceParts(key_slices, 3),
+              SliceParts(&value_slice, 1));
+  }
+
+  WriteBatchInternal::SetSequence(&batch, 100);
+  ASSERT_EQ("Put(baz, headerpayload)@101"
+            "Put(foo, bar)@100"
+            "Put(keypart2part3, value)@102",
+            PrintContents(&batch));
+  ASSERT_EQ(3u, batch.Count());
+}
+
+namespace {
+class ColumnFamilyHandleImplDummy : public ColumnFamilyHandleImpl {
+ public:
+  explicit ColumnFamilyHandleImplDummy(int id)
+      : ColumnFamilyHandleImpl(nullptr, nullptr, nullptr), id_(id) {}
+  uint32_t GetID() const override { return id_; }
+  const Comparator* GetComparator() const override {
+    return BytewiseComparator();
+  }
+
+ private:
+  uint32_t id_;
+};
+}  // namespace anonymous
+
+TEST_F(WriteBatchTest, ColumnFamiliesBatchTest) {
+  WriteBatch batch;
+  ColumnFamilyHandleImplDummy zero(0), two(2), three(3), eight(8);
+  batch.Put(&zero, Slice("foo"), Slice("bar"));
+  batch.Put(&two, Slice("twofoo"), Slice("bar2"));
+  batch.Put(&eight, Slice("eightfoo"), Slice("bar8"));
+  batch.Delete(&eight, Slice("eightfoo"));
+  batch.SingleDelete(&two, Slice("twofoo"));
+  batch.DeleteRange(&two, Slice("3foo"), Slice("4foo"));
+  batch.Merge(&three, Slice("threethree"), Slice("3three"));
+  batch.Put(&zero, Slice("foo"), Slice("bar"));
+  batch.Merge(Slice("omom"), Slice("nom"));
+
+  TestHandler handler;
+  batch.Iterate(&handler);
+  ASSERT_EQ(
+      "Put(foo, bar)"
+      "PutCF(2, twofoo, bar2)"
+      "PutCF(8, eightfoo, bar8)"
+      "DeleteCF(8, eightfoo)"
+      "SingleDeleteCF(2, twofoo)"
+      "DeleteRangeCF(2, 3foo, 4foo)"
+      "MergeCF(3, threethree, 3three)"
+      "Put(foo, bar)"
+      "Merge(omom, nom)",
+      handler.seen);
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(WriteBatchTest, ColumnFamiliesBatchWithIndexTest) {
+  WriteBatchWithIndex batch;
+  ColumnFamilyHandleImplDummy zero(0), two(2), three(3), eight(8);
+  batch.Put(&zero, Slice("foo"), Slice("bar"));
+  batch.Put(&two, Slice("twofoo"), Slice("bar2"));
+  batch.Put(&eight, Slice("eightfoo"), Slice("bar8"));
+  batch.Delete(&eight, Slice("eightfoo"));
+  batch.SingleDelete(&two, Slice("twofoo"));
+  batch.Merge(&three, Slice("threethree"), Slice("3three"));
+  batch.Put(&zero, Slice("foo"), Slice("bar"));
+  batch.Merge(Slice("omom"), Slice("nom"));
+
+  std::unique_ptr<WBWIIterator> iter;
+
+  iter.reset(batch.NewIterator(&eight));
+  iter->Seek("eightfoo");
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(WriteType::kPutRecord, iter->Entry().type);
+  ASSERT_EQ("eightfoo", iter->Entry().key.ToString());
+  ASSERT_EQ("bar8", iter->Entry().value.ToString());
+
+  iter->Next();
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(WriteType::kDeleteRecord, iter->Entry().type);
+  ASSERT_EQ("eightfoo", iter->Entry().key.ToString());
+
+  iter->Next();
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(!iter->Valid());
+
+  iter.reset(batch.NewIterator(&two));
+  iter->Seek("twofoo");
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(WriteType::kPutRecord, iter->Entry().type);
+  ASSERT_EQ("twofoo", iter->Entry().key.ToString());
+  ASSERT_EQ("bar2", iter->Entry().value.ToString());
+
+  iter->Next();
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(WriteType::kSingleDeleteRecord, iter->Entry().type);
+  ASSERT_EQ("twofoo", iter->Entry().key.ToString());
+
+  iter->Next();
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(!iter->Valid());
+
+  iter.reset(batch.NewIterator());
+  iter->Seek("gggg");
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(WriteType::kMergeRecord, iter->Entry().type);
+  ASSERT_EQ("omom", iter->Entry().key.ToString());
+  ASSERT_EQ("nom", iter->Entry().value.ToString());
+
+  iter->Next();
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(!iter->Valid());
+
+  iter.reset(batch.NewIterator(&zero));
+  iter->Seek("foo");
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(WriteType::kPutRecord, iter->Entry().type);
+  ASSERT_EQ("foo", iter->Entry().key.ToString());
+  ASSERT_EQ("bar", iter->Entry().value.ToString());
+
+  iter->Next();
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(WriteType::kPutRecord, iter->Entry().type);
+  ASSERT_EQ("foo", iter->Entry().key.ToString());
+  ASSERT_EQ("bar", iter->Entry().value.ToString());
+
+  iter->Next();
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(WriteType::kMergeRecord, iter->Entry().type);
+  ASSERT_EQ("omom", iter->Entry().key.ToString());
+  ASSERT_EQ("nom", iter->Entry().value.ToString());
+
+  iter->Next();
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(!iter->Valid());
+
+  TestHandler handler;
+  batch.GetWriteBatch()->Iterate(&handler);
+  ASSERT_EQ(
+      "Put(foo, bar)"
+      "PutCF(2, twofoo, bar2)"
+      "PutCF(8, eightfoo, bar8)"
+      "DeleteCF(8, eightfoo)"
+      "SingleDeleteCF(2, twofoo)"
+      "MergeCF(3, threethree, 3three)"
+      "Put(foo, bar)"
+      "Merge(omom, nom)",
+      handler.seen);
+}
+#endif  // !ROCKSDB_LITE
+
+TEST_F(WriteBatchTest, SavePointTest) {
+  Status s;
+  WriteBatch batch;
+  batch.SetSavePoint();
+
+  batch.Put("A", "a");
+  batch.Put("B", "b");
+  batch.SetSavePoint();
+
+  batch.Put("C", "c");
+  batch.Delete("A");
+  batch.SetSavePoint();
+  batch.SetSavePoint();
+
+  ASSERT_OK(batch.RollbackToSavePoint());
+  ASSERT_EQ(
+      "Delete(A)@3"
+      "Put(A, a)@0"
+      "Put(B, b)@1"
+      "Put(C, c)@2",
+      PrintContents(&batch));
+
+  ASSERT_OK(batch.RollbackToSavePoint());
+  ASSERT_OK(batch.RollbackToSavePoint());
+  ASSERT_EQ(
+      "Put(A, a)@0"
+      "Put(B, b)@1",
+      PrintContents(&batch));
+
+  batch.Delete("A");
+  batch.Put("B", "bb");
+
+  ASSERT_OK(batch.RollbackToSavePoint());
+  ASSERT_EQ("", PrintContents(&batch));
+
+  s = batch.RollbackToSavePoint();
+  ASSERT_TRUE(s.IsNotFound());
+  ASSERT_EQ("", PrintContents(&batch));
+
+  batch.Put("D", "d");
+  batch.Delete("A");
+
+  batch.SetSavePoint();
+
+  batch.Put("A", "aaa");
+
+  ASSERT_OK(batch.RollbackToSavePoint());
+  ASSERT_EQ(
+      "Delete(A)@1"
+      "Put(D, d)@0",
+      PrintContents(&batch));
+
+  batch.SetSavePoint();
+
+  batch.Put("D", "d");
+  batch.Delete("A");
+
+  ASSERT_OK(batch.RollbackToSavePoint());
+  ASSERT_EQ(
+      "Delete(A)@1"
+      "Put(D, d)@0",
+      PrintContents(&batch));
+
+  s = batch.RollbackToSavePoint();
+  ASSERT_TRUE(s.IsNotFound());
+  ASSERT_EQ(
+      "Delete(A)@1"
+      "Put(D, d)@0",
+      PrintContents(&batch));
+
+  WriteBatch batch2;
+
+  s = batch2.RollbackToSavePoint();
+  ASSERT_TRUE(s.IsNotFound());
+  ASSERT_EQ("", PrintContents(&batch2));
+
+  batch2.Delete("A");
+  batch2.SetSavePoint();
+
+  s = batch2.RollbackToSavePoint();
+  ASSERT_OK(s);
+  ASSERT_EQ("Delete(A)@0", PrintContents(&batch2));
+
+  batch2.Clear();
+  ASSERT_EQ("", PrintContents(&batch2));
+
+  batch2.SetSavePoint();
+
+  batch2.Delete("B");
+  ASSERT_EQ("Delete(B)@0", PrintContents(&batch2));
+
+  batch2.SetSavePoint();
+  s = batch2.RollbackToSavePoint();
+  ASSERT_OK(s);
+  ASSERT_EQ("Delete(B)@0", PrintContents(&batch2));
+
+  s = batch2.RollbackToSavePoint();
+  ASSERT_OK(s);
+  ASSERT_EQ("", PrintContents(&batch2));
+
+  s = batch2.RollbackToSavePoint();
+  ASSERT_TRUE(s.IsNotFound());
+  ASSERT_EQ("", PrintContents(&batch2));
+
+  WriteBatch batch3;
+
+  s = batch3.PopSavePoint();
+  ASSERT_TRUE(s.IsNotFound());
+  ASSERT_EQ("", PrintContents(&batch3));
+
+  batch3.SetSavePoint();
+  batch3.Delete("A");
+
+  s = batch3.PopSavePoint();
+  ASSERT_OK(s);
+  ASSERT_EQ("Delete(A)@0", PrintContents(&batch3));
+}
+
+TEST_F(WriteBatchTest, MemoryLimitTest) {
+  Status s;
+  // The header size is 12 bytes. The two Puts take 8 bytes which gives total
+  // of 12 + 8 * 2 = 28 bytes.
+  WriteBatch batch(0, 28);
+
+  ASSERT_OK(batch.Put("a", "...."));
+  ASSERT_OK(batch.Put("b", "...."));
+  s = batch.Put("c", "....");
+  ASSERT_TRUE(s.IsMemoryLimit());
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/write_callback.h b/src/rocksdb/db/write_callback.h
new file mode 100644
index 000000000..106d02041
--- /dev/null
+++ b/src/rocksdb/db/write_callback.h
@@ -0,0 +1,27 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DB;
+
+class WriteCallback {
+ public:
+  virtual ~WriteCallback() {}
+
+  // Will be called while on the write thread before the write executes.  If
+  // this function returns a non-OK status, the write will be aborted and this
+  // status will be returned to the caller of DB::Write().
+  virtual Status Callback(DB* db) = 0;
+
+  // return true if writes with this callback can be batched with other writes
+  virtual bool AllowWriteBatching() = 0;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/write_callback_test.cc b/src/rocksdb/db/write_callback_test.cc
new file mode 100644
index 000000000..df7d673aa
--- /dev/null
+++ b/src/rocksdb/db/write_callback_test.cc
@@ -0,0 +1,452 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include <atomic>
+#include <functional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "db/db_impl/db_impl.h"
+#include "db/write_callback.h"
+#include "port/port.h"
+#include "rocksdb/db.h"
+#include "rocksdb/write_batch.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "util/random.h"
+
+using std::string;
+
+namespace ROCKSDB_NAMESPACE {
+
+class WriteCallbackTest : public testing::Test {
+ public:
+  string dbname;
+
+  WriteCallbackTest() {
+    dbname = test::PerThreadDBPath("write_callback_testdb");
+  }
+};
+
+class WriteCallbackTestWriteCallback1 : public WriteCallback {
+ public:
+  bool was_called = false;
+
+  Status Callback(DB *db) override {
+    was_called = true;
+
+    // Make sure db is a DBImpl
+    DBImpl* db_impl = dynamic_cast<DBImpl*> (db);
+    if (db_impl == nullptr) {
+      return Status::InvalidArgument("");
+    }
+
+    return Status::OK();
+  }
+
+  bool AllowWriteBatching() override { return true; }
+};
+
+class WriteCallbackTestWriteCallback2 : public WriteCallback {
+ public:
+  Status Callback(DB* /*db*/) override { return Status::Busy(); }
+  bool AllowWriteBatching() override { return true; }
+};
+
+class MockWriteCallback : public WriteCallback {
+ public:
+  bool should_fail_ = false;
+  bool allow_batching_ = false;
+  std::atomic<bool> was_called_{false};
+
+  MockWriteCallback() {}
+
+  MockWriteCallback(const MockWriteCallback& other) {
+    should_fail_ = other.should_fail_;
+    allow_batching_ = other.allow_batching_;
+    was_called_.store(other.was_called_.load());
+  }
+
+  Status Callback(DB* /*db*/) override {
+    was_called_.store(true);
+    if (should_fail_) {
+      return Status::Busy();
+    } else {
+      return Status::OK();
+    }
+  }
+
+  bool AllowWriteBatching() override { return allow_batching_; }
+};
+
+TEST_F(WriteCallbackTest, WriteWithCallbackTest) {
+  struct WriteOP {
+    WriteOP(bool should_fail = false) { callback_.should_fail_ = should_fail; }
+
+    void Put(const string& key, const string& val) {
+      kvs_.push_back(std::make_pair(key, val));
+      write_batch_.Put(key, val);
+    }
+
+    void Clear() {
+      kvs_.clear();
+      write_batch_.Clear();
+      callback_.was_called_.store(false);
+    }
+
+    MockWriteCallback callback_;
+    WriteBatch write_batch_;
+    std::vector<std::pair<string, string>> kvs_;
+  };
+
+  // In each scenario we'll launch multiple threads to write.
+  // The size of each array equals to number of threads, and
+  // each boolean in it denote whether callback of corresponding
+  // thread should succeed or fail.
+  std::vector<std::vector<WriteOP>> write_scenarios = {
+      {true},
+      {false},
+      {false, false},
+      {true, true},
+      {true, false},
+      {false, true},
+      {false, false, false},
+      {true, true, true},
+      {false, true, false},
+      {true, false, true},
+      {true, false, false, false, false},
+      {false, false, false, false, true},
+      {false, false, true, false, true},
+  };
+
+  for (auto& unordered_write : {true, false}) {
+  for (auto& seq_per_batch : {true, false}) {
+  for (auto& two_queues : {true, false}) {
+    for (auto& allow_parallel : {true, false}) {
+      for (auto& allow_batching : {true, false}) {
+        for (auto& enable_WAL : {true, false}) {
+          for (auto& enable_pipelined_write : {true, false}) {
+            for (auto& write_group : write_scenarios) {
+              Options options;
+              options.create_if_missing = true;
+              options.unordered_write = unordered_write;
+              options.allow_concurrent_memtable_write = allow_parallel;
+              options.enable_pipelined_write = enable_pipelined_write;
+              options.two_write_queues = two_queues;
+              // Skip unsupported combinations
+              if (options.enable_pipelined_write && seq_per_batch) {
+                continue;
+              }
+              if (options.enable_pipelined_write && options.two_write_queues) {
+                continue;
+              }
+              if (options.unordered_write &&
+                  !options.allow_concurrent_memtable_write) {
+                continue;
+              }
+              if (options.unordered_write && options.enable_pipelined_write) {
+                continue;
+              }
+
+              ReadOptions read_options;
+              DB* db;
+              DBImpl* db_impl;
+
+              DestroyDB(dbname, options);
+
+              DBOptions db_options(options);
+              ColumnFamilyOptions cf_options(options);
+              std::vector<ColumnFamilyDescriptor> column_families;
+              column_families.push_back(
+                  ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options));
+              std::vector<ColumnFamilyHandle*> handles;
+              auto open_s =
+                  DBImpl::Open(db_options, dbname, column_families, &handles,
+                               &db, seq_per_batch, true /* batch_per_txn */);
+              ASSERT_OK(open_s);
+              assert(handles.size() == 1);
+              delete handles[0];
+
+              db_impl = dynamic_cast<DBImpl*>(db);
+              ASSERT_TRUE(db_impl);
+
+              // Writers that have called JoinBatchGroup.
+              std::atomic<uint64_t> threads_joining(0);
+              // Writers that have linked to the queue
+              std::atomic<uint64_t> threads_linked(0);
+              // Writers that pass WriteThread::JoinBatchGroup:Wait sync-point.
+              std::atomic<uint64_t> threads_verified(0);
+
+              std::atomic<uint64_t> seq(db_impl->GetLatestSequenceNumber());
+              ASSERT_EQ(db_impl->GetLatestSequenceNumber(), 0);
+
+              ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+                  "WriteThread::JoinBatchGroup:Start", [&](void*) {
+                    uint64_t cur_threads_joining = threads_joining.fetch_add(1);
+                    // Wait for the last joined writer to link to the queue.
+                    // In this way the writers link to the queue one by one.
+                    // This allows us to confidently detect the first writer
+                    // who increases threads_linked as the leader.
+                    while (threads_linked.load() < cur_threads_joining) {
+                    }
+                  });
+
+              // Verification once writers call JoinBatchGroup.
+              ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+                  "WriteThread::JoinBatchGroup:Wait", [&](void* arg) {
+                    uint64_t cur_threads_linked = threads_linked.fetch_add(1);
+                    bool is_leader = false;
+                    bool is_last = false;
+
+                    // who am i
+                    is_leader = (cur_threads_linked == 0);
+                    is_last = (cur_threads_linked == write_group.size() - 1);
+
+                    // check my state
+                    auto* writer = reinterpret_cast<WriteThread::Writer*>(arg);
+
+                    if (is_leader) {
+                      ASSERT_TRUE(writer->state ==
+                                  WriteThread::State::STATE_GROUP_LEADER);
+                    } else {
+                      ASSERT_TRUE(writer->state ==
+                                  WriteThread::State::STATE_INIT);
+                    }
+
+                    // (meta test) the first WriteOP should indeed be the first
+                    // and the last should be the last (all others can be out of
+                    // order)
+                    if (is_leader) {
+                      ASSERT_TRUE(writer->callback->Callback(nullptr).ok() ==
+                                  !write_group.front().callback_.should_fail_);
+                    } else if (is_last) {
+                      ASSERT_TRUE(writer->callback->Callback(nullptr).ok() ==
+                                  !write_group.back().callback_.should_fail_);
+                    }
+
+                    threads_verified.fetch_add(1);
+                    // Wait here until all verification in this sync-point
+                    // callback finish for all writers.
+                    while (threads_verified.load() < write_group.size()) {
+                    }
+                  });
+
+              ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+                  "WriteThread::JoinBatchGroup:DoneWaiting", [&](void* arg) {
+                    // check my state
+                    auto* writer = reinterpret_cast<WriteThread::Writer*>(arg);
+
+                    if (!allow_batching) {
+                      // no batching so everyone should be a leader
+                      ASSERT_TRUE(writer->state ==
+                                  WriteThread::State::STATE_GROUP_LEADER);
+                    } else if (!allow_parallel) {
+                      ASSERT_TRUE(writer->state ==
+                                      WriteThread::State::STATE_COMPLETED ||
+                                  (enable_pipelined_write &&
+                                   writer->state ==
+                                       WriteThread::State::
+                                           STATE_MEMTABLE_WRITER_LEADER));
+                    }
+                  });
+
+              std::atomic<uint32_t> thread_num(0);
+              std::atomic<char> dummy_key(0);
+
+              // Each write thread create a random write batch and write to DB
+              // with a write callback.
+              std::function<void()> write_with_callback_func = [&]() {
+                uint32_t i = thread_num.fetch_add(1);
+                Random rnd(i);
+
+                // leaders gotta lead
+                while (i > 0 && threads_verified.load() < 1) {
+                }
+
+                // loser has to lose
+                while (i == write_group.size() - 1 &&
+                       threads_verified.load() < write_group.size() - 1) {
+                }
+
+                auto& write_op = write_group.at(i);
+                write_op.Clear();
+                write_op.callback_.allow_batching_ = allow_batching;
+
+                // insert some keys
+                for (uint32_t j = 0; j < rnd.Next() % 50; j++) {
+                  // grab unique key
+                  char my_key = dummy_key.fetch_add(1);
+
+                  string skey(5, my_key);
+                  string sval(10, my_key);
+                  write_op.Put(skey, sval);
+
+                  if (!write_op.callback_.should_fail_ && !seq_per_batch) {
+                    seq.fetch_add(1);
+                  }
+                }
+                if (!write_op.callback_.should_fail_ && seq_per_batch) {
+                  seq.fetch_add(1);
+                }
+
+                WriteOptions woptions;
+                woptions.disableWAL = !enable_WAL;
+                woptions.sync = enable_WAL;
+                Status s;
+                if (seq_per_batch) {
+                  class PublishSeqCallback : public PreReleaseCallback {
+                   public:
+                    PublishSeqCallback(DBImpl* db_impl_in)
+                        : db_impl_(db_impl_in) {}
+                    Status Callback(SequenceNumber last_seq, bool /*not used*/,
+                                    uint64_t, size_t /*index*/,
+                                    size_t /*total*/) override {
+                      db_impl_->SetLastPublishedSequence(last_seq);
+                      return Status::OK();
+                    }
+                    DBImpl* db_impl_;
+                  } publish_seq_callback(db_impl);
+                  // seq_per_batch requires a natural batch separator or Noop
+                  WriteBatchInternal::InsertNoop(&write_op.write_batch_);
+                  const size_t ONE_BATCH = 1;
+                  s = db_impl->WriteImpl(
+                      woptions, &write_op.write_batch_, &write_op.callback_,
+                      nullptr, 0, false, nullptr, ONE_BATCH,
+                      two_queues ? &publish_seq_callback : nullptr);
+                } else {
+                  s = db_impl->WriteWithCallback(
+                      woptions, &write_op.write_batch_, &write_op.callback_);
+                }
+
+                if (write_op.callback_.should_fail_) {
+                  ASSERT_TRUE(s.IsBusy());
+                } else {
+                  ASSERT_OK(s);
+                }
+              };
+
+              ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+              // do all the writes
+              std::vector<port::Thread> threads;
+              for (uint32_t i = 0; i < write_group.size(); i++) {
+                threads.emplace_back(write_with_callback_func);
+              }
+              for (auto& t : threads) {
+                t.join();
+              }
+
+              ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+              // check for keys
+              string value;
+              for (auto& w : write_group) {
+                ASSERT_TRUE(w.callback_.was_called_.load());
+                for (auto& kvp : w.kvs_) {
+                  if (w.callback_.should_fail_) {
+                    ASSERT_TRUE(
+                        db->Get(read_options, kvp.first, &value).IsNotFound());
+                  } else {
+                    ASSERT_OK(db->Get(read_options, kvp.first, &value));
+                    ASSERT_EQ(value, kvp.second);
+                  }
+                }
+              }
+
+              ASSERT_EQ(seq.load(), db_impl->TEST_GetLastVisibleSequence());
+
+              delete db;
+              DestroyDB(dbname, options);
+            }
+          }
+        }
+      }
+    }
+  }
+  }
+  }
+}
+
+TEST_F(WriteCallbackTest, WriteCallBackTest) {
+  Options options;
+  WriteOptions write_options;
+  ReadOptions read_options;
+  string value;
+  DB* db;
+  DBImpl* db_impl;
+
+  DestroyDB(dbname, options);
+
+  options.create_if_missing = true;
+  Status s = DB::Open(options, dbname, &db);
+  ASSERT_OK(s);
+
+  db_impl = dynamic_cast<DBImpl*> (db);
+  ASSERT_TRUE(db_impl);
+
+  WriteBatch wb;
+
+  wb.Put("a", "value.a");
+  wb.Delete("x");
+
+  // Test a simple Write
+  s = db->Write(write_options, &wb);
+  ASSERT_OK(s);
+
+  s = db->Get(read_options, "a", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("value.a", value);
+
+  // Test WriteWithCallback
+  WriteCallbackTestWriteCallback1 callback1;
+  WriteBatch wb2;
+
+  wb2.Put("a", "value.a2");
+
+  s = db_impl->WriteWithCallback(write_options, &wb2, &callback1);
+  ASSERT_OK(s);
+  ASSERT_TRUE(callback1.was_called);
+
+  s = db->Get(read_options, "a", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("value.a2", value);
+
+  // Test WriteWithCallback for a callback that fails
+  WriteCallbackTestWriteCallback2 callback2;
+  WriteBatch wb3;
+
+  wb3.Put("a", "value.a3");
+
+  s = db_impl->WriteWithCallback(write_options, &wb3, &callback2);
+  ASSERT_NOK(s);
+
+  s = db->Get(read_options, "a", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("value.a2", value);
+
+  delete db;
+  DestroyDB(dbname, options);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+  fprintf(stderr,
+          "SKIPPED as WriteWithCallback is not supported in ROCKSDB_LITE\n");
+  return 0;
+}
+
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/write_controller.cc b/src/rocksdb/db/write_controller.cc
new file mode 100644
index 000000000..5480aabd1
--- /dev/null
+++ b/src/rocksdb/db/write_controller.cc
@@ -0,0 +1,128 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/write_controller.h"
+
+#include <atomic>
+#include <cassert>
+#include <ratio>
+#include "rocksdb/env.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+std::unique_ptr<WriteControllerToken> WriteController::GetStopToken() {
+  ++total_stopped_;
+  return std::unique_ptr<WriteControllerToken>(new StopWriteToken(this));
+}
+
+std::unique_ptr<WriteControllerToken> WriteController::GetDelayToken(
+    uint64_t write_rate) {
+  total_delayed_++;
+  // Reset counters.
+  last_refill_time_ = 0;
+  bytes_left_ = 0;
+  set_delayed_write_rate(write_rate);
+  return std::unique_ptr<WriteControllerToken>(new DelayWriteToken(this));
+}
+
+std::unique_ptr<WriteControllerToken>
+WriteController::GetCompactionPressureToken() {
+  ++total_compaction_pressure_;
+  return std::unique_ptr<WriteControllerToken>(
+      new CompactionPressureToken(this));
+}
+
+bool WriteController::IsStopped() const {
+  return total_stopped_.load(std::memory_order_relaxed) > 0;
+}
+// This is inside DB mutex, so we can't sleep and need to minimize
+// frequency to get time.
+// If it turns out to be a performance issue, we can redesign the thread
+// synchronization model here.
+// The function trust caller will sleep micros returned.
+uint64_t WriteController::GetDelay(Env* env, uint64_t num_bytes) {
+  if (total_stopped_.load(std::memory_order_relaxed) > 0) {
+    return 0;
+  }
+  if (total_delayed_.load(std::memory_order_relaxed) == 0) {
+    return 0;
+  }
+
+  const uint64_t kMicrosPerSecond = 1000000;
+  const uint64_t kRefillInterval = 1024U;
+
+  if (bytes_left_ >= num_bytes) {
+    bytes_left_ -= num_bytes;
+    return 0;
+  }
+  // The frequency to get time inside DB mutex is less than one per refill
+  // interval.
+  auto time_now = NowMicrosMonotonic(env);
+
+  uint64_t sleep_debt = 0;
+  uint64_t time_since_last_refill = 0;
+  if (last_refill_time_ != 0) {
+    if (last_refill_time_ > time_now) {
+      sleep_debt = last_refill_time_ - time_now;
+    } else {
+      time_since_last_refill = time_now - last_refill_time_;
+      bytes_left_ +=
+          static_cast<uint64_t>(static_cast<double>(time_since_last_refill) /
+                                kMicrosPerSecond * delayed_write_rate_);
+      if (time_since_last_refill >= kRefillInterval &&
+          bytes_left_ > num_bytes) {
+        // If refill interval already passed and we have enough bytes
+        // return without extra sleeping.
+        last_refill_time_ = time_now;
+        bytes_left_ -= num_bytes;
+        return 0;
+      }
+    }
+  }
+
+  uint64_t single_refill_amount =
+      delayed_write_rate_ * kRefillInterval / kMicrosPerSecond;
+  if (bytes_left_ + single_refill_amount >= num_bytes) {
+    // Wait until a refill interval
+    // Never trigger expire for less than one refill interval to avoid to get
+    // time.
+    bytes_left_ = bytes_left_ + single_refill_amount - num_bytes;
+    last_refill_time_ = time_now + kRefillInterval;
+    return kRefillInterval + sleep_debt;
+  }
+
+  // Need to refill more than one interval. Need to sleep longer. Check
+  // whether expiration will hit
+
+  // Sleep just until `num_bytes` is allowed.
+  uint64_t sleep_amount =
+      static_cast<uint64_t>(num_bytes /
+                            static_cast<long double>(delayed_write_rate_) *
+                            kMicrosPerSecond) +
+      sleep_debt;
+  last_refill_time_ = time_now + sleep_amount;
+  return sleep_amount;
+}
+
+uint64_t WriteController::NowMicrosMonotonic(Env* env) {
+  return env->NowNanos() / std::milli::den;
+}
+
+StopWriteToken::~StopWriteToken() {
+  assert(controller_->total_stopped_ >= 1);
+  --controller_->total_stopped_;
+}
+
+DelayWriteToken::~DelayWriteToken() {
+  controller_->total_delayed_--;
+  assert(controller_->total_delayed_.load() >= 0);
+}
+
+CompactionPressureToken::~CompactionPressureToken() {
+  controller_->total_compaction_pressure_--;
+  assert(controller_->total_compaction_pressure_ >= 0);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/write_controller.h b/src/rocksdb/db/write_controller.h
new file mode 100644
index 000000000..785ae6896
--- /dev/null
+++ b/src/rocksdb/db/write_controller.h
@@ -0,0 +1,144 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <stdint.h>
+
+#include <atomic>
+#include <memory>
+#include "rocksdb/rate_limiter.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Env;
+class WriteControllerToken;
+
+// WriteController is controlling write stalls in our write code-path. Write
+// stalls happen when compaction can't keep up with write rate.
+// All of the methods here (including WriteControllerToken's destructors) need
+// to be called while holding DB mutex
+class WriteController {
+ public:
+  explicit WriteController(uint64_t _delayed_write_rate = 1024u * 1024u * 32u,
+                           int64_t low_pri_rate_bytes_per_sec = 1024 * 1024)
+      : total_stopped_(0),
+        total_delayed_(0),
+        total_compaction_pressure_(0),
+        bytes_left_(0),
+        last_refill_time_(0),
+        low_pri_rate_limiter_(
+            NewGenericRateLimiter(low_pri_rate_bytes_per_sec)) {
+    set_max_delayed_write_rate(_delayed_write_rate);
+  }
+  ~WriteController() = default;
+
+  // When an actor (column family) requests a stop token, all writes will be
+  // stopped until the stop token is released (deleted)
+  std::unique_ptr<WriteControllerToken> GetStopToken();
+  // When an actor (column family) requests a delay token, total delay for all
+  // writes to the DB will be controlled under the delayed write rate. Every
+  // write needs to call GetDelay() with number of bytes writing to the DB,
+  // which returns number of microseconds to sleep.
+  std::unique_ptr<WriteControllerToken> GetDelayToken(
+      uint64_t delayed_write_rate);
+  // When an actor (column family) requests a moderate token, compaction
+  // threads will be increased
+  std::unique_ptr<WriteControllerToken> GetCompactionPressureToken();
+
+  // these three metods are querying the state of the WriteController
+  bool IsStopped() const;
+  bool NeedsDelay() const { return total_delayed_.load() > 0; }
+  bool NeedSpeedupCompaction() const {
+    return IsStopped() || NeedsDelay() || total_compaction_pressure_ > 0;
+  }
+  // return how many microseconds the caller needs to sleep after the call
+  // num_bytes: how many number of bytes to put into the DB.
+  // Prerequisite: DB mutex held.
+  uint64_t GetDelay(Env* env, uint64_t num_bytes);
+  void set_delayed_write_rate(uint64_t write_rate) {
+    // avoid divide 0
+    if (write_rate == 0) {
+      write_rate = 1u;
+    } else if (write_rate > max_delayed_write_rate()) {
+      write_rate = max_delayed_write_rate();
+    }
+    delayed_write_rate_ = write_rate;
+  }
+
+  void set_max_delayed_write_rate(uint64_t write_rate) {
+    // avoid divide 0
+    if (write_rate == 0) {
+      write_rate = 1u;
+    }
+    max_delayed_write_rate_ = write_rate;
+    // update delayed_write_rate_ as well
+    delayed_write_rate_ = write_rate;
+  }
+
+  uint64_t delayed_write_rate() const { return delayed_write_rate_; }
+
+  uint64_t max_delayed_write_rate() const { return max_delayed_write_rate_; }
+
+  RateLimiter* low_pri_rate_limiter() { return low_pri_rate_limiter_.get(); }
+
+ private:
+  uint64_t NowMicrosMonotonic(Env* env);
+
+  friend class WriteControllerToken;
+  friend class StopWriteToken;
+  friend class DelayWriteToken;
+  friend class CompactionPressureToken;
+
+  std::atomic<int> total_stopped_;
+  std::atomic<int> total_delayed_;
+  std::atomic<int> total_compaction_pressure_;
+  uint64_t bytes_left_;
+  uint64_t last_refill_time_;
+  // write rate set when initialization or by `DBImpl::SetDBOptions`
+  uint64_t max_delayed_write_rate_;
+  // current write rate
+  uint64_t delayed_write_rate_;
+
+  std::unique_ptr<RateLimiter> low_pri_rate_limiter_;
+};
+
+class WriteControllerToken {
+ public:
+  explicit WriteControllerToken(WriteController* controller)
+      : controller_(controller) {}
+  virtual ~WriteControllerToken() {}
+
+ protected:
+  WriteController* controller_;
+
+ private:
+  // no copying allowed
+  WriteControllerToken(const WriteControllerToken&) = delete;
+  void operator=(const WriteControllerToken&) = delete;
+};
+
+class StopWriteToken : public WriteControllerToken {
+ public:
+  explicit StopWriteToken(WriteController* controller)
+      : WriteControllerToken(controller) {}
+  virtual ~StopWriteToken();
+};
+
+class DelayWriteToken : public WriteControllerToken {
+ public:
+  explicit DelayWriteToken(WriteController* controller)
+      : WriteControllerToken(controller) {}
+  virtual ~DelayWriteToken();
+};
+
+class CompactionPressureToken : public WriteControllerToken {
+ public:
+  explicit CompactionPressureToken(WriteController* controller)
+      : WriteControllerToken(controller) {}
+  virtual ~CompactionPressureToken();
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/write_controller_test.cc b/src/rocksdb/db/write_controller_test.cc
new file mode 100644
index 000000000..72d116798
--- /dev/null
+++ b/src/rocksdb/db/write_controller_test.cc
@@ -0,0 +1,135 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#include <ratio>
+
+#include "db/write_controller.h"
+
+#include "rocksdb/env.h"
+#include "test_util/testharness.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class WriteControllerTest : public testing::Test {};
+
+class TimeSetEnv : public EnvWrapper {
+ public:
+  explicit TimeSetEnv() : EnvWrapper(nullptr) {}
+  uint64_t now_micros_ = 6666;
+  uint64_t NowNanos() override { return now_micros_ * std::milli::den; }
+};
+
+TEST_F(WriteControllerTest, ChangeDelayRateTest) {
+  TimeSetEnv env;
+  WriteController controller(40000000u);  // also set max delayed rate
+  controller.set_delayed_write_rate(10000000u);
+  auto delay_token_0 =
+      controller.GetDelayToken(controller.delayed_write_rate());
+  ASSERT_EQ(static_cast<uint64_t>(2000000),
+            controller.GetDelay(&env, 20000000u));
+  auto delay_token_1 = controller.GetDelayToken(2000000u);
+  ASSERT_EQ(static_cast<uint64_t>(10000000),
+            controller.GetDelay(&env, 20000000u));
+  auto delay_token_2 = controller.GetDelayToken(1000000u);
+  ASSERT_EQ(static_cast<uint64_t>(20000000),
+            controller.GetDelay(&env, 20000000u));
+  auto delay_token_3 = controller.GetDelayToken(20000000u);
+  ASSERT_EQ(static_cast<uint64_t>(1000000),
+            controller.GetDelay(&env, 20000000u));
+  // This is more than max rate. Max delayed rate will be used.
+  auto delay_token_4 =
+      controller.GetDelayToken(controller.delayed_write_rate() * 3);
+  ASSERT_EQ(static_cast<uint64_t>(500000),
+            controller.GetDelay(&env, 20000000u));
+}
+
+TEST_F(WriteControllerTest, SanityTest) {
+  WriteController controller(10000000u);
+  auto stop_token_1 = controller.GetStopToken();
+  auto stop_token_2 = controller.GetStopToken();
+
+  ASSERT_TRUE(controller.IsStopped());
+  stop_token_1.reset();
+  ASSERT_TRUE(controller.IsStopped());
+  stop_token_2.reset();
+  ASSERT_FALSE(controller.IsStopped());
+
+  TimeSetEnv env;
+
+  auto delay_token_1 = controller.GetDelayToken(10000000u);
+  ASSERT_EQ(static_cast<uint64_t>(2000000),
+            controller.GetDelay(&env, 20000000u));
+
+  env.now_micros_ += 1999900u;  // sleep debt 1000
+
+  auto delay_token_2 = controller.GetDelayToken(10000000u);
+  // Rate reset after changing the token.
+  ASSERT_EQ(static_cast<uint64_t>(2000000),
+            controller.GetDelay(&env, 20000000u));
+
+  env.now_micros_ += 1999900u;  // sleep debt 1000
+
+  // One refill: 10240 bytes allowed, 1000 used, 9240 left
+  ASSERT_EQ(static_cast<uint64_t>(1124), controller.GetDelay(&env, 1000u));
+  env.now_micros_ += 1124u;  // sleep debt 0
+
+  delay_token_2.reset();
+  // 1000 used, 8240 left
+  ASSERT_EQ(static_cast<uint64_t>(0), controller.GetDelay(&env, 1000u));
+
+  env.now_micros_ += 100u;  // sleep credit 100
+  // 1000 used, 7240 left
+  ASSERT_EQ(static_cast<uint64_t>(0), controller.GetDelay(&env, 1000u));
+
+  env.now_micros_ += 100u;  // sleep credit 200
+  // One refill: 10240 fileed, sleep credit generates 2000. 8000 used
+  //             7240 + 10240 + 2000 - 8000 = 11480 left
+  ASSERT_EQ(static_cast<uint64_t>(1024u), controller.GetDelay(&env, 8000u));
+
+  env.now_micros_ += 200u;  // sleep debt 824
+  // 1000 used, 10480 left.
+  ASSERT_EQ(static_cast<uint64_t>(0), controller.GetDelay(&env, 1000u));
+
+  env.now_micros_ += 200u;  // sleep debt 624
+  // Out of bound sleep, still 10480 left
+  ASSERT_EQ(static_cast<uint64_t>(3000624u),
+            controller.GetDelay(&env, 30000000u));
+
+  env.now_micros_ += 3000724u;  // sleep credit 100
+  // 6000 used, 4480 left.
+  ASSERT_EQ(static_cast<uint64_t>(0), controller.GetDelay(&env, 6000u));
+
+  env.now_micros_ += 200u;  // sleep credit 300
+  // One refill, credit 4480 balance + 3000 credit + 10240 refill
+  // Use 8000, 9720 left
+  ASSERT_EQ(static_cast<uint64_t>(1024u), controller.GetDelay(&env, 8000u));
+
+  env.now_micros_ += 3024u;  // sleep credit 2000
+
+  // 1720 left
+  ASSERT_EQ(static_cast<uint64_t>(0u), controller.GetDelay(&env, 8000u));
+
+  // 1720 balance + 20000 credit = 20170 left
+  // Use 8000, 12170 left
+  ASSERT_EQ(static_cast<uint64_t>(0u), controller.GetDelay(&env, 8000u));
+
+  // 4170 left
+  ASSERT_EQ(static_cast<uint64_t>(0u), controller.GetDelay(&env, 8000u));
+
+  // Need a refill
+  ASSERT_EQ(static_cast<uint64_t>(1024u), controller.GetDelay(&env, 9000u));
+
+  delay_token_1.reset();
+  ASSERT_EQ(static_cast<uint64_t>(0), controller.GetDelay(&env, 30000000u));
+  delay_token_1.reset();
+  ASSERT_FALSE(controller.IsStopped());
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/write_thread.cc b/src/rocksdb/db/write_thread.cc
new file mode 100644
index 000000000..5f50bba63
--- /dev/null
+++ b/src/rocksdb/db/write_thread.cc
@@ -0,0 +1,777 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/write_thread.h"
+#include <chrono>
+#include <thread>
+#include "db/column_family.h"
+#include "monitoring/perf_context_imp.h"
+#include "port/port.h"
+#include "test_util/sync_point.h"
+#include "util/random.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+WriteThread::WriteThread(const ImmutableDBOptions& db_options)
+    : max_yield_usec_(db_options.enable_write_thread_adaptive_yield
+                          ? db_options.write_thread_max_yield_usec
+                          : 0),
+      slow_yield_usec_(db_options.write_thread_slow_yield_usec),
+      allow_concurrent_memtable_write_(
+          db_options.allow_concurrent_memtable_write),
+      enable_pipelined_write_(db_options.enable_pipelined_write),
+      max_write_batch_group_size_bytes(
+          db_options.max_write_batch_group_size_bytes),
+      newest_writer_(nullptr),
+      newest_memtable_writer_(nullptr),
+      last_sequence_(0),
+      write_stall_dummy_(),
+      stall_mu_(),
+      stall_cv_(&stall_mu_) {}
+
+uint8_t WriteThread::BlockingAwaitState(Writer* w, uint8_t goal_mask) {
+  // We're going to block.  Lazily create the mutex.  We guarantee
+  // propagation of this construction to the waker via the
+  // STATE_LOCKED_WAITING state.  The waker won't try to touch the mutex
+  // or the condvar unless they CAS away the STATE_LOCKED_WAITING that
+  // we install below.
+  w->CreateMutex();
+
+  auto state = w->state.load(std::memory_order_acquire);
+  assert(state != STATE_LOCKED_WAITING);
+  if ((state & goal_mask) == 0 &&
+      w->state.compare_exchange_strong(state, STATE_LOCKED_WAITING)) {
+    // we have permission (and an obligation) to use StateMutex
+    std::unique_lock<std::mutex> guard(w->StateMutex());
+    w->StateCV().wait(guard, [w] {
+      return w->state.load(std::memory_order_relaxed) != STATE_LOCKED_WAITING;
+    });
+    state = w->state.load(std::memory_order_relaxed);
+  }
+  // else tricky.  Goal is met or CAS failed.  In the latter case the waker
+  // must have changed the state, and compare_exchange_strong has updated
+  // our local variable with the new one.  At the moment WriteThread never
+  // waits for a transition across intermediate states, so we know that
+  // since a state change has occurred the goal must have been met.
+  assert((state & goal_mask) != 0);
+  return state;
+}
+
+uint8_t WriteThread::AwaitState(Writer* w, uint8_t goal_mask,
+                                AdaptationContext* ctx) {
+  uint8_t state = 0;
+
+  // 1. Busy loop using "pause" for 1 micro sec
+  // 2. Else SOMETIMES busy loop using "yield" for 100 micro sec (default)
+  // 3. Else blocking wait
+
+  // On a modern Xeon each loop takes about 7 nanoseconds (most of which
+  // is the effect of the pause instruction), so 200 iterations is a bit
+  // more than a microsecond.  This is long enough that waits longer than
+  // this can amortize the cost of accessing the clock and yielding.
+  for (uint32_t tries = 0; tries < 200; ++tries) {
+    state = w->state.load(std::memory_order_acquire);
+    if ((state & goal_mask) != 0) {
+      return state;
+    }
+    port::AsmVolatilePause();
+  }
+
+  // This is below the fast path, so that the stat is zero when all writes are
+  // from the same thread.
+  PERF_TIMER_GUARD(write_thread_wait_nanos);
+
+  // If we're only going to end up waiting a short period of time,
+  // it can be a lot more efficient to call std::this_thread::yield()
+  // in a loop than to block in StateMutex().  For reference, on my 4.0
+  // SELinux test server with support for syscall auditing enabled, the
+  // minimum latency between FUTEX_WAKE to returning from FUTEX_WAIT is
+  // 2.7 usec, and the average is more like 10 usec.  That can be a big
+  // drag on RockDB's single-writer design.  Of course, spinning is a
+  // bad idea if other threads are waiting to run or if we're going to
+  // wait for a long time.  How do we decide?
+  //
+  // We break waiting into 3 categories: short-uncontended,
+  // short-contended, and long.  If we had an oracle, then we would always
+  // spin for short-uncontended, always block for long, and our choice for
+  // short-contended might depend on whether we were trying to optimize
+  // RocksDB throughput or avoid being greedy with system resources.
+  //
+  // Bucketing into short or long is easy by measuring elapsed time.
+  // Differentiating short-uncontended from short-contended is a bit
+  // trickier, but not too bad.  We could look for involuntary context
+  // switches using getrusage(RUSAGE_THREAD, ..), but it's less work
+  // (portability code and CPU) to just look for yield calls that take
+  // longer than we expect.  sched_yield() doesn't actually result in any
+  // context switch overhead if there are no other runnable processes
+  // on the current core, in which case it usually takes less than
+  // a microsecond.
+  //
+  // There are two primary tunables here: the threshold between "short"
+  // and "long" waits, and the threshold at which we suspect that a yield
+  // is slow enough to indicate we should probably block.  If these
+  // thresholds are chosen well then CPU-bound workloads that don't
+  // have more threads than cores will experience few context switches
+  // (voluntary or involuntary), and the total number of context switches
+  // (voluntary and involuntary) will not be dramatically larger (maybe
+  // 2x) than the number of voluntary context switches that occur when
+  // --max_yield_wait_micros=0.
+  //
+  // There's another constant, which is the number of slow yields we will
+  // tolerate before reversing our previous decision.  Solitary slow
+  // yields are pretty common (low-priority small jobs ready to run),
+  // so this should be at least 2.  We set this conservatively to 3 so
+  // that we can also immediately schedule a ctx adaptation, rather than
+  // waiting for the next update_ctx.
+
+  const size_t kMaxSlowYieldsWhileSpinning = 3;
+
+  // Whether the yield approach has any credit in this context. The credit is
+  // added by yield being succesfull before timing out, and decreased otherwise.
+  auto& yield_credit = ctx->value;
+  // Update the yield_credit based on sample runs or right after a hard failure
+  bool update_ctx = false;
+  // Should we reinforce the yield credit
+  bool would_spin_again = false;
+  // The samling base for updating the yeild credit. The sampling rate would be
+  // 1/sampling_base.
+  const int sampling_base = 256;
+
+  if (max_yield_usec_ > 0) {
+    update_ctx = Random::GetTLSInstance()->OneIn(sampling_base);
+
+    if (update_ctx || yield_credit.load(std::memory_order_relaxed) >= 0) {
+      // we're updating the adaptation statistics, or spinning has >
+      // 50% chance of being shorter than max_yield_usec_ and causing no
+      // involuntary context switches
+      auto spin_begin = std::chrono::steady_clock::now();
+
+      // this variable doesn't include the final yield (if any) that
+      // causes the goal to be met
+      size_t slow_yield_count = 0;
+
+      auto iter_begin = spin_begin;
+      while ((iter_begin - spin_begin) <=
+             std::chrono::microseconds(max_yield_usec_)) {
+        std::this_thread::yield();
+
+        state = w->state.load(std::memory_order_acquire);
+        if ((state & goal_mask) != 0) {
+          // success
+          would_spin_again = true;
+          break;
+        }
+
+        auto now = std::chrono::steady_clock::now();
+        if (now == iter_begin ||
+            now - iter_begin >= std::chrono::microseconds(slow_yield_usec_)) {
+          // conservatively count it as a slow yield if our clock isn't
+          // accurate enough to measure the yield duration
+          ++slow_yield_count;
+          if (slow_yield_count >= kMaxSlowYieldsWhileSpinning) {
+            // Not just one ivcsw, but several.  Immediately update yield_credit
+            // and fall back to blocking
+            update_ctx = true;
+            break;
+          }
+        }
+        iter_begin = now;
+      }
+    }
+  }
+
+  if ((state & goal_mask) == 0) {
+    TEST_SYNC_POINT_CALLBACK("WriteThread::AwaitState:BlockingWaiting", w);
+    state = BlockingAwaitState(w, goal_mask);
+  }
+
+  if (update_ctx) {
+    // Since our update is sample based, it is ok if a thread overwrites the
+    // updates by other threads. Thus the update does not have to be atomic.
+    auto v = yield_credit.load(std::memory_order_relaxed);
+    // fixed point exponential decay with decay constant 1/1024, with +1
+    // and -1 scaled to avoid overflow for int32_t
+    //
+    // On each update the positive credit is decayed by a facor of 1/1024 (i.e.,
+    // 0.1%). If the sampled yield was successful, the credit is also increased
+    // by X. Setting X=2^17 ensures that the credit never exceeds
+    // 2^17*2^10=2^27, which is lower than 2^31 the upperbound of int32_t. Same
+    // logic applies to negative credits.
+    v = v - (v / 1024) + (would_spin_again ? 1 : -1) * 131072;
+    yield_credit.store(v, std::memory_order_relaxed);
+  }
+
+  assert((state & goal_mask) != 0);
+  return state;
+}
+
+void WriteThread::SetState(Writer* w, uint8_t new_state) {
+  auto state = w->state.load(std::memory_order_acquire);
+  if (state == STATE_LOCKED_WAITING ||
+      !w->state.compare_exchange_strong(state, new_state)) {
+    assert(state == STATE_LOCKED_WAITING);
+
+    std::lock_guard<std::mutex> guard(w->StateMutex());
+    assert(w->state.load(std::memory_order_relaxed) != new_state);
+    w->state.store(new_state, std::memory_order_relaxed);
+    w->StateCV().notify_one();
+  }
+}
+
+bool WriteThread::LinkOne(Writer* w, std::atomic<Writer*>* newest_writer) {
+  assert(newest_writer != nullptr);
+  assert(w->state == STATE_INIT);
+  Writer* writers = newest_writer->load(std::memory_order_relaxed);
+  while (true) {
+    // If write stall in effect, and w->no_slowdown is not true,
+    // block here until stall is cleared. If its true, then return
+    // immediately
+    if (writers == &write_stall_dummy_) {
+      if (w->no_slowdown) {
+        w->status = Status::Incomplete("Write stall");
+        SetState(w, STATE_COMPLETED);
+        return false;
+      }
+      // Since no_slowdown is false, wait here to be notified of the write
+      // stall clearing
+      {
+        MutexLock lock(&stall_mu_);
+        writers = newest_writer->load(std::memory_order_relaxed);
+        if (writers == &write_stall_dummy_) {
+          stall_cv_.Wait();
+          // Load newest_writers_ again since it may have changed
+          writers = newest_writer->load(std::memory_order_relaxed);
+          continue;
+        }
+      }
+    }
+    w->link_older = writers;
+    if (newest_writer->compare_exchange_weak(writers, w)) {
+      return (writers == nullptr);
+    }
+  }
+}
+
+bool WriteThread::LinkGroup(WriteGroup& write_group,
+                            std::atomic<Writer*>* newest_writer) {
+  assert(newest_writer != nullptr);
+  Writer* leader = write_group.leader;
+  Writer* last_writer = write_group.last_writer;
+  Writer* w = last_writer;
+  while (true) {
+    // Unset link_newer pointers to make sure when we call
+    // CreateMissingNewerLinks later it create all missing links.
+    w->link_newer = nullptr;
+    w->write_group = nullptr;
+    if (w == leader) {
+      break;
+    }
+    w = w->link_older;
+  }
+  Writer* newest = newest_writer->load(std::memory_order_relaxed);
+  while (true) {
+    leader->link_older = newest;
+    if (newest_writer->compare_exchange_weak(newest, last_writer)) {
+      return (newest == nullptr);
+    }
+  }
+}
+
+void WriteThread::CreateMissingNewerLinks(Writer* head) {
+  while (true) {
+    Writer* next = head->link_older;
+    if (next == nullptr || next->link_newer != nullptr) {
+      assert(next == nullptr || next->link_newer == head);
+      break;
+    }
+    next->link_newer = head;
+    head = next;
+  }
+}
+
+WriteThread::Writer* WriteThread::FindNextLeader(Writer* from,
+                                                 Writer* boundary) {
+  assert(from != nullptr && from != boundary);
+  Writer* current = from;
+  while (current->link_older != boundary) {
+    current = current->link_older;
+    assert(current != nullptr);
+  }
+  return current;
+}
+
+void WriteThread::CompleteLeader(WriteGroup& write_group) {
+  assert(write_group.size > 0);
+  Writer* leader = write_group.leader;
+  if (write_group.size == 1) {
+    write_group.leader = nullptr;
+    write_group.last_writer = nullptr;
+  } else {
+    assert(leader->link_newer != nullptr);
+    leader->link_newer->link_older = nullptr;
+    write_group.leader = leader->link_newer;
+  }
+  write_group.size -= 1;
+  SetState(leader, STATE_COMPLETED);
+}
+
+void WriteThread::CompleteFollower(Writer* w, WriteGroup& write_group) {
+  assert(write_group.size > 1);
+  assert(w != write_group.leader);
+  if (w == write_group.last_writer) {
+    w->link_older->link_newer = nullptr;
+    write_group.last_writer = w->link_older;
+  } else {
+    w->link_older->link_newer = w->link_newer;
+    w->link_newer->link_older = w->link_older;
+  }
+  write_group.size -= 1;
+  SetState(w, STATE_COMPLETED);
+}
+
+void WriteThread::BeginWriteStall() {
+  LinkOne(&write_stall_dummy_, &newest_writer_);
+
+  // Walk writer list until w->write_group != nullptr. The current write group
+  // will not have a mix of slowdown/no_slowdown, so its ok to stop at that
+  // point
+  Writer* w = write_stall_dummy_.link_older;
+  Writer* prev = &write_stall_dummy_;
+  while (w != nullptr && w->write_group == nullptr) {
+    if (w->no_slowdown) {
+      prev->link_older = w->link_older;
+      w->status = Status::Incomplete("Write stall");
+      SetState(w, STATE_COMPLETED);
+      if (prev->link_older) {
+        prev->link_older->link_newer = prev;
+      }
+      w = prev->link_older;
+    } else {
+      prev = w;
+      w = w->link_older;
+    }
+  }
+}
+
+void WriteThread::EndWriteStall() {
+  MutexLock lock(&stall_mu_);
+
+  // Unlink write_stall_dummy_ from the write queue. This will unblock
+  // pending write threads to enqueue themselves
+  assert(newest_writer_.load(std::memory_order_relaxed) == &write_stall_dummy_);
+  assert(write_stall_dummy_.link_older != nullptr);
+  write_stall_dummy_.link_older->link_newer = write_stall_dummy_.link_newer;
+  newest_writer_.exchange(write_stall_dummy_.link_older);
+
+  // Wake up writers
+  stall_cv_.SignalAll();
+}
+
+static WriteThread::AdaptationContext jbg_ctx("JoinBatchGroup");
+void WriteThread::JoinBatchGroup(Writer* w) {
+  TEST_SYNC_POINT_CALLBACK("WriteThread::JoinBatchGroup:Start", w);
+  assert(w->batch != nullptr);
+
+  bool linked_as_leader = LinkOne(w, &newest_writer_);
+
+  if (linked_as_leader) {
+    SetState(w, STATE_GROUP_LEADER);
+  }
+
+  TEST_SYNC_POINT_CALLBACK("WriteThread::JoinBatchGroup:Wait", w);
+
+  if (!linked_as_leader) {
+    /**
+     * Wait util:
+     * 1) An existing leader pick us as the new leader when it finishes
+     * 2) An existing leader pick us as its follewer and
+     * 2.1) finishes the memtable writes on our behalf
+     * 2.2) Or tell us to finish the memtable writes in pralallel
+     * 3) (pipelined write) An existing leader pick us as its follower and
+     *    finish book-keeping and WAL write for us, enqueue us as pending
+     *    memtable writer, and
+     * 3.1) we become memtable writer group leader, or
+     * 3.2) an existing memtable writer group leader tell us to finish memtable
+     *      writes in parallel.
+     */
+    TEST_SYNC_POINT_CALLBACK("WriteThread::JoinBatchGroup:BeganWaiting", w);
+    AwaitState(w, STATE_GROUP_LEADER | STATE_MEMTABLE_WRITER_LEADER |
+                      STATE_PARALLEL_MEMTABLE_WRITER | STATE_COMPLETED,
+               &jbg_ctx);
+    TEST_SYNC_POINT_CALLBACK("WriteThread::JoinBatchGroup:DoneWaiting", w);
+  }
+}
+
+size_t WriteThread::EnterAsBatchGroupLeader(Writer* leader,
+                                            WriteGroup* write_group) {
+  assert(leader->link_older == nullptr);
+  assert(leader->batch != nullptr);
+  assert(write_group != nullptr);
+
+  size_t size = WriteBatchInternal::ByteSize(leader->batch);
+
+  // Allow the group to grow up to a maximum size, but if the
+  // original write is small, limit the growth so we do not slow
+  // down the small write too much.
+  size_t max_size = max_write_batch_group_size_bytes;
+  const uint64_t min_batch_size_bytes = max_write_batch_group_size_bytes / 8;
+  if (size <= min_batch_size_bytes) {
+    max_size = size + min_batch_size_bytes;
+  }
+
+  leader->write_group = write_group;
+  write_group->leader = leader;
+  write_group->last_writer = leader;
+  write_group->size = 1;
+  Writer* newest_writer = newest_writer_.load(std::memory_order_acquire);
+
+  // This is safe regardless of any db mutex status of the caller. Previous
+  // calls to ExitAsGroupLeader either didn't call CreateMissingNewerLinks
+  // (they emptied the list and then we added ourself as leader) or had to
+  // explicitly wake us up (the list was non-empty when we added ourself,
+  // so we have already received our MarkJoined).
+  CreateMissingNewerLinks(newest_writer);
+
+  // Tricky. Iteration start (leader) is exclusive and finish
+  // (newest_writer) is inclusive. Iteration goes from old to new.
+  Writer* w = leader;
+  while (w != newest_writer) {
+    w = w->link_newer;
+
+    if (w->sync && !leader->sync) {
+      // Do not include a sync write into a batch handled by a non-sync write.
+      break;
+    }
+
+    if (w->no_slowdown != leader->no_slowdown) {
+      // Do not mix writes that are ok with delays with the ones that
+      // request fail on delays.
+      break;
+    }
+
+    if (w->disable_wal != leader->disable_wal) {
+      // Do not mix writes that enable WAL with the ones whose
+      // WAL disabled.
+      break;
+    }
+
+    if (w->batch == nullptr) {
+      // Do not include those writes with nullptr batch. Those are not writes,
+      // those are something else. They want to be alone
+      break;
+    }
+
+    if (w->callback != nullptr && !w->callback->AllowWriteBatching()) {
+      // dont batch writes that don't want to be batched
+      break;
+    }
+
+    auto batch_size = WriteBatchInternal::ByteSize(w->batch);
+    if (size + batch_size > max_size) {
+      // Do not make batch too big
+      break;
+    }
+
+    w->write_group = write_group;
+    size += batch_size;
+    write_group->last_writer = w;
+    write_group->size++;
+  }
+  TEST_SYNC_POINT_CALLBACK("WriteThread::EnterAsBatchGroupLeader:End", w);
+  return size;
+}
+
+void WriteThread::EnterAsMemTableWriter(Writer* leader,
+                                        WriteGroup* write_group) {
+  assert(leader != nullptr);
+  assert(leader->link_older == nullptr);
+  assert(leader->batch != nullptr);
+  assert(write_group != nullptr);
+
+  size_t size = WriteBatchInternal::ByteSize(leader->batch);
+
+  // Allow the group to grow up to a maximum size, but if the
+  // original write is small, limit the growth so we do not slow
+  // down the small write too much.
+  size_t max_size = max_write_batch_group_size_bytes;
+  const uint64_t min_batch_size_bytes = max_write_batch_group_size_bytes / 8;
+  if (size <= min_batch_size_bytes) {
+    max_size = size + min_batch_size_bytes;
+  }
+
+  leader->write_group = write_group;
+  write_group->leader = leader;
+  write_group->size = 1;
+  Writer* last_writer = leader;
+
+  if (!allow_concurrent_memtable_write_ || !leader->batch->HasMerge()) {
+    Writer* newest_writer = newest_memtable_writer_.load();
+    CreateMissingNewerLinks(newest_writer);
+
+    Writer* w = leader;
+    while (w != newest_writer) {
+      w = w->link_newer;
+
+      if (w->batch == nullptr) {
+        break;
+      }
+
+      if (w->batch->HasMerge()) {
+        break;
+      }
+
+      if (!allow_concurrent_memtable_write_) {
+        auto batch_size = WriteBatchInternal::ByteSize(w->batch);
+        if (size + batch_size > max_size) {
+          // Do not make batch too big
+          break;
+        }
+        size += batch_size;
+      }
+
+      w->write_group = write_group;
+      last_writer = w;
+      write_group->size++;
+    }
+  }
+
+  write_group->last_writer = last_writer;
+  write_group->last_sequence =
+      last_writer->sequence + WriteBatchInternal::Count(last_writer->batch) - 1;
+}
+
+void WriteThread::ExitAsMemTableWriter(Writer* /*self*/,
+                                       WriteGroup& write_group) {
+  Writer* leader = write_group.leader;
+  Writer* last_writer = write_group.last_writer;
+
+  Writer* newest_writer = last_writer;
+  if (!newest_memtable_writer_.compare_exchange_strong(newest_writer,
+                                                       nullptr)) {
+    CreateMissingNewerLinks(newest_writer);
+    Writer* next_leader = last_writer->link_newer;
+    assert(next_leader != nullptr);
+    next_leader->link_older = nullptr;
+    SetState(next_leader, STATE_MEMTABLE_WRITER_LEADER);
+  }
+  Writer* w = leader;
+  while (true) {
+    if (!write_group.status.ok()) {
+      w->status = write_group.status;
+    }
+    Writer* next = w->link_newer;
+    if (w != leader) {
+      SetState(w, STATE_COMPLETED);
+    }
+    if (w == last_writer) {
+      break;
+    }
+    w = next;
+  }
+  // Note that leader has to exit last, since it owns the write group.
+  SetState(leader, STATE_COMPLETED);
+}
+
+void WriteThread::LaunchParallelMemTableWriters(WriteGroup* write_group) {
+  assert(write_group != nullptr);
+  write_group->running.store(write_group->size);
+  for (auto w : *write_group) {
+    SetState(w, STATE_PARALLEL_MEMTABLE_WRITER);
+  }
+}
+
+static WriteThread::AdaptationContext cpmtw_ctx("CompleteParallelMemTableWriter");
+// This method is called by both the leader and parallel followers
+bool WriteThread::CompleteParallelMemTableWriter(Writer* w) {
+
+  auto* write_group = w->write_group;
+  if (!w->status.ok()) {
+    std::lock_guard<std::mutex> guard(write_group->leader->StateMutex());
+    write_group->status = w->status;
+  }
+
+  if (write_group->running-- > 1) {
+    // we're not the last one
+    AwaitState(w, STATE_COMPLETED, &cpmtw_ctx);
+    return false;
+  }
+  // else we're the last parallel worker and should perform exit duties.
+  w->status = write_group->status;
+  return true;
+}
+
+void WriteThread::ExitAsBatchGroupFollower(Writer* w) {
+  auto* write_group = w->write_group;
+
+  assert(w->state == STATE_PARALLEL_MEMTABLE_WRITER);
+  assert(write_group->status.ok());
+  ExitAsBatchGroupLeader(*write_group, write_group->status);
+  assert(w->status.ok());
+  assert(w->state == STATE_COMPLETED);
+  SetState(write_group->leader, STATE_COMPLETED);
+}
+
+static WriteThread::AdaptationContext eabgl_ctx("ExitAsBatchGroupLeader");
+void WriteThread::ExitAsBatchGroupLeader(WriteGroup& write_group,
+                                         Status status) {
+  Writer* leader = write_group.leader;
+  Writer* last_writer = write_group.last_writer;
+  assert(leader->link_older == nullptr);
+
+  // Propagate memtable write error to the whole group.
+  if (status.ok() && !write_group.status.ok()) {
+    status = write_group.status;
+  }
+
+  if (enable_pipelined_write_) {
+    // Notify writers don't write to memtable to exit.
+    for (Writer* w = last_writer; w != leader;) {
+      Writer* next = w->link_older;
+      w->status = status;
+      if (!w->ShouldWriteToMemtable()) {
+        CompleteFollower(w, write_group);
+      }
+      w = next;
+    }
+    if (!leader->ShouldWriteToMemtable()) {
+      CompleteLeader(write_group);
+    }
+
+    Writer* next_leader = nullptr;
+
+    // Look for next leader before we call LinkGroup. If there isn't
+    // pending writers, place a dummy writer at the tail of the queue
+    // so we know the boundary of the current write group.
+    Writer dummy;
+    Writer* expected = last_writer;
+    bool has_dummy = newest_writer_.compare_exchange_strong(expected, &dummy);
+    if (!has_dummy) {
+      // We find at least one pending writer when we insert dummy. We search
+      // for next leader from there.
+      next_leader = FindNextLeader(expected, last_writer);
+      assert(next_leader != nullptr && next_leader != last_writer);
+    }
+
+    // Link the ramaining of the group to memtable writer list.
+    //
+    // We have to link our group to memtable writer queue before wake up the
+    // next leader or set newest_writer_ to null, otherwise the next leader
+    // can run ahead of us and link to memtable writer queue before we do.
+    if (write_group.size > 0) {
+      if (LinkGroup(write_group, &newest_memtable_writer_)) {
+        // The leader can now be different from current writer.
+        SetState(write_group.leader, STATE_MEMTABLE_WRITER_LEADER);
+      }
+    }
+
+    // If we have inserted dummy in the queue, remove it now and check if there
+    // are pending writer join the queue since we insert the dummy. If so,
+    // look for next leader again.
+    if (has_dummy) {
+      assert(next_leader == nullptr);
+      expected = &dummy;
+      bool has_pending_writer =
+          !newest_writer_.compare_exchange_strong(expected, nullptr);
+      if (has_pending_writer) {
+        next_leader = FindNextLeader(expected, &dummy);
+        assert(next_leader != nullptr && next_leader != &dummy);
+      }
+    }
+
+    if (next_leader != nullptr) {
+      next_leader->link_older = nullptr;
+      SetState(next_leader, STATE_GROUP_LEADER);
+    }
+    AwaitState(leader, STATE_MEMTABLE_WRITER_LEADER |
+                           STATE_PARALLEL_MEMTABLE_WRITER | STATE_COMPLETED,
+               &eabgl_ctx);
+  } else {
+    Writer* head = newest_writer_.load(std::memory_order_acquire);
+    if (head != last_writer ||
+        !newest_writer_.compare_exchange_strong(head, nullptr)) {
+      // Either w wasn't the head during the load(), or it was the head
+      // during the load() but somebody else pushed onto the list before
+      // we did the compare_exchange_strong (causing it to fail).  In the
+      // latter case compare_exchange_strong has the effect of re-reading
+      // its first param (head).  No need to retry a failing CAS, because
+      // only a departing leader (which we are at the moment) can remove
+      // nodes from the list.
+      assert(head != last_writer);
+
+      // After walking link_older starting from head (if not already done)
+      // we will be able to traverse w->link_newer below. This function
+      // can only be called from an active leader, only a leader can
+      // clear newest_writer_, we didn't, and only a clear newest_writer_
+      // could cause the next leader to start their work without a call
+      // to MarkJoined, so we can definitely conclude that no other leader
+      // work is going on here (with or without db mutex).
+      CreateMissingNewerLinks(head);
+      assert(last_writer->link_newer->link_older == last_writer);
+      last_writer->link_newer->link_older = nullptr;
+
+      // Next leader didn't self-identify, because newest_writer_ wasn't
+      // nullptr when they enqueued (we were definitely enqueued before them
+      // and are still in the list).  That means leader handoff occurs when
+      // we call MarkJoined
+      SetState(last_writer->link_newer, STATE_GROUP_LEADER);
+    }
+    // else nobody else was waiting, although there might already be a new
+    // leader now
+
+    while (last_writer != leader) {
+      last_writer->status = status;
+      // we need to read link_older before calling SetState, because as soon
+      // as it is marked committed the other thread's Await may return and
+      // deallocate the Writer.
+      auto next = last_writer->link_older;
+      SetState(last_writer, STATE_COMPLETED);
+
+      last_writer = next;
+    }
+  }
+}
+
+static WriteThread::AdaptationContext eu_ctx("EnterUnbatched");
+void WriteThread::EnterUnbatched(Writer* w, InstrumentedMutex* mu) {
+  assert(w != nullptr && w->batch == nullptr);
+  mu->Unlock();
+  bool linked_as_leader = LinkOne(w, &newest_writer_);
+  if (!linked_as_leader) {
+    TEST_SYNC_POINT("WriteThread::EnterUnbatched:Wait");
+    // Last leader will not pick us as a follower since our batch is nullptr
+    AwaitState(w, STATE_GROUP_LEADER, &eu_ctx);
+  }
+  if (enable_pipelined_write_) {
+    WaitForMemTableWriters();
+  }
+  mu->Lock();
+}
+
+void WriteThread::ExitUnbatched(Writer* w) {
+  assert(w != nullptr);
+  Writer* newest_writer = w;
+  if (!newest_writer_.compare_exchange_strong(newest_writer, nullptr)) {
+    CreateMissingNewerLinks(newest_writer);
+    Writer* next_leader = w->link_newer;
+    assert(next_leader != nullptr);
+    next_leader->link_older = nullptr;
+    SetState(next_leader, STATE_GROUP_LEADER);
+  }
+}
+
+static WriteThread::AdaptationContext wfmw_ctx("WaitForMemTableWriters");
+void WriteThread::WaitForMemTableWriters() {
+  assert(enable_pipelined_write_);
+  if (newest_memtable_writer_.load() == nullptr) {
+    return;
+  }
+  Writer w;
+  if (!LinkOne(&w, &newest_memtable_writer_)) {
+    AwaitState(&w, STATE_MEMTABLE_WRITER_LEADER, &wfmw_ctx);
+  }
+  newest_memtable_writer_.store(nullptr);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/write_thread.h b/src/rocksdb/db/write_thread.h
new file mode 100644
index 000000000..878199714
--- /dev/null
+++ b/src/rocksdb/db/write_thread.h
@@ -0,0 +1,431 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <assert.h>
+#include <stdint.h>
+#include <atomic>
+#include <chrono>
+#include <condition_variable>
+#include <mutex>
+#include <type_traits>
+#include <vector>
+
+#include "db/dbformat.h"
+#include "db/pre_release_callback.h"
+#include "db/write_callback.h"
+#include "monitoring/instrumented_mutex.h"
+#include "rocksdb/options.h"
+#include "rocksdb/status.h"
+#include "rocksdb/types.h"
+#include "rocksdb/write_batch.h"
+#include "util/autovector.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class WriteThread {
+ public:
+  enum State : uint8_t {
+    // The initial state of a writer.  This is a Writer that is
+    // waiting in JoinBatchGroup.  This state can be left when another
+    // thread informs the waiter that it has become a group leader
+    // (-> STATE_GROUP_LEADER), when a leader that has chosen to be
+    // non-parallel informs a follower that its writes have been committed
+    // (-> STATE_COMPLETED), or when a leader that has chosen to perform
+    // updates in parallel and needs this Writer to apply its batch (->
+    // STATE_PARALLEL_FOLLOWER).
+    STATE_INIT = 1,
+
+    // The state used to inform a waiting Writer that it has become the
+    // leader, and it should now build a write batch group.  Tricky:
+    // this state is not used if newest_writer_ is empty when a writer
+    // enqueues itself, because there is no need to wait (or even to
+    // create the mutex and condvar used to wait) in that case.  This is
+    // a terminal state unless the leader chooses to make this a parallel
+    // batch, in which case the last parallel worker to finish will move
+    // the leader to STATE_COMPLETED.
+    STATE_GROUP_LEADER = 2,
+
+    // The state used to inform a waiting writer that it has become the
+    // leader of memtable writer group. The leader will either write
+    // memtable for the whole group, or launch a parallel group write
+    // to memtable by calling LaunchParallelMemTableWrite.
+    STATE_MEMTABLE_WRITER_LEADER = 4,
+
+    // The state used to inform a waiting writer that it has become a
+    // parallel memtable writer. It can be the group leader who launch the
+    // parallel writer group, or one of the followers. The writer should then
+    // apply its batch to the memtable concurrently and call
+    // CompleteParallelMemTableWriter.
+    STATE_PARALLEL_MEMTABLE_WRITER = 8,
+
+    // A follower whose writes have been applied, or a parallel leader
+    // whose followers have all finished their work.  This is a terminal
+    // state.
+    STATE_COMPLETED = 16,
+
+    // A state indicating that the thread may be waiting using StateMutex()
+    // and StateCondVar()
+    STATE_LOCKED_WAITING = 32,
+  };
+
+  struct Writer;
+
+  struct WriteGroup {
+    Writer* leader = nullptr;
+    Writer* last_writer = nullptr;
+    SequenceNumber last_sequence;
+    // before running goes to zero, status needs leader->StateMutex()
+    Status status;
+    std::atomic<size_t> running;
+    size_t size = 0;
+
+    struct Iterator {
+      Writer* writer;
+      Writer* last_writer;
+
+      explicit Iterator(Writer* w, Writer* last)
+          : writer(w), last_writer(last) {}
+
+      Writer* operator*() const { return writer; }
+
+      Iterator& operator++() {
+        assert(writer != nullptr);
+        if (writer == last_writer) {
+          writer = nullptr;
+        } else {
+          writer = writer->link_newer;
+        }
+        return *this;
+      }
+
+      bool operator!=(const Iterator& other) const {
+        return writer != other.writer;
+      }
+    };
+
+    Iterator begin() const { return Iterator(leader, last_writer); }
+    Iterator end() const { return Iterator(nullptr, nullptr); }
+  };
+
+  // Information kept for every waiting writer.
+  struct Writer {
+    WriteBatch* batch;
+    bool sync;
+    bool no_slowdown;
+    bool disable_wal;
+    bool disable_memtable;
+    size_t batch_cnt;  // if non-zero, number of sub-batches in the write batch
+    PreReleaseCallback* pre_release_callback;
+    uint64_t log_used;  // log number that this batch was inserted into
+    uint64_t log_ref;   // log number that memtable insert should reference
+    WriteCallback* callback;
+    bool made_waitable;          // records lazy construction of mutex and cv
+    std::atomic<uint8_t> state;  // write under StateMutex() or pre-link
+    WriteGroup* write_group;
+    SequenceNumber sequence;  // the sequence number to use for the first key
+    Status status;
+    Status callback_status;   // status returned by callback->Callback()
+
+    std::aligned_storage<sizeof(std::mutex)>::type state_mutex_bytes;
+    std::aligned_storage<sizeof(std::condition_variable)>::type state_cv_bytes;
+    Writer* link_older;  // read/write only before linking, or as leader
+    Writer* link_newer;  // lazy, read/write only before linking, or as leader
+
+    Writer()
+        : batch(nullptr),
+          sync(false),
+          no_slowdown(false),
+          disable_wal(false),
+          disable_memtable(false),
+          batch_cnt(0),
+          pre_release_callback(nullptr),
+          log_used(0),
+          log_ref(0),
+          callback(nullptr),
+          made_waitable(false),
+          state(STATE_INIT),
+          write_group(nullptr),
+          sequence(kMaxSequenceNumber),
+          link_older(nullptr),
+          link_newer(nullptr) {}
+
+    Writer(const WriteOptions& write_options, WriteBatch* _batch,
+           WriteCallback* _callback, uint64_t _log_ref, bool _disable_memtable,
+           size_t _batch_cnt = 0,
+           PreReleaseCallback* _pre_release_callback = nullptr)
+        : batch(_batch),
+          sync(write_options.sync),
+          no_slowdown(write_options.no_slowdown),
+          disable_wal(write_options.disableWAL),
+          disable_memtable(_disable_memtable),
+          batch_cnt(_batch_cnt),
+          pre_release_callback(_pre_release_callback),
+          log_used(0),
+          log_ref(_log_ref),
+          callback(_callback),
+          made_waitable(false),
+          state(STATE_INIT),
+          write_group(nullptr),
+          sequence(kMaxSequenceNumber),
+          link_older(nullptr),
+          link_newer(nullptr) {}
+
+    ~Writer() {
+      if (made_waitable) {
+        StateMutex().~mutex();
+        StateCV().~condition_variable();
+      }
+    }
+
+    bool CheckCallback(DB* db) {
+      if (callback != nullptr) {
+        callback_status = callback->Callback(db);
+      }
+      return callback_status.ok();
+    }
+
+    void CreateMutex() {
+      if (!made_waitable) {
+        // Note that made_waitable is tracked separately from state
+        // transitions, because we can't atomically create the mutex and
+        // link into the list.
+        made_waitable = true;
+        new (&state_mutex_bytes) std::mutex;
+        new (&state_cv_bytes) std::condition_variable;
+      }
+    }
+
+    // returns the aggregate status of this Writer
+    Status FinalStatus() {
+      if (!status.ok()) {
+        // a non-ok memtable write status takes presidence
+        assert(callback == nullptr || callback_status.ok());
+        return status;
+      } else if (!callback_status.ok()) {
+        // if the callback failed then that is the status we want
+        // because a memtable insert should not have been attempted
+        assert(callback != nullptr);
+        assert(status.ok());
+        return callback_status;
+      } else {
+        // if there is no callback then we only care about
+        // the memtable insert status
+        assert(callback == nullptr || callback_status.ok());
+        return status;
+      }
+    }
+
+    bool CallbackFailed() {
+      return (callback != nullptr) && !callback_status.ok();
+    }
+
+    bool ShouldWriteToMemtable() {
+      return status.ok() && !CallbackFailed() && !disable_memtable;
+    }
+
+    bool ShouldWriteToWAL() {
+      return status.ok() && !CallbackFailed() && !disable_wal;
+    }
+
+    // No other mutexes may be acquired while holding StateMutex(), it is
+    // always last in the order
+    std::mutex& StateMutex() {
+      assert(made_waitable);
+      return *static_cast<std::mutex*>(static_cast<void*>(&state_mutex_bytes));
+    }
+
+    std::condition_variable& StateCV() {
+      assert(made_waitable);
+      return *static_cast<std::condition_variable*>(
+                 static_cast<void*>(&state_cv_bytes));
+    }
+  };
+
+  struct AdaptationContext {
+    const char* name;
+    std::atomic<int32_t> value;
+
+    explicit AdaptationContext(const char* name0) : name(name0), value(0) {}
+  };
+
+  explicit WriteThread(const ImmutableDBOptions& db_options);
+
+  virtual ~WriteThread() = default;
+
+  // IMPORTANT: None of the methods in this class rely on the db mutex
+  // for correctness. All of the methods except JoinBatchGroup and
+  // EnterUnbatched may be called either with or without the db mutex held.
+  // Correctness is maintained by ensuring that only a single thread is
+  // a leader at a time.
+
+  // Registers w as ready to become part of a batch group, waits until the
+  // caller should perform some work, and returns the current state of the
+  // writer.  If w has become the leader of a write batch group, returns
+  // STATE_GROUP_LEADER.  If w has been made part of a sequential batch
+  // group and the leader has performed the write, returns STATE_DONE.
+  // If w has been made part of a parallel batch group and is responsible
+  // for updating the memtable, returns STATE_PARALLEL_FOLLOWER.
+  //
+  // The db mutex SHOULD NOT be held when calling this function, because
+  // it will block.
+  //
+  // Writer* w:        Writer to be executed as part of a batch group
+  void JoinBatchGroup(Writer* w);
+
+  // Constructs a write batch group led by leader, which should be a
+  // Writer passed to JoinBatchGroup on the current thread.
+  //
+  // Writer* leader:          Writer that is STATE_GROUP_LEADER
+  // WriteGroup* write_group: Out-param of group members
+  // returns:                 Total batch group byte size
+  size_t EnterAsBatchGroupLeader(Writer* leader, WriteGroup* write_group);
+
+  // Unlinks the Writer-s in a batch group, wakes up the non-leaders,
+  // and wakes up the next leader (if any).
+  //
+  // WriteGroup* write_group: the write group
+  // Status status:           Status of write operation
+  void ExitAsBatchGroupLeader(WriteGroup& write_group, Status status);
+
+  // Exit batch group on behalf of batch group leader.
+  void ExitAsBatchGroupFollower(Writer* w);
+
+  // Constructs a write batch group led by leader from newest_memtable_writers_
+  // list. The leader should either write memtable for the whole group and
+  // call ExitAsMemTableWriter, or launch parallel memtable write through
+  // LaunchParallelMemTableWriters.
+  void EnterAsMemTableWriter(Writer* leader, WriteGroup* write_grup);
+
+  // Memtable writer group leader, or the last finished writer in a parallel
+  // write group, exit from the newest_memtable_writers_ list, and wake up
+  // the next leader if needed.
+  void ExitAsMemTableWriter(Writer* self, WriteGroup& write_group);
+
+  // Causes JoinBatchGroup to return STATE_PARALLEL_FOLLOWER for all of the
+  // non-leader members of this write batch group.  Sets Writer::sequence
+  // before waking them up.
+  //
+  // WriteGroup* write_group: Extra state used to coordinate the parallel add
+  void LaunchParallelMemTableWriters(WriteGroup* write_group);
+
+  // Reports the completion of w's batch to the parallel group leader, and
+  // waits for the rest of the parallel batch to complete.  Returns true
+  // if this thread is the last to complete, and hence should advance
+  // the sequence number and then call EarlyExitParallelGroup, false if
+  // someone else has already taken responsibility for that.
+  bool CompleteParallelMemTableWriter(Writer* w);
+
+  // Waits for all preceding writers (unlocking mu while waiting), then
+  // registers w as the currently proceeding writer.
+  //
+  // Writer* w:              A Writer not eligible for batching
+  // InstrumentedMutex* mu:  The db mutex, to unlock while waiting
+  // REQUIRES: db mutex held
+  void EnterUnbatched(Writer* w, InstrumentedMutex* mu);
+
+  // Completes a Writer begun with EnterUnbatched, unblocking subsequent
+  // writers.
+  void ExitUnbatched(Writer* w);
+
+  // Wait for all parallel memtable writers to finish, in case pipelined
+  // write is enabled.
+  void WaitForMemTableWriters();
+
+  SequenceNumber UpdateLastSequence(SequenceNumber sequence) {
+    if (sequence > last_sequence_) {
+      last_sequence_ = sequence;
+    }
+    return last_sequence_;
+  }
+
+  // Insert a dummy writer at the tail of the write queue to indicate a write
+  // stall, and fail any writers in the queue with no_slowdown set to true
+  void BeginWriteStall();
+
+  // Remove the dummy writer and wake up waiting writers
+  void EndWriteStall();
+
+ private:
+  // See AwaitState.
+  const uint64_t max_yield_usec_;
+  const uint64_t slow_yield_usec_;
+
+  // Allow multiple writers write to memtable concurrently.
+  const bool allow_concurrent_memtable_write_;
+
+  // Enable pipelined write to WAL and memtable.
+  const bool enable_pipelined_write_;
+
+  // The maximum limit of number of bytes that are written in a single batch
+  // of WAL or memtable write. It is followed when the leader write size
+  // is larger than 1/8 of this limit.
+  const uint64_t max_write_batch_group_size_bytes;
+
+  // Points to the newest pending writer. Only leader can remove
+  // elements, adding can be done lock-free by anybody.
+  std::atomic<Writer*> newest_writer_;
+
+  // Points to the newest pending memtable writer. Used only when pipelined
+  // write is enabled.
+  std::atomic<Writer*> newest_memtable_writer_;
+
+  // The last sequence that have been consumed by a writer. The sequence
+  // is not necessary visible to reads because the writer can be ongoing.
+  SequenceNumber last_sequence_;
+
+  // A dummy writer to indicate a write stall condition. This will be inserted
+  // at the tail of the writer queue by the leader, so newer writers can just
+  // check for this and bail
+  Writer write_stall_dummy_;
+
+  // Mutex and condvar for writers to block on a write stall. During a write
+  // stall, writers with no_slowdown set to false will wait on this rather
+  // on the writer queue
+  port::Mutex stall_mu_;
+  port::CondVar stall_cv_;
+
+  // Waits for w->state & goal_mask using w->StateMutex().  Returns
+  // the state that satisfies goal_mask.
+  uint8_t BlockingAwaitState(Writer* w, uint8_t goal_mask);
+
+  // Blocks until w->state & goal_mask, returning the state value
+  // that satisfied the predicate.  Uses ctx to adaptively use
+  // std::this_thread::yield() to avoid mutex overheads.  ctx should be
+  // a context-dependent static.
+  uint8_t AwaitState(Writer* w, uint8_t goal_mask, AdaptationContext* ctx);
+
+  // Set writer state and wake the writer up if it is waiting.
+  void SetState(Writer* w, uint8_t new_state);
+
+  // Links w into the newest_writer list. Return true if w was linked directly
+  // into the leader position.  Safe to call from multiple threads without
+  // external locking.
+  bool LinkOne(Writer* w, std::atomic<Writer*>* newest_writer);
+
+  // Link write group into the newest_writer list as a whole, while keeping the
+  // order of the writers unchanged. Return true if the group was linked
+  // directly into the leader position.
+  bool LinkGroup(WriteGroup& write_group, std::atomic<Writer*>* newest_writer);
+
+  // Computes any missing link_newer links.  Should not be called
+  // concurrently with itself.
+  void CreateMissingNewerLinks(Writer* head);
+
+  // Starting from a pending writer, follow link_older to search for next
+  // leader, until we hit boundary.
+  Writer* FindNextLeader(Writer* pending_writer, Writer* boundary);
+
+  // Set the leader in write_group to completed state and remove it from the
+  // write group.
+  void CompleteLeader(WriteGroup& write_group);
+
+  // Set a follower in write_group to completed state and remove it from the
+  // write group.
+  void CompleteFollower(Writer* w, WriteGroup& write_group);
+};
+
+}  // namespace ROCKSDB_NAMESPACE
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-07 18:45:59 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-07 18:45:59 +0000
commit	19fcec84d8d7d21e796c7624e521b60d28ee21ed (patch)
tree	42d26aa27d1e3f7c0b8bd3fd14e7d7082f5008dc /src/rocksdb/db
parent	Initial commit. (diff)
download	ceph-19fcec84d8d7d21e796c7624e521b60d28ee21ed.tar.xz ceph-19fcec84d8d7d21e796c7624e521b60d28ee21ed.zip