26 files changed, 10105 insertions, 0 deletions
diff --git a/src/rocksdb/file/delete_scheduler.cc b/src/rocksdb/file/delete_scheduler.cc
new file mode 100644
index 000000000..b97a0f224
--- /dev/null
+++ b/src/rocksdb/file/delete_scheduler.cc
@@ -0,0 +1,411 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "file/delete_scheduler.h"
+
+#include <cinttypes>
+#include <thread>
+#include <vector>
+
+#include "file/sst_file_manager_impl.h"
+#include "logging/logging.h"
+#include "port/port.h"
+#include "rocksdb/env.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/system_clock.h"
+#include "test_util/sync_point.h"
+#include "util/mutexlock.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+DeleteScheduler::DeleteScheduler(SystemClock* clock, FileSystem* fs,
+                                 int64_t rate_bytes_per_sec, Logger* info_log,
+                                 SstFileManagerImpl* sst_file_manager,
+                                 double max_trash_db_ratio,
+                                 uint64_t bytes_max_delete_chunk)
+    : clock_(clock),
+      fs_(fs),
+      total_trash_size_(0),
+      rate_bytes_per_sec_(rate_bytes_per_sec),
+      pending_files_(0),
+      bytes_max_delete_chunk_(bytes_max_delete_chunk),
+      closing_(false),
+      cv_(&mu_),
+      bg_thread_(nullptr),
+      info_log_(info_log),
+      sst_file_manager_(sst_file_manager),
+      max_trash_db_ratio_(max_trash_db_ratio) {
+  assert(sst_file_manager != nullptr);
+  assert(max_trash_db_ratio >= 0);
+  MaybeCreateBackgroundThread();
+}
+
+DeleteScheduler::~DeleteScheduler() {
+  {
+    InstrumentedMutexLock l(&mu_);
+    closing_ = true;
+    cv_.SignalAll();
+  }
+  if (bg_thread_) {
+    bg_thread_->join();
+  }
+  for (const auto& it : bg_errors_) {
+    it.second.PermitUncheckedError();
+  }
+}
+
+Status DeleteScheduler::DeleteFile(const std::string& file_path,
+                                   const std::string& dir_to_sync,
+                                   const bool force_bg) {
+  if (rate_bytes_per_sec_.load() <= 0 ||
+      (!force_bg &&
+       total_trash_size_.load() >
+           sst_file_manager_->GetTotalSize() * max_trash_db_ratio_.load())) {
+    // Rate limiting is disabled or trash size makes up more than
+    // max_trash_db_ratio_ (default 25%) of the total DB size
+    TEST_SYNC_POINT("DeleteScheduler::DeleteFile");
+    Status s = fs_->DeleteFile(file_path, IOOptions(), nullptr);
+    if (s.ok()) {
+      s = sst_file_manager_->OnDeleteFile(file_path);
+      ROCKS_LOG_INFO(info_log_,
+                     "Deleted file %s immediately, rate_bytes_per_sec %" PRIi64
+                     ", total_trash_size %" PRIu64 " max_trash_db_ratio %lf",
+                     file_path.c_str(), rate_bytes_per_sec_.load(),
+                     total_trash_size_.load(), max_trash_db_ratio_.load());
+      InstrumentedMutexLock l(&mu_);
+      RecordTick(stats_.get(), FILES_DELETED_IMMEDIATELY);
+    }
+    return s;
+  }
+
+  // Move file to trash
+  std::string trash_file;
+  Status s = MarkAsTrash(file_path, &trash_file);
+  ROCKS_LOG_INFO(info_log_, "Mark file: %s as trash -- %s", trash_file.c_str(),
+                 s.ToString().c_str());
+
+  if (!s.ok()) {
+    ROCKS_LOG_ERROR(info_log_, "Failed to mark %s as trash -- %s",
+                    file_path.c_str(), s.ToString().c_str());
+    s = fs_->DeleteFile(file_path, IOOptions(), nullptr);
+    if (s.ok()) {
+      s = sst_file_manager_->OnDeleteFile(file_path);
+      ROCKS_LOG_INFO(info_log_, "Deleted file %s immediately",
+                     trash_file.c_str());
+      InstrumentedMutexLock l(&mu_);
+      RecordTick(stats_.get(), FILES_DELETED_IMMEDIATELY);
+    }
+    return s;
+  }
+
+  // Update the total trash size
+  uint64_t trash_file_size = 0;
+  IOStatus io_s =
+      fs_->GetFileSize(trash_file, IOOptions(), &trash_file_size, nullptr);
+  if (io_s.ok()) {
+    total_trash_size_.fetch_add(trash_file_size);
+  }
+  //**TODO: What should we do if we failed to
+  // get the file size?
+
+  // Add file to delete queue
+  {
+    InstrumentedMutexLock l(&mu_);
+    RecordTick(stats_.get(), FILES_MARKED_TRASH);
+    queue_.emplace(trash_file, dir_to_sync);
+    pending_files_++;
+    if (pending_files_ == 1) {
+      cv_.SignalAll();
+    }
+  }
+  return s;
+}
+
+std::map<std::string, Status> DeleteScheduler::GetBackgroundErrors() {
+  InstrumentedMutexLock l(&mu_);
+  return bg_errors_;
+}
+
+const std::string DeleteScheduler::kTrashExtension = ".trash";
+bool DeleteScheduler::IsTrashFile(const std::string& file_path) {
+  return (file_path.size() >= kTrashExtension.size() &&
+          file_path.rfind(kTrashExtension) ==
+              file_path.size() - kTrashExtension.size());
+}
+
+Status DeleteScheduler::CleanupDirectory(Env* env, SstFileManagerImpl* sfm,
+                                         const std::string& path) {
+  Status s;
+  // Check if there are any files marked as trash in this path
+  std::vector<std::string> files_in_path;
+  const auto& fs = env->GetFileSystem();
+  IOOptions io_opts;
+  io_opts.do_not_recurse = true;
+  s = fs->GetChildren(path, io_opts, &files_in_path,
+                      /*IODebugContext*=*/nullptr);
+  if (!s.ok()) {
+    return s;
+  }
+  for (const std::string& current_file : files_in_path) {
+    if (!DeleteScheduler::IsTrashFile(current_file)) {
+      // not a trash file, skip
+      continue;
+    }
+
+    Status file_delete;
+    std::string trash_file = path + "/" + current_file;
+    if (sfm) {
+      // We have an SstFileManager that will schedule the file delete
+      s = sfm->OnAddFile(trash_file);
+      file_delete = sfm->ScheduleFileDeletion(trash_file, path);
+    } else {
+      // Delete the file immediately
+      file_delete = env->DeleteFile(trash_file);
+    }
+
+    if (s.ok() && !file_delete.ok()) {
+      s = file_delete;
+    }
+  }
+
+  return s;
+}
+
+Status DeleteScheduler::MarkAsTrash(const std::string& file_path,
+                                    std::string* trash_file) {
+  // Sanity check of the path
+  size_t idx = file_path.rfind("/");
+  if (idx == std::string::npos || idx == file_path.size() - 1) {
+    return Status::InvalidArgument("file_path is corrupted");
+  }
+
+  if (DeleteScheduler::IsTrashFile(file_path)) {
+    // This is already a trash file
+    *trash_file = file_path;
+    return Status::OK();
+  }
+
+  *trash_file = file_path + kTrashExtension;
+  // TODO(tec) : Implement Env::RenameFileIfNotExist and remove
+  //             file_move_mu mutex.
+  int cnt = 0;
+  Status s;
+  InstrumentedMutexLock l(&file_move_mu_);
+  while (true) {
+    s = fs_->FileExists(*trash_file, IOOptions(), nullptr);
+    if (s.IsNotFound()) {
+      // We found a path for our file in trash
+      s = fs_->RenameFile(file_path, *trash_file, IOOptions(), nullptr);
+      break;
+    } else if (s.ok()) {
+      // Name conflict, generate new random suffix
+      *trash_file = file_path + std::to_string(cnt) + kTrashExtension;
+    } else {
+      // Error during FileExists call, we cannot continue
+      break;
+    }
+    cnt++;
+  }
+  if (s.ok()) {
+    s = sst_file_manager_->OnMoveFile(file_path, *trash_file);
+  }
+  return s;
+}
+
+void DeleteScheduler::BackgroundEmptyTrash() {
+  TEST_SYNC_POINT("DeleteScheduler::BackgroundEmptyTrash");
+
+  while (true) {
+    InstrumentedMutexLock l(&mu_);
+    while (queue_.empty() && !closing_) {
+      cv_.Wait();
+    }
+
+    if (closing_) {
+      return;
+    }
+
+    // Delete all files in queue_
+    uint64_t start_time = clock_->NowMicros();
+    uint64_t total_deleted_bytes = 0;
+    int64_t current_delete_rate = rate_bytes_per_sec_.load();
+    while (!queue_.empty() && !closing_) {
+      if (current_delete_rate != rate_bytes_per_sec_.load()) {
+        // User changed the delete rate
+        current_delete_rate = rate_bytes_per_sec_.load();
+        start_time = clock_->NowMicros();
+        total_deleted_bytes = 0;
+        ROCKS_LOG_INFO(info_log_, "rate_bytes_per_sec is changed to %" PRIi64,
+                       current_delete_rate);
+      }
+
+      // Get new file to delete
+      const FileAndDir& fad = queue_.front();
+      std::string path_in_trash = fad.fname;
+
+      // We don't need to hold the lock while deleting the file
+      mu_.Unlock();
+      uint64_t deleted_bytes = 0;
+      bool is_complete = true;
+      // Delete file from trash and update total_penlty value
+      Status s =
+          DeleteTrashFile(path_in_trash, fad.dir, &deleted_bytes, &is_complete);
+      total_deleted_bytes += deleted_bytes;
+      mu_.Lock();
+      if (is_complete) {
+        queue_.pop();
+      }
+
+      if (!s.ok()) {
+        bg_errors_[path_in_trash] = s;
+      }
+
+      // Apply penalty if necessary
+      uint64_t total_penalty;
+      if (current_delete_rate > 0) {
+        // rate limiting is enabled
+        total_penalty =
+            ((total_deleted_bytes * kMicrosInSecond) / current_delete_rate);
+        ROCKS_LOG_INFO(info_log_,
+                       "Rate limiting is enabled with penalty %" PRIu64
+                       " after deleting file %s",
+                       total_penalty, path_in_trash.c_str());
+        while (!closing_ && !cv_.TimedWait(start_time + total_penalty)) {
+        }
+      } else {
+        // rate limiting is disabled
+        total_penalty = 0;
+        ROCKS_LOG_INFO(info_log_,
+                       "Rate limiting is disabled after deleting file %s",
+                       path_in_trash.c_str());
+      }
+      TEST_SYNC_POINT_CALLBACK("DeleteScheduler::BackgroundEmptyTrash:Wait",
+                               &total_penalty);
+
+      if (is_complete) {
+        pending_files_--;
+      }
+      if (pending_files_ == 0) {
+        // Unblock WaitForEmptyTrash since there are no more files waiting
+        // to be deleted
+        cv_.SignalAll();
+      }
+    }
+  }
+}
+
+Status DeleteScheduler::DeleteTrashFile(const std::string& path_in_trash,
+                                        const std::string& dir_to_sync,
+                                        uint64_t* deleted_bytes,
+                                        bool* is_complete) {
+  uint64_t file_size;
+  Status s = fs_->GetFileSize(path_in_trash, IOOptions(), &file_size, nullptr);
+  *is_complete = true;
+  TEST_SYNC_POINT("DeleteScheduler::DeleteTrashFile:DeleteFile");
+  if (s.ok()) {
+    bool need_full_delete = true;
+    if (bytes_max_delete_chunk_ != 0 && file_size > bytes_max_delete_chunk_) {
+      uint64_t num_hard_links = 2;
+      // We don't have to worry aobut data race between linking a new
+      // file after the number of file link check and ftruncte because
+      // the file is now in trash and no hardlink is supposed to create
+      // to trash files by RocksDB.
+      Status my_status = fs_->NumFileLinks(path_in_trash, IOOptions(),
+                                           &num_hard_links, nullptr);
+      if (my_status.ok()) {
+        if (num_hard_links == 1) {
+          std::unique_ptr<FSWritableFile> wf;
+          my_status = fs_->ReopenWritableFile(path_in_trash, FileOptions(), &wf,
+                                              nullptr);
+          if (my_status.ok()) {
+            my_status = wf->Truncate(file_size - bytes_max_delete_chunk_,
+                                     IOOptions(), nullptr);
+            if (my_status.ok()) {
+              TEST_SYNC_POINT("DeleteScheduler::DeleteTrashFile:Fsync");
+              my_status = wf->Fsync(IOOptions(), nullptr);
+            }
+          }
+          if (my_status.ok()) {
+            *deleted_bytes = bytes_max_delete_chunk_;
+            need_full_delete = false;
+            *is_complete = false;
+          } else {
+            ROCKS_LOG_WARN(info_log_,
+                           "Failed to partially delete %s from trash -- %s",
+                           path_in_trash.c_str(), my_status.ToString().c_str());
+          }
+        } else {
+          ROCKS_LOG_INFO(info_log_,
+                         "Cannot delete %s slowly through ftruncate from trash "
+                         "as it has other links",
+                         path_in_trash.c_str());
+        }
+      } else if (!num_link_error_printed_) {
+        ROCKS_LOG_INFO(
+            info_log_,
+            "Cannot delete files slowly through ftruncate from trash "
+            "as Env::NumFileLinks() returns error: %s",
+            my_status.ToString().c_str());
+        num_link_error_printed_ = true;
+      }
+    }
+
+    if (need_full_delete) {
+      s = fs_->DeleteFile(path_in_trash, IOOptions(), nullptr);
+      if (!dir_to_sync.empty()) {
+        std::unique_ptr<FSDirectory> dir_obj;
+        if (s.ok()) {
+          s = fs_->NewDirectory(dir_to_sync, IOOptions(), &dir_obj, nullptr);
+        }
+        if (s.ok()) {
+          s = dir_obj->FsyncWithDirOptions(
+              IOOptions(), nullptr,
+              DirFsyncOptions(DirFsyncOptions::FsyncReason::kFileDeleted));
+          TEST_SYNC_POINT_CALLBACK(
+              "DeleteScheduler::DeleteTrashFile::AfterSyncDir",
+              reinterpret_cast<void*>(const_cast<std::string*>(&dir_to_sync)));
+        }
+      }
+      if (s.ok()) {
+        *deleted_bytes = file_size;
+        s = sst_file_manager_->OnDeleteFile(path_in_trash);
+      }
+    }
+  }
+  if (!s.ok()) {
+    // Error while getting file size or while deleting
+    ROCKS_LOG_ERROR(info_log_, "Failed to delete %s from trash -- %s",
+                    path_in_trash.c_str(), s.ToString().c_str());
+    *deleted_bytes = 0;
+  } else {
+    total_trash_size_.fetch_sub(*deleted_bytes);
+  }
+
+  return s;
+}
+
+void DeleteScheduler::WaitForEmptyTrash() {
+  InstrumentedMutexLock l(&mu_);
+  while (pending_files_ > 0 && !closing_) {
+    cv_.Wait();
+  }
+}
+
+void DeleteScheduler::MaybeCreateBackgroundThread() {
+  if (bg_thread_ == nullptr && rate_bytes_per_sec_.load() > 0) {
+    bg_thread_.reset(
+        new port::Thread(&DeleteScheduler::BackgroundEmptyTrash, this));
+    ROCKS_LOG_INFO(info_log_,
+                   "Created background thread for deletion scheduler with "
+                   "rate_bytes_per_sec: %" PRIi64,
+                   rate_bytes_per_sec_.load());
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/file/delete_scheduler.h b/src/rocksdb/file/delete_scheduler.h
new file mode 100644
index 000000000..2904ec621
--- /dev/null
+++ b/src/rocksdb/file/delete_scheduler.h
@@ -0,0 +1,149 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <map>
+#include <queue>
+#include <string>
+#include <thread>
+
+#include "monitoring/instrumented_mutex.h"
+#include "port/port.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Env;
+class FileSystem;
+class Logger;
+class SstFileManagerImpl;
+class SystemClock;
+
+// DeleteScheduler allows the DB to enforce a rate limit on file deletion,
+// Instead of deleteing files immediately, files are marked as trash
+// and deleted in a background thread that apply sleep penalty between deletes
+// if they are happening in a rate faster than rate_bytes_per_sec,
+//
+// Rate limiting can be turned off by setting rate_bytes_per_sec = 0, In this
+// case DeleteScheduler will delete files immediately.
+class DeleteScheduler {
+ public:
+  DeleteScheduler(SystemClock* clock, FileSystem* fs,
+                  int64_t rate_bytes_per_sec, Logger* info_log,
+                  SstFileManagerImpl* sst_file_manager,
+                  double max_trash_db_ratio, uint64_t bytes_max_delete_chunk);
+
+  ~DeleteScheduler();
+
+  // Return delete rate limit in bytes per second
+  int64_t GetRateBytesPerSecond() { return rate_bytes_per_sec_.load(); }
+
+  // Set delete rate limit in bytes per second
+  void SetRateBytesPerSecond(int64_t bytes_per_sec) {
+    rate_bytes_per_sec_.store(bytes_per_sec);
+    MaybeCreateBackgroundThread();
+  }
+
+  // Mark file as trash directory and schedule its deletion. If force_bg is
+  // set, it forces the file to always be deleted in the background thread,
+  // except when rate limiting is disabled
+  Status DeleteFile(const std::string& fname, const std::string& dir_to_sync,
+                    const bool force_bg = false);
+
+  // Wait for all files being deleteing in the background to finish or for
+  // destructor to be called.
+  void WaitForEmptyTrash();
+
+  // Return a map containing errors that happened in BackgroundEmptyTrash
+  // file_path => error status
+  std::map<std::string, Status> GetBackgroundErrors();
+
+  uint64_t GetTotalTrashSize() { return total_trash_size_.load(); }
+
+  // Return trash/DB size ratio where new files will be deleted immediately
+  double GetMaxTrashDBRatio() { return max_trash_db_ratio_.load(); }
+
+  // Update trash/DB size ratio where new files will be deleted immediately
+  void SetMaxTrashDBRatio(double r) {
+    assert(r >= 0);
+    max_trash_db_ratio_.store(r);
+  }
+
+  static const std::string kTrashExtension;
+  static bool IsTrashFile(const std::string& file_path);
+
+  // Check if there are any .trash files in path, and schedule their deletion
+  // Or delete immediately if sst_file_manager is nullptr
+  static Status CleanupDirectory(Env* env, SstFileManagerImpl* sfm,
+                                 const std::string& path);
+
+  void SetStatisticsPtr(const std::shared_ptr<Statistics>& stats) {
+    InstrumentedMutexLock l(&mu_);
+    stats_ = stats;
+  }
+
+ private:
+  Status MarkAsTrash(const std::string& file_path, std::string* path_in_trash);
+
+  Status DeleteTrashFile(const std::string& path_in_trash,
+                         const std::string& dir_to_sync,
+                         uint64_t* deleted_bytes, bool* is_complete);
+
+  void BackgroundEmptyTrash();
+
+  void MaybeCreateBackgroundThread();
+
+  SystemClock* clock_;
+  FileSystem* fs_;
+
+  // total size of trash files
+  std::atomic<uint64_t> total_trash_size_;
+  // Maximum number of bytes that should be deleted per second
+  std::atomic<int64_t> rate_bytes_per_sec_;
+  // Mutex to protect queue_, pending_files_, bg_errors_, closing_, stats_
+  InstrumentedMutex mu_;
+
+  struct FileAndDir {
+    FileAndDir(const std::string& f, const std::string& d) : fname(f), dir(d) {}
+    std::string fname;
+    std::string dir;  // empty will be skipped.
+  };
+
+  // Queue of trash files that need to be deleted
+  std::queue<FileAndDir> queue_;
+  // Number of trash files that are waiting to be deleted
+  int32_t pending_files_;
+  uint64_t bytes_max_delete_chunk_;
+  // Errors that happened in BackgroundEmptyTrash (file_path => error)
+  std::map<std::string, Status> bg_errors_;
+
+  bool num_link_error_printed_ = false;
+  // Set to true in ~DeleteScheduler() to force BackgroundEmptyTrash to stop
+  bool closing_;
+  // Condition variable signaled in these conditions
+  //    - pending_files_ value change from 0 => 1
+  //    - pending_files_ value change from 1 => 0
+  //    - closing_ value is set to true
+  InstrumentedCondVar cv_;
+  // Background thread running BackgroundEmptyTrash
+  std::unique_ptr<port::Thread> bg_thread_;
+  // Mutex to protect threads from file name conflicts
+  InstrumentedMutex file_move_mu_;
+  Logger* info_log_;
+  SstFileManagerImpl* sst_file_manager_;
+  // If the trash size constitutes for more than this fraction of the total DB
+  // size we will start deleting new files passed to DeleteScheduler
+  // immediately
+  std::atomic<double> max_trash_db_ratio_;
+  static const uint64_t kMicrosInSecond = 1000 * 1000LL;
+  std::shared_ptr<Statistics> stats_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/file/delete_scheduler_test.cc b/src/rocksdb/file/delete_scheduler_test.cc
new file mode 100644
index 000000000..d825da32a
--- /dev/null
+++ b/src/rocksdb/file/delete_scheduler_test.cc
@@ -0,0 +1,724 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "file/delete_scheduler.h"
+
+#include <atomic>
+#include <cinttypes>
+#include <thread>
+#include <vector>
+
+#include "file/file_util.h"
+#include "file/sst_file_manager_impl.h"
+#include "rocksdb/env.h"
+#include "rocksdb/options.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "util/string_util.h"
+
+#ifndef ROCKSDB_LITE
+
+namespace ROCKSDB_NAMESPACE {
+
+class DeleteSchedulerTest : public testing::Test {
+ public:
+  DeleteSchedulerTest() : env_(Env::Default()) {
+    const int kNumDataDirs = 3;
+    dummy_files_dirs_.reserve(kNumDataDirs);
+    for (size_t i = 0; i < kNumDataDirs; ++i) {
+      dummy_files_dirs_.emplace_back(
+          test::PerThreadDBPath(env_, "delete_scheduler_dummy_data_dir") +
+          std::to_string(i));
+      DestroyAndCreateDir(dummy_files_dirs_.back());
+    }
+    stats_ = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  }
+
+  ~DeleteSchedulerTest() override {
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({});
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+    for (const auto& dummy_files_dir : dummy_files_dirs_) {
+      DestroyDir(env_, dummy_files_dir);
+    }
+  }
+
+  void DestroyAndCreateDir(const std::string& dir) {
+    ASSERT_OK(DestroyDir(env_, dir));
+    EXPECT_OK(env_->CreateDir(dir));
+  }
+
+  int CountNormalFiles(size_t dummy_files_dirs_idx = 0) {
+    std::vector<std::string> files_in_dir;
+    EXPECT_OK(env_->GetChildren(dummy_files_dirs_[dummy_files_dirs_idx],
+                                &files_in_dir));
+
+    int normal_cnt = 0;
+    for (auto& f : files_in_dir) {
+      if (!DeleteScheduler::IsTrashFile(f)) {
+        normal_cnt++;
+      }
+    }
+    return normal_cnt;
+  }
+
+  int CountTrashFiles(size_t dummy_files_dirs_idx = 0) {
+    std::vector<std::string> files_in_dir;
+    EXPECT_OK(env_->GetChildren(dummy_files_dirs_[dummy_files_dirs_idx],
+                                &files_in_dir));
+
+    int trash_cnt = 0;
+    for (auto& f : files_in_dir) {
+      if (DeleteScheduler::IsTrashFile(f)) {
+        trash_cnt++;
+      }
+    }
+    return trash_cnt;
+  }
+
+  std::string NewDummyFile(const std::string& file_name, uint64_t size = 1024,
+                           size_t dummy_files_dirs_idx = 0) {
+    std::string file_path =
+        dummy_files_dirs_[dummy_files_dirs_idx] + "/" + file_name;
+    std::unique_ptr<WritableFile> f;
+    env_->NewWritableFile(file_path, &f, EnvOptions());
+    std::string data(size, 'A');
+    EXPECT_OK(f->Append(data));
+    EXPECT_OK(f->Close());
+    sst_file_mgr_->OnAddFile(file_path);
+    return file_path;
+  }
+
+  void NewDeleteScheduler() {
+    // Tests in this file are for DeleteScheduler component and don't create any
+    // DBs, so we need to set max_trash_db_ratio to 100% (instead of default
+    // 25%)
+    sst_file_mgr_.reset(
+        new SstFileManagerImpl(env_->GetSystemClock(), env_->GetFileSystem(),
+                               nullptr, rate_bytes_per_sec_,
+                               /* max_trash_db_ratio= */ 1.1, 128 * 1024));
+    delete_scheduler_ = sst_file_mgr_->delete_scheduler();
+    sst_file_mgr_->SetStatisticsPtr(stats_);
+  }
+
+  Env* env_;
+  std::vector<std::string> dummy_files_dirs_;
+  int64_t rate_bytes_per_sec_;
+  DeleteScheduler* delete_scheduler_;
+  std::unique_ptr<SstFileManagerImpl> sst_file_mgr_;
+  std::shared_ptr<Statistics> stats_;
+};
+
+// Test the basic functionality of DeleteScheduler (Rate Limiting).
+// 1- Create 100 dummy files
+// 2- Delete the 100 dummy files using DeleteScheduler
+// --- Hold DeleteScheduler::BackgroundEmptyTrash ---
+// 3- Wait for DeleteScheduler to delete all files in trash
+// 4- Verify that BackgroundEmptyTrash used to correct penlties for the files
+// 5- Make sure that all created files were completely deleted
+TEST_F(DeleteSchedulerTest, BasicRateLimiting) {
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+      {"DeleteSchedulerTest::BasicRateLimiting:1",
+       "DeleteScheduler::BackgroundEmptyTrash"},
+  });
+
+  std::vector<uint64_t> penalties;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DeleteScheduler::BackgroundEmptyTrash:Wait",
+      [&](void* arg) { penalties.push_back(*(static_cast<uint64_t*>(arg))); });
+  int dir_synced = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DeleteScheduler::DeleteTrashFile::AfterSyncDir", [&](void* arg) {
+        dir_synced++;
+        std::string* dir = reinterpret_cast<std::string*>(arg);
+        EXPECT_EQ(dummy_files_dirs_[0], *dir);
+      });
+
+  int num_files = 100;        // 100 files
+  uint64_t file_size = 1024;  // every file is 1 kb
+  std::vector<uint64_t> delete_kbs_per_sec = {512, 200, 100, 50, 25};
+
+  for (size_t t = 0; t < delete_kbs_per_sec.size(); t++) {
+    penalties.clear();
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearTrace();
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+    DestroyAndCreateDir(dummy_files_dirs_[0]);
+    rate_bytes_per_sec_ = delete_kbs_per_sec[t] * 1024;
+    NewDeleteScheduler();
+
+    dir_synced = 0;
+    // Create 100 dummy files, every file is 1 Kb
+    std::vector<std::string> generated_files;
+    for (int i = 0; i < num_files; i++) {
+      std::string file_name = "file" + std::to_string(i) + ".data";
+      generated_files.push_back(NewDummyFile(file_name, file_size));
+    }
+
+    // Delete dummy files and measure time spent to empty trash
+    for (int i = 0; i < num_files; i++) {
+      ASSERT_OK(delete_scheduler_->DeleteFile(generated_files[i],
+                                              dummy_files_dirs_[0]));
+    }
+    ASSERT_EQ(CountNormalFiles(), 0);
+
+    uint64_t delete_start_time = env_->NowMicros();
+    TEST_SYNC_POINT("DeleteSchedulerTest::BasicRateLimiting:1");
+    delete_scheduler_->WaitForEmptyTrash();
+    uint64_t time_spent_deleting = env_->NowMicros() - delete_start_time;
+
+    auto bg_errors = delete_scheduler_->GetBackgroundErrors();
+    ASSERT_EQ(bg_errors.size(), 0);
+
+    uint64_t total_files_size = 0;
+    uint64_t expected_penlty = 0;
+    ASSERT_EQ(penalties.size(), num_files);
+    for (int i = 0; i < num_files; i++) {
+      total_files_size += file_size;
+      expected_penlty = ((total_files_size * 1000000) / rate_bytes_per_sec_);
+      ASSERT_EQ(expected_penlty, penalties[i]);
+    }
+    ASSERT_GT(time_spent_deleting, expected_penlty * 0.9);
+
+    ASSERT_EQ(num_files, dir_synced);
+
+    ASSERT_EQ(CountTrashFiles(), 0);
+    ASSERT_EQ(num_files, stats_->getAndResetTickerCount(FILES_MARKED_TRASH));
+    ASSERT_EQ(0, stats_->getAndResetTickerCount(FILES_DELETED_IMMEDIATELY));
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  }
+}
+
+TEST_F(DeleteSchedulerTest, MultiDirectoryDeletionsScheduled) {
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+      {"DeleteSchedulerTest::MultiDbPathDeletionsScheduled:1",
+       "DeleteScheduler::BackgroundEmptyTrash"},
+  });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  rate_bytes_per_sec_ = 1 << 20;  // 1MB
+  NewDeleteScheduler();
+
+  // Generate dummy files in multiple directories
+  const size_t kNumFiles = dummy_files_dirs_.size();
+  const size_t kFileSize = 1 << 10;  // 1KB
+  std::vector<std::string> generated_files;
+  for (size_t i = 0; i < kNumFiles; i++) {
+    generated_files.push_back(NewDummyFile("file", kFileSize, i));
+    ASSERT_EQ(1, CountNormalFiles(i));
+  }
+
+  // Mark dummy files as trash
+  for (size_t i = 0; i < kNumFiles; i++) {
+    ASSERT_OK(delete_scheduler_->DeleteFile(generated_files[i], ""));
+    ASSERT_EQ(0, CountNormalFiles(i));
+    ASSERT_EQ(1, CountTrashFiles(i));
+  }
+  TEST_SYNC_POINT("DeleteSchedulerTest::MultiDbPathDeletionsScheduled:1");
+  delete_scheduler_->WaitForEmptyTrash();
+
+  // Verify dummy files eventually got deleted
+  for (size_t i = 0; i < kNumFiles; i++) {
+    ASSERT_EQ(0, CountNormalFiles(i));
+    ASSERT_EQ(0, CountTrashFiles(i));
+  }
+
+  ASSERT_EQ(kNumFiles, stats_->getAndResetTickerCount(FILES_MARKED_TRASH));
+  ASSERT_EQ(0, stats_->getAndResetTickerCount(FILES_DELETED_IMMEDIATELY));
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+// Same as the BasicRateLimiting test but delete files in multiple threads.
+// 1- Create 100 dummy files
+// 2- Delete the 100 dummy files using DeleteScheduler using 10 threads
+// --- Hold DeleteScheduler::BackgroundEmptyTrash ---
+// 3- Wait for DeleteScheduler to delete all files in queue
+// 4- Verify that BackgroundEmptyTrash used to correct penlties for the files
+// 5- Make sure that all created files were completely deleted
+TEST_F(DeleteSchedulerTest, RateLimitingMultiThreaded) {
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+      {"DeleteSchedulerTest::RateLimitingMultiThreaded:1",
+       "DeleteScheduler::BackgroundEmptyTrash"},
+  });
+
+  std::vector<uint64_t> penalties;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DeleteScheduler::BackgroundEmptyTrash:Wait",
+      [&](void* arg) { penalties.push_back(*(static_cast<uint64_t*>(arg))); });
+
+  int thread_cnt = 10;
+  int num_files = 10;         // 10 files per thread
+  uint64_t file_size = 1024;  // every file is 1 kb
+
+  std::vector<uint64_t> delete_kbs_per_sec = {512, 200, 100, 50, 25};
+  for (size_t t = 0; t < delete_kbs_per_sec.size(); t++) {
+    penalties.clear();
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearTrace();
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+    DestroyAndCreateDir(dummy_files_dirs_[0]);
+    rate_bytes_per_sec_ = delete_kbs_per_sec[t] * 1024;
+    NewDeleteScheduler();
+
+    // Create 100 dummy files, every file is 1 Kb
+    std::vector<std::string> generated_files;
+    for (int i = 0; i < num_files * thread_cnt; i++) {
+      std::string file_name = "file" + std::to_string(i) + ".data";
+      generated_files.push_back(NewDummyFile(file_name, file_size));
+    }
+
+    // Delete dummy files using 10 threads and measure time spent to empty trash
+    std::atomic<int> thread_num(0);
+    std::vector<port::Thread> threads;
+    std::function<void()> delete_thread = [&]() {
+      int idx = thread_num.fetch_add(1);
+      int range_start = idx * num_files;
+      int range_end = range_start + num_files;
+      for (int j = range_start; j < range_end; j++) {
+        ASSERT_OK(delete_scheduler_->DeleteFile(generated_files[j], ""));
+      }
+    };
+
+    for (int i = 0; i < thread_cnt; i++) {
+      threads.emplace_back(delete_thread);
+    }
+
+    for (size_t i = 0; i < threads.size(); i++) {
+      threads[i].join();
+    }
+
+    uint64_t delete_start_time = env_->NowMicros();
+    TEST_SYNC_POINT("DeleteSchedulerTest::RateLimitingMultiThreaded:1");
+    delete_scheduler_->WaitForEmptyTrash();
+    uint64_t time_spent_deleting = env_->NowMicros() - delete_start_time;
+
+    auto bg_errors = delete_scheduler_->GetBackgroundErrors();
+    ASSERT_EQ(bg_errors.size(), 0);
+
+    uint64_t total_files_size = 0;
+    uint64_t expected_penlty = 0;
+    ASSERT_EQ(penalties.size(), num_files * thread_cnt);
+    for (int i = 0; i < num_files * thread_cnt; i++) {
+      total_files_size += file_size;
+      expected_penlty = ((total_files_size * 1000000) / rate_bytes_per_sec_);
+      ASSERT_EQ(expected_penlty, penalties[i]);
+    }
+    ASSERT_GT(time_spent_deleting, expected_penlty * 0.9);
+
+    ASSERT_EQ(CountNormalFiles(), 0);
+    ASSERT_EQ(CountTrashFiles(), 0);
+    ASSERT_EQ(num_files * thread_cnt,
+              stats_->getAndResetTickerCount(FILES_MARKED_TRASH));
+    ASSERT_EQ(0, stats_->getAndResetTickerCount(FILES_DELETED_IMMEDIATELY));
+
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  }
+}
+
+// Disable rate limiting by setting rate_bytes_per_sec_ to 0 and make sure
+// that when DeleteScheduler delete a file it delete it immediately and don't
+// move it to trash
+TEST_F(DeleteSchedulerTest, DisableRateLimiting) {
+  int bg_delete_file = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DeleteScheduler::DeleteTrashFile:DeleteFile",
+      [&](void* /*arg*/) { bg_delete_file++; });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  rate_bytes_per_sec_ = 0;
+  NewDeleteScheduler();
+  constexpr int num_files = 10;
+
+  for (int i = 0; i < num_files; i++) {
+    // Every file we delete will be deleted immediately
+    std::string dummy_file = NewDummyFile("dummy.data");
+    ASSERT_OK(delete_scheduler_->DeleteFile(dummy_file, ""));
+    ASSERT_TRUE(env_->FileExists(dummy_file).IsNotFound());
+    ASSERT_EQ(CountNormalFiles(), 0);
+    ASSERT_EQ(CountTrashFiles(), 0);
+  }
+
+  ASSERT_EQ(bg_delete_file, 0);
+  ASSERT_EQ(0, stats_->getAndResetTickerCount(FILES_MARKED_TRASH));
+  ASSERT_EQ(num_files,
+            stats_->getAndResetTickerCount(FILES_DELETED_IMMEDIATELY));
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+// Testing that moving files to trash with the same name is not a problem
+// 1- Create 10 files with the same name "conflict.data"
+// 2- Delete the 10 files using DeleteScheduler
+// 3- Make sure that trash directory contain 10 files ("conflict.data" x 10)
+// --- Hold DeleteScheduler::BackgroundEmptyTrash ---
+// 4- Make sure that files are deleted from trash
+TEST_F(DeleteSchedulerTest, ConflictNames) {
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+      {"DeleteSchedulerTest::ConflictNames:1",
+       "DeleteScheduler::BackgroundEmptyTrash"},
+  });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  rate_bytes_per_sec_ = 1024 * 1024;  // 1 Mb/sec
+  NewDeleteScheduler();
+
+  // Create "conflict.data" and move it to trash 10 times
+  for (int i = 0; i < 10; i++) {
+    std::string dummy_file = NewDummyFile("conflict.data");
+    ASSERT_OK(delete_scheduler_->DeleteFile(dummy_file, ""));
+  }
+  ASSERT_EQ(CountNormalFiles(), 0);
+  // 10 files ("conflict.data" x 10) in trash
+  ASSERT_EQ(CountTrashFiles(), 10);
+
+  // Hold BackgroundEmptyTrash
+  TEST_SYNC_POINT("DeleteSchedulerTest::ConflictNames:1");
+  delete_scheduler_->WaitForEmptyTrash();
+  ASSERT_EQ(CountTrashFiles(), 0);
+
+  auto bg_errors = delete_scheduler_->GetBackgroundErrors();
+  ASSERT_EQ(bg_errors.size(), 0);
+  ASSERT_EQ(10, stats_->getAndResetTickerCount(FILES_MARKED_TRASH));
+  ASSERT_EQ(0, stats_->getAndResetTickerCount(FILES_DELETED_IMMEDIATELY));
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+// 1- Create 10 dummy files
+// 2- Delete the 10 files using DeleteScheduler (move them to trsah)
+// 3- Delete the 10 files directly (using env_->DeleteFile)
+// --- Hold DeleteScheduler::BackgroundEmptyTrash ---
+// 4- Make sure that DeleteScheduler failed to delete the 10 files and
+//    reported 10 background errors
+TEST_F(DeleteSchedulerTest, BackgroundError) {
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+      {"DeleteSchedulerTest::BackgroundError:1",
+       "DeleteScheduler::BackgroundEmptyTrash"},
+  });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  rate_bytes_per_sec_ = 1024 * 1024;  // 1 Mb/sec
+  NewDeleteScheduler();
+
+  // Generate 10 dummy files and move them to trash
+  for (int i = 0; i < 10; i++) {
+    std::string file_name = "data_" + std::to_string(i) + ".data";
+    ASSERT_OK(delete_scheduler_->DeleteFile(NewDummyFile(file_name), ""));
+  }
+  ASSERT_EQ(CountNormalFiles(), 0);
+  ASSERT_EQ(CountTrashFiles(), 10);
+
+  // Delete 10 files from trash, this will cause background errors in
+  // BackgroundEmptyTrash since we already deleted the files it was
+  // goind to delete
+  for (int i = 0; i < 10; i++) {
+    std::string file_name = "data_" + std::to_string(i) + ".data.trash";
+    ASSERT_OK(env_->DeleteFile(dummy_files_dirs_[0] + "/" + file_name));
+  }
+
+  // Hold BackgroundEmptyTrash
+  TEST_SYNC_POINT("DeleteSchedulerTest::BackgroundError:1");
+  delete_scheduler_->WaitForEmptyTrash();
+  auto bg_errors = delete_scheduler_->GetBackgroundErrors();
+  ASSERT_EQ(bg_errors.size(), 10);
+  for (const auto& it : bg_errors) {
+    ASSERT_TRUE(it.second.IsPathNotFound());
+  }
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+// 1- Create kTestFileNum dummy files
+// 2- Delete kTestFileNum dummy files using DeleteScheduler
+// 3- Wait for DeleteScheduler to delete all files in queue
+// 4- Make sure all files in trash directory were deleted
+// 5- Repeat previous steps 5 times
+TEST_F(DeleteSchedulerTest, StartBGEmptyTrashMultipleTimes) {
+  constexpr int kTestFileNum = 10;
+  std::atomic_int bg_delete_file = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DeleteScheduler::DeleteTrashFile:DeleteFile",
+      [&](void* /*arg*/) { bg_delete_file++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  rate_bytes_per_sec_ = 1024 * 1024;  // 1 MB / sec
+  NewDeleteScheduler();
+
+  // If trash file is generated faster than deleting, delete_scheduler will
+  // delete it directly instead of waiting for background trash empty thread to
+  // clean it. Set the ratio higher to avoid that.
+  sst_file_mgr_->SetMaxTrashDBRatio(kTestFileNum + 1);
+
+  // Move files to trash, wait for empty trash, start again
+  for (int run = 1; run <= 5; run++) {
+    // Generate kTestFileNum dummy files and move them to trash
+    for (int i = 0; i < kTestFileNum; i++) {
+      std::string file_name = "data_" + std::to_string(i) + ".data";
+      ASSERT_OK(delete_scheduler_->DeleteFile(NewDummyFile(file_name), ""));
+    }
+    ASSERT_EQ(CountNormalFiles(), 0);
+    delete_scheduler_->WaitForEmptyTrash();
+    ASSERT_EQ(bg_delete_file, kTestFileNum * run);
+    ASSERT_EQ(CountTrashFiles(), 0);
+
+    auto bg_errors = delete_scheduler_->GetBackgroundErrors();
+    ASSERT_EQ(bg_errors.size(), 0);
+    ASSERT_EQ(kTestFileNum, stats_->getAndResetTickerCount(FILES_MARKED_TRASH));
+    ASSERT_EQ(0, stats_->getAndResetTickerCount(FILES_DELETED_IMMEDIATELY));
+  }
+
+  ASSERT_EQ(bg_delete_file, 5 * kTestFileNum);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+}
+
+TEST_F(DeleteSchedulerTest, DeletePartialFile) {
+  int bg_delete_file = 0;
+  int bg_fsync = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DeleteScheduler::DeleteTrashFile:DeleteFile",
+      [&](void*) { bg_delete_file++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DeleteScheduler::DeleteTrashFile:Fsync", [&](void*) { bg_fsync++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  rate_bytes_per_sec_ = 1024 * 1024;  // 1 MB / sec
+  NewDeleteScheduler();
+
+  // Should delete in 4 batch
+  ASSERT_OK(
+      delete_scheduler_->DeleteFile(NewDummyFile("data_1", 500 * 1024), ""));
+  ASSERT_OK(
+      delete_scheduler_->DeleteFile(NewDummyFile("data_2", 100 * 1024), ""));
+  // Should delete in 2 batch
+  ASSERT_OK(
+      delete_scheduler_->DeleteFile(NewDummyFile("data_2", 200 * 1024), ""));
+
+  delete_scheduler_->WaitForEmptyTrash();
+
+  auto bg_errors = delete_scheduler_->GetBackgroundErrors();
+  ASSERT_EQ(bg_errors.size(), 0);
+  ASSERT_EQ(7, bg_delete_file);
+  ASSERT_EQ(4, bg_fsync);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+}
+
+#ifdef OS_LINUX
+TEST_F(DeleteSchedulerTest, NoPartialDeleteWithLink) {
+  int bg_delete_file = 0;
+  int bg_fsync = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DeleteScheduler::DeleteTrashFile:DeleteFile",
+      [&](void*) { bg_delete_file++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DeleteScheduler::DeleteTrashFile:Fsync", [&](void*) { bg_fsync++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  rate_bytes_per_sec_ = 1024 * 1024;  // 1 MB / sec
+  NewDeleteScheduler();
+
+  std::string file1 = NewDummyFile("data_1", 500 * 1024);
+  std::string file2 = NewDummyFile("data_2", 100 * 1024);
+
+  ASSERT_OK(env_->LinkFile(file1, dummy_files_dirs_[0] + "/data_1b"));
+  ASSERT_OK(env_->LinkFile(file2, dummy_files_dirs_[0] + "/data_2b"));
+
+  // Should delete in 4 batch if there is no hardlink
+  ASSERT_OK(delete_scheduler_->DeleteFile(file1, ""));
+  ASSERT_OK(delete_scheduler_->DeleteFile(file2, ""));
+
+  delete_scheduler_->WaitForEmptyTrash();
+
+  auto bg_errors = delete_scheduler_->GetBackgroundErrors();
+  ASSERT_EQ(bg_errors.size(), 0);
+  ASSERT_EQ(2, bg_delete_file);
+  ASSERT_EQ(0, bg_fsync);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+}
+#endif
+
+// 1- Create a DeleteScheduler with very slow rate limit (1 Byte / sec)
+// 2- Delete 100 files using DeleteScheduler
+// 3- Delete the DeleteScheduler (call the destructor while queue is not empty)
+// 4- Make sure that not all files were deleted from trash and that
+//    DeleteScheduler background thread did not delete all files
+TEST_F(DeleteSchedulerTest, DestructorWithNonEmptyQueue) {
+  int bg_delete_file = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DeleteScheduler::DeleteTrashFile:DeleteFile",
+      [&](void* /*arg*/) { bg_delete_file++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  rate_bytes_per_sec_ = 1;  // 1 Byte / sec
+  NewDeleteScheduler();
+
+  for (int i = 0; i < 100; i++) {
+    std::string file_name = "data_" + std::to_string(i) + ".data";
+    ASSERT_OK(delete_scheduler_->DeleteFile(NewDummyFile(file_name), ""));
+  }
+
+  // Deleting 100 files will need >28 hours to delete
+  // we will delete the DeleteScheduler while delete queue is not empty
+  sst_file_mgr_.reset();
+
+  ASSERT_LT(bg_delete_file, 100);
+  ASSERT_GT(CountTrashFiles(), 0);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DeleteSchedulerTest, DISABLED_DynamicRateLimiting1) {
+  std::vector<uint64_t> penalties;
+  int bg_delete_file = 0;
+  int fg_delete_file = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DeleteScheduler::DeleteTrashFile:DeleteFile",
+      [&](void* /*arg*/) { bg_delete_file++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DeleteScheduler::DeleteFile", [&](void* /*arg*/) { fg_delete_file++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DeleteScheduler::BackgroundEmptyTrash:Wait",
+      [&](void* arg) { penalties.push_back(*(static_cast<int*>(arg))); });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+      {"DeleteSchedulerTest::DynamicRateLimiting1:1",
+       "DeleteScheduler::BackgroundEmptyTrash"},
+  });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  rate_bytes_per_sec_ = 0;  // Disable rate limiting initially
+  NewDeleteScheduler();
+
+  int num_files = 10;         // 10 files
+  uint64_t file_size = 1024;  // every file is 1 kb
+
+  std::vector<int64_t> delete_kbs_per_sec = {512, 200, 0, 100, 50, -2, 25};
+  for (size_t t = 0; t < delete_kbs_per_sec.size(); t++) {
+    penalties.clear();
+    bg_delete_file = 0;
+    fg_delete_file = 0;
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearTrace();
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+    DestroyAndCreateDir(dummy_files_dirs_[0]);
+    rate_bytes_per_sec_ = delete_kbs_per_sec[t] * 1024;
+    delete_scheduler_->SetRateBytesPerSecond(rate_bytes_per_sec_);
+
+    // Create 100 dummy files, every file is 1 Kb
+    std::vector<std::string> generated_files;
+    for (int i = 0; i < num_files; i++) {
+      std::string file_name = "file" + std::to_string(i) + ".data";
+      generated_files.push_back(NewDummyFile(file_name, file_size));
+    }
+
+    // Delete dummy files and measure time spent to empty trash
+    for (int i = 0; i < num_files; i++) {
+      ASSERT_OK(delete_scheduler_->DeleteFile(generated_files[i], ""));
+    }
+    ASSERT_EQ(CountNormalFiles(), 0);
+
+    if (rate_bytes_per_sec_ > 0) {
+      uint64_t delete_start_time = env_->NowMicros();
+      TEST_SYNC_POINT("DeleteSchedulerTest::DynamicRateLimiting1:1");
+      delete_scheduler_->WaitForEmptyTrash();
+      uint64_t time_spent_deleting = env_->NowMicros() - delete_start_time;
+
+      auto bg_errors = delete_scheduler_->GetBackgroundErrors();
+      ASSERT_EQ(bg_errors.size(), 0);
+
+      uint64_t total_files_size = 0;
+      uint64_t expected_penlty = 0;
+      ASSERT_EQ(penalties.size(), num_files);
+      for (int i = 0; i < num_files; i++) {
+        total_files_size += file_size;
+        expected_penlty = ((total_files_size * 1000000) / rate_bytes_per_sec_);
+        ASSERT_EQ(expected_penlty, penalties[i]);
+      }
+      ASSERT_GT(time_spent_deleting, expected_penlty * 0.9);
+      ASSERT_EQ(bg_delete_file, num_files);
+      ASSERT_EQ(fg_delete_file, 0);
+    } else {
+      ASSERT_EQ(penalties.size(), 0);
+      ASSERT_EQ(bg_delete_file, 0);
+      ASSERT_EQ(fg_delete_file, num_files);
+    }
+
+    ASSERT_EQ(CountTrashFiles(), 0);
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  }
+}
+
+TEST_F(DeleteSchedulerTest, ImmediateDeleteOn25PercDBSize) {
+  int bg_delete_file = 0;
+  int fg_delete_file = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DeleteScheduler::DeleteTrashFile:DeleteFile",
+      [&](void* /*arg*/) { bg_delete_file++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DeleteScheduler::DeleteFile", [&](void* /*arg*/) { fg_delete_file++; });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  int num_files = 100;             // 100 files
+  uint64_t file_size = 1024 * 10;  // 100 KB as a file size
+  rate_bytes_per_sec_ = 1;         // 1 byte per sec (very slow trash delete)
+
+  NewDeleteScheduler();
+  delete_scheduler_->SetMaxTrashDBRatio(0.25);
+
+  std::vector<std::string> generated_files;
+  for (int i = 0; i < num_files; i++) {
+    std::string file_name = "file" + std::to_string(i) + ".data";
+    generated_files.push_back(NewDummyFile(file_name, file_size));
+  }
+
+  for (std::string& file_name : generated_files) {
+    ASSERT_OK(delete_scheduler_->DeleteFile(file_name, ""));
+  }
+
+  // When we end up with 26 files in trash we will start
+  // deleting new files immediately
+  ASSERT_EQ(fg_delete_file, 74);
+  ASSERT_EQ(26, stats_->getAndResetTickerCount(FILES_MARKED_TRASH));
+  ASSERT_EQ(74, stats_->getAndResetTickerCount(FILES_DELETED_IMMEDIATELY));
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DeleteSchedulerTest, IsTrashCheck) {
+  // Trash files
+  ASSERT_TRUE(DeleteScheduler::IsTrashFile("x.trash"));
+  ASSERT_TRUE(DeleteScheduler::IsTrashFile(".trash"));
+  ASSERT_TRUE(DeleteScheduler::IsTrashFile("abc.sst.trash"));
+  ASSERT_TRUE(DeleteScheduler::IsTrashFile("/a/b/c/abc..sst.trash"));
+  ASSERT_TRUE(DeleteScheduler::IsTrashFile("log.trash"));
+  ASSERT_TRUE(DeleteScheduler::IsTrashFile("^^^^^.log.trash"));
+  ASSERT_TRUE(DeleteScheduler::IsTrashFile("abc.t.trash"));
+
+  // Not trash files
+  ASSERT_FALSE(DeleteScheduler::IsTrashFile("abc.sst"));
+  ASSERT_FALSE(DeleteScheduler::IsTrashFile("abc.txt"));
+  ASSERT_FALSE(DeleteScheduler::IsTrashFile("/a/b/c/abc.sst"));
+  ASSERT_FALSE(DeleteScheduler::IsTrashFile("/a/b/c/abc.sstrash"));
+  ASSERT_FALSE(DeleteScheduler::IsTrashFile("^^^^^.trashh"));
+  ASSERT_FALSE(DeleteScheduler::IsTrashFile("abc.ttrash"));
+  ASSERT_FALSE(DeleteScheduler::IsTrashFile(".ttrash"));
+  ASSERT_FALSE(DeleteScheduler::IsTrashFile("abc.trashx"));
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+
+#else
+int main(int /*argc*/, char** /*argv*/) {
+  printf("DeleteScheduler is not supported in ROCKSDB_LITE\n");
+  return 0;
+}
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/file/file_prefetch_buffer.cc b/src/rocksdb/file/file_prefetch_buffer.cc
new file mode 100644
index 000000000..4ac0d0504
--- /dev/null
+++ b/src/rocksdb/file/file_prefetch_buffer.cc
@@ -0,0 +1,918 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "file/file_prefetch_buffer.h"
+
+#include <algorithm>
+#include <cassert>
+
+#include "file/random_access_file_reader.h"
+#include "monitoring/histogram.h"
+#include "monitoring/iostats_context_imp.h"
+#include "port/port.h"
+#include "test_util/sync_point.h"
+#include "util/random.h"
+#include "util/rate_limiter.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+void FilePrefetchBuffer::CalculateOffsetAndLen(size_t alignment,
+                                               uint64_t offset,
+                                               size_t roundup_len,
+                                               uint32_t index, bool refit_tail,
+                                               uint64_t& chunk_len) {
+  uint64_t chunk_offset_in_buffer = 0;
+  bool copy_data_to_new_buffer = false;
+  // Check if requested bytes are in the existing buffer_.
+  // If only a few bytes exist -- reuse them & read only what is really needed.
+  //     This is typically the case of incremental reading of data.
+  // If no bytes exist in buffer -- full pread.
+  if (DoesBufferContainData(index) && IsOffsetInBuffer(offset, index)) {
+    // Only a few requested bytes are in the buffer. memmove those chunk of
+    // bytes to the beginning, and memcpy them back into the new buffer if a
+    // new buffer is created.
+    chunk_offset_in_buffer = Rounddown(
+        static_cast<size_t>(offset - bufs_[index].offset_), alignment);
+    chunk_len = static_cast<uint64_t>(bufs_[index].buffer_.CurrentSize()) -
+                chunk_offset_in_buffer;
+    assert(chunk_offset_in_buffer % alignment == 0);
+    assert(chunk_len % alignment == 0);
+    assert(chunk_offset_in_buffer + chunk_len <=
+           bufs_[index].offset_ + bufs_[index].buffer_.CurrentSize());
+    if (chunk_len > 0) {
+      copy_data_to_new_buffer = true;
+    } else {
+      // this reset is not necessary, but just to be safe.
+      chunk_offset_in_buffer = 0;
+    }
+  }
+
+  // Create a new buffer only if current capacity is not sufficient, and memcopy
+  // bytes from old buffer if needed (i.e., if chunk_len is greater than 0).
+  if (bufs_[index].buffer_.Capacity() < roundup_len) {
+    bufs_[index].buffer_.Alignment(alignment);
+    bufs_[index].buffer_.AllocateNewBuffer(
+        static_cast<size_t>(roundup_len), copy_data_to_new_buffer,
+        chunk_offset_in_buffer, static_cast<size_t>(chunk_len));
+  } else if (chunk_len > 0 && refit_tail) {
+    // New buffer not needed. But memmove bytes from tail to the beginning since
+    // chunk_len is greater than 0.
+    bufs_[index].buffer_.RefitTail(static_cast<size_t>(chunk_offset_in_buffer),
+                                   static_cast<size_t>(chunk_len));
+  } else if (chunk_len > 0) {
+    // For async prefetching, it doesn't call RefitTail with chunk_len > 0.
+    // Allocate new buffer if needed because aligned buffer calculate remaining
+    // buffer as capacity_ - cursize_ which might not be the case in this as we
+    // are not refitting.
+    // TODO akanksha: Update the condition when asynchronous prefetching is
+    // stable.
+    bufs_[index].buffer_.Alignment(alignment);
+    bufs_[index].buffer_.AllocateNewBuffer(
+        static_cast<size_t>(roundup_len), copy_data_to_new_buffer,
+        chunk_offset_in_buffer, static_cast<size_t>(chunk_len));
+  }
+}
+
+Status FilePrefetchBuffer::Read(const IOOptions& opts,
+                                RandomAccessFileReader* reader,
+                                Env::IOPriority rate_limiter_priority,
+                                uint64_t read_len, uint64_t chunk_len,
+                                uint64_t rounddown_start, uint32_t index) {
+  Slice result;
+  Status s = reader->Read(opts, rounddown_start + chunk_len, read_len, &result,
+                          bufs_[index].buffer_.BufferStart() + chunk_len,
+                          /*aligned_buf=*/nullptr, rate_limiter_priority);
+#ifndef NDEBUG
+  if (result.size() < read_len) {
+    // Fake an IO error to force db_stress fault injection to ignore
+    // truncated read errors
+    IGNORE_STATUS_IF_ERROR(Status::IOError());
+  }
+#endif
+  if (!s.ok()) {
+    return s;
+  }
+
+  // Update the buffer offset and size.
+  bufs_[index].offset_ = rounddown_start;
+  bufs_[index].buffer_.Size(static_cast<size_t>(chunk_len) + result.size());
+  return s;
+}
+
+Status FilePrefetchBuffer::ReadAsync(const IOOptions& opts,
+                                     RandomAccessFileReader* reader,
+                                     uint64_t read_len,
+                                     uint64_t rounddown_start, uint32_t index) {
+  // callback for async read request.
+  auto fp = std::bind(&FilePrefetchBuffer::PrefetchAsyncCallback, this,
+                      std::placeholders::_1, std::placeholders::_2);
+  FSReadRequest req;
+  Slice result;
+  req.len = read_len;
+  req.offset = rounddown_start;
+  req.result = result;
+  req.scratch = bufs_[index].buffer_.BufferStart();
+  bufs_[index].async_req_len_ = req.len;
+
+  Status s =
+      reader->ReadAsync(req, opts, fp, &(bufs_[index].pos_),
+                        &(bufs_[index].io_handle_), &(bufs_[index].del_fn_),
+                        /*aligned_buf=*/nullptr);
+  req.status.PermitUncheckedError();
+  if (s.ok()) {
+    bufs_[index].async_read_in_progress_ = true;
+  }
+  return s;
+}
+
+Status FilePrefetchBuffer::Prefetch(const IOOptions& opts,
+                                    RandomAccessFileReader* reader,
+                                    uint64_t offset, size_t n,
+                                    Env::IOPriority rate_limiter_priority) {
+  if (!enable_ || reader == nullptr) {
+    return Status::OK();
+  }
+  TEST_SYNC_POINT("FilePrefetchBuffer::Prefetch:Start");
+
+  if (offset + n <= bufs_[curr_].offset_ + bufs_[curr_].buffer_.CurrentSize()) {
+    // All requested bytes are already in the curr_ buffer. So no need to Read
+    // again.
+    return Status::OK();
+  }
+
+  size_t alignment = reader->file()->GetRequiredBufferAlignment();
+  size_t offset_ = static_cast<size_t>(offset);
+  uint64_t rounddown_offset = Rounddown(offset_, alignment);
+  uint64_t roundup_end = Roundup(offset_ + n, alignment);
+  uint64_t roundup_len = roundup_end - rounddown_offset;
+  assert(roundup_len >= alignment);
+  assert(roundup_len % alignment == 0);
+
+  uint64_t chunk_len = 0;
+  CalculateOffsetAndLen(alignment, offset, roundup_len, curr_,
+                        true /*refit_tail*/, chunk_len);
+  size_t read_len = static_cast<size_t>(roundup_len - chunk_len);
+
+  Status s = Read(opts, reader, rate_limiter_priority, read_len, chunk_len,
+                  rounddown_offset, curr_);
+  return s;
+}
+
+// Copy data from src to third buffer.
+void FilePrefetchBuffer::CopyDataToBuffer(uint32_t src, uint64_t& offset,
+                                          size_t& length) {
+  if (length == 0) {
+    return;
+  }
+  uint64_t copy_offset = (offset - bufs_[src].offset_);
+  size_t copy_len = 0;
+  if (IsDataBlockInBuffer(offset, length, src)) {
+    // All the bytes are in src.
+    copy_len = length;
+  } else {
+    copy_len = bufs_[src].buffer_.CurrentSize() - copy_offset;
+  }
+
+  memcpy(bufs_[2].buffer_.BufferStart() + bufs_[2].buffer_.CurrentSize(),
+         bufs_[src].buffer_.BufferStart() + copy_offset, copy_len);
+
+  bufs_[2].buffer_.Size(bufs_[2].buffer_.CurrentSize() + copy_len);
+
+  // Update offset and length.
+  offset += copy_len;
+  length -= copy_len;
+
+  // length > 0 indicates it has consumed all data from the src buffer and it
+  // still needs to read more other buffer.
+  if (length > 0) {
+    bufs_[src].buffer_.Clear();
+  }
+}
+
+// Clear the buffers if it contains outdated data. Outdated data can be
+// because previous sequential reads were read from the cache instead of these
+// buffer. In that case outdated IOs should be aborted.
+void FilePrefetchBuffer::AbortIOIfNeeded(uint64_t offset) {
+  uint32_t second = curr_ ^ 1;
+  std::vector<void*> handles;
+  autovector<uint32_t> buf_pos;
+  if (IsBufferOutdatedWithAsyncProgress(offset, curr_)) {
+    handles.emplace_back(bufs_[curr_].io_handle_);
+    buf_pos.emplace_back(curr_);
+  }
+  if (IsBufferOutdatedWithAsyncProgress(offset, second)) {
+    handles.emplace_back(bufs_[second].io_handle_);
+    buf_pos.emplace_back(second);
+  }
+  if (!handles.empty()) {
+    StopWatch sw(clock_, stats_, ASYNC_PREFETCH_ABORT_MICROS);
+    Status s = fs_->AbortIO(handles);
+    assert(s.ok());
+  }
+
+  for (auto& pos : buf_pos) {
+    // Release io_handle.
+    DestroyAndClearIOHandle(pos);
+  }
+
+  if (bufs_[second].io_handle_ == nullptr) {
+    bufs_[second].async_read_in_progress_ = false;
+  }
+
+  if (bufs_[curr_].io_handle_ == nullptr) {
+    bufs_[curr_].async_read_in_progress_ = false;
+  }
+}
+
+void FilePrefetchBuffer::AbortAllIOs() {
+  uint32_t second = curr_ ^ 1;
+  std::vector<void*> handles;
+  for (uint32_t i = 0; i < 2; i++) {
+    if (bufs_[i].async_read_in_progress_ && bufs_[i].io_handle_ != nullptr) {
+      handles.emplace_back(bufs_[i].io_handle_);
+    }
+  }
+  if (!handles.empty()) {
+    StopWatch sw(clock_, stats_, ASYNC_PREFETCH_ABORT_MICROS);
+    Status s = fs_->AbortIO(handles);
+    assert(s.ok());
+  }
+
+  // Release io_handles.
+  if (bufs_[curr_].io_handle_ != nullptr && bufs_[curr_].del_fn_ != nullptr) {
+    DestroyAndClearIOHandle(curr_);
+  } else {
+    bufs_[curr_].async_read_in_progress_ = false;
+  }
+
+  if (bufs_[second].io_handle_ != nullptr && bufs_[second].del_fn_ != nullptr) {
+    DestroyAndClearIOHandle(second);
+  } else {
+    bufs_[second].async_read_in_progress_ = false;
+  }
+}
+
+// Clear the buffers if it contains outdated data. Outdated data can be
+// because previous sequential reads were read from the cache instead of these
+// buffer.
+void FilePrefetchBuffer::UpdateBuffersIfNeeded(uint64_t offset) {
+  uint32_t second = curr_ ^ 1;
+  if (IsBufferOutdated(offset, curr_)) {
+    bufs_[curr_].buffer_.Clear();
+  }
+  if (IsBufferOutdated(offset, second)) {
+    bufs_[second].buffer_.Clear();
+  }
+
+  {
+    // In case buffers do not align, reset second buffer. This can happen in
+    // case readahead_size is set.
+    if (!bufs_[second].async_read_in_progress_ &&
+        !bufs_[curr_].async_read_in_progress_) {
+      if (DoesBufferContainData(curr_)) {
+        if (bufs_[curr_].offset_ + bufs_[curr_].buffer_.CurrentSize() !=
+            bufs_[second].offset_) {
+          bufs_[second].buffer_.Clear();
+        }
+      } else {
+        if (!IsOffsetInBuffer(offset, second)) {
+          bufs_[second].buffer_.Clear();
+        }
+      }
+    }
+  }
+
+  // If data starts from second buffer, make it curr_. Second buffer can be
+  // either partial filled, full or async read is in progress.
+  if (bufs_[second].async_read_in_progress_) {
+    if (IsOffsetInBufferWithAsyncProgress(offset, second)) {
+      curr_ = curr_ ^ 1;
+    }
+  } else {
+    if (DoesBufferContainData(second) && IsOffsetInBuffer(offset, second)) {
+      assert(bufs_[curr_].async_read_in_progress_ ||
+             bufs_[curr_].buffer_.CurrentSize() == 0);
+      curr_ = curr_ ^ 1;
+    }
+  }
+}
+
+void FilePrefetchBuffer::PollAndUpdateBuffersIfNeeded(uint64_t offset) {
+  if (bufs_[curr_].async_read_in_progress_ && fs_ != nullptr) {
+    if (bufs_[curr_].io_handle_ != nullptr) {
+      // Wait for prefetch data to complete.
+      // No mutex is needed as async_read_in_progress behaves as mutex and is
+      // updated by main thread only.
+      std::vector<void*> handles;
+      handles.emplace_back(bufs_[curr_].io_handle_);
+      StopWatch sw(clock_, stats_, POLL_WAIT_MICROS);
+      fs_->Poll(handles, 1).PermitUncheckedError();
+    }
+
+    // Reset and Release io_handle after the Poll API as request has been
+    // completed.
+    DestroyAndClearIOHandle(curr_);
+  }
+  UpdateBuffersIfNeeded(offset);
+}
+
+Status FilePrefetchBuffer::HandleOverlappingData(
+    const IOOptions& opts, RandomAccessFileReader* reader, uint64_t offset,
+    size_t length, size_t readahead_size,
+    Env::IOPriority /*rate_limiter_priority*/, bool& copy_to_third_buffer,
+    uint64_t& tmp_offset, size_t& tmp_length) {
+  Status s;
+  size_t alignment = reader->file()->GetRequiredBufferAlignment();
+  uint32_t second;
+
+  // Check if the first buffer has the required offset and the async read is
+  // still in progress. This should only happen if a prefetch was initiated
+  // by Seek, but the next access is at another offset.
+  if (bufs_[curr_].async_read_in_progress_ &&
+      IsOffsetInBufferWithAsyncProgress(offset, curr_)) {
+    PollAndUpdateBuffersIfNeeded(offset);
+  }
+  second = curr_ ^ 1;
+
+  // If data is overlapping over two buffers, copy the data from curr_ and
+  // call ReadAsync on curr_.
+  if (!bufs_[curr_].async_read_in_progress_ && DoesBufferContainData(curr_) &&
+      IsOffsetInBuffer(offset, curr_) &&
+      (/*Data extends over curr_ buffer and second buffer either has data or in
+         process of population=*/
+       (offset + length > bufs_[second].offset_) &&
+       (bufs_[second].async_read_in_progress_ ||
+        DoesBufferContainData(second)))) {
+    // Allocate new buffer to third buffer;
+    bufs_[2].buffer_.Clear();
+    bufs_[2].buffer_.Alignment(alignment);
+    bufs_[2].buffer_.AllocateNewBuffer(length);
+    bufs_[2].offset_ = offset;
+    copy_to_third_buffer = true;
+
+    CopyDataToBuffer(curr_, tmp_offset, tmp_length);
+
+    // Call async prefetching on curr_ since data has been consumed in curr_
+    // only if data lies within second buffer.
+    size_t second_size = bufs_[second].async_read_in_progress_
+                             ? bufs_[second].async_req_len_
+                             : bufs_[second].buffer_.CurrentSize();
+    if (tmp_offset + tmp_length <= bufs_[second].offset_ + second_size) {
+      uint64_t rounddown_start = bufs_[second].offset_ + second_size;
+      uint64_t roundup_end =
+          Roundup(rounddown_start + readahead_size, alignment);
+      uint64_t roundup_len = roundup_end - rounddown_start;
+      uint64_t chunk_len = 0;
+      CalculateOffsetAndLen(alignment, rounddown_start, roundup_len, curr_,
+                            false, chunk_len);
+      assert(chunk_len == 0);
+      assert(roundup_len >= chunk_len);
+
+      bufs_[curr_].offset_ = rounddown_start;
+      uint64_t read_len = static_cast<size_t>(roundup_len - chunk_len);
+      s = ReadAsync(opts, reader, read_len, rounddown_start, curr_);
+      if (!s.ok()) {
+        DestroyAndClearIOHandle(curr_);
+        bufs_[curr_].buffer_.Clear();
+        return s;
+      }
+    }
+    curr_ = curr_ ^ 1;
+  }
+  return s;
+}
+// If async_io is enabled in case of sequential reads, PrefetchAsyncInternal is
+// called. When buffers are switched, we clear the curr_ buffer as we assume the
+// data has been consumed because of sequential reads.
+// Data in buffers will always be sequential with curr_ following second and
+// not vice versa.
+//
+// Scenarios for prefetching asynchronously:
+// Case1: If both buffers are empty, prefetch n + readahead_size_/2 bytes
+//        synchronously in curr_ and prefetch readahead_size_/2 async in second
+//        buffer.
+// Case2: If second buffer has partial or full data, make it current and
+//        prefetch readahead_size_/2 async in second buffer. In case of
+//        partial data, prefetch remaining bytes from size n synchronously to
+//        fulfill the requested bytes request.
+// Case3: If curr_ has partial data, prefetch remaining bytes from size n
+//        synchronously in curr_ to fulfill the requested bytes request and
+//        prefetch readahead_size_/2 bytes async in second buffer.
+// Case4: (Special case) If data is in both buffers, copy requested data from
+//        curr_, send async request on curr_, wait for poll to fill second
+//        buffer (if any), and copy remaining data from second buffer to third
+//        buffer.
+Status FilePrefetchBuffer::PrefetchAsyncInternal(
+    const IOOptions& opts, RandomAccessFileReader* reader, uint64_t offset,
+    size_t length, size_t readahead_size, Env::IOPriority rate_limiter_priority,
+    bool& copy_to_third_buffer) {
+  if (!enable_) {
+    return Status::OK();
+  }
+
+  TEST_SYNC_POINT("FilePrefetchBuffer::PrefetchAsyncInternal:Start");
+
+  size_t alignment = reader->file()->GetRequiredBufferAlignment();
+  Status s;
+  uint64_t tmp_offset = offset;
+  size_t tmp_length = length;
+
+  // 1. Abort IO and swap buffers if needed to point curr_ to first buffer with
+  // data.
+  if (!explicit_prefetch_submitted_) {
+    AbortIOIfNeeded(offset);
+  }
+  UpdateBuffersIfNeeded(offset);
+
+  // 2. Handle overlapping data over two buffers. If data is overlapping then
+  //    during this call:
+  //   - data from curr_ is copied into third buffer,
+  //   - curr_ is send for async prefetching of further data if second buffer
+  //     contains remaining requested data or in progress for async prefetch,
+  //   - switch buffers and curr_ now points to second buffer to copy remaining
+  //     data.
+  s = HandleOverlappingData(opts, reader, offset, length, readahead_size,
+                            rate_limiter_priority, copy_to_third_buffer,
+                            tmp_offset, tmp_length);
+  if (!s.ok()) {
+    return s;
+  }
+
+  // 3. Call Poll only if data is needed for the second buffer.
+  //    - Return if whole data is in curr_ and second buffer is in progress or
+  //      already full.
+  //    - If second buffer is empty, it will go for ReadAsync for second buffer.
+  if (!bufs_[curr_].async_read_in_progress_ && DoesBufferContainData(curr_) &&
+      IsDataBlockInBuffer(offset, length, curr_)) {
+    // Whole data is in curr_.
+    UpdateBuffersIfNeeded(offset);
+    if (!IsSecondBuffEligibleForPrefetching()) {
+      return s;
+    }
+  } else {
+    // After poll request, curr_ might be empty because of IOError in
+    // callback while reading or may contain required data.
+    PollAndUpdateBuffersIfNeeded(offset);
+  }
+
+  if (copy_to_third_buffer) {
+    offset = tmp_offset;
+    length = tmp_length;
+  }
+
+  // 4. After polling and swapping buffers, if all the requested bytes are in
+  // curr_, it will only go for async prefetching.
+  // copy_to_third_buffer is a special case so it will be handled separately.
+  if (!copy_to_third_buffer && DoesBufferContainData(curr_) &&
+      IsDataBlockInBuffer(offset, length, curr_)) {
+    offset += length;
+    length = 0;
+
+    // Since async request was submitted directly by calling PrefetchAsync in
+    // last call, we don't need to prefetch further as this call is to poll
+    // the data submitted in previous call.
+    if (explicit_prefetch_submitted_) {
+      return s;
+    }
+    if (!IsSecondBuffEligibleForPrefetching()) {
+      return s;
+    }
+  }
+
+  uint32_t second = curr_ ^ 1;
+  assert(!bufs_[curr_].async_read_in_progress_);
+
+  // In case because of some IOError curr_ got empty, abort IO for second as
+  // well. Otherwise data might not align if more data needs to be read in curr_
+  // which might overlap with second buffer.
+  if (!DoesBufferContainData(curr_) && bufs_[second].async_read_in_progress_) {
+    if (bufs_[second].io_handle_ != nullptr) {
+      std::vector<void*> handles;
+      handles.emplace_back(bufs_[second].io_handle_);
+      {
+        StopWatch sw(clock_, stats_, ASYNC_PREFETCH_ABORT_MICROS);
+        Status status = fs_->AbortIO(handles);
+        assert(status.ok());
+      }
+    }
+    DestroyAndClearIOHandle(second);
+    bufs_[second].buffer_.Clear();
+  }
+
+  // 5. Data is overlapping i.e. some of the data has been copied to third
+  // buffer and remaining will be updated below.
+  if (copy_to_third_buffer && DoesBufferContainData(curr_)) {
+    CopyDataToBuffer(curr_, offset, length);
+
+    // Length == 0: All the requested data has been copied to third buffer and
+    // it has already gone for async prefetching. It can return without doing
+    // anything further.
+    // Length > 0: More data needs to be consumed so it will continue async
+    // and sync prefetching and copy the remaining data to third buffer in the
+    // end.
+    if (length == 0) {
+      return s;
+    }
+  }
+
+  // 6. Go for ReadAsync and Read (if needed).
+  size_t prefetch_size = length + readahead_size;
+  size_t _offset = static_cast<size_t>(offset);
+
+  // offset and size alignment for curr_ buffer with synchronous prefetching
+  uint64_t rounddown_start1 = Rounddown(_offset, alignment);
+  uint64_t roundup_end1 = Roundup(_offset + prefetch_size, alignment);
+  uint64_t roundup_len1 = roundup_end1 - rounddown_start1;
+  assert(roundup_len1 >= alignment);
+  assert(roundup_len1 % alignment == 0);
+  uint64_t chunk_len1 = 0;
+  uint64_t read_len1 = 0;
+
+  assert(!bufs_[second].async_read_in_progress_ &&
+         !DoesBufferContainData(second));
+
+  // For length == 0, skip the synchronous prefetching. read_len1 will be 0.
+  if (length > 0) {
+    CalculateOffsetAndLen(alignment, offset, roundup_len1, curr_,
+                          false /*refit_tail*/, chunk_len1);
+    assert(roundup_len1 >= chunk_len1);
+    read_len1 = static_cast<size_t>(roundup_len1 - chunk_len1);
+  }
+  {
+    // offset and size alignment for second buffer for asynchronous
+    // prefetching
+    uint64_t rounddown_start2 = roundup_end1;
+    uint64_t roundup_end2 =
+        Roundup(rounddown_start2 + readahead_size, alignment);
+
+    // For length == 0, do the asynchronous prefetching in second instead of
+    // synchronous prefetching in curr_.
+    if (length == 0) {
+      rounddown_start2 =
+          bufs_[curr_].offset_ + bufs_[curr_].buffer_.CurrentSize();
+      roundup_end2 = Roundup(rounddown_start2 + prefetch_size, alignment);
+    }
+
+    uint64_t roundup_len2 = roundup_end2 - rounddown_start2;
+    uint64_t chunk_len2 = 0;
+    CalculateOffsetAndLen(alignment, rounddown_start2, roundup_len2, second,
+                          false /*refit_tail*/, chunk_len2);
+    assert(chunk_len2 == 0);
+    // Update the buffer offset.
+    bufs_[second].offset_ = rounddown_start2;
+    assert(roundup_len2 >= chunk_len2);
+    uint64_t read_len2 = static_cast<size_t>(roundup_len2 - chunk_len2);
+    Status tmp_s = ReadAsync(opts, reader, read_len2, rounddown_start2, second);
+    if (!tmp_s.ok()) {
+      DestroyAndClearIOHandle(second);
+      bufs_[second].buffer_.Clear();
+    }
+  }
+
+  if (read_len1 > 0) {
+    s = Read(opts, reader, rate_limiter_priority, read_len1, chunk_len1,
+             rounddown_start1, curr_);
+    if (!s.ok()) {
+      if (bufs_[second].io_handle_ != nullptr) {
+        std::vector<void*> handles;
+        handles.emplace_back(bufs_[second].io_handle_);
+        {
+          StopWatch sw(clock_, stats_, ASYNC_PREFETCH_ABORT_MICROS);
+          Status status = fs_->AbortIO(handles);
+          assert(status.ok());
+        }
+      }
+      DestroyAndClearIOHandle(second);
+      bufs_[second].buffer_.Clear();
+      bufs_[curr_].buffer_.Clear();
+      return s;
+    }
+  }
+  // Copy remaining requested bytes to third_buffer.
+  if (copy_to_third_buffer && length > 0) {
+    CopyDataToBuffer(curr_, offset, length);
+  }
+  return s;
+}
+
+bool FilePrefetchBuffer::TryReadFromCache(const IOOptions& opts,
+                                          RandomAccessFileReader* reader,
+                                          uint64_t offset, size_t n,
+                                          Slice* result, Status* status,
+                                          Env::IOPriority rate_limiter_priority,
+                                          bool for_compaction /* = false */) {
+  if (track_min_offset_ && offset < min_offset_read_) {
+    min_offset_read_ = static_cast<size_t>(offset);
+  }
+  if (!enable_ || (offset < bufs_[curr_].offset_)) {
+    return false;
+  }
+
+  // If the buffer contains only a few of the requested bytes:
+  //    If readahead is enabled: prefetch the remaining bytes + readahead bytes
+  //        and satisfy the request.
+  //    If readahead is not enabled: return false.
+  TEST_SYNC_POINT_CALLBACK("FilePrefetchBuffer::TryReadFromCache",
+                           &readahead_size_);
+  if (offset + n > bufs_[curr_].offset_ + bufs_[curr_].buffer_.CurrentSize()) {
+    if (readahead_size_ > 0) {
+      Status s;
+      assert(reader != nullptr);
+      assert(max_readahead_size_ >= readahead_size_);
+      if (for_compaction) {
+        s = Prefetch(opts, reader, offset, std::max(n, readahead_size_),
+                     rate_limiter_priority);
+      } else {
+        if (implicit_auto_readahead_) {
+          if (!IsEligibleForPrefetch(offset, n)) {
+            // Ignore status as Prefetch is not called.
+            s.PermitUncheckedError();
+            return false;
+          }
+        }
+        s = Prefetch(opts, reader, offset, n + readahead_size_,
+                     rate_limiter_priority);
+      }
+      if (!s.ok()) {
+        if (status) {
+          *status = s;
+        }
+#ifndef NDEBUG
+        IGNORE_STATUS_IF_ERROR(s);
+#endif
+        return false;
+      }
+      readahead_size_ = std::min(max_readahead_size_, readahead_size_ * 2);
+    } else {
+      return false;
+    }
+  }
+  UpdateReadPattern(offset, n, false /*decrease_readaheadsize*/);
+
+  uint64_t offset_in_buffer = offset - bufs_[curr_].offset_;
+  *result = Slice(bufs_[curr_].buffer_.BufferStart() + offset_in_buffer, n);
+  return true;
+}
+
+bool FilePrefetchBuffer::TryReadFromCacheAsync(
+    const IOOptions& opts, RandomAccessFileReader* reader, uint64_t offset,
+    size_t n, Slice* result, Status* status,
+    Env::IOPriority rate_limiter_priority) {
+  if (track_min_offset_ && offset < min_offset_read_) {
+    min_offset_read_ = static_cast<size_t>(offset);
+  }
+
+  if (!enable_) {
+    return false;
+  }
+
+  if (explicit_prefetch_submitted_) {
+    // explicit_prefetch_submitted_ is special case where it expects request
+    // submitted in PrefetchAsync should match with this request. Otherwise
+    // buffers will be outdated.
+    // Random offset called. So abort the IOs.
+    if (prev_offset_ != offset) {
+      AbortAllIOs();
+      bufs_[curr_].buffer_.Clear();
+      bufs_[curr_ ^ 1].buffer_.Clear();
+      explicit_prefetch_submitted_ = false;
+      return false;
+    }
+  }
+
+  if (!explicit_prefetch_submitted_ && offset < bufs_[curr_].offset_) {
+    return false;
+  }
+
+  bool prefetched = false;
+  bool copy_to_third_buffer = false;
+  // If the buffer contains only a few of the requested bytes:
+  //    If readahead is enabled: prefetch the remaining bytes + readahead bytes
+  //        and satisfy the request.
+  //    If readahead is not enabled: return false.
+  TEST_SYNC_POINT_CALLBACK("FilePrefetchBuffer::TryReadFromCache",
+                           &readahead_size_);
+
+  if (explicit_prefetch_submitted_ ||
+      (bufs_[curr_].async_read_in_progress_ ||
+       offset + n >
+           bufs_[curr_].offset_ + bufs_[curr_].buffer_.CurrentSize())) {
+    if (readahead_size_ > 0) {
+      Status s;
+      assert(reader != nullptr);
+      assert(max_readahead_size_ >= readahead_size_);
+
+      if (implicit_auto_readahead_) {
+        if (!IsEligibleForPrefetch(offset, n)) {
+          // Ignore status as Prefetch is not called.
+          s.PermitUncheckedError();
+          return false;
+        }
+      }
+      // Prefetch n + readahead_size_/2 synchronously as remaining
+      // readahead_size_/2 will be prefetched asynchronously.
+      s = PrefetchAsyncInternal(opts, reader, offset, n, readahead_size_ / 2,
+                                rate_limiter_priority, copy_to_third_buffer);
+      explicit_prefetch_submitted_ = false;
+      if (!s.ok()) {
+        if (status) {
+          *status = s;
+        }
+#ifndef NDEBUG
+        IGNORE_STATUS_IF_ERROR(s);
+#endif
+        return false;
+      }
+      prefetched = explicit_prefetch_submitted_ ? false : true;
+    } else {
+      return false;
+    }
+  }
+
+  UpdateReadPattern(offset, n, false /*decrease_readaheadsize*/);
+
+  uint32_t index = curr_;
+  if (copy_to_third_buffer) {
+    index = 2;
+  }
+  uint64_t offset_in_buffer = offset - bufs_[index].offset_;
+  *result = Slice(bufs_[index].buffer_.BufferStart() + offset_in_buffer, n);
+  if (prefetched) {
+    readahead_size_ = std::min(max_readahead_size_, readahead_size_ * 2);
+  }
+  return true;
+}
+
+void FilePrefetchBuffer::PrefetchAsyncCallback(const FSReadRequest& req,
+                                               void* cb_arg) {
+  uint32_t index = *(static_cast<uint32_t*>(cb_arg));
+#ifndef NDEBUG
+  if (req.result.size() < req.len) {
+    // Fake an IO error to force db_stress fault injection to ignore
+    // truncated read errors
+    IGNORE_STATUS_IF_ERROR(Status::IOError());
+  }
+  IGNORE_STATUS_IF_ERROR(req.status);
+#endif
+
+  if (req.status.ok()) {
+    if (req.offset + req.result.size() <=
+        bufs_[index].offset_ + bufs_[index].buffer_.CurrentSize()) {
+      // All requested bytes are already in the buffer or no data is read
+      // because of EOF. So no need to update.
+      return;
+    }
+    if (req.offset < bufs_[index].offset_) {
+      // Next block to be read has changed (Recent read was not a sequential
+      // read). So ignore this read.
+      return;
+    }
+    size_t current_size = bufs_[index].buffer_.CurrentSize();
+    bufs_[index].buffer_.Size(current_size + req.result.size());
+  }
+}
+
+Status FilePrefetchBuffer::PrefetchAsync(const IOOptions& opts,
+                                         RandomAccessFileReader* reader,
+                                         uint64_t offset, size_t n,
+                                         Slice* result) {
+  assert(reader != nullptr);
+  if (!enable_) {
+    return Status::NotSupported();
+  }
+
+  TEST_SYNC_POINT("FilePrefetchBuffer::PrefetchAsync:Start");
+
+  num_file_reads_ = 0;
+  explicit_prefetch_submitted_ = false;
+  bool is_eligible_for_prefetching = false;
+  if (readahead_size_ > 0 &&
+      (!implicit_auto_readahead_ ||
+       num_file_reads_ + 1 >= num_file_reads_for_auto_readahead_)) {
+    is_eligible_for_prefetching = true;
+  }
+
+  // 1. Cancel any pending async read to make code simpler as buffers can be out
+  // of sync.
+  AbortAllIOs();
+
+  // 2. Clear outdated data.
+  UpdateBuffersIfNeeded(offset);
+  uint32_t second = curr_ ^ 1;
+  // Since PrefetchAsync can be called on non sequential reads. So offset can
+  // be less than curr_ buffers' offset. In that case also it clears both
+  // buffers.
+  if (DoesBufferContainData(curr_) && !IsOffsetInBuffer(offset, curr_)) {
+    bufs_[curr_].buffer_.Clear();
+    bufs_[second].buffer_.Clear();
+  }
+
+  UpdateReadPattern(offset, n, /*decrease_readaheadsize=*/false);
+
+  bool data_found = false;
+
+  // 3. If curr_ has full data.
+  if (DoesBufferContainData(curr_) && IsDataBlockInBuffer(offset, n, curr_)) {
+    uint64_t offset_in_buffer = offset - bufs_[curr_].offset_;
+    *result = Slice(bufs_[curr_].buffer_.BufferStart() + offset_in_buffer, n);
+    data_found = true;
+    // Update num_file_reads_ as TryReadFromCacheAsync won't be called for
+    // poll and update num_file_reads_ if data is found.
+    num_file_reads_++;
+
+    // 3.1 If second also has some data or is not eligible for prefetching,
+    // return.
+    if (!is_eligible_for_prefetching || DoesBufferContainData(second)) {
+      return Status::OK();
+    }
+  } else {
+    // Partial data in curr_.
+    bufs_[curr_].buffer_.Clear();
+  }
+  bufs_[second].buffer_.Clear();
+
+  Status s;
+  size_t alignment = reader->file()->GetRequiredBufferAlignment();
+  size_t prefetch_size = is_eligible_for_prefetching ? readahead_size_ / 2 : 0;
+  size_t offset_to_read = static_cast<size_t>(offset);
+  uint64_t rounddown_start1 = 0;
+  uint64_t roundup_end1 = 0;
+  uint64_t rounddown_start2 = 0;
+  uint64_t roundup_end2 = 0;
+  uint64_t chunk_len1 = 0;
+  uint64_t chunk_len2 = 0;
+  size_t read_len1 = 0;
+  size_t read_len2 = 0;
+
+  // - If curr_ is empty.
+  //   - Call async read for full data +  prefetch_size on curr_.
+  //   - Call async read for prefetch_size on second if eligible.
+  // - If curr_ is filled.
+  //   - prefetch_size on second.
+  // Calculate length and offsets for reading.
+  if (!DoesBufferContainData(curr_)) {
+    // Prefetch full data + prefetch_size in curr_.
+    rounddown_start1 = Rounddown(offset_to_read, alignment);
+    roundup_end1 = Roundup(offset_to_read + n + prefetch_size, alignment);
+    uint64_t roundup_len1 = roundup_end1 - rounddown_start1;
+    assert(roundup_len1 >= alignment);
+    assert(roundup_len1 % alignment == 0);
+
+    CalculateOffsetAndLen(alignment, rounddown_start1, roundup_len1, curr_,
+                          false, chunk_len1);
+    assert(chunk_len1 == 0);
+    assert(roundup_len1 >= chunk_len1);
+    read_len1 = static_cast<size_t>(roundup_len1 - chunk_len1);
+    bufs_[curr_].offset_ = rounddown_start1;
+  }
+
+  if (is_eligible_for_prefetching) {
+    if (DoesBufferContainData(curr_)) {
+      rounddown_start2 =
+          bufs_[curr_].offset_ + bufs_[curr_].buffer_.CurrentSize();
+    } else {
+      rounddown_start2 = roundup_end1;
+    }
+
+    roundup_end2 = Roundup(rounddown_start2 + prefetch_size, alignment);
+    uint64_t roundup_len2 = roundup_end2 - rounddown_start2;
+
+    assert(roundup_len2 >= alignment);
+    CalculateOffsetAndLen(alignment, rounddown_start2, roundup_len2, second,
+                          false, chunk_len2);
+    assert(chunk_len2 == 0);
+    assert(roundup_len2 >= chunk_len2);
+    read_len2 = static_cast<size_t>(roundup_len2 - chunk_len2);
+    // Update the buffer offset.
+    bufs_[second].offset_ = rounddown_start2;
+  }
+
+  if (read_len1) {
+    s = ReadAsync(opts, reader, read_len1, rounddown_start1, curr_);
+    if (!s.ok()) {
+      DestroyAndClearIOHandle(curr_);
+      bufs_[curr_].buffer_.Clear();
+      return s;
+    }
+    explicit_prefetch_submitted_ = true;
+    prev_len_ = 0;
+  }
+  if (read_len2) {
+    s = ReadAsync(opts, reader, read_len2, rounddown_start2, second);
+    if (!s.ok()) {
+      DestroyAndClearIOHandle(second);
+      bufs_[second].buffer_.Clear();
+      return s;
+    }
+    readahead_size_ = std::min(max_readahead_size_, readahead_size_ * 2);
+  }
+  return (data_found ? Status::OK() : Status::TryAgain());
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/file/file_prefetch_buffer.h b/src/rocksdb/file/file_prefetch_buffer.h
new file mode 100644
index 000000000..a4a75fe2b
--- /dev/null
+++ b/src/rocksdb/file/file_prefetch_buffer.h
@@ -0,0 +1,446 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <algorithm>
+#include <atomic>
+#include <sstream>
+#include <string>
+
+#include "file/readahead_file_info.h"
+#include "monitoring/statistics.h"
+#include "port/port.h"
+#include "rocksdb/env.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/options.h"
+#include "util/aligned_buffer.h"
+#include "util/autovector.h"
+#include "util/stop_watch.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+#define DEFAULT_DECREMENT 8 * 1024
+
+struct IOOptions;
+class RandomAccessFileReader;
+
+struct BufferInfo {
+  AlignedBuffer buffer_;
+
+  uint64_t offset_ = 0;
+
+  // Below parameters are used in case of async read flow.
+  // Length requested for in ReadAsync.
+  size_t async_req_len_ = 0;
+
+  // async_read_in_progress can be used as mutex. Callback can update the buffer
+  // and its size but async_read_in_progress is only set by main thread.
+  bool async_read_in_progress_ = false;
+
+  // io_handle is allocated and used by underlying file system in case of
+  // asynchronous reads.
+  void* io_handle_ = nullptr;
+
+  IOHandleDeleter del_fn_ = nullptr;
+
+  // pos represents the index of this buffer in vector of BufferInfo.
+  uint32_t pos_ = 0;
+};
+
+// FilePrefetchBuffer is a smart buffer to store and read data from a file.
+class FilePrefetchBuffer {
+ public:
+  // Constructor.
+  //
+  // All arguments are optional.
+  // readahead_size     : the initial readahead size.
+  // max_readahead_size : the maximum readahead size.
+  //   If max_readahead_size > readahead_size, the readahead size will be
+  //   doubled on every IO until max_readahead_size is hit.
+  //   Typically this is set as a multiple of readahead_size.
+  //   max_readahead_size should be greater than equal to readahead_size.
+  // enable : controls whether reading from the buffer is enabled.
+  //   If false, TryReadFromCache() always return false, and we only take stats
+  //   for the minimum offset if track_min_offset = true.
+  // track_min_offset : Track the minimum offset ever read and collect stats on
+  //   it. Used for adaptable readahead of the file footer/metadata.
+  // implicit_auto_readahead : Readahead is enabled implicitly by rocksdb after
+  //   doing sequential scans for two times.
+  //
+  // Automatic readhead is enabled for a file if readahead_size
+  // and max_readahead_size are passed in.
+  // A user can construct a FilePrefetchBuffer without any arguments, but use
+  // `Prefetch` to load data into the buffer.
+  FilePrefetchBuffer(size_t readahead_size = 0, size_t max_readahead_size = 0,
+                     bool enable = true, bool track_min_offset = false,
+                     bool implicit_auto_readahead = false,
+                     uint64_t num_file_reads = 0,
+                     uint64_t num_file_reads_for_auto_readahead = 0,
+                     FileSystem* fs = nullptr, SystemClock* clock = nullptr,
+                     Statistics* stats = nullptr)
+      : curr_(0),
+        readahead_size_(readahead_size),
+        initial_auto_readahead_size_(readahead_size),
+        max_readahead_size_(max_readahead_size),
+        min_offset_read_(std::numeric_limits<size_t>::max()),
+        enable_(enable),
+        track_min_offset_(track_min_offset),
+        implicit_auto_readahead_(implicit_auto_readahead),
+        prev_offset_(0),
+        prev_len_(0),
+        num_file_reads_for_auto_readahead_(num_file_reads_for_auto_readahead),
+        num_file_reads_(num_file_reads),
+        explicit_prefetch_submitted_(false),
+        fs_(fs),
+        clock_(clock),
+        stats_(stats) {
+    assert((num_file_reads_ >= num_file_reads_for_auto_readahead_ + 1) ||
+           (num_file_reads_ == 0));
+    // If ReadOptions.async_io is enabled, data is asynchronously filled in
+    // second buffer while curr_ is being consumed. If data is overlapping in
+    // two buffers, data is copied to third buffer to return continuous buffer.
+    bufs_.resize(3);
+    for (uint32_t i = 0; i < 2; i++) {
+      bufs_[i].pos_ = i;
+    }
+  }
+
+  ~FilePrefetchBuffer() {
+    // Abort any pending async read request before destroying the class object.
+    if (fs_ != nullptr) {
+      std::vector<void*> handles;
+      for (uint32_t i = 0; i < 2; i++) {
+        if (bufs_[i].async_read_in_progress_ &&
+            bufs_[i].io_handle_ != nullptr) {
+          handles.emplace_back(bufs_[i].io_handle_);
+        }
+      }
+      if (!handles.empty()) {
+        StopWatch sw(clock_, stats_, ASYNC_PREFETCH_ABORT_MICROS);
+        Status s = fs_->AbortIO(handles);
+        assert(s.ok());
+      }
+    }
+
+    // Prefetch buffer bytes discarded.
+    uint64_t bytes_discarded = 0;
+    // Iterated over 2 buffers.
+    for (int i = 0; i < 2; i++) {
+      int first = i;
+      int second = i ^ 1;
+
+      if (DoesBufferContainData(first)) {
+        // If last block was read completely from first and some bytes in
+        // first buffer are still unconsumed.
+        if (prev_offset_ >= bufs_[first].offset_ &&
+            prev_offset_ + prev_len_ <
+                bufs_[first].offset_ + bufs_[first].buffer_.CurrentSize()) {
+          bytes_discarded += bufs_[first].buffer_.CurrentSize() -
+                             (prev_offset_ + prev_len_ - bufs_[first].offset_);
+        }
+        // If data was in second buffer and some/whole block bytes were read
+        // from second buffer.
+        else if (prev_offset_ < bufs_[first].offset_ &&
+                 !DoesBufferContainData(second)) {
+          // If last block read was completely from different buffer, this
+          // buffer is unconsumed.
+          if (prev_offset_ + prev_len_ <= bufs_[first].offset_) {
+            bytes_discarded += bufs_[first].buffer_.CurrentSize();
+          }
+          // If last block read overlaps with this buffer and some data is
+          // still unconsumed and previous buffer (second) is not cleared.
+          else if (prev_offset_ + prev_len_ > bufs_[first].offset_ &&
+                   bufs_[first].offset_ + bufs_[first].buffer_.CurrentSize() ==
+                       bufs_[second].offset_) {
+            bytes_discarded += bufs_[first].buffer_.CurrentSize() -
+                               (/*bytes read from this buffer=*/prev_len_ -
+                                (bufs_[first].offset_ - prev_offset_));
+          }
+        }
+      }
+    }
+
+    for (uint32_t i = 0; i < 2; i++) {
+      // Release io_handle.
+      DestroyAndClearIOHandle(i);
+    }
+    RecordInHistogram(stats_, PREFETCHED_BYTES_DISCARDED, bytes_discarded);
+  }
+
+  // Load data into the buffer from a file.
+  // reader                : the file reader.
+  // offset                : the file offset to start reading from.
+  // n                     : the number of bytes to read.
+  // rate_limiter_priority : rate limiting priority, or `Env::IO_TOTAL` to
+  //                         bypass.
+  Status Prefetch(const IOOptions& opts, RandomAccessFileReader* reader,
+                  uint64_t offset, size_t n,
+                  Env::IOPriority rate_limiter_priority);
+
+  // Request for reading the data from a file asynchronously.
+  // If data already exists in the buffer, result will be updated.
+  // reader                : the file reader.
+  // offset                : the file offset to start reading from.
+  // n                     : the number of bytes to read.
+  // result                : if data already exists in the buffer, result will
+  //                         be updated with the data.
+  //
+  // If data already exist in the buffer, it will return Status::OK, otherwise
+  // it will send asynchronous request and return Status::TryAgain.
+  Status PrefetchAsync(const IOOptions& opts, RandomAccessFileReader* reader,
+                       uint64_t offset, size_t n, Slice* result);
+
+  // Tries returning the data for a file read from this buffer if that data is
+  // in the buffer.
+  // It handles tracking the minimum read offset if track_min_offset = true.
+  // It also does the exponential readahead when readahead_size is set as part
+  // of the constructor.
+  //
+  // opts                  : the IO options to use.
+  // reader                : the file reader.
+  // offset                : the file offset.
+  // n                     : the number of bytes.
+  // result                : output buffer to put the data into.
+  // s                     : output status.
+  // rate_limiter_priority : rate limiting priority, or `Env::IO_TOTAL` to
+  //                         bypass.
+  // for_compaction        : true if cache read is done for compaction read.
+  bool TryReadFromCache(const IOOptions& opts, RandomAccessFileReader* reader,
+                        uint64_t offset, size_t n, Slice* result, Status* s,
+                        Env::IOPriority rate_limiter_priority,
+                        bool for_compaction = false);
+
+  bool TryReadFromCacheAsync(const IOOptions& opts,
+                             RandomAccessFileReader* reader, uint64_t offset,
+                             size_t n, Slice* result, Status* status,
+                             Env::IOPriority rate_limiter_priority);
+
+  // The minimum `offset` ever passed to TryReadFromCache(). This will nly be
+  // tracked if track_min_offset = true.
+  size_t min_offset_read() const { return min_offset_read_; }
+
+  // Called in case of implicit auto prefetching.
+  void UpdateReadPattern(const uint64_t& offset, const size_t& len,
+                         bool decrease_readaheadsize) {
+    if (decrease_readaheadsize) {
+      // Since this block was eligible for prefetch but it was found in
+      // cache, so check and decrease the readahead_size by 8KB (default)
+      // if eligible.
+      DecreaseReadAheadIfEligible(offset, len);
+    }
+    prev_offset_ = offset;
+    prev_len_ = len;
+    explicit_prefetch_submitted_ = false;
+  }
+
+  void GetReadaheadState(ReadaheadFileInfo::ReadaheadInfo* readahead_info) {
+    readahead_info->readahead_size = readahead_size_;
+    readahead_info->num_file_reads = num_file_reads_;
+  }
+
+  void DecreaseReadAheadIfEligible(uint64_t offset, size_t size,
+                                   size_t value = DEFAULT_DECREMENT) {
+    // Decrease the readahead_size if
+    // - its enabled internally by RocksDB (implicit_auto_readahead_) and,
+    // - readahead_size is greater than 0 and,
+    // - this block would have called prefetch API if not found in cache for
+    //   which conditions are:
+    //   - few/no bytes are in buffer and,
+    //   - block is sequential with the previous read and,
+    //   - num_file_reads_ + 1 (including this read) >
+    //   num_file_reads_for_auto_readahead_
+    size_t curr_size = bufs_[curr_].async_read_in_progress_
+                           ? bufs_[curr_].async_req_len_
+                           : bufs_[curr_].buffer_.CurrentSize();
+    if (implicit_auto_readahead_ && readahead_size_ > 0) {
+      if ((offset + size > bufs_[curr_].offset_ + curr_size) &&
+          IsBlockSequential(offset) &&
+          (num_file_reads_ + 1 > num_file_reads_for_auto_readahead_)) {
+        readahead_size_ =
+            std::max(initial_auto_readahead_size_,
+                     (readahead_size_ >= value ? readahead_size_ - value : 0));
+      }
+    }
+  }
+
+  // Callback function passed to underlying FS in case of asynchronous reads.
+  void PrefetchAsyncCallback(const FSReadRequest& req, void* cb_arg);
+
+ private:
+  // Calculates roundoff offset and length to be prefetched based on alignment
+  // and data present in buffer_. It also allocates new buffer or refit tail if
+  // required.
+  void CalculateOffsetAndLen(size_t alignment, uint64_t offset,
+                             size_t roundup_len, uint32_t index,
+                             bool refit_tail, uint64_t& chunk_len);
+
+  void AbortIOIfNeeded(uint64_t offset);
+
+  void AbortAllIOs();
+
+  void UpdateBuffersIfNeeded(uint64_t offset);
+
+  // It calls Poll API if any there is any pending asynchronous request. It then
+  // checks if data is in any buffer. It clears the outdated data and swaps the
+  // buffers if required.
+  void PollAndUpdateBuffersIfNeeded(uint64_t offset);
+
+  Status PrefetchAsyncInternal(const IOOptions& opts,
+                               RandomAccessFileReader* reader, uint64_t offset,
+                               size_t length, size_t readahead_size,
+                               Env::IOPriority rate_limiter_priority,
+                               bool& copy_to_third_buffer);
+
+  Status Read(const IOOptions& opts, RandomAccessFileReader* reader,
+              Env::IOPriority rate_limiter_priority, uint64_t read_len,
+              uint64_t chunk_len, uint64_t rounddown_start, uint32_t index);
+
+  Status ReadAsync(const IOOptions& opts, RandomAccessFileReader* reader,
+                   uint64_t read_len, uint64_t rounddown_start, uint32_t index);
+
+  // Copy the data from src to third buffer.
+  void CopyDataToBuffer(uint32_t src, uint64_t& offset, size_t& length);
+
+  bool IsBlockSequential(const size_t& offset) {
+    return (prev_len_ == 0 || (prev_offset_ + prev_len_ == offset));
+  }
+
+  // Called in case of implicit auto prefetching.
+  void ResetValues() {
+    num_file_reads_ = 1;
+    readahead_size_ = initial_auto_readahead_size_;
+  }
+
+  // Called in case of implicit auto prefetching.
+  bool IsEligibleForPrefetch(uint64_t offset, size_t n) {
+    // Prefetch only if this read is sequential otherwise reset readahead_size_
+    // to initial value.
+    if (!IsBlockSequential(offset)) {
+      UpdateReadPattern(offset, n, false /*decrease_readaheadsize*/);
+      ResetValues();
+      return false;
+    }
+    num_file_reads_++;
+
+    // Since async request was submitted in last call directly by calling
+    // PrefetchAsync, it skips num_file_reads_ check as this call is to poll the
+    // data submitted in previous call.
+    if (explicit_prefetch_submitted_) {
+      return true;
+    }
+    if (num_file_reads_ <= num_file_reads_for_auto_readahead_) {
+      UpdateReadPattern(offset, n, false /*decrease_readaheadsize*/);
+      return false;
+    }
+    return true;
+  }
+
+  // Helper functions.
+  bool IsDataBlockInBuffer(uint64_t offset, size_t length, uint32_t index) {
+    return (offset >= bufs_[index].offset_ &&
+            offset + length <=
+                bufs_[index].offset_ + bufs_[index].buffer_.CurrentSize());
+  }
+  bool IsOffsetInBuffer(uint64_t offset, uint32_t index) {
+    return (offset >= bufs_[index].offset_ &&
+            offset < bufs_[index].offset_ + bufs_[index].buffer_.CurrentSize());
+  }
+  bool DoesBufferContainData(uint32_t index) {
+    return bufs_[index].buffer_.CurrentSize() > 0;
+  }
+  bool IsBufferOutdated(uint64_t offset, uint32_t index) {
+    return (
+        !bufs_[index].async_read_in_progress_ && DoesBufferContainData(index) &&
+        offset >= bufs_[index].offset_ + bufs_[index].buffer_.CurrentSize());
+  }
+  bool IsBufferOutdatedWithAsyncProgress(uint64_t offset, uint32_t index) {
+    return (bufs_[index].async_read_in_progress_ &&
+            bufs_[index].io_handle_ != nullptr &&
+            offset >= bufs_[index].offset_ + bufs_[index].async_req_len_);
+  }
+  bool IsOffsetInBufferWithAsyncProgress(uint64_t offset, uint32_t index) {
+    return (bufs_[index].async_read_in_progress_ &&
+            offset >= bufs_[index].offset_ &&
+            offset < bufs_[index].offset_ + bufs_[index].async_req_len_);
+  }
+
+  bool IsSecondBuffEligibleForPrefetching() {
+    uint32_t second = curr_ ^ 1;
+    if (bufs_[second].async_read_in_progress_) {
+      return false;
+    }
+    assert(!bufs_[curr_].async_read_in_progress_);
+
+    if (DoesBufferContainData(curr_) && DoesBufferContainData(second) &&
+        (bufs_[curr_].offset_ + bufs_[curr_].buffer_.CurrentSize() ==
+         bufs_[second].offset_)) {
+      return false;
+    }
+    bufs_[second].buffer_.Clear();
+    return true;
+  }
+
+  void DestroyAndClearIOHandle(uint32_t index) {
+    if (bufs_[index].io_handle_ != nullptr && bufs_[index].del_fn_ != nullptr) {
+      bufs_[index].del_fn_(bufs_[index].io_handle_);
+      bufs_[index].io_handle_ = nullptr;
+      bufs_[index].del_fn_ = nullptr;
+    }
+    bufs_[index].async_read_in_progress_ = false;
+  }
+
+  Status HandleOverlappingData(const IOOptions& opts,
+                               RandomAccessFileReader* reader, uint64_t offset,
+                               size_t length, size_t readahead_size,
+                               Env::IOPriority rate_limiter_priority,
+                               bool& copy_to_third_buffer, uint64_t& tmp_offset,
+                               size_t& tmp_length);
+
+  std::vector<BufferInfo> bufs_;
+  // curr_ represents the index for bufs_ indicating which buffer is being
+  // consumed currently.
+  uint32_t curr_;
+
+  size_t readahead_size_;
+  size_t initial_auto_readahead_size_;
+  // FilePrefetchBuffer object won't be created from Iterator flow if
+  // max_readahead_size_ = 0.
+  size_t max_readahead_size_;
+
+  // The minimum `offset` ever passed to TryReadFromCache().
+  size_t min_offset_read_;
+  // if false, TryReadFromCache() always return false, and we only take stats
+  // for track_min_offset_ if track_min_offset_ = true
+  bool enable_;
+  // If true, track minimum `offset` ever passed to TryReadFromCache(), which
+  // can be fetched from min_offset_read().
+  bool track_min_offset_;
+
+  // implicit_auto_readahead is enabled by rocksdb internally after 2
+  // sequential IOs.
+  bool implicit_auto_readahead_;
+  uint64_t prev_offset_;
+  size_t prev_len_;
+  // num_file_reads_ and num_file_reads_for_auto_readahead_ is only used when
+  // implicit_auto_readahead_ is set.
+  uint64_t num_file_reads_for_auto_readahead_;
+  uint64_t num_file_reads_;
+
+  // If explicit_prefetch_submitted_ is set then it indicates RocksDB called
+  // PrefetchAsync to submit request. It needs to call TryReadFromCacheAsync to
+  // poll the submitted request without checking if data is sequential and
+  // num_file_reads_.
+  bool explicit_prefetch_submitted_;
+
+  FileSystem* fs_;
+  SystemClock* clock_;
+  Statistics* stats_;
+};
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/file/file_util.cc b/src/rocksdb/file/file_util.cc
new file mode 100644
index 000000000..7997d6e11
--- /dev/null
+++ b/src/rocksdb/file/file_util.cc
@@ -0,0 +1,282 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#include "file/file_util.h"
+
+#include <algorithm>
+#include <string>
+
+#include "file/random_access_file_reader.h"
+#include "file/sequence_file_reader.h"
+#include "file/sst_file_manager_impl.h"
+#include "file/writable_file_writer.h"
+#include "rocksdb/env.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Utility function to copy a file up to a specified length
+IOStatus CopyFile(FileSystem* fs, const std::string& source,
+                  std::unique_ptr<WritableFileWriter>& dest_writer,
+                  uint64_t size, bool use_fsync,
+                  const std::shared_ptr<IOTracer>& io_tracer,
+                  const Temperature temperature) {
+  FileOptions soptions;
+  IOStatus io_s;
+  std::unique_ptr<SequentialFileReader> src_reader;
+
+  {
+    soptions.temperature = temperature;
+    std::unique_ptr<FSSequentialFile> srcfile;
+    io_s = fs->NewSequentialFile(source, soptions, &srcfile, nullptr);
+    if (!io_s.ok()) {
+      return io_s;
+    }
+
+    if (size == 0) {
+      // default argument means copy everything
+      io_s = fs->GetFileSize(source, IOOptions(), &size, nullptr);
+      if (!io_s.ok()) {
+        return io_s;
+      }
+    }
+    src_reader.reset(
+        new SequentialFileReader(std::move(srcfile), source, io_tracer));
+  }
+
+  char buffer[4096];
+  Slice slice;
+  while (size > 0) {
+    size_t bytes_to_read = std::min(sizeof(buffer), static_cast<size_t>(size));
+    // TODO: rate limit copy file
+    io_s = status_to_io_status(
+        src_reader->Read(bytes_to_read, &slice, buffer,
+                         Env::IO_TOTAL /* rate_limiter_priority */));
+    if (!io_s.ok()) {
+      return io_s;
+    }
+    if (slice.size() == 0) {
+      return IOStatus::Corruption("file too small");
+    }
+    io_s = dest_writer->Append(slice);
+    if (!io_s.ok()) {
+      return io_s;
+    }
+    size -= slice.size();
+  }
+  return dest_writer->Sync(use_fsync);
+}
+
+IOStatus CopyFile(FileSystem* fs, const std::string& source,
+                  const std::string& destination, uint64_t size, bool use_fsync,
+                  const std::shared_ptr<IOTracer>& io_tracer,
+                  const Temperature temperature) {
+  FileOptions options;
+  IOStatus io_s;
+  std::unique_ptr<WritableFileWriter> dest_writer;
+
+  {
+    options.temperature = temperature;
+    std::unique_ptr<FSWritableFile> destfile;
+    io_s = fs->NewWritableFile(destination, options, &destfile, nullptr);
+    if (!io_s.ok()) {
+      return io_s;
+    }
+
+    dest_writer.reset(
+        new WritableFileWriter(std::move(destfile), destination, options));
+  }
+
+  return CopyFile(fs, source, dest_writer, size, use_fsync, io_tracer,
+                  temperature);
+}
+
+// Utility function to create a file with the provided contents
+IOStatus CreateFile(FileSystem* fs, const std::string& destination,
+                    const std::string& contents, bool use_fsync) {
+  const EnvOptions soptions;
+  IOStatus io_s;
+  std::unique_ptr<WritableFileWriter> dest_writer;
+
+  std::unique_ptr<FSWritableFile> destfile;
+  io_s = fs->NewWritableFile(destination, soptions, &destfile, nullptr);
+  if (!io_s.ok()) {
+    return io_s;
+  }
+  dest_writer.reset(
+      new WritableFileWriter(std::move(destfile), destination, soptions));
+  io_s = dest_writer->Append(Slice(contents));
+  if (!io_s.ok()) {
+    return io_s;
+  }
+  return dest_writer->Sync(use_fsync);
+}
+
+Status DeleteDBFile(const ImmutableDBOptions* db_options,
+                    const std::string& fname, const std::string& dir_to_sync,
+                    const bool force_bg, const bool force_fg) {
+#ifndef ROCKSDB_LITE
+  SstFileManagerImpl* sfm =
+      static_cast<SstFileManagerImpl*>(db_options->sst_file_manager.get());
+  if (sfm && !force_fg) {
+    return sfm->ScheduleFileDeletion(fname, dir_to_sync, force_bg);
+  } else {
+    return db_options->env->DeleteFile(fname);
+  }
+#else
+  (void)dir_to_sync;
+  (void)force_bg;
+  (void)force_fg;
+  // SstFileManager is not supported in ROCKSDB_LITE
+  // Delete file immediately
+  return db_options->env->DeleteFile(fname);
+#endif
+}
+
+// requested_checksum_func_name brings the function name of the checksum
+// generator in checksum_factory. Empty string is permitted, in which case the
+// name of the generator created by the factory is unchecked. When
+// `requested_checksum_func_name` is non-empty, however, the created generator's
+// name must match it, otherwise an `InvalidArgument` error is returned.
+IOStatus GenerateOneFileChecksum(
+    FileSystem* fs, const std::string& file_path,
+    FileChecksumGenFactory* checksum_factory,
+    const std::string& requested_checksum_func_name, std::string* file_checksum,
+    std::string* file_checksum_func_name,
+    size_t verify_checksums_readahead_size, bool allow_mmap_reads,
+    std::shared_ptr<IOTracer>& io_tracer, RateLimiter* rate_limiter,
+    Env::IOPriority rate_limiter_priority) {
+  if (checksum_factory == nullptr) {
+    return IOStatus::InvalidArgument("Checksum factory is invalid");
+  }
+  assert(file_checksum != nullptr);
+  assert(file_checksum_func_name != nullptr);
+
+  FileChecksumGenContext gen_context;
+  gen_context.requested_checksum_func_name = requested_checksum_func_name;
+  gen_context.file_name = file_path;
+  std::unique_ptr<FileChecksumGenerator> checksum_generator =
+      checksum_factory->CreateFileChecksumGenerator(gen_context);
+  if (checksum_generator == nullptr) {
+    std::string msg =
+        "Cannot get the file checksum generator based on the requested "
+        "checksum function name: " +
+        requested_checksum_func_name +
+        " from checksum factory: " + checksum_factory->Name();
+    return IOStatus::InvalidArgument(msg);
+  } else {
+    // For backward compatibility and use in file ingestion clients where there
+    // is no stored checksum function name, `requested_checksum_func_name` can
+    // be empty. If we give the requested checksum function name, we expect it
+    // is the same name of the checksum generator.
+    if (!requested_checksum_func_name.empty() &&
+        checksum_generator->Name() != requested_checksum_func_name) {
+      std::string msg = "Expected file checksum generator named '" +
+                        requested_checksum_func_name +
+                        "', while the factory created one "
+                        "named '" +
+                        checksum_generator->Name() + "'";
+      return IOStatus::InvalidArgument(msg);
+    }
+  }
+
+  uint64_t size;
+  IOStatus io_s;
+  std::unique_ptr<RandomAccessFileReader> reader;
+  {
+    std::unique_ptr<FSRandomAccessFile> r_file;
+    io_s = fs->NewRandomAccessFile(file_path, FileOptions(), &r_file, nullptr);
+    if (!io_s.ok()) {
+      return io_s;
+    }
+    io_s = fs->GetFileSize(file_path, IOOptions(), &size, nullptr);
+    if (!io_s.ok()) {
+      return io_s;
+    }
+    reader.reset(new RandomAccessFileReader(std::move(r_file), file_path,
+                                            nullptr /*Env*/, io_tracer, nullptr,
+                                            0, nullptr, rate_limiter));
+  }
+
+  // Found that 256 KB readahead size provides the best performance, based on
+  // experiments, for auto readahead. Experiment data is in PR #3282.
+  size_t default_max_read_ahead_size = 256 * 1024;
+  size_t readahead_size = (verify_checksums_readahead_size != 0)
+                              ? verify_checksums_readahead_size
+                              : default_max_read_ahead_size;
+
+  FilePrefetchBuffer prefetch_buffer(readahead_size /* readahead_size */,
+                                     readahead_size /* max_readahead_size */,
+                                     !allow_mmap_reads /* enable */);
+
+  Slice slice;
+  uint64_t offset = 0;
+  IOOptions opts;
+  while (size > 0) {
+    size_t bytes_to_read =
+        static_cast<size_t>(std::min(uint64_t{readahead_size}, size));
+    if (!prefetch_buffer.TryReadFromCache(
+            opts, reader.get(), offset, bytes_to_read, &slice,
+            nullptr /* status */, rate_limiter_priority,
+            false /* for_compaction */)) {
+      return IOStatus::Corruption("file read failed");
+    }
+    if (slice.size() == 0) {
+      return IOStatus::Corruption("file too small");
+    }
+    checksum_generator->Update(slice.data(), slice.size());
+    size -= slice.size();
+    offset += slice.size();
+  }
+  checksum_generator->Finalize();
+  *file_checksum = checksum_generator->GetChecksum();
+  *file_checksum_func_name = checksum_generator->Name();
+  return IOStatus::OK();
+}
+
+Status DestroyDir(Env* env, const std::string& dir) {
+  Status s;
+  if (env->FileExists(dir).IsNotFound()) {
+    return s;
+  }
+  std::vector<std::string> files_in_dir;
+  s = env->GetChildren(dir, &files_in_dir);
+  if (s.ok()) {
+    for (auto& file_in_dir : files_in_dir) {
+      std::string path = dir + "/" + file_in_dir;
+      bool is_dir = false;
+      s = env->IsDirectory(path, &is_dir);
+      if (s.ok()) {
+        if (is_dir) {
+          s = DestroyDir(env, path);
+        } else {
+          s = env->DeleteFile(path);
+        }
+      } else if (s.IsNotSupported()) {
+        s = Status::OK();
+      }
+      if (!s.ok()) {
+        // IsDirectory, etc. might not report NotFound
+        if (s.IsNotFound() || env->FileExists(path).IsNotFound()) {
+          // Allow files to be deleted externally
+          s = Status::OK();
+        } else {
+          break;
+        }
+      }
+    }
+  }
+
+  if (s.ok()) {
+    s = env->DeleteDir(dir);
+    // DeleteDir might or might not report NotFound
+    if (!s.ok() && (s.IsNotFound() || env->FileExists(dir).IsNotFound())) {
+      // Allow to be deleted externally
+      s = Status::OK();
+    }
+  }
+  return s;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/file/file_util.h b/src/rocksdb/file/file_util.h
new file mode 100644
index 000000000..d46a7ba0e
--- /dev/null
+++ b/src/rocksdb/file/file_util.h
@@ -0,0 +1,89 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#pragma once
+#include <string>
+
+#include "file/filename.h"
+#include "options/db_options.h"
+#include "rocksdb/env.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/sst_file_writer.h"
+#include "rocksdb/status.h"
+#include "rocksdb/system_clock.h"
+#include "rocksdb/types.h"
+#include "trace_replay/io_tracer.h"
+
+namespace ROCKSDB_NAMESPACE {
+// use_fsync maps to options.use_fsync, which determines the way that
+// the file is synced after copying.
+extern IOStatus CopyFile(FileSystem* fs, const std::string& source,
+                         std::unique_ptr<WritableFileWriter>& dest_writer,
+                         uint64_t size, bool use_fsync,
+                         const std::shared_ptr<IOTracer>& io_tracer,
+                         const Temperature temperature);
+extern IOStatus CopyFile(FileSystem* fs, const std::string& source,
+                         const std::string& destination, uint64_t size,
+                         bool use_fsync,
+                         const std::shared_ptr<IOTracer>& io_tracer,
+                         const Temperature temperature);
+inline IOStatus CopyFile(const std::shared_ptr<FileSystem>& fs,
+                         const std::string& source,
+                         const std::string& destination, uint64_t size,
+                         bool use_fsync,
+                         const std::shared_ptr<IOTracer>& io_tracer,
+                         const Temperature temperature) {
+  return CopyFile(fs.get(), source, destination, size, use_fsync, io_tracer,
+                  temperature);
+}
+extern IOStatus CreateFile(FileSystem* fs, const std::string& destination,
+                           const std::string& contents, bool use_fsync);
+
+inline IOStatus CreateFile(const std::shared_ptr<FileSystem>& fs,
+                           const std::string& destination,
+                           const std::string& contents, bool use_fsync) {
+  return CreateFile(fs.get(), destination, contents, use_fsync);
+}
+
+extern Status DeleteDBFile(const ImmutableDBOptions* db_options,
+                           const std::string& fname,
+                           const std::string& path_to_sync, const bool force_bg,
+                           const bool force_fg);
+
+extern IOStatus GenerateOneFileChecksum(
+    FileSystem* fs, const std::string& file_path,
+    FileChecksumGenFactory* checksum_factory,
+    const std::string& requested_checksum_func_name, std::string* file_checksum,
+    std::string* file_checksum_func_name,
+    size_t verify_checksums_readahead_size, bool allow_mmap_reads,
+    std::shared_ptr<IOTracer>& io_tracer, RateLimiter* rate_limiter,
+    Env::IOPriority rate_limiter_priority);
+
+inline IOStatus PrepareIOFromReadOptions(const ReadOptions& ro,
+                                         SystemClock* clock, IOOptions& opts) {
+  if (ro.deadline.count()) {
+    std::chrono::microseconds now =
+        std::chrono::microseconds(clock->NowMicros());
+    // Ensure there is atleast 1us available. We don't want to pass a value of
+    // 0 as that means no timeout
+    if (now >= ro.deadline) {
+      return IOStatus::TimedOut("Deadline exceeded");
+    }
+    opts.timeout = ro.deadline - now;
+  }
+
+  if (ro.io_timeout.count() &&
+      (!opts.timeout.count() || ro.io_timeout < opts.timeout)) {
+    opts.timeout = ro.io_timeout;
+  }
+
+  opts.rate_limiter_priority = ro.rate_limiter_priority;
+  return IOStatus::OK();
+}
+
+// Test method to delete the input directory and all of its contents.
+// This method is destructive and is meant for use only in tests!!!
+Status DestroyDir(Env* env, const std::string& dir);
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/file/filename.cc b/src/rocksdb/file/filename.cc
new file mode 100644
index 000000000..1e04c7339
--- /dev/null
+++ b/src/rocksdb/file/filename.cc
@@ -0,0 +1,523 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include "file/filename.h"
+
+#include <ctype.h>
+#include <stdio.h>
+
+#include <cinttypes>
+#include <vector>
+
+#include "file/writable_file_writer.h"
+#include "rocksdb/env.h"
+#include "test_util/sync_point.h"
+#include "util/stop_watch.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+const std::string kCurrentFileName = "CURRENT";
+const std::string kOptionsFileNamePrefix = "OPTIONS-";
+const std::string kTempFileNameSuffix = "dbtmp";
+
+static const std::string kRocksDbTFileExt = "sst";
+static const std::string kLevelDbTFileExt = "ldb";
+static const std::string kRocksDBBlobFileExt = "blob";
+static const std::string kArchivalDirName = "archive";
+
+// Given a path, flatten the path name by replacing all chars not in
+// {[0-9,a-z,A-Z,-,_,.]} with _. And append '_LOG\0' at the end.
+// Return the number of chars stored in dest not including the trailing '\0'.
+static size_t GetInfoLogPrefix(const std::string& path, char* dest, int len) {
+  const char suffix[] = "_LOG";
+
+  size_t write_idx = 0;
+  size_t i = 0;
+  size_t src_len = path.size();
+
+  while (i < src_len && write_idx < len - sizeof(suffix)) {
+    if ((path[i] >= 'a' && path[i] <= 'z') ||
+        (path[i] >= '0' && path[i] <= '9') ||
+        (path[i] >= 'A' && path[i] <= 'Z') || path[i] == '-' ||
+        path[i] == '.' || path[i] == '_') {
+      dest[write_idx++] = path[i];
+    } else {
+      if (i > 0) {
+        dest[write_idx++] = '_';
+      }
+    }
+    i++;
+  }
+  assert(sizeof(suffix) <= len - write_idx);
+  // "\0" is automatically added by snprintf
+  snprintf(dest + write_idx, len - write_idx, suffix);
+  write_idx += sizeof(suffix) - 1;
+  return write_idx;
+}
+
+static std::string MakeFileName(uint64_t number, const char* suffix) {
+  char buf[100];
+  snprintf(buf, sizeof(buf), "%06llu.%s",
+           static_cast<unsigned long long>(number), suffix);
+  return buf;
+}
+
+static std::string MakeFileName(const std::string& name, uint64_t number,
+                                const char* suffix) {
+  return name + "/" + MakeFileName(number, suffix);
+}
+
+std::string LogFileName(const std::string& name, uint64_t number) {
+  assert(number > 0);
+  return MakeFileName(name, number, "log");
+}
+
+std::string LogFileName(uint64_t number) {
+  assert(number > 0);
+  return MakeFileName(number, "log");
+}
+
+std::string BlobFileName(uint64_t number) {
+  assert(number > 0);
+  return MakeFileName(number, kRocksDBBlobFileExt.c_str());
+}
+
+std::string BlobFileName(const std::string& blobdirname, uint64_t number) {
+  assert(number > 0);
+  return MakeFileName(blobdirname, number, kRocksDBBlobFileExt.c_str());
+}
+
+std::string BlobFileName(const std::string& dbname, const std::string& blob_dir,
+                         uint64_t number) {
+  assert(number > 0);
+  return MakeFileName(dbname + "/" + blob_dir, number,
+                      kRocksDBBlobFileExt.c_str());
+}
+
+std::string ArchivalDirectory(const std::string& dir) {
+  return dir + "/" + kArchivalDirName;
+}
+std::string ArchivedLogFileName(const std::string& name, uint64_t number) {
+  assert(number > 0);
+  return MakeFileName(name + "/" + kArchivalDirName, number, "log");
+}
+
+std::string MakeTableFileName(const std::string& path, uint64_t number) {
+  return MakeFileName(path, number, kRocksDbTFileExt.c_str());
+}
+
+std::string MakeTableFileName(uint64_t number) {
+  return MakeFileName(number, kRocksDbTFileExt.c_str());
+}
+
+std::string Rocks2LevelTableFileName(const std::string& fullname) {
+  assert(fullname.size() > kRocksDbTFileExt.size() + 1);
+  if (fullname.size() <= kRocksDbTFileExt.size() + 1) {
+    return "";
+  }
+  return fullname.substr(0, fullname.size() - kRocksDbTFileExt.size()) +
+         kLevelDbTFileExt;
+}
+
+uint64_t TableFileNameToNumber(const std::string& name) {
+  uint64_t number = 0;
+  uint64_t base = 1;
+  int pos = static_cast<int>(name.find_last_of('.'));
+  while (--pos >= 0 && name[pos] >= '0' && name[pos] <= '9') {
+    number += (name[pos] - '0') * base;
+    base *= 10;
+  }
+  return number;
+}
+
+std::string TableFileName(const std::vector<DbPath>& db_paths, uint64_t number,
+                          uint32_t path_id) {
+  assert(number > 0);
+  std::string path;
+  if (path_id >= db_paths.size()) {
+    path = db_paths.back().path;
+  } else {
+    path = db_paths[path_id].path;
+  }
+  return MakeTableFileName(path, number);
+}
+
+void FormatFileNumber(uint64_t number, uint32_t path_id, char* out_buf,
+                      size_t out_buf_size) {
+  if (path_id == 0) {
+    snprintf(out_buf, out_buf_size, "%" PRIu64, number);
+  } else {
+    snprintf(out_buf, out_buf_size,
+             "%" PRIu64
+             "(path "
+             "%" PRIu32 ")",
+             number, path_id);
+  }
+}
+
+std::string DescriptorFileName(uint64_t number) {
+  assert(number > 0);
+  char buf[100];
+  snprintf(buf, sizeof(buf), "MANIFEST-%06llu",
+           static_cast<unsigned long long>(number));
+  return buf;
+}
+
+std::string DescriptorFileName(const std::string& dbname, uint64_t number) {
+  return dbname + "/" + DescriptorFileName(number);
+}
+
+std::string CurrentFileName(const std::string& dbname) {
+  return dbname + "/" + kCurrentFileName;
+}
+
+std::string LockFileName(const std::string& dbname) { return dbname + "/LOCK"; }
+
+std::string TempFileName(const std::string& dbname, uint64_t number) {
+  return MakeFileName(dbname, number, kTempFileNameSuffix.c_str());
+}
+
+InfoLogPrefix::InfoLogPrefix(bool has_log_dir,
+                             const std::string& db_absolute_path) {
+  if (!has_log_dir) {
+    const char kInfoLogPrefix[] = "LOG";
+    // "\0" is automatically added to the end
+    snprintf(buf, sizeof(buf), kInfoLogPrefix);
+    prefix = Slice(buf, sizeof(kInfoLogPrefix) - 1);
+  } else {
+    size_t len =
+        GetInfoLogPrefix(NormalizePath(db_absolute_path), buf, sizeof(buf));
+    prefix = Slice(buf, len);
+  }
+}
+
+std::string InfoLogFileName(const std::string& dbname,
+                            const std::string& db_path,
+                            const std::string& log_dir) {
+  if (log_dir.empty()) {
+    return dbname + "/LOG";
+  }
+
+  InfoLogPrefix info_log_prefix(true, db_path);
+  return log_dir + "/" + info_log_prefix.buf;
+}
+
+// Return the name of the old info log file for "dbname".
+std::string OldInfoLogFileName(const std::string& dbname, uint64_t ts,
+                               const std::string& db_path,
+                               const std::string& log_dir) {
+  char buf[50];
+  snprintf(buf, sizeof(buf), "%llu", static_cast<unsigned long long>(ts));
+
+  if (log_dir.empty()) {
+    return dbname + "/LOG.old." + buf;
+  }
+
+  InfoLogPrefix info_log_prefix(true, db_path);
+  return log_dir + "/" + info_log_prefix.buf + ".old." + buf;
+}
+
+std::string OptionsFileName(uint64_t file_num) {
+  char buffer[256];
+  snprintf(buffer, sizeof(buffer), "%s%06" PRIu64,
+           kOptionsFileNamePrefix.c_str(), file_num);
+  return buffer;
+}
+std::string OptionsFileName(const std::string& dbname, uint64_t file_num) {
+  return dbname + "/" + OptionsFileName(file_num);
+}
+
+std::string TempOptionsFileName(const std::string& dbname, uint64_t file_num) {
+  char buffer[256];
+  snprintf(buffer, sizeof(buffer), "%s%06" PRIu64 ".%s",
+           kOptionsFileNamePrefix.c_str(), file_num,
+           kTempFileNameSuffix.c_str());
+  return dbname + "/" + buffer;
+}
+
+std::string MetaDatabaseName(const std::string& dbname, uint64_t number) {
+  char buf[100];
+  snprintf(buf, sizeof(buf), "/METADB-%llu",
+           static_cast<unsigned long long>(number));
+  return dbname + buf;
+}
+
+std::string IdentityFileName(const std::string& dbname) {
+  return dbname + "/IDENTITY";
+}
+
+// Owned filenames have the form:
+//    dbname/IDENTITY
+//    dbname/CURRENT
+//    dbname/LOCK
+//    dbname/<info_log_name_prefix>
+//    dbname/<info_log_name_prefix>.old.[0-9]+
+//    dbname/MANIFEST-[0-9]+
+//    dbname/[0-9]+.(log|sst|blob)
+//    dbname/METADB-[0-9]+
+//    dbname/OPTIONS-[0-9]+
+//    dbname/OPTIONS-[0-9]+.dbtmp
+//    Disregards / at the beginning
+bool ParseFileName(const std::string& fname, uint64_t* number, FileType* type,
+                   WalFileType* log_type) {
+  return ParseFileName(fname, number, "", type, log_type);
+}
+
+bool ParseFileName(const std::string& fname, uint64_t* number,
+                   const Slice& info_log_name_prefix, FileType* type,
+                   WalFileType* log_type) {
+  Slice rest(fname);
+  if (fname.length() > 1 && fname[0] == '/') {
+    rest.remove_prefix(1);
+  }
+  if (rest == "IDENTITY") {
+    *number = 0;
+    *type = kIdentityFile;
+  } else if (rest == "CURRENT") {
+    *number = 0;
+    *type = kCurrentFile;
+  } else if (rest == "LOCK") {
+    *number = 0;
+    *type = kDBLockFile;
+  } else if (info_log_name_prefix.size() > 0 &&
+             rest.starts_with(info_log_name_prefix)) {
+    rest.remove_prefix(info_log_name_prefix.size());
+    if (rest == "" || rest == ".old") {
+      *number = 0;
+      *type = kInfoLogFile;
+    } else if (rest.starts_with(".old.")) {
+      uint64_t ts_suffix;
+      // sizeof also counts the trailing '\0'.
+      rest.remove_prefix(sizeof(".old.") - 1);
+      if (!ConsumeDecimalNumber(&rest, &ts_suffix)) {
+        return false;
+      }
+      *number = ts_suffix;
+      *type = kInfoLogFile;
+    }
+  } else if (rest.starts_with("MANIFEST-")) {
+    rest.remove_prefix(strlen("MANIFEST-"));
+    uint64_t num;
+    if (!ConsumeDecimalNumber(&rest, &num)) {
+      return false;
+    }
+    if (!rest.empty()) {
+      return false;
+    }
+    *type = kDescriptorFile;
+    *number = num;
+  } else if (rest.starts_with("METADB-")) {
+    rest.remove_prefix(strlen("METADB-"));
+    uint64_t num;
+    if (!ConsumeDecimalNumber(&rest, &num)) {
+      return false;
+    }
+    if (!rest.empty()) {
+      return false;
+    }
+    *type = kMetaDatabase;
+    *number = num;
+  } else if (rest.starts_with(kOptionsFileNamePrefix)) {
+    uint64_t ts_suffix;
+    bool is_temp_file = false;
+    rest.remove_prefix(kOptionsFileNamePrefix.size());
+    const std::string kTempFileNameSuffixWithDot =
+        std::string(".") + kTempFileNameSuffix;
+    if (rest.ends_with(kTempFileNameSuffixWithDot)) {
+      rest.remove_suffix(kTempFileNameSuffixWithDot.size());
+      is_temp_file = true;
+    }
+    if (!ConsumeDecimalNumber(&rest, &ts_suffix)) {
+      return false;
+    }
+    *number = ts_suffix;
+    *type = is_temp_file ? kTempFile : kOptionsFile;
+  } else {
+    // Avoid strtoull() to keep filename format independent of the
+    // current locale
+    bool archive_dir_found = false;
+    if (rest.starts_with(kArchivalDirName)) {
+      if (rest.size() <= kArchivalDirName.size()) {
+        return false;
+      }
+      rest.remove_prefix(kArchivalDirName.size() +
+                         1);  // Add 1 to remove / also
+      if (log_type) {
+        *log_type = kArchivedLogFile;
+      }
+      archive_dir_found = true;
+    }
+    uint64_t num;
+    if (!ConsumeDecimalNumber(&rest, &num)) {
+      return false;
+    }
+    if (rest.size() <= 1 || rest[0] != '.') {
+      return false;
+    }
+    rest.remove_prefix(1);
+
+    Slice suffix = rest;
+    if (suffix == Slice("log")) {
+      *type = kWalFile;
+      if (log_type && !archive_dir_found) {
+        *log_type = kAliveLogFile;
+      }
+    } else if (archive_dir_found) {
+      return false;  // Archive dir can contain only log files
+    } else if (suffix == Slice(kRocksDbTFileExt) ||
+               suffix == Slice(kLevelDbTFileExt)) {
+      *type = kTableFile;
+    } else if (suffix == Slice(kRocksDBBlobFileExt)) {
+      *type = kBlobFile;
+    } else if (suffix == Slice(kTempFileNameSuffix)) {
+      *type = kTempFile;
+    } else {
+      return false;
+    }
+    *number = num;
+  }
+  return true;
+}
+
+IOStatus SetCurrentFile(FileSystem* fs, const std::string& dbname,
+                        uint64_t descriptor_number,
+                        FSDirectory* dir_contains_current_file) {
+  // Remove leading "dbname/" and add newline to manifest file name
+  std::string manifest = DescriptorFileName(dbname, descriptor_number);
+  Slice contents = manifest;
+  assert(contents.starts_with(dbname + "/"));
+  contents.remove_prefix(dbname.size() + 1);
+  std::string tmp = TempFileName(dbname, descriptor_number);
+  IOStatus s = WriteStringToFile(fs, contents.ToString() + "\n", tmp, true);
+  TEST_SYNC_POINT_CALLBACK("SetCurrentFile:BeforeRename", &s);
+  if (s.ok()) {
+    TEST_KILL_RANDOM_WITH_WEIGHT("SetCurrentFile:0", REDUCE_ODDS2);
+    s = fs->RenameFile(tmp, CurrentFileName(dbname), IOOptions(), nullptr);
+    TEST_KILL_RANDOM_WITH_WEIGHT("SetCurrentFile:1", REDUCE_ODDS2);
+    TEST_SYNC_POINT_CALLBACK("SetCurrentFile:AfterRename", &s);
+  }
+  if (s.ok()) {
+    if (dir_contains_current_file != nullptr) {
+      s = dir_contains_current_file->FsyncWithDirOptions(
+          IOOptions(), nullptr, DirFsyncOptions(CurrentFileName(dbname)));
+    }
+  } else {
+    fs->DeleteFile(tmp, IOOptions(), nullptr)
+        .PermitUncheckedError();  // NOTE: PermitUncheckedError is acceptable
+                                  // here as we are already handling an error
+                                  // case, and this is just a best-attempt
+                                  // effort at some cleanup
+  }
+  return s;
+}
+
+Status SetIdentityFile(Env* env, const std::string& dbname,
+                       const std::string& db_id) {
+  std::string id;
+  if (db_id.empty()) {
+    id = env->GenerateUniqueId();
+  } else {
+    id = db_id;
+  }
+  assert(!id.empty());
+  // Reserve the filename dbname/000000.dbtmp for the temporary identity file
+  std::string tmp = TempFileName(dbname, 0);
+  std::string identify_file_name = IdentityFileName(dbname);
+  Status s = WriteStringToFile(env, id, tmp, true);
+  if (s.ok()) {
+    s = env->RenameFile(tmp, identify_file_name);
+  }
+  std::unique_ptr<FSDirectory> dir_obj;
+  if (s.ok()) {
+    s = env->GetFileSystem()->NewDirectory(dbname, IOOptions(), &dir_obj,
+                                           nullptr);
+  }
+  if (s.ok()) {
+    s = dir_obj->FsyncWithDirOptions(IOOptions(), nullptr,
+                                     DirFsyncOptions(identify_file_name));
+  }
+
+  // The default Close() could return "NotSupported" and we bypass it
+  // if it is not impelmented. Detailed explanations can be found in
+  // db/db_impl/db_impl.h
+  if (s.ok()) {
+    Status temp_s = dir_obj->Close(IOOptions(), nullptr);
+    if (!temp_s.ok()) {
+      if (temp_s.IsNotSupported()) {
+        temp_s.PermitUncheckedError();
+      } else {
+        s = temp_s;
+      }
+    }
+  }
+  if (!s.ok()) {
+    env->DeleteFile(tmp).PermitUncheckedError();
+  }
+  return s;
+}
+
+IOStatus SyncManifest(const ImmutableDBOptions* db_options,
+                      WritableFileWriter* file) {
+  TEST_KILL_RANDOM_WITH_WEIGHT("SyncManifest:0", REDUCE_ODDS2);
+  StopWatch sw(db_options->clock, db_options->stats, MANIFEST_FILE_SYNC_MICROS);
+  return file->Sync(db_options->use_fsync);
+}
+
+Status GetInfoLogFiles(const std::shared_ptr<FileSystem>& fs,
+                       const std::string& db_log_dir, const std::string& dbname,
+                       std::string* parent_dir,
+                       std::vector<std::string>* info_log_list) {
+  assert(parent_dir != nullptr);
+  assert(info_log_list != nullptr);
+  uint64_t number = 0;
+  FileType type = kWalFile;
+
+  if (!db_log_dir.empty()) {
+    *parent_dir = db_log_dir;
+  } else {
+    *parent_dir = dbname;
+  }
+
+  InfoLogPrefix info_log_prefix(!db_log_dir.empty(), dbname);
+
+  std::vector<std::string> file_names;
+  Status s = fs->GetChildren(*parent_dir, IOOptions(), &file_names, nullptr);
+
+  if (!s.ok()) {
+    return s;
+  }
+
+  for (auto& f : file_names) {
+    if (ParseFileName(f, &number, info_log_prefix.prefix, &type) &&
+        (type == kInfoLogFile)) {
+      info_log_list->push_back(f);
+    }
+  }
+  return Status::OK();
+}
+
+std::string NormalizePath(const std::string& path) {
+  std::string dst;
+
+  if (path.length() > 2 && path[0] == kFilePathSeparator &&
+      path[1] == kFilePathSeparator) {  // Handle UNC names
+    dst.append(2, kFilePathSeparator);
+  }
+
+  for (auto c : path) {
+    if (!dst.empty() && (c == kFilePathSeparator || c == '/') &&
+        (dst.back() == kFilePathSeparator || dst.back() == '/')) {
+      continue;
+    }
+    dst.push_back(c);
+  }
+  return dst;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/file/filename.h b/src/rocksdb/file/filename.h
new file mode 100644
index 000000000..2eb125b6a
--- /dev/null
+++ b/src/rocksdb/file/filename.h
@@ -0,0 +1,188 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// File names used by DB code
+
+#pragma once
+#include <stdint.h>
+
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "options/db_options.h"
+#include "port/port.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/options.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+#include "rocksdb/transaction_log.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Env;
+class Directory;
+class SystemClock;
+class WritableFileWriter;
+
+#ifdef OS_WIN
+constexpr char kFilePathSeparator = '\\';
+#else
+constexpr char kFilePathSeparator = '/';
+#endif
+
+// Return the name of the log file with the specified number
+// in the db named by "dbname".  The result will be prefixed with
+// "dbname".
+extern std::string LogFileName(const std::string& dbname, uint64_t number);
+
+extern std::string LogFileName(uint64_t number);
+
+extern std::string BlobFileName(uint64_t number);
+
+extern std::string BlobFileName(const std::string& bdirname, uint64_t number);
+
+extern std::string BlobFileName(const std::string& dbname,
+                                const std::string& blob_dir, uint64_t number);
+
+extern std::string ArchivalDirectory(const std::string& dbname);
+
+//  Return the name of the archived log file with the specified number
+//  in the db named by "dbname". The result will be prefixed with "dbname".
+extern std::string ArchivedLogFileName(const std::string& dbname, uint64_t num);
+
+extern std::string MakeTableFileName(const std::string& name, uint64_t number);
+
+extern std::string MakeTableFileName(uint64_t number);
+
+// Return the name of sstable with LevelDB suffix
+// created from RocksDB sstable suffixed name
+extern std::string Rocks2LevelTableFileName(const std::string& fullname);
+
+// the reverse function of MakeTableFileName
+// TODO(yhchiang): could merge this function with ParseFileName()
+extern uint64_t TableFileNameToNumber(const std::string& name);
+
+// Return the name of the sstable with the specified number
+// in the db named by "dbname".  The result will be prefixed with
+// "dbname".
+extern std::string TableFileName(const std::vector<DbPath>& db_paths,
+                                 uint64_t number, uint32_t path_id);
+
+// Sufficient buffer size for FormatFileNumber.
+const size_t kFormatFileNumberBufSize = 38;
+
+extern void FormatFileNumber(uint64_t number, uint32_t path_id, char* out_buf,
+                             size_t out_buf_size);
+
+// Return the name of the descriptor file for the db named by
+// "dbname" and the specified incarnation number.  The result will be
+// prefixed with "dbname".
+extern std::string DescriptorFileName(const std::string& dbname,
+                                      uint64_t number);
+
+extern std::string DescriptorFileName(uint64_t number);
+
+extern const std::string kCurrentFileName;  // = "CURRENT"
+
+// Return the name of the current file.  This file contains the name
+// of the current manifest file.  The result will be prefixed with
+// "dbname".
+extern std::string CurrentFileName(const std::string& dbname);
+
+// Return the name of the lock file for the db named by
+// "dbname".  The result will be prefixed with "dbname".
+extern std::string LockFileName(const std::string& dbname);
+
+// Return the name of a temporary file owned by the db named "dbname".
+// The result will be prefixed with "dbname".
+extern std::string TempFileName(const std::string& dbname, uint64_t number);
+
+// A helper structure for prefix of info log names.
+struct InfoLogPrefix {
+  char buf[260];
+  Slice prefix;
+  // Prefix with DB absolute path encoded
+  explicit InfoLogPrefix(bool has_log_dir, const std::string& db_absolute_path);
+  // Default Prefix
+  explicit InfoLogPrefix();
+};
+
+// Return the name of the info log file for "dbname".
+extern std::string InfoLogFileName(const std::string& dbname,
+                                   const std::string& db_path = "",
+                                   const std::string& log_dir = "");
+
+// Return the name of the old info log file for "dbname".
+extern std::string OldInfoLogFileName(const std::string& dbname, uint64_t ts,
+                                      const std::string& db_path = "",
+                                      const std::string& log_dir = "");
+
+extern const std::string kOptionsFileNamePrefix;  // = "OPTIONS-"
+extern const std::string kTempFileNameSuffix;     // = "dbtmp"
+
+// Return a options file name given the "dbname" and file number.
+// Format:  OPTIONS-[number].dbtmp
+extern std::string OptionsFileName(const std::string& dbname,
+                                   uint64_t file_num);
+extern std::string OptionsFileName(uint64_t file_num);
+
+// Return a temp options file name given the "dbname" and file number.
+// Format:  OPTIONS-[number]
+extern std::string TempOptionsFileName(const std::string& dbname,
+                                       uint64_t file_num);
+
+// Return the name to use for a metadatabase. The result will be prefixed with
+// "dbname".
+extern std::string MetaDatabaseName(const std::string& dbname, uint64_t number);
+
+// Return the name of the Identity file which stores a unique number for the db
+// that will get regenerated if the db loses all its data and is recreated fresh
+// either from a backup-image or empty
+extern std::string IdentityFileName(const std::string& dbname);
+
+// If filename is a rocksdb file, store the type of the file in *type.
+// The number encoded in the filename is stored in *number.  If the
+// filename was successfully parsed, returns true.  Else return false.
+// info_log_name_prefix is the path of info logs.
+extern bool ParseFileName(const std::string& filename, uint64_t* number,
+                          const Slice& info_log_name_prefix, FileType* type,
+                          WalFileType* log_type = nullptr);
+// Same as previous function, but skip info log files.
+extern bool ParseFileName(const std::string& filename, uint64_t* number,
+                          FileType* type, WalFileType* log_type = nullptr);
+
+// Make the CURRENT file point to the descriptor file with the
+// specified number. On its success and when dir_contains_current_file is not
+// nullptr, the function will fsync the directory containing the CURRENT file
+// when
+extern IOStatus SetCurrentFile(FileSystem* fs, const std::string& dbname,
+                               uint64_t descriptor_number,
+                               FSDirectory* dir_contains_current_file);
+
+// Make the IDENTITY file for the db
+extern Status SetIdentityFile(Env* env, const std::string& dbname,
+                              const std::string& db_id = {});
+
+// Sync manifest file `file`.
+extern IOStatus SyncManifest(const ImmutableDBOptions* db_options,
+                             WritableFileWriter* file);
+
+// Return list of file names of info logs in `file_names`.
+// The list only contains file name. The parent directory name is stored
+// in `parent_dir`.
+// `db_log_dir` should be the one as in options.db_log_dir
+extern Status GetInfoLogFiles(const std::shared_ptr<FileSystem>& fs,
+                              const std::string& db_log_dir,
+                              const std::string& dbname,
+                              std::string* parent_dir,
+                              std::vector<std::string>* file_names);
+
+extern std::string NormalizePath(const std::string& path);
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/file/line_file_reader.cc b/src/rocksdb/file/line_file_reader.cc
new file mode 100644
index 000000000..50c415dc6
--- /dev/null
+++ b/src/rocksdb/file/line_file_reader.cc
@@ -0,0 +1,73 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "file/line_file_reader.h"
+
+#include <cstring>
+
+#include "monitoring/iostats_context_imp.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+IOStatus LineFileReader::Create(const std::shared_ptr<FileSystem>& fs,
+                                const std::string& fname,
+                                const FileOptions& file_opts,
+                                std::unique_ptr<LineFileReader>* reader,
+                                IODebugContext* dbg,
+                                RateLimiter* rate_limiter) {
+  std::unique_ptr<FSSequentialFile> file;
+  IOStatus io_s = fs->NewSequentialFile(fname, file_opts, &file, dbg);
+  if (io_s.ok()) {
+    reader->reset(new LineFileReader(
+        std::move(file), fname, nullptr,
+        std::vector<std::shared_ptr<EventListener>>{}, rate_limiter));
+  }
+  return io_s;
+}
+
+bool LineFileReader::ReadLine(std::string* out,
+                              Env::IOPriority rate_limiter_priority) {
+  assert(out);
+  if (!io_status_.ok()) {
+    // Status should be checked (or permit unchecked) any time we return false.
+    io_status_.MustCheck();
+    return false;
+  }
+  out->clear();
+  for (;;) {
+    // Look for line delimiter
+    const char* found = static_cast<const char*>(
+        std::memchr(buf_begin_, '\n', buf_end_ - buf_begin_));
+    if (found) {
+      size_t len = found - buf_begin_;
+      out->append(buf_begin_, len);
+      buf_begin_ += len + /*delim*/ 1;
+      ++line_number_;
+      return true;
+    }
+    if (at_eof_) {
+      io_status_.MustCheck();
+      return false;
+    }
+    // else flush and reload buffer
+    out->append(buf_begin_, buf_end_ - buf_begin_);
+    Slice result;
+    io_status_ =
+        sfr_.Read(buf_.size(), &result, buf_.data(), rate_limiter_priority);
+    IOSTATS_ADD(bytes_read, result.size());
+    if (!io_status_.ok()) {
+      io_status_.MustCheck();
+      return false;
+    }
+    if (result.size() != buf_.size()) {
+      // The obscure way of indicating EOF
+      at_eof_ = true;
+    }
+    buf_begin_ = result.data();
+    buf_end_ = result.data() + result.size();
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/file/line_file_reader.h b/src/rocksdb/file/line_file_reader.h
new file mode 100644
index 000000000..cc302d311
--- /dev/null
+++ b/src/rocksdb/file/line_file_reader.h
@@ -0,0 +1,60 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#include <array>
+
+#include "file/sequence_file_reader.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// A wrapper on top of Env::SequentialFile for reading text lines from a file.
+// Lines are delimited by '\n'. The last line may or may not include a
+// trailing newline. Uses SequentialFileReader internally.
+class LineFileReader {
+ private:
+  std::array<char, 8192> buf_;
+  SequentialFileReader sfr_;
+  IOStatus io_status_;
+  const char* buf_begin_ = buf_.data();
+  const char* buf_end_ = buf_.data();
+  size_t line_number_ = 0;
+  bool at_eof_ = false;
+
+ public:
+  // See SequentialFileReader constructors
+  template <typename... Args>
+  explicit LineFileReader(Args&&... args)
+      : sfr_(std::forward<Args&&>(args)...) {}
+
+  static IOStatus Create(const std::shared_ptr<FileSystem>& fs,
+                         const std::string& fname, const FileOptions& file_opts,
+                         std::unique_ptr<LineFileReader>* reader,
+                         IODebugContext* dbg, RateLimiter* rate_limiter);
+
+  LineFileReader(const LineFileReader&) = delete;
+  LineFileReader& operator=(const LineFileReader&) = delete;
+
+  // Reads another line from the file, returning true on success and saving
+  // the line to `out`, without delimiter, or returning false on failure. You
+  // must check GetStatus() to determine whether the failure was just
+  // end-of-file (OK status) or an I/O error (another status).
+  // The internal rate limiter will be charged at the specified priority.
+  bool ReadLine(std::string* out, Env::IOPriority rate_limiter_priority);
+
+  // Returns the number of the line most recently returned from ReadLine.
+  // Return value is unspecified if ReadLine has returned false due to
+  // I/O error. After ReadLine returns false due to end-of-file, return
+  // value is the last returned line number, or equivalently the total
+  // number of lines returned.
+  size_t GetLineNumber() const { return line_number_; }
+
+  // Returns any error encountered during read. The error is considered
+  // permanent and no retry or recovery is attempted with the same
+  // LineFileReader.
+  const IOStatus& GetStatus() const { return io_status_; }
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/file/prefetch_test.cc b/src/rocksdb/file/prefetch_test.cc
new file mode 100644
index 000000000..438286bfc
--- /dev/null
+++ b/src/rocksdb/file/prefetch_test.cc
@@ -0,0 +1,2109 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/db_test_util.h"
+#include "file/file_prefetch_buffer.h"
+#include "file/file_util.h"
+#include "rocksdb/file_system.h"
+#include "test_util/sync_point.h"
+#ifdef GFLAGS
+#include "tools/io_tracer_parser_tool.h"
+#endif
+#include "util/random.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class MockFS;
+
+class MockRandomAccessFile : public FSRandomAccessFileOwnerWrapper {
+ public:
+  MockRandomAccessFile(std::unique_ptr<FSRandomAccessFile>& file,
+                       bool support_prefetch, std::atomic_int& prefetch_count)
+      : FSRandomAccessFileOwnerWrapper(std::move(file)),
+        support_prefetch_(support_prefetch),
+        prefetch_count_(prefetch_count) {}
+
+  IOStatus Prefetch(uint64_t offset, size_t n, const IOOptions& options,
+                    IODebugContext* dbg) override {
+    if (support_prefetch_) {
+      prefetch_count_.fetch_add(1);
+      return target()->Prefetch(offset, n, options, dbg);
+    } else {
+      return IOStatus::NotSupported("Prefetch not supported");
+    }
+  }
+
+ private:
+  const bool support_prefetch_;
+  std::atomic_int& prefetch_count_;
+};
+
+class MockFS : public FileSystemWrapper {
+ public:
+  explicit MockFS(const std::shared_ptr<FileSystem>& wrapped,
+                  bool support_prefetch)
+      : FileSystemWrapper(wrapped), support_prefetch_(support_prefetch) {}
+
+  static const char* kClassName() { return "MockFS"; }
+  const char* Name() const override { return kClassName(); }
+
+  IOStatus NewRandomAccessFile(const std::string& fname,
+                               const FileOptions& opts,
+                               std::unique_ptr<FSRandomAccessFile>* result,
+                               IODebugContext* dbg) override {
+    std::unique_ptr<FSRandomAccessFile> file;
+    IOStatus s;
+    s = target()->NewRandomAccessFile(fname, opts, &file, dbg);
+    result->reset(
+        new MockRandomAccessFile(file, support_prefetch_, prefetch_count_));
+    return s;
+  }
+
+  void ClearPrefetchCount() { prefetch_count_ = 0; }
+
+  bool IsPrefetchCalled() { return prefetch_count_ > 0; }
+
+  int GetPrefetchCount() {
+    return prefetch_count_.load(std::memory_order_relaxed);
+  }
+
+ private:
+  const bool support_prefetch_;
+  std::atomic_int prefetch_count_{0};
+};
+
+class PrefetchTest
+    : public DBTestBase,
+      public ::testing::WithParamInterface<std::tuple<bool, bool>> {
+ public:
+  PrefetchTest() : DBTestBase("prefetch_test", true) {}
+};
+
+INSTANTIATE_TEST_CASE_P(PrefetchTest, PrefetchTest,
+                        ::testing::Combine(::testing::Bool(),
+                                           ::testing::Bool()));
+
+std::string BuildKey(int num, std::string postfix = "") {
+  return "my_key_" + std::to_string(num) + postfix;
+}
+
+TEST_P(PrefetchTest, Basic) {
+  // First param is if the mockFS support_prefetch or not
+  bool support_prefetch =
+      std::get<0>(GetParam()) &&
+      test::IsPrefetchSupported(env_->GetFileSystem(), dbname_);
+
+  // Second param is if directIO is enabled or not
+  bool use_direct_io = std::get<1>(GetParam());
+  const int kNumKeys = 1100;
+  std::shared_ptr<MockFS> fs =
+      std::make_shared<MockFS>(env_->GetFileSystem(), support_prefetch);
+  std::unique_ptr<Env> env(new CompositeEnvWrapper(env_, fs));
+  Options options = CurrentOptions();
+  options.write_buffer_size = 1024;
+  options.create_if_missing = true;
+  options.compression = kNoCompression;
+  options.env = env.get();
+  if (use_direct_io) {
+    options.use_direct_reads = true;
+    options.use_direct_io_for_flush_and_compaction = true;
+  }
+
+  int buff_prefetch_count = 0;
+  SyncPoint::GetInstance()->SetCallBack("FilePrefetchBuffer::Prefetch:Start",
+                                        [&](void*) { buff_prefetch_count++; });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  Status s = TryReopen(options);
+  if (use_direct_io && (s.IsNotSupported() || s.IsInvalidArgument())) {
+    // If direct IO is not supported, skip the test
+    return;
+  } else {
+    ASSERT_OK(s);
+  }
+
+  // create first key range
+  WriteBatch batch;
+  for (int i = 0; i < kNumKeys; i++) {
+    ASSERT_OK(batch.Put(BuildKey(i), "value for range 1 key"));
+  }
+  ASSERT_OK(db_->Write(WriteOptions(), &batch));
+
+  // create second key range
+  batch.Clear();
+  for (int i = 0; i < kNumKeys; i++) {
+    ASSERT_OK(batch.Put(BuildKey(i, "key2"), "value for range 2 key"));
+  }
+  ASSERT_OK(db_->Write(WriteOptions(), &batch));
+
+  // delete second key range
+  batch.Clear();
+  for (int i = 0; i < kNumKeys; i++) {
+    ASSERT_OK(batch.Delete(BuildKey(i, "key2")));
+  }
+  ASSERT_OK(db_->Write(WriteOptions(), &batch));
+
+  // compact database
+  std::string start_key = BuildKey(0);
+  std::string end_key = BuildKey(kNumKeys - 1);
+  Slice least(start_key.data(), start_key.size());
+  Slice greatest(end_key.data(), end_key.size());
+
+  // commenting out the line below causes the example to work correctly
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &least, &greatest));
+
+  if (support_prefetch && !use_direct_io) {
+    // If underline file system supports prefetch, and directIO is not enabled
+    // make sure prefetch() is called and FilePrefetchBuffer is not used.
+    ASSERT_TRUE(fs->IsPrefetchCalled());
+    fs->ClearPrefetchCount();
+    ASSERT_EQ(0, buff_prefetch_count);
+  } else {
+    // If underline file system doesn't support prefetch, or directIO is
+    // enabled, make sure prefetch() is not called and FilePrefetchBuffer is
+    // used.
+    ASSERT_FALSE(fs->IsPrefetchCalled());
+    ASSERT_GT(buff_prefetch_count, 0);
+    buff_prefetch_count = 0;
+  }
+
+  // count the keys
+  {
+    auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ReadOptions()));
+    int num_keys = 0;
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+      num_keys++;
+    }
+  }
+
+  // Make sure prefetch is called only if file system support prefetch.
+  if (support_prefetch && !use_direct_io) {
+    ASSERT_TRUE(fs->IsPrefetchCalled());
+    fs->ClearPrefetchCount();
+    ASSERT_EQ(0, buff_prefetch_count);
+  } else {
+    ASSERT_FALSE(fs->IsPrefetchCalled());
+    ASSERT_GT(buff_prefetch_count, 0);
+    buff_prefetch_count = 0;
+  }
+  Close();
+}
+
+#ifndef ROCKSDB_LITE
+TEST_P(PrefetchTest, ConfigureAutoMaxReadaheadSize) {
+  // First param is if the mockFS support_prefetch or not
+  bool support_prefetch =
+      std::get<0>(GetParam()) &&
+      test::IsPrefetchSupported(env_->GetFileSystem(), dbname_);
+
+  // Second param is if directIO is enabled or not
+  bool use_direct_io = std::get<1>(GetParam());
+
+  std::shared_ptr<MockFS> fs =
+      std::make_shared<MockFS>(env_->GetFileSystem(), support_prefetch);
+  std::unique_ptr<Env> env(new CompositeEnvWrapper(env_, fs));
+
+  Options options = CurrentOptions();
+  options.write_buffer_size = 1024;
+  options.create_if_missing = true;
+  options.compression = kNoCompression;
+  options.env = env.get();
+  options.disable_auto_compactions = true;
+  if (use_direct_io) {
+    options.use_direct_reads = true;
+    options.use_direct_io_for_flush_and_compaction = true;
+  }
+  BlockBasedTableOptions table_options;
+  table_options.no_block_cache = true;
+  table_options.cache_index_and_filter_blocks = false;
+  table_options.metadata_block_size = 1024;
+  table_options.index_type =
+      BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+  table_options.max_auto_readahead_size = 0;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  int buff_prefetch_count = 0;
+  SyncPoint::GetInstance()->SetCallBack("FilePrefetchBuffer::Prefetch:Start",
+                                        [&](void*) { buff_prefetch_count++; });
+
+  // DB open will create table readers unless we reduce the table cache
+  // capacity. SanitizeOptions will set max_open_files to minimum of 20. Table
+  // cache is allocated with max_open_files - 10 as capacity. So override
+  // max_open_files to 10 so table cache capacity will become 0. This will
+  // prevent file open during DB open and force the file to be opened during
+  // Iteration.
+  SyncPoint::GetInstance()->SetCallBack(
+      "SanitizeOptions::AfterChangeMaxOpenFiles", [&](void* arg) {
+        int* max_open_files = (int*)arg;
+        *max_open_files = 11;
+      });
+
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  Status s = TryReopen(options);
+
+  if (use_direct_io && (s.IsNotSupported() || s.IsInvalidArgument())) {
+    // If direct IO is not supported, skip the test
+    return;
+  } else {
+    ASSERT_OK(s);
+  }
+
+  Random rnd(309);
+  int key_count = 0;
+  const int num_keys_per_level = 100;
+  // Level 0 : Keys in range [0, 99], Level 1:[100, 199], Level 2:[200, 299].
+  for (int level = 2; level >= 0; level--) {
+    key_count = level * num_keys_per_level;
+    for (int i = 0; i < num_keys_per_level; ++i) {
+      ASSERT_OK(Put(Key(key_count++), rnd.RandomString(500)));
+    }
+    ASSERT_OK(Flush());
+    MoveFilesToLevel(level);
+  }
+  Close();
+  std::vector<int> buff_prefectch_level_count = {0, 0, 0};
+  TryReopen(options);
+  {
+    auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ReadOptions()));
+    fs->ClearPrefetchCount();
+    buff_prefetch_count = 0;
+
+    for (int level = 2; level >= 0; level--) {
+      key_count = level * num_keys_per_level;
+      switch (level) {
+        case 0:
+          // max_auto_readahead_size is set 0 so data and index blocks are not
+          // prefetched.
+          ASSERT_OK(db_->SetOptions(
+              {{"block_based_table_factory", "{max_auto_readahead_size=0;}"}}));
+          break;
+        case 1:
+          // max_auto_readahead_size is set less than
+          // initial_auto_readahead_size. So readahead_size remains equal to
+          // max_auto_readahead_size.
+          ASSERT_OK(db_->SetOptions({{"block_based_table_factory",
+                                      "{max_auto_readahead_size=4096;}"}}));
+          break;
+        case 2:
+          ASSERT_OK(db_->SetOptions({{"block_based_table_factory",
+                                      "{max_auto_readahead_size=65536;}"}}));
+          break;
+        default:
+          assert(false);
+      }
+
+      for (int i = 0; i < num_keys_per_level; ++i) {
+        iter->Seek(Key(key_count++));
+        iter->Next();
+      }
+
+      buff_prefectch_level_count[level] = buff_prefetch_count;
+      if (support_prefetch && !use_direct_io) {
+        if (level == 0) {
+          ASSERT_FALSE(fs->IsPrefetchCalled());
+        } else {
+          ASSERT_TRUE(fs->IsPrefetchCalled());
+        }
+        fs->ClearPrefetchCount();
+      } else {
+        ASSERT_FALSE(fs->IsPrefetchCalled());
+        if (level == 0) {
+          ASSERT_EQ(buff_prefetch_count, 0);
+        } else {
+          ASSERT_GT(buff_prefetch_count, 0);
+        }
+        buff_prefetch_count = 0;
+      }
+    }
+  }
+
+  if (!support_prefetch) {
+    ASSERT_GT(buff_prefectch_level_count[1], buff_prefectch_level_count[2]);
+  }
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  Close();
+}
+
+TEST_P(PrefetchTest, ConfigureInternalAutoReadaheadSize) {
+  // First param is if the mockFS support_prefetch or not
+  bool support_prefetch =
+      std::get<0>(GetParam()) &&
+      test::IsPrefetchSupported(env_->GetFileSystem(), dbname_);
+
+  // Second param is if directIO is enabled or not
+  bool use_direct_io = std::get<1>(GetParam());
+
+  std::shared_ptr<MockFS> fs =
+      std::make_shared<MockFS>(env_->GetFileSystem(), support_prefetch);
+  std::unique_ptr<Env> env(new CompositeEnvWrapper(env_, fs));
+
+  Options options = CurrentOptions();
+  options.write_buffer_size = 1024;
+  options.create_if_missing = true;
+  options.compression = kNoCompression;
+  options.env = env.get();
+  options.disable_auto_compactions = true;
+  if (use_direct_io) {
+    options.use_direct_reads = true;
+    options.use_direct_io_for_flush_and_compaction = true;
+  }
+  BlockBasedTableOptions table_options;
+  table_options.no_block_cache = true;
+  table_options.cache_index_and_filter_blocks = false;
+  table_options.metadata_block_size = 1024;
+  table_options.index_type =
+      BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+  table_options.initial_auto_readahead_size = 0;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  int buff_prefetch_count = 0;
+  // DB open will create table readers unless we reduce the table cache
+  // capacity. SanitizeOptions will set max_open_files to minimum of 20.
+  // Table cache is allocated with max_open_files - 10 as capacity. So
+  // override max_open_files to 10 so table cache capacity will become 0.
+  // This will prevent file open during DB open and force the file to be
+  // opened during Iteration.
+  SyncPoint::GetInstance()->SetCallBack(
+      "SanitizeOptions::AfterChangeMaxOpenFiles", [&](void* arg) {
+        int* max_open_files = (int*)arg;
+        *max_open_files = 11;
+      });
+
+  SyncPoint::GetInstance()->SetCallBack("FilePrefetchBuffer::Prefetch:Start",
+                                        [&](void*) { buff_prefetch_count++; });
+
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  Status s = TryReopen(options);
+
+  if (use_direct_io && (s.IsNotSupported() || s.IsInvalidArgument())) {
+    // If direct IO is not supported, skip the test
+    return;
+  } else {
+    ASSERT_OK(s);
+  }
+
+  Random rnd(309);
+  int key_count = 0;
+  const int num_keys_per_level = 100;
+  // Level 0 : Keys in range [0, 99], Level 1:[100, 199], Level 2:[200, 299].
+  for (int level = 2; level >= 0; level--) {
+    key_count = level * num_keys_per_level;
+    for (int i = 0; i < num_keys_per_level; ++i) {
+      ASSERT_OK(Put(Key(key_count++), rnd.RandomString(500)));
+    }
+    ASSERT_OK(Flush());
+    MoveFilesToLevel(level);
+  }
+  Close();
+
+  TryReopen(options);
+  {
+    auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ReadOptions()));
+    fs->ClearPrefetchCount();
+    buff_prefetch_count = 0;
+    std::vector<int> buff_prefetch_level_count = {0, 0, 0};
+
+    for (int level = 2; level >= 0; level--) {
+      key_count = level * num_keys_per_level;
+      switch (level) {
+        case 0:
+          // initial_auto_readahead_size is set 0 so data and index blocks are
+          // not prefetched.
+          ASSERT_OK(db_->SetOptions({{"block_based_table_factory",
+                                      "{initial_auto_readahead_size=0;}"}}));
+          break;
+        case 1:
+          // intial_auto_readahead_size and max_auto_readahead_size are set same
+          // so readahead_size remains same.
+          ASSERT_OK(db_->SetOptions({{"block_based_table_factory",
+                                      "{initial_auto_readahead_size=4096;max_"
+                                      "auto_readahead_size=4096;}"}}));
+          break;
+        case 2:
+          ASSERT_OK(
+              db_->SetOptions({{"block_based_table_factory",
+                                "{initial_auto_readahead_size=65536;}"}}));
+          break;
+        default:
+          assert(false);
+      }
+
+      for (int i = 0; i < num_keys_per_level; ++i) {
+        iter->Seek(Key(key_count++));
+        iter->Next();
+      }
+
+      buff_prefetch_level_count[level] = buff_prefetch_count;
+      if (support_prefetch && !use_direct_io) {
+        if (level == 0) {
+          ASSERT_FALSE(fs->IsPrefetchCalled());
+        } else {
+          ASSERT_TRUE(fs->IsPrefetchCalled());
+        }
+        fs->ClearPrefetchCount();
+      } else {
+        ASSERT_FALSE(fs->IsPrefetchCalled());
+        if (level == 0) {
+          ASSERT_EQ(buff_prefetch_count, 0);
+        } else {
+          ASSERT_GT(buff_prefetch_count, 0);
+        }
+        buff_prefetch_count = 0;
+      }
+    }
+    if (!support_prefetch) {
+      ASSERT_GT(buff_prefetch_level_count[1], buff_prefetch_level_count[2]);
+    }
+  }
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  Close();
+}
+
+TEST_P(PrefetchTest, ConfigureNumFilesReadsForReadaheadSize) {
+  // First param is if the mockFS support_prefetch or not
+  bool support_prefetch =
+      std::get<0>(GetParam()) &&
+      test::IsPrefetchSupported(env_->GetFileSystem(), dbname_);
+
+  const int kNumKeys = 2000;
+  std::shared_ptr<MockFS> fs =
+      std::make_shared<MockFS>(env_->GetFileSystem(), support_prefetch);
+  std::unique_ptr<Env> env(new CompositeEnvWrapper(env_, fs));
+
+  // Second param is if directIO is enabled or not
+  bool use_direct_io = std::get<1>(GetParam());
+
+  Options options = CurrentOptions();
+  options.write_buffer_size = 1024;
+  options.create_if_missing = true;
+  options.compression = kNoCompression;
+  options.env = env.get();
+
+  BlockBasedTableOptions table_options;
+  table_options.no_block_cache = true;
+  table_options.cache_index_and_filter_blocks = false;
+  table_options.metadata_block_size = 1024;
+  table_options.index_type =
+      BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+  table_options.num_file_reads_for_auto_readahead = 0;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  if (use_direct_io) {
+    options.use_direct_reads = true;
+    options.use_direct_io_for_flush_and_compaction = true;
+  }
+
+  int buff_prefetch_count = 0;
+  SyncPoint::GetInstance()->SetCallBack("FilePrefetchBuffer::Prefetch:Start",
+                                        [&](void*) { buff_prefetch_count++; });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  Status s = TryReopen(options);
+  if (use_direct_io && (s.IsNotSupported() || s.IsInvalidArgument())) {
+    // If direct IO is not supported, skip the test
+    return;
+  } else {
+    ASSERT_OK(s);
+  }
+
+  WriteBatch batch;
+  Random rnd(309);
+  for (int i = 0; i < kNumKeys; i++) {
+    ASSERT_OK(batch.Put(BuildKey(i), rnd.RandomString(1000)));
+  }
+  ASSERT_OK(db_->Write(WriteOptions(), &batch));
+
+  std::string start_key = BuildKey(0);
+  std::string end_key = BuildKey(kNumKeys - 1);
+  Slice least(start_key.data(), start_key.size());
+  Slice greatest(end_key.data(), end_key.size());
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &least, &greatest));
+
+  Close();
+  TryReopen(options);
+
+  fs->ClearPrefetchCount();
+  buff_prefetch_count = 0;
+
+  {
+    auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ReadOptions()));
+    /*
+     * Reseek keys from sequential Data Blocks within same partitioned
+     * index. It will prefetch the data block at the first seek since
+     * num_file_reads_for_auto_readahead = 0. Data Block size is nearly 4076 so
+     * readahead will fetch 8 * 1024 data more initially (2 more data blocks).
+     */
+    iter->Seek(BuildKey(0));  // Prefetch data + index block since
+                              // num_file_reads_for_auto_readahead = 0.
+    ASSERT_TRUE(iter->Valid());
+    iter->Seek(BuildKey(1000));  // In buffer
+    ASSERT_TRUE(iter->Valid());
+    iter->Seek(BuildKey(1004));  // In buffer
+    ASSERT_TRUE(iter->Valid());
+    iter->Seek(BuildKey(1008));  // Prefetch Data
+    ASSERT_TRUE(iter->Valid());
+    iter->Seek(BuildKey(1011));  // In buffer
+    ASSERT_TRUE(iter->Valid());
+    iter->Seek(BuildKey(1015));  // In buffer
+    ASSERT_TRUE(iter->Valid());
+    iter->Seek(BuildKey(1019));  // In buffer
+    ASSERT_TRUE(iter->Valid());
+    // Missed 2 blocks but they are already in buffer so no reset.
+    iter->Seek(BuildKey(103));  // Already in buffer.
+    ASSERT_TRUE(iter->Valid());
+    iter->Seek(BuildKey(1033));  // Prefetch Data.
+    ASSERT_TRUE(iter->Valid());
+    if (support_prefetch && !use_direct_io) {
+      ASSERT_EQ(fs->GetPrefetchCount(), 4);
+      fs->ClearPrefetchCount();
+    } else {
+      ASSERT_EQ(buff_prefetch_count, 4);
+      buff_prefetch_count = 0;
+    }
+  }
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  Close();
+}
+#endif  // !ROCKSDB_LITE
+
+TEST_P(PrefetchTest, PrefetchWhenReseek) {
+  // First param is if the mockFS support_prefetch or not
+  bool support_prefetch =
+      std::get<0>(GetParam()) &&
+      test::IsPrefetchSupported(env_->GetFileSystem(), dbname_);
+
+  const int kNumKeys = 2000;
+  std::shared_ptr<MockFS> fs =
+      std::make_shared<MockFS>(env_->GetFileSystem(), support_prefetch);
+  std::unique_ptr<Env> env(new CompositeEnvWrapper(env_, fs));
+
+  // Second param is if directIO is enabled or not
+  bool use_direct_io = std::get<1>(GetParam());
+
+  Options options = CurrentOptions();
+  options.write_buffer_size = 1024;
+  options.create_if_missing = true;
+  options.compression = kNoCompression;
+  options.env = env.get();
+
+  BlockBasedTableOptions table_options;
+  table_options.no_block_cache = true;
+  table_options.cache_index_and_filter_blocks = false;
+  table_options.metadata_block_size = 1024;
+  table_options.index_type =
+      BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  if (use_direct_io) {
+    options.use_direct_reads = true;
+    options.use_direct_io_for_flush_and_compaction = true;
+  }
+
+  int buff_prefetch_count = 0;
+  SyncPoint::GetInstance()->SetCallBack("FilePrefetchBuffer::Prefetch:Start",
+                                        [&](void*) { buff_prefetch_count++; });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  Status s = TryReopen(options);
+  if (use_direct_io && (s.IsNotSupported() || s.IsInvalidArgument())) {
+    // If direct IO is not supported, skip the test
+    return;
+  } else {
+    ASSERT_OK(s);
+  }
+
+  WriteBatch batch;
+  Random rnd(309);
+  for (int i = 0; i < kNumKeys; i++) {
+    ASSERT_OK(batch.Put(BuildKey(i), rnd.RandomString(1000)));
+  }
+  ASSERT_OK(db_->Write(WriteOptions(), &batch));
+
+  std::string start_key = BuildKey(0);
+  std::string end_key = BuildKey(kNumKeys - 1);
+  Slice least(start_key.data(), start_key.size());
+  Slice greatest(end_key.data(), end_key.size());
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &least, &greatest));
+
+  fs->ClearPrefetchCount();
+  buff_prefetch_count = 0;
+
+  {
+    auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ReadOptions()));
+    /*
+     * Reseek keys from sequential Data Blocks within same partitioned
+     * index. After 2 sequential reads it will prefetch the data block.
+     * Data Block size is nearly 4076 so readahead will fetch 8 * 1024 data more
+     * initially (2 more data blocks).
+     */
+    iter->Seek(BuildKey(0));
+    ASSERT_TRUE(iter->Valid());
+    iter->Seek(BuildKey(1000));
+    ASSERT_TRUE(iter->Valid());
+    iter->Seek(BuildKey(1004));  // Prefetch Data
+    ASSERT_TRUE(iter->Valid());
+    iter->Seek(BuildKey(1008));
+    ASSERT_TRUE(iter->Valid());
+    iter->Seek(BuildKey(1011));
+    ASSERT_TRUE(iter->Valid());
+    iter->Seek(BuildKey(1015));  // Prefetch Data
+    ASSERT_TRUE(iter->Valid());
+    iter->Seek(BuildKey(1019));
+    ASSERT_TRUE(iter->Valid());
+    // Missed 2 blocks but they are already in buffer so no reset.
+    iter->Seek(BuildKey(103));  // Already in buffer.
+    ASSERT_TRUE(iter->Valid());
+    iter->Seek(BuildKey(1033));  // Prefetch Data
+    ASSERT_TRUE(iter->Valid());
+    if (support_prefetch && !use_direct_io) {
+      ASSERT_EQ(fs->GetPrefetchCount(), 3);
+      fs->ClearPrefetchCount();
+    } else {
+      ASSERT_EQ(buff_prefetch_count, 3);
+      buff_prefetch_count = 0;
+    }
+  }
+  {
+    /*
+     * Reseek keys from  non sequential data blocks within same partitioned
+     * index. buff_prefetch_count will be 0 in that case.
+     */
+    auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ReadOptions()));
+    iter->Seek(BuildKey(0));
+    ASSERT_TRUE(iter->Valid());
+    iter->Seek(BuildKey(1008));
+    ASSERT_TRUE(iter->Valid());
+    iter->Seek(BuildKey(1019));
+    ASSERT_TRUE(iter->Valid());
+    iter->Seek(BuildKey(1033));
+    ASSERT_TRUE(iter->Valid());
+    iter->Seek(BuildKey(1048));
+    ASSERT_TRUE(iter->Valid());
+    if (support_prefetch && !use_direct_io) {
+      ASSERT_EQ(fs->GetPrefetchCount(), 0);
+      fs->ClearPrefetchCount();
+    } else {
+      ASSERT_EQ(buff_prefetch_count, 0);
+      buff_prefetch_count = 0;
+    }
+  }
+  {
+    /*
+     * Reesek keys from Single Data Block.
+     */
+    auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ReadOptions()));
+    iter->Seek(BuildKey(0));
+    ASSERT_TRUE(iter->Valid());
+    iter->Seek(BuildKey(1));
+    ASSERT_TRUE(iter->Valid());
+    iter->Seek(BuildKey(10));
+    ASSERT_TRUE(iter->Valid());
+    iter->Seek(BuildKey(100));
+    ASSERT_TRUE(iter->Valid());
+    if (support_prefetch && !use_direct_io) {
+      ASSERT_EQ(fs->GetPrefetchCount(), 0);
+      fs->ClearPrefetchCount();
+    } else {
+      ASSERT_EQ(buff_prefetch_count, 0);
+      buff_prefetch_count = 0;
+    }
+  }
+  {
+    /*
+     * Reseek keys from  sequential data blocks to set implicit auto readahead
+     * and prefetch data but after that iterate over different (non sequential)
+     * data blocks which won't prefetch any data further. So buff_prefetch_count
+     * will be 1 for the first one.
+     */
+    auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ReadOptions()));
+    iter->Seek(BuildKey(0));
+    ASSERT_TRUE(iter->Valid());
+    iter->Seek(BuildKey(1000));
+    ASSERT_TRUE(iter->Valid());
+    iter->Seek(BuildKey(1004));  // This iteration will prefetch buffer
+    ASSERT_TRUE(iter->Valid());
+    iter->Seek(BuildKey(1008));
+    ASSERT_TRUE(iter->Valid());
+    iter->Seek(
+        BuildKey(996));  // Reseek won't prefetch any data and
+                         // readahead_size will be initiallized to 8*1024.
+    ASSERT_TRUE(iter->Valid());
+    iter->Seek(BuildKey(992));
+    ASSERT_TRUE(iter->Valid());
+    iter->Seek(BuildKey(989));
+    ASSERT_TRUE(iter->Valid());
+    if (support_prefetch && !use_direct_io) {
+      ASSERT_EQ(fs->GetPrefetchCount(), 1);
+      fs->ClearPrefetchCount();
+    } else {
+      ASSERT_EQ(buff_prefetch_count, 1);
+      buff_prefetch_count = 0;
+    }
+
+    // Read sequentially to confirm readahead_size is reset to initial value (2
+    // more data blocks)
+    iter->Seek(BuildKey(1011));
+    ASSERT_TRUE(iter->Valid());
+    iter->Seek(BuildKey(1015));
+    ASSERT_TRUE(iter->Valid());
+    iter->Seek(BuildKey(1019));  // Prefetch Data
+    ASSERT_TRUE(iter->Valid());
+    iter->Seek(BuildKey(1022));
+    ASSERT_TRUE(iter->Valid());
+    iter->Seek(BuildKey(1026));
+    ASSERT_TRUE(iter->Valid());
+    iter->Seek(BuildKey(103));  // Prefetch Data
+    ASSERT_TRUE(iter->Valid());
+    if (support_prefetch && !use_direct_io) {
+      ASSERT_EQ(fs->GetPrefetchCount(), 2);
+      fs->ClearPrefetchCount();
+    } else {
+      ASSERT_EQ(buff_prefetch_count, 2);
+      buff_prefetch_count = 0;
+    }
+  }
+  {
+    /* Reseek keys from sequential partitioned index block. Since partitioned
+     * index fetch are sequential, buff_prefetch_count will be 1.
+     */
+    auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ReadOptions()));
+    iter->Seek(BuildKey(0));
+    ASSERT_TRUE(iter->Valid());
+    iter->Seek(BuildKey(1167));
+    ASSERT_TRUE(iter->Valid());
+    iter->Seek(BuildKey(1334));  // This iteration will prefetch buffer
+    ASSERT_TRUE(iter->Valid());
+    iter->Seek(BuildKey(1499));
+    ASSERT_TRUE(iter->Valid());
+    iter->Seek(BuildKey(1667));
+    ASSERT_TRUE(iter->Valid());
+    iter->Seek(BuildKey(1847));
+    ASSERT_TRUE(iter->Valid());
+    iter->Seek(BuildKey(1999));
+    ASSERT_TRUE(iter->Valid());
+    if (support_prefetch && !use_direct_io) {
+      ASSERT_EQ(fs->GetPrefetchCount(), 1);
+      fs->ClearPrefetchCount();
+    } else {
+      ASSERT_EQ(buff_prefetch_count, 1);
+      buff_prefetch_count = 0;
+    }
+  }
+  {
+    /*
+     * Reseek over different keys from different blocks. buff_prefetch_count is
+     * set 0.
+     */
+    auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ReadOptions()));
+    int i = 0;
+    int j = 1000;
+    do {
+      iter->Seek(BuildKey(i));
+      if (!iter->Valid()) {
+        break;
+      }
+      i = i + 100;
+      iter->Seek(BuildKey(j));
+      j = j + 100;
+    } while (i < 1000 && j < kNumKeys && iter->Valid());
+    if (support_prefetch && !use_direct_io) {
+      ASSERT_EQ(fs->GetPrefetchCount(), 0);
+      fs->ClearPrefetchCount();
+    } else {
+      ASSERT_EQ(buff_prefetch_count, 0);
+      buff_prefetch_count = 0;
+    }
+  }
+  {
+    /* Iterates sequentially over all keys. It will prefetch the buffer.*/
+    auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ReadOptions()));
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    }
+    if (support_prefetch && !use_direct_io) {
+      ASSERT_EQ(fs->GetPrefetchCount(), 13);
+      fs->ClearPrefetchCount();
+    } else {
+      ASSERT_EQ(buff_prefetch_count, 13);
+      buff_prefetch_count = 0;
+    }
+  }
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  Close();
+}
+
+TEST_P(PrefetchTest, PrefetchWhenReseekwithCache) {
+  // First param is if the mockFS support_prefetch or not
+  bool support_prefetch =
+      std::get<0>(GetParam()) &&
+      test::IsPrefetchSupported(env_->GetFileSystem(), dbname_);
+
+  const int kNumKeys = 2000;
+  std::shared_ptr<MockFS> fs =
+      std::make_shared<MockFS>(env_->GetFileSystem(), support_prefetch);
+  std::unique_ptr<Env> env(new CompositeEnvWrapper(env_, fs));
+
+  // Second param is if directIO is enabled or not
+  bool use_direct_io = std::get<1>(GetParam());
+
+  Options options = CurrentOptions();
+  options.write_buffer_size = 1024;
+  options.create_if_missing = true;
+  options.compression = kNoCompression;
+  options.env = env.get();
+
+  BlockBasedTableOptions table_options;
+  std::shared_ptr<Cache> cache = NewLRUCache(4 * 1024 * 1024, 2);  // 8MB
+  table_options.block_cache = cache;
+  table_options.cache_index_and_filter_blocks = false;
+  table_options.metadata_block_size = 1024;
+  table_options.index_type =
+      BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  if (use_direct_io) {
+    options.use_direct_reads = true;
+    options.use_direct_io_for_flush_and_compaction = true;
+  }
+
+  int buff_prefetch_count = 0;
+  SyncPoint::GetInstance()->SetCallBack("FilePrefetchBuffer::Prefetch:Start",
+                                        [&](void*) { buff_prefetch_count++; });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  Status s = TryReopen(options);
+  if (use_direct_io && (s.IsNotSupported() || s.IsInvalidArgument())) {
+    // If direct IO is not supported, skip the test
+    return;
+  } else {
+    ASSERT_OK(s);
+  }
+
+  WriteBatch batch;
+  Random rnd(309);
+  for (int i = 0; i < kNumKeys; i++) {
+    ASSERT_OK(batch.Put(BuildKey(i), rnd.RandomString(1000)));
+  }
+  ASSERT_OK(db_->Write(WriteOptions(), &batch));
+
+  std::string start_key = BuildKey(0);
+  std::string end_key = BuildKey(kNumKeys - 1);
+  Slice least(start_key.data(), start_key.size());
+  Slice greatest(end_key.data(), end_key.size());
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &least, &greatest));
+
+  fs->ClearPrefetchCount();
+  buff_prefetch_count = 0;
+
+  {
+    /*
+     * Reseek keys from sequential Data Blocks within same partitioned
+     * index. After 2 sequential reads it will prefetch the data block.
+     * Data Block size is nearly 4076 so readahead will fetch 8 * 1024 data more
+     * initially (2 more data blocks).
+     */
+    auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ReadOptions()));
+    // Warm up the cache
+    iter->Seek(BuildKey(1011));
+    ASSERT_TRUE(iter->Valid());
+    iter->Seek(BuildKey(1015));
+    ASSERT_TRUE(iter->Valid());
+    iter->Seek(BuildKey(1019));
+    ASSERT_TRUE(iter->Valid());
+    if (support_prefetch && !use_direct_io) {
+      ASSERT_EQ(fs->GetPrefetchCount(), 1);
+      fs->ClearPrefetchCount();
+    } else {
+      ASSERT_EQ(buff_prefetch_count, 1);
+      buff_prefetch_count = 0;
+    }
+  }
+  {
+    // After caching, blocks will be read from cache (Sequential blocks)
+    auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ReadOptions()));
+    iter->Seek(BuildKey(0));
+    ASSERT_TRUE(iter->Valid());
+    iter->Seek(BuildKey(1000));
+    ASSERT_TRUE(iter->Valid());
+    iter->Seek(BuildKey(1004));  // Prefetch data (not in cache).
+    ASSERT_TRUE(iter->Valid());
+    // Missed one sequential block but next is in already in buffer so readahead
+    // will not be reset.
+    iter->Seek(BuildKey(1011));
+    ASSERT_TRUE(iter->Valid());
+    // Prefetch data but blocks are in cache so no prefetch and reset.
+    iter->Seek(BuildKey(1015));
+    ASSERT_TRUE(iter->Valid());
+    iter->Seek(BuildKey(1019));
+    ASSERT_TRUE(iter->Valid());
+    iter->Seek(BuildKey(1022));
+    ASSERT_TRUE(iter->Valid());
+    // Prefetch data with readahead_size = 4 blocks.
+    iter->Seek(BuildKey(1026));
+    ASSERT_TRUE(iter->Valid());
+    iter->Seek(BuildKey(103));
+    ASSERT_TRUE(iter->Valid());
+    iter->Seek(BuildKey(1033));
+    ASSERT_TRUE(iter->Valid());
+    iter->Seek(BuildKey(1037));
+    ASSERT_TRUE(iter->Valid());
+
+    if (support_prefetch && !use_direct_io) {
+      ASSERT_EQ(fs->GetPrefetchCount(), 3);
+      fs->ClearPrefetchCount();
+    } else {
+      ASSERT_EQ(buff_prefetch_count, 2);
+      buff_prefetch_count = 0;
+    }
+  }
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  Close();
+}
+
+#ifndef ROCKSDB_LITE
+TEST_P(PrefetchTest, DBIterLevelReadAhead) {
+  const int kNumKeys = 1000;
+  // Set options
+  std::shared_ptr<MockFS> fs =
+      std::make_shared<MockFS>(env_->GetFileSystem(), false);
+  std::unique_ptr<Env> env(new CompositeEnvWrapper(env_, fs));
+
+  bool use_direct_io = std::get<0>(GetParam());
+  bool is_adaptive_readahead = std::get<1>(GetParam());
+
+  Options options = CurrentOptions();
+  options.write_buffer_size = 1024;
+  options.create_if_missing = true;
+  options.compression = kNoCompression;
+  options.statistics = CreateDBStatistics();
+  options.env = env.get();
+
+  if (use_direct_io) {
+    options.use_direct_reads = true;
+    options.use_direct_io_for_flush_and_compaction = true;
+  }
+  BlockBasedTableOptions table_options;
+  table_options.no_block_cache = true;
+  table_options.cache_index_and_filter_blocks = false;
+  table_options.metadata_block_size = 1024;
+  table_options.index_type =
+      BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  Status s = TryReopen(options);
+  if (use_direct_io && (s.IsNotSupported() || s.IsInvalidArgument())) {
+    // If direct IO is not supported, skip the test
+    return;
+  } else {
+    ASSERT_OK(s);
+  }
+
+  WriteBatch batch;
+  Random rnd(309);
+  int total_keys = 0;
+  for (int j = 0; j < 5; j++) {
+    for (int i = j * kNumKeys; i < (j + 1) * kNumKeys; i++) {
+      ASSERT_OK(batch.Put(BuildKey(i), rnd.RandomString(1000)));
+      total_keys++;
+    }
+    ASSERT_OK(db_->Write(WriteOptions(), &batch));
+    ASSERT_OK(Flush());
+  }
+  MoveFilesToLevel(2);
+  int buff_prefetch_count = 0;
+  int buff_async_prefetch_count = 0;
+  int readahead_carry_over_count = 0;
+  int num_sst_files = NumTableFilesAtLevel(2);
+  size_t current_readahead_size = 0;
+
+  // Test - Iterate over the keys sequentially.
+  {
+    SyncPoint::GetInstance()->SetCallBack(
+        "FilePrefetchBuffer::Prefetch:Start",
+        [&](void*) { buff_prefetch_count++; });
+
+    SyncPoint::GetInstance()->SetCallBack(
+        "FilePrefetchBuffer::PrefetchAsyncInternal:Start",
+        [&](void*) { buff_async_prefetch_count++; });
+
+    // The callback checks, since reads are sequential, readahead_size doesn't
+    // start from 8KB when iterator moves to next file and its called
+    // num_sst_files-1 times (excluding for first file).
+    SyncPoint::GetInstance()->SetCallBack(
+        "BlockPrefetcher::SetReadaheadState", [&](void* arg) {
+          readahead_carry_over_count++;
+          size_t readahead_size = *reinterpret_cast<size_t*>(arg);
+          if (readahead_carry_over_count) {
+            ASSERT_GT(readahead_size, 8 * 1024);
+          }
+        });
+
+    SyncPoint::GetInstance()->SetCallBack(
+        "FilePrefetchBuffer::TryReadFromCache", [&](void* arg) {
+          current_readahead_size = *reinterpret_cast<size_t*>(arg);
+          ASSERT_GT(current_readahead_size, 0);
+        });
+
+    SyncPoint::GetInstance()->EnableProcessing();
+
+    ReadOptions ro;
+    if (is_adaptive_readahead) {
+      ro.adaptive_readahead = true;
+      ro.async_io = true;
+    }
+
+    ASSERT_OK(options.statistics->Reset());
+
+    auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ro));
+    int num_keys = 0;
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+      ASSERT_OK(iter->status());
+      num_keys++;
+    }
+    ASSERT_EQ(num_keys, total_keys);
+
+    // For index and data blocks.
+    if (is_adaptive_readahead) {
+      ASSERT_EQ(readahead_carry_over_count, 2 * (num_sst_files - 1));
+      ASSERT_GT(buff_async_prefetch_count, 0);
+    } else {
+      ASSERT_GT(buff_prefetch_count, 0);
+      ASSERT_EQ(readahead_carry_over_count, 0);
+    }
+
+    // Check stats to make sure async prefetch is done.
+    {
+      HistogramData async_read_bytes;
+      options.statistics->histogramData(ASYNC_READ_BYTES, &async_read_bytes);
+      if (ro.async_io) {
+        ASSERT_GT(async_read_bytes.count, 0);
+      } else {
+        ASSERT_EQ(async_read_bytes.count, 0);
+      }
+    }
+
+    SyncPoint::GetInstance()->DisableProcessing();
+    SyncPoint::GetInstance()->ClearAllCallBacks();
+  }
+  Close();
+}
+#endif  //! ROCKSDB_LITE
+
+class PrefetchTest1 : public DBTestBase,
+                      public ::testing::WithParamInterface<bool> {
+ public:
+  PrefetchTest1() : DBTestBase("prefetch_test1", true) {}
+};
+
+INSTANTIATE_TEST_CASE_P(PrefetchTest1, PrefetchTest1, ::testing::Bool());
+
+#ifndef ROCKSDB_LITE
+TEST_P(PrefetchTest1, NonSequentialReadsWithAdaptiveReadahead) {
+  const int kNumKeys = 1000;
+  // Set options
+  std::shared_ptr<MockFS> fs =
+      std::make_shared<MockFS>(env_->GetFileSystem(), false);
+  std::unique_ptr<Env> env(new CompositeEnvWrapper(env_, fs));
+
+  Options options = CurrentOptions();
+  options.write_buffer_size = 1024;
+  options.create_if_missing = true;
+  options.compression = kNoCompression;
+  options.env = env.get();
+  if (GetParam()) {
+    options.use_direct_reads = true;
+    options.use_direct_io_for_flush_and_compaction = true;
+  }
+  BlockBasedTableOptions table_options;
+  table_options.no_block_cache = true;
+  table_options.cache_index_and_filter_blocks = false;
+  table_options.metadata_block_size = 1024;
+  table_options.index_type =
+      BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  Status s = TryReopen(options);
+  if (GetParam() && (s.IsNotSupported() || s.IsInvalidArgument())) {
+    // If direct IO is not supported, skip the test
+    return;
+  } else {
+    ASSERT_OK(s);
+  }
+
+  WriteBatch batch;
+  Random rnd(309);
+  for (int j = 0; j < 5; j++) {
+    for (int i = j * kNumKeys; i < (j + 1) * kNumKeys; i++) {
+      ASSERT_OK(batch.Put(BuildKey(i), rnd.RandomString(1000)));
+    }
+    ASSERT_OK(db_->Write(WriteOptions(), &batch));
+    ASSERT_OK(Flush());
+  }
+  MoveFilesToLevel(2);
+
+  int buff_prefetch_count = 0;
+  int set_readahead = 0;
+  size_t readahead_size = 0;
+
+  SyncPoint::GetInstance()->SetCallBack("FilePrefetchBuffer::Prefetch:Start",
+                                        [&](void*) { buff_prefetch_count++; });
+  SyncPoint::GetInstance()->SetCallBack(
+      "BlockPrefetcher::SetReadaheadState",
+      [&](void* /*arg*/) { set_readahead++; });
+  SyncPoint::GetInstance()->SetCallBack(
+      "FilePrefetchBuffer::TryReadFromCache",
+      [&](void* arg) { readahead_size = *reinterpret_cast<size_t*>(arg); });
+
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  {
+    // Iterate until prefetch is done.
+    ReadOptions ro;
+    ro.adaptive_readahead = true;
+    auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ro));
+
+    iter->SeekToFirst();
+    ASSERT_TRUE(iter->Valid());
+
+    while (iter->Valid() && buff_prefetch_count == 0) {
+      iter->Next();
+    }
+
+    ASSERT_EQ(readahead_size, 8 * 1024);
+    ASSERT_EQ(buff_prefetch_count, 1);
+    ASSERT_EQ(set_readahead, 0);
+    buff_prefetch_count = 0;
+
+    // Move to last file and check readahead size fallbacks to 8KB. So next
+    // readahead size after prefetch should be 8 * 1024;
+    iter->Seek(BuildKey(4004));
+    ASSERT_TRUE(iter->Valid());
+
+    while (iter->Valid() && buff_prefetch_count == 0) {
+      iter->Next();
+    }
+
+    ASSERT_EQ(readahead_size, 8 * 1024);
+    ASSERT_EQ(set_readahead, 0);
+    ASSERT_EQ(buff_prefetch_count, 1);
+  }
+  Close();
+}
+#endif  //! ROCKSDB_LITE
+
+TEST_P(PrefetchTest1, DecreaseReadAheadIfInCache) {
+  const int kNumKeys = 2000;
+  // Set options
+  std::shared_ptr<MockFS> fs =
+      std::make_shared<MockFS>(env_->GetFileSystem(), false);
+  std::unique_ptr<Env> env(new CompositeEnvWrapper(env_, fs));
+
+  Options options = CurrentOptions();
+  options.write_buffer_size = 1024;
+  options.create_if_missing = true;
+  options.compression = kNoCompression;
+  options.env = env.get();
+  if (GetParam()) {
+    options.use_direct_reads = true;
+    options.use_direct_io_for_flush_and_compaction = true;
+  }
+
+  options.statistics = CreateDBStatistics();
+  BlockBasedTableOptions table_options;
+  std::shared_ptr<Cache> cache = NewLRUCache(4 * 1024 * 1024, 2);  // 8MB
+  table_options.block_cache = cache;
+  table_options.cache_index_and_filter_blocks = false;
+  table_options.metadata_block_size = 1024;
+  table_options.index_type =
+      BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  Status s = TryReopen(options);
+  if (GetParam() && (s.IsNotSupported() || s.IsInvalidArgument())) {
+    // If direct IO is not supported, skip the test
+    return;
+  } else {
+    ASSERT_OK(s);
+  }
+
+  WriteBatch batch;
+  Random rnd(309);
+  for (int i = 0; i < kNumKeys; i++) {
+    ASSERT_OK(batch.Put(BuildKey(i), rnd.RandomString(1000)));
+  }
+  ASSERT_OK(db_->Write(WriteOptions(), &batch));
+
+  std::string start_key = BuildKey(0);
+  std::string end_key = BuildKey(kNumKeys - 1);
+  Slice least(start_key.data(), start_key.size());
+  Slice greatest(end_key.data(), end_key.size());
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &least, &greatest));
+
+  int buff_prefetch_count = 0;
+  size_t current_readahead_size = 0;
+  size_t expected_current_readahead_size = 8 * 1024;
+  size_t decrease_readahead_size = 8 * 1024;
+
+  SyncPoint::GetInstance()->SetCallBack("FilePrefetchBuffer::Prefetch:Start",
+                                        [&](void*) { buff_prefetch_count++; });
+  SyncPoint::GetInstance()->SetCallBack(
+      "FilePrefetchBuffer::TryReadFromCache", [&](void* arg) {
+        current_readahead_size = *reinterpret_cast<size_t*>(arg);
+      });
+
+  SyncPoint::GetInstance()->EnableProcessing();
+  ReadOptions ro;
+  ro.adaptive_readahead = true;
+  {
+    /*
+     * Reseek keys from sequential Data Blocks within same partitioned
+     * index. After 2 sequential reads it will prefetch the data block.
+     * Data Block size is nearly 4076 so readahead will fetch 8 * 1024 data
+     * more initially (2 more data blocks).
+     */
+    auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ro));
+    // Warm up the cache
+    iter->Seek(BuildKey(1011));
+    ASSERT_TRUE(iter->Valid());
+    iter->Seek(BuildKey(1015));
+    ASSERT_TRUE(iter->Valid());
+    iter->Seek(BuildKey(1019));
+    ASSERT_TRUE(iter->Valid());
+    buff_prefetch_count = 0;
+  }
+
+  {
+    ASSERT_OK(options.statistics->Reset());
+    // After caching, blocks will be read from cache (Sequential blocks)
+    auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ro));
+    iter->Seek(
+        BuildKey(0));  // In cache so it will decrease the readahead_size.
+    ASSERT_TRUE(iter->Valid());
+    expected_current_readahead_size = std::max(
+        decrease_readahead_size,
+        (expected_current_readahead_size >= decrease_readahead_size
+             ? (expected_current_readahead_size - decrease_readahead_size)
+             : 0));
+
+    iter->Seek(BuildKey(1000));  // Won't prefetch the block.
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(current_readahead_size, expected_current_readahead_size);
+
+    iter->Seek(BuildKey(1004));  // Prefetch the block.
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(current_readahead_size, expected_current_readahead_size);
+    expected_current_readahead_size *= 2;
+
+    iter->Seek(BuildKey(1011));
+    ASSERT_TRUE(iter->Valid());
+
+    // Eligible to Prefetch data (not in buffer) but block is in cache so no
+    // prefetch will happen and will result in decrease in readahead_size.
+    // readahead_size will be 8 * 1024
+    iter->Seek(BuildKey(1015));
+    ASSERT_TRUE(iter->Valid());
+    expected_current_readahead_size = std::max(
+        decrease_readahead_size,
+        (expected_current_readahead_size >= decrease_readahead_size
+             ? (expected_current_readahead_size - decrease_readahead_size)
+             : 0));
+
+    // 1016 is the same block as 1015. So no change in readahead_size.
+    iter->Seek(BuildKey(1016));
+    ASSERT_TRUE(iter->Valid());
+
+    // Prefetch data (not in buffer) but found in cache. So decrease
+    // readahead_size. Since it will 0 after decrementing so readahead_size will
+    // be set to initial value.
+    iter->Seek(BuildKey(1019));
+    ASSERT_TRUE(iter->Valid());
+    expected_current_readahead_size = std::max(
+        decrease_readahead_size,
+        (expected_current_readahead_size >= decrease_readahead_size
+             ? (expected_current_readahead_size - decrease_readahead_size)
+             : 0));
+
+    // Prefetch next sequential data.
+    iter->Seek(BuildKey(1022));
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(current_readahead_size, expected_current_readahead_size);
+    ASSERT_EQ(buff_prefetch_count, 2);
+
+    buff_prefetch_count = 0;
+  }
+  Close();
+}
+
+TEST_P(PrefetchTest1, SeekParallelizationTest) {
+  const int kNumKeys = 2000;
+  // Set options
+  std::shared_ptr<MockFS> fs =
+      std::make_shared<MockFS>(env_->GetFileSystem(), false);
+  std::unique_ptr<Env> env(new CompositeEnvWrapper(env_, fs));
+
+  Options options = CurrentOptions();
+  options.write_buffer_size = 1024;
+  options.create_if_missing = true;
+  options.compression = kNoCompression;
+  options.env = env.get();
+  if (GetParam()) {
+    options.use_direct_reads = true;
+    options.use_direct_io_for_flush_and_compaction = true;
+  }
+
+  options.statistics = CreateDBStatistics();
+  BlockBasedTableOptions table_options;
+  table_options.no_block_cache = true;
+  table_options.cache_index_and_filter_blocks = false;
+  table_options.metadata_block_size = 1024;
+  table_options.index_type =
+      BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  Status s = TryReopen(options);
+  if (GetParam() && (s.IsNotSupported() || s.IsInvalidArgument())) {
+    // If direct IO is not supported, skip the test
+    return;
+  } else {
+    ASSERT_OK(s);
+  }
+
+  WriteBatch batch;
+  Random rnd(309);
+  for (int i = 0; i < kNumKeys; i++) {
+    ASSERT_OK(batch.Put(BuildKey(i), rnd.RandomString(1000)));
+  }
+  ASSERT_OK(db_->Write(WriteOptions(), &batch));
+
+  std::string start_key = BuildKey(0);
+  std::string end_key = BuildKey(kNumKeys - 1);
+  Slice least(start_key.data(), start_key.size());
+  Slice greatest(end_key.data(), end_key.size());
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &least, &greatest));
+
+  int buff_prefetch_count = 0;
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "FilePrefetchBuffer::PrefetchAsyncInternal:Start",
+      [&](void*) { buff_prefetch_count++; });
+
+  SyncPoint::GetInstance()->EnableProcessing();
+  ReadOptions ro;
+  ro.adaptive_readahead = true;
+  ro.async_io = true;
+
+  {
+    ASSERT_OK(options.statistics->Reset());
+    // Each block contains around 4 keys.
+    auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ro));
+    iter->Seek(BuildKey(0));  // Prefetch data because of seek parallelization.
+    ASSERT_TRUE(iter->Valid());
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+
+    // New data block. Since num_file_reads in FilePrefetch after this read is
+    // 2, it won't go for prefetching.
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+
+    // Prefetch data.
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+
+    ASSERT_EQ(buff_prefetch_count, 2);
+
+    // Check stats to make sure async prefetch is done.
+    {
+      HistogramData async_read_bytes;
+      options.statistics->histogramData(ASYNC_READ_BYTES, &async_read_bytes);
+      ASSERT_GT(async_read_bytes.count, 0);
+      ASSERT_GT(get_perf_context()->number_async_seek, 0);
+    }
+
+    buff_prefetch_count = 0;
+  }
+  Close();
+}
+
+extern "C" bool RocksDbIOUringEnable() { return true; }
+
+namespace {
+#ifndef ROCKSDB_LITE
+#ifdef GFLAGS
+const int kMaxArgCount = 100;
+const size_t kArgBufferSize = 100000;
+
+void RunIOTracerParserTool(std::string trace_file) {
+  std::vector<std::string> params = {"./io_tracer_parser",
+                                     "-io_trace_file=" + trace_file};
+
+  char arg_buffer[kArgBufferSize];
+  char* argv[kMaxArgCount];
+  int argc = 0;
+  int cursor = 0;
+  for (const auto& arg : params) {
+    ASSERT_LE(cursor + arg.size() + 1, kArgBufferSize);
+    ASSERT_LE(argc + 1, kMaxArgCount);
+
+    snprintf(arg_buffer + cursor, arg.size() + 1, "%s", arg.c_str());
+
+    argv[argc++] = arg_buffer + cursor;
+    cursor += static_cast<int>(arg.size()) + 1;
+  }
+  ASSERT_EQ(0, ROCKSDB_NAMESPACE::io_tracer_parser(argc, argv));
+}
+#endif  // GFLAGS
+#endif  // ROCKSDB_LITE
+}  // namespace
+
+// Tests the default implementation of ReadAsync API with PosixFileSystem.
+TEST_P(PrefetchTest, ReadAsyncWithPosixFS) {
+  if (mem_env_ || encrypted_env_) {
+    ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment");
+    return;
+  }
+
+  const int kNumKeys = 1000;
+  std::shared_ptr<MockFS> fs = std::make_shared<MockFS>(
+      FileSystem::Default(), /*support_prefetch=*/false);
+  std::unique_ptr<Env> env(new CompositeEnvWrapper(env_, fs));
+
+  bool use_direct_io = std::get<0>(GetParam());
+  Options options = CurrentOptions();
+  options.write_buffer_size = 1024;
+  options.create_if_missing = true;
+  options.compression = kNoCompression;
+  options.env = env.get();
+  options.statistics = CreateDBStatistics();
+  if (use_direct_io) {
+    options.use_direct_reads = true;
+    options.use_direct_io_for_flush_and_compaction = true;
+  }
+  BlockBasedTableOptions table_options;
+  table_options.no_block_cache = true;
+  table_options.cache_index_and_filter_blocks = false;
+  table_options.metadata_block_size = 1024;
+  table_options.index_type =
+      BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  Status s = TryReopen(options);
+  if (use_direct_io && (s.IsNotSupported() || s.IsInvalidArgument())) {
+    // If direct IO is not supported, skip the test
+    return;
+  } else {
+    ASSERT_OK(s);
+  }
+
+  int total_keys = 0;
+  // Write the keys.
+  {
+    WriteBatch batch;
+    Random rnd(309);
+    for (int j = 0; j < 5; j++) {
+      for (int i = j * kNumKeys; i < (j + 1) * kNumKeys; i++) {
+        ASSERT_OK(batch.Put(BuildKey(i), rnd.RandomString(1000)));
+        total_keys++;
+      }
+      ASSERT_OK(db_->Write(WriteOptions(), &batch));
+      ASSERT_OK(Flush());
+    }
+    MoveFilesToLevel(2);
+  }
+
+  int buff_prefetch_count = 0;
+  bool read_async_called = false;
+  ReadOptions ro;
+  ro.adaptive_readahead = true;
+  ro.async_io = true;
+
+  if (std::get<1>(GetParam())) {
+    ro.readahead_size = 16 * 1024;
+  }
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "FilePrefetchBuffer::PrefetchAsyncInternal:Start",
+      [&](void*) { buff_prefetch_count++; });
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "UpdateResults::io_uring_result",
+      [&](void* /*arg*/) { read_async_called = true; });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  // Read the keys.
+  {
+    ASSERT_OK(options.statistics->Reset());
+    get_perf_context()->Reset();
+
+    auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ro));
+    int num_keys = 0;
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+      ASSERT_OK(iter->status());
+      num_keys++;
+    }
+
+    ASSERT_EQ(num_keys, total_keys);
+    ASSERT_GT(buff_prefetch_count, 0);
+
+    // Check stats to make sure async prefetch is done.
+    {
+      HistogramData async_read_bytes;
+      options.statistics->histogramData(ASYNC_READ_BYTES, &async_read_bytes);
+      HistogramData prefetched_bytes_discarded;
+      options.statistics->histogramData(PREFETCHED_BYTES_DISCARDED,
+                                        &prefetched_bytes_discarded);
+
+      // Not all platforms support iouring. In that case, ReadAsync in posix
+      // won't submit async requests.
+      if (read_async_called) {
+        ASSERT_GT(async_read_bytes.count, 0);
+      } else {
+        ASSERT_EQ(async_read_bytes.count, 0);
+      }
+      ASSERT_GT(prefetched_bytes_discarded.count, 0);
+    }
+    ASSERT_EQ(get_perf_context()->number_async_seek, 0);
+  }
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  Close();
+}
+
+TEST_P(PrefetchTest, MultipleSeekWithPosixFS) {
+  if (mem_env_ || encrypted_env_) {
+    ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment");
+    return;
+  }
+
+  const int kNumKeys = 1000;
+  std::shared_ptr<MockFS> fs = std::make_shared<MockFS>(
+      FileSystem::Default(), /*support_prefetch=*/false);
+  std::unique_ptr<Env> env(new CompositeEnvWrapper(env_, fs));
+
+  bool use_direct_io = std::get<0>(GetParam());
+  Options options = CurrentOptions();
+  options.write_buffer_size = 1024;
+  options.create_if_missing = true;
+  options.compression = kNoCompression;
+  options.env = env.get();
+  options.statistics = CreateDBStatistics();
+  if (use_direct_io) {
+    options.use_direct_reads = true;
+    options.use_direct_io_for_flush_and_compaction = true;
+  }
+  BlockBasedTableOptions table_options;
+  table_options.no_block_cache = true;
+  table_options.cache_index_and_filter_blocks = false;
+  table_options.metadata_block_size = 1024;
+  table_options.index_type =
+      BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  Status s = TryReopen(options);
+  if (use_direct_io && (s.IsNotSupported() || s.IsInvalidArgument())) {
+    // If direct IO is not supported, skip the test
+    return;
+  } else {
+    ASSERT_OK(s);
+  }
+
+  int total_keys = 0;
+  // Write the keys.
+  {
+    WriteBatch batch;
+    Random rnd(309);
+    for (int j = 0; j < 5; j++) {
+      for (int i = j * kNumKeys; i < (j + 1) * kNumKeys; i++) {
+        ASSERT_OK(batch.Put(BuildKey(i), rnd.RandomString(1000)));
+        total_keys++;
+      }
+      ASSERT_OK(db_->Write(WriteOptions(), &batch));
+      ASSERT_OK(Flush());
+    }
+    MoveFilesToLevel(2);
+  }
+
+  int num_keys_first_batch = 0;
+  int num_keys_second_batch = 0;
+  // Calculate number of keys without async_io for correctness validation.
+  {
+    auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ReadOptions()));
+    // First Seek.
+    iter->Seek(BuildKey(450));
+    while (iter->Valid() && num_keys_first_batch < 100) {
+      ASSERT_OK(iter->status());
+      num_keys_first_batch++;
+      iter->Next();
+    }
+    ASSERT_OK(iter->status());
+
+    iter->Seek(BuildKey(942));
+    while (iter->Valid()) {
+      ASSERT_OK(iter->status());
+      num_keys_second_batch++;
+      iter->Next();
+    }
+    ASSERT_OK(iter->status());
+  }
+
+  int buff_prefetch_count = 0;
+  bool read_async_called = false;
+  ReadOptions ro;
+  ro.adaptive_readahead = true;
+  ro.async_io = true;
+
+  if (std::get<1>(GetParam())) {
+    ro.readahead_size = 16 * 1024;
+  }
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "FilePrefetchBuffer::PrefetchAsyncInternal:Start",
+      [&](void*) { buff_prefetch_count++; });
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "UpdateResults::io_uring_result",
+      [&](void* /*arg*/) { read_async_called = true; });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  // Read the keys using seek.
+  {
+    ASSERT_OK(options.statistics->Reset());
+    get_perf_context()->Reset();
+
+    auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ro));
+    int num_keys = 0;
+    // First Seek.
+    {
+      iter->Seek(BuildKey(450));
+      while (iter->Valid() && num_keys < 100) {
+        ASSERT_OK(iter->status());
+        num_keys++;
+        iter->Next();
+      }
+      ASSERT_OK(iter->status());
+      ASSERT_EQ(num_keys, num_keys_first_batch);
+      // Check stats to make sure async prefetch is done.
+      {
+        HistogramData async_read_bytes;
+        options.statistics->histogramData(ASYNC_READ_BYTES, &async_read_bytes);
+
+        // Not all platforms support iouring. In that case, ReadAsync in posix
+        // won't submit async requests.
+        if (read_async_called) {
+          ASSERT_GT(async_read_bytes.count, 0);
+          ASSERT_GT(get_perf_context()->number_async_seek, 0);
+        } else {
+          ASSERT_EQ(async_read_bytes.count, 0);
+          ASSERT_EQ(get_perf_context()->number_async_seek, 0);
+        }
+      }
+    }
+
+    // Second Seek.
+    {
+      num_keys = 0;
+      ASSERT_OK(options.statistics->Reset());
+      get_perf_context()->Reset();
+
+      iter->Seek(BuildKey(942));
+      while (iter->Valid()) {
+        ASSERT_OK(iter->status());
+        num_keys++;
+        iter->Next();
+      }
+      ASSERT_OK(iter->status());
+      ASSERT_EQ(num_keys, num_keys_second_batch);
+
+      ASSERT_GT(buff_prefetch_count, 0);
+
+      // Check stats to make sure async prefetch is done.
+      {
+        HistogramData async_read_bytes;
+        options.statistics->histogramData(ASYNC_READ_BYTES, &async_read_bytes);
+        HistogramData prefetched_bytes_discarded;
+        options.statistics->histogramData(PREFETCHED_BYTES_DISCARDED,
+                                          &prefetched_bytes_discarded);
+
+        // Not all platforms support iouring. In that case, ReadAsync in posix
+        // won't submit async requests.
+        if (read_async_called) {
+          ASSERT_GT(async_read_bytes.count, 0);
+          ASSERT_GT(get_perf_context()->number_async_seek, 0);
+        } else {
+          ASSERT_EQ(async_read_bytes.count, 0);
+          ASSERT_EQ(get_perf_context()->number_async_seek, 0);
+        }
+        ASSERT_GT(prefetched_bytes_discarded.count, 0);
+      }
+    }
+  }
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  Close();
+}
+
+TEST_P(PrefetchTest, SeekParallelizationTest1) {
+  if (mem_env_ || encrypted_env_) {
+    ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment");
+    return;
+  }
+  const int kNumKeys = 2000;
+  // Set options
+  std::shared_ptr<MockFS> fs = std::make_shared<MockFS>(
+      FileSystem::Default(), /*support_prefetch=*/false);
+  std::unique_ptr<Env> env(new CompositeEnvWrapper(env_, fs));
+
+  bool use_direct_io = std::get<0>(GetParam());
+  Options options = CurrentOptions();
+  options.write_buffer_size = 1024;
+  options.create_if_missing = true;
+  options.compression = kNoCompression;
+  options.env = env.get();
+  if (use_direct_io) {
+    options.use_direct_reads = true;
+    options.use_direct_io_for_flush_and_compaction = true;
+  }
+
+  options.statistics = CreateDBStatistics();
+  BlockBasedTableOptions table_options;
+  table_options.no_block_cache = true;
+  table_options.cache_index_and_filter_blocks = false;
+  table_options.metadata_block_size = 1024;
+  table_options.index_type =
+      BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  Status s = TryReopen(options);
+  if (use_direct_io && (s.IsNotSupported() || s.IsInvalidArgument())) {
+    // If direct IO is not supported, skip the test
+    return;
+  } else {
+    ASSERT_OK(s);
+  }
+
+  WriteBatch batch;
+  Random rnd(309);
+  for (int i = 0; i < kNumKeys; i++) {
+    ASSERT_OK(batch.Put(BuildKey(i), rnd.RandomString(1000)));
+  }
+  ASSERT_OK(db_->Write(WriteOptions(), &batch));
+
+  std::string start_key = BuildKey(0);
+  std::string end_key = BuildKey(kNumKeys - 1);
+  Slice least(start_key.data(), start_key.size());
+  Slice greatest(end_key.data(), end_key.size());
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &least, &greatest));
+
+  int buff_prefetch_count = 0;
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "FilePrefetchBuffer::PrefetchAsyncInternal:Start",
+      [&](void*) { buff_prefetch_count++; });
+
+  bool read_async_called = false;
+  SyncPoint::GetInstance()->SetCallBack(
+      "UpdateResults::io_uring_result",
+      [&](void* /*arg*/) { read_async_called = true; });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  SyncPoint::GetInstance()->EnableProcessing();
+  ReadOptions ro;
+  ro.adaptive_readahead = true;
+  ro.async_io = true;
+
+  if (std::get<1>(GetParam())) {
+    ro.readahead_size = 16 * 1024;
+  }
+
+  {
+    ASSERT_OK(options.statistics->Reset());
+    // Each block contains around 4 keys.
+    auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ro));
+    iter->Seek(BuildKey(0));  // Prefetch data because of seek parallelization.
+    ASSERT_TRUE(iter->Valid());
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+
+    // New data block. Since num_file_reads in FilePrefetch after this read is
+    // 2, it won't go for prefetching.
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+
+    // Prefetch data.
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+
+    // Check stats to make sure async prefetch is done.
+    {
+      HistogramData async_read_bytes;
+      options.statistics->histogramData(ASYNC_READ_BYTES, &async_read_bytes);
+      // Not all platforms support iouring. In that case, ReadAsync in posix
+      // won't submit async requests.
+      if (read_async_called) {
+        ASSERT_GT(async_read_bytes.count, 0);
+        ASSERT_GT(get_perf_context()->number_async_seek, 0);
+        if (std::get<1>(GetParam())) {
+          ASSERT_EQ(buff_prefetch_count, 1);
+        } else {
+          ASSERT_EQ(buff_prefetch_count, 2);
+        }
+      } else {
+        ASSERT_EQ(async_read_bytes.count, 0);
+        ASSERT_EQ(get_perf_context()->number_async_seek, 0);
+        ASSERT_EQ(buff_prefetch_count, 1);
+      }
+    }
+
+    buff_prefetch_count = 0;
+  }
+  Close();
+}
+
+#ifndef ROCKSDB_LITE
+#ifdef GFLAGS
+TEST_P(PrefetchTest, TraceReadAsyncWithCallbackWrapper) {
+  if (mem_env_ || encrypted_env_) {
+    ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment");
+    return;
+  }
+
+  const int kNumKeys = 1000;
+  std::shared_ptr<MockFS> fs = std::make_shared<MockFS>(
+      FileSystem::Default(), /*support_prefetch=*/false);
+  std::unique_ptr<Env> env(new CompositeEnvWrapper(env_, fs));
+
+  bool use_direct_io = std::get<0>(GetParam());
+  Options options = CurrentOptions();
+  options.write_buffer_size = 1024;
+  options.create_if_missing = true;
+  options.compression = kNoCompression;
+  options.env = env.get();
+  options.statistics = CreateDBStatistics();
+  if (use_direct_io) {
+    options.use_direct_reads = true;
+    options.use_direct_io_for_flush_and_compaction = true;
+  }
+  BlockBasedTableOptions table_options;
+  table_options.no_block_cache = true;
+  table_options.cache_index_and_filter_blocks = false;
+  table_options.metadata_block_size = 1024;
+  table_options.index_type =
+      BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  Status s = TryReopen(options);
+  if (use_direct_io && (s.IsNotSupported() || s.IsInvalidArgument())) {
+    // If direct IO is not supported, skip the test
+    return;
+  } else {
+    ASSERT_OK(s);
+  }
+
+  int total_keys = 0;
+  // Write the keys.
+  {
+    WriteBatch batch;
+    Random rnd(309);
+    for (int j = 0; j < 5; j++) {
+      for (int i = j * kNumKeys; i < (j + 1) * kNumKeys; i++) {
+        ASSERT_OK(batch.Put(BuildKey(i), rnd.RandomString(1000)));
+        total_keys++;
+      }
+      ASSERT_OK(db_->Write(WriteOptions(), &batch));
+      ASSERT_OK(Flush());
+    }
+    MoveFilesToLevel(2);
+  }
+
+  int buff_prefetch_count = 0;
+  bool read_async_called = false;
+  ReadOptions ro;
+  ro.adaptive_readahead = true;
+  ro.async_io = true;
+
+  if (std::get<1>(GetParam())) {
+    ro.readahead_size = 16 * 1024;
+  }
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "FilePrefetchBuffer::PrefetchAsyncInternal:Start",
+      [&](void*) { buff_prefetch_count++; });
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "UpdateResults::io_uring_result",
+      [&](void* /*arg*/) { read_async_called = true; });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  // Read the keys.
+  {
+    // Start io_tracing.
+    WriteOptions write_opt;
+    TraceOptions trace_opt;
+    std::unique_ptr<TraceWriter> trace_writer;
+    std::string trace_file_path = dbname_ + "/io_trace_file";
+
+    ASSERT_OK(
+        NewFileTraceWriter(env_, EnvOptions(), trace_file_path, &trace_writer));
+    ASSERT_OK(db_->StartIOTrace(trace_opt, std::move(trace_writer)));
+    ASSERT_OK(options.statistics->Reset());
+
+    auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ro));
+    int num_keys = 0;
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+      ASSERT_OK(iter->status());
+      num_keys++;
+    }
+
+    // End the tracing.
+    ASSERT_OK(db_->EndIOTrace());
+    ASSERT_OK(env_->FileExists(trace_file_path));
+
+    ASSERT_EQ(num_keys, total_keys);
+    ASSERT_GT(buff_prefetch_count, 0);
+
+    // Check stats to make sure async prefetch is done.
+    {
+      HistogramData async_read_bytes;
+      options.statistics->histogramData(ASYNC_READ_BYTES, &async_read_bytes);
+      // Not all platforms support iouring. In that case, ReadAsync in posix
+      // won't submit async requests.
+      if (read_async_called) {
+        ASSERT_GT(async_read_bytes.count, 0);
+      } else {
+        ASSERT_EQ(async_read_bytes.count, 0);
+      }
+    }
+
+    // Check the file to see if ReadAsync is logged.
+    RunIOTracerParserTool(trace_file_path);
+  }
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  Close();
+}
+#endif  // GFLAGS
+
+class FilePrefetchBufferTest : public testing::Test {
+ public:
+  void SetUp() override {
+    SetupSyncPointsToMockDirectIO();
+    env_ = Env::Default();
+    fs_ = FileSystem::Default();
+    test_dir_ = test::PerThreadDBPath("file_prefetch_buffer_test");
+    ASSERT_OK(fs_->CreateDir(test_dir_, IOOptions(), nullptr));
+  }
+
+  void TearDown() override { EXPECT_OK(DestroyDir(env_, test_dir_)); }
+
+  void Write(const std::string& fname, const std::string& content) {
+    std::unique_ptr<FSWritableFile> f;
+    ASSERT_OK(fs_->NewWritableFile(Path(fname), FileOptions(), &f, nullptr));
+    ASSERT_OK(f->Append(content, IOOptions(), nullptr));
+    ASSERT_OK(f->Close(IOOptions(), nullptr));
+  }
+
+  void Read(const std::string& fname, const FileOptions& opts,
+            std::unique_ptr<RandomAccessFileReader>* reader) {
+    std::string fpath = Path(fname);
+    std::unique_ptr<FSRandomAccessFile> f;
+    ASSERT_OK(fs_->NewRandomAccessFile(fpath, opts, &f, nullptr));
+    reader->reset(new RandomAccessFileReader(std::move(f), fpath,
+                                             env_->GetSystemClock().get()));
+  }
+
+  void AssertResult(const std::string& content,
+                    const std::vector<FSReadRequest>& reqs) {
+    for (const auto& r : reqs) {
+      ASSERT_OK(r.status);
+      ASSERT_EQ(r.len, r.result.size());
+      ASSERT_EQ(content.substr(r.offset, r.len), r.result.ToString());
+    }
+  }
+
+  FileSystem* fs() { return fs_.get(); }
+
+ private:
+  Env* env_;
+  std::shared_ptr<FileSystem> fs_;
+  std::string test_dir_;
+
+  std::string Path(const std::string& fname) { return test_dir_ + "/" + fname; }
+};
+
+TEST_F(FilePrefetchBufferTest, SeekWithBlockCacheHit) {
+  std::string fname = "seek-with-block-cache-hit";
+  Random rand(0);
+  std::string content = rand.RandomString(32768);
+  Write(fname, content);
+
+  FileOptions opts;
+  std::unique_ptr<RandomAccessFileReader> r;
+  Read(fname, opts, &r);
+
+  FilePrefetchBuffer fpb(16384, 16384, true, false, false, 0, 0, fs());
+  Slice result;
+  // Simulate a seek of 4096 bytes at offset 0. Due to the readahead settings,
+  // it will do two reads of 4096+8192 and 8192
+  Status s = fpb.PrefetchAsync(IOOptions(), r.get(), 0, 4096, &result);
+  ASSERT_EQ(s, Status::TryAgain());
+  // Simulate a block cache hit
+  fpb.UpdateReadPattern(0, 4096, false);
+  // Now read some data that straddles the two prefetch buffers - offset 8192 to
+  // 16384
+  ASSERT_TRUE(fpb.TryReadFromCacheAsync(IOOptions(), r.get(), 8192, 8192,
+                                        &result, &s, Env::IOPriority::IO_LOW));
+}
+#endif  // ROCKSDB_LITE
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/file/random_access_file_reader.cc b/src/rocksdb/file/random_access_file_reader.cc
new file mode 100644
index 000000000..030cd8d07
--- /dev/null
+++ b/src/rocksdb/file/random_access_file_reader.cc
@@ -0,0 +1,602 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "file/random_access_file_reader.h"
+
+#include <algorithm>
+#include <mutex>
+
+#include "file/file_util.h"
+#include "monitoring/histogram.h"
+#include "monitoring/iostats_context_imp.h"
+#include "port/port.h"
+#include "table/format.h"
+#include "test_util/sync_point.h"
+#include "util/random.h"
+#include "util/rate_limiter.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+inline void RecordIOStats(Statistics* stats, Temperature file_temperature,
+                          bool is_last_level, size_t size) {
+  IOSTATS_ADD(bytes_read, size);
+  // record for last/non-last level
+  if (is_last_level) {
+    RecordTick(stats, LAST_LEVEL_READ_BYTES, size);
+    RecordTick(stats, LAST_LEVEL_READ_COUNT, 1);
+  } else {
+    RecordTick(stats, NON_LAST_LEVEL_READ_BYTES, size);
+    RecordTick(stats, NON_LAST_LEVEL_READ_COUNT, 1);
+  }
+
+  // record for temperature file
+  if (file_temperature != Temperature::kUnknown) {
+    switch (file_temperature) {
+      case Temperature::kHot:
+        IOSTATS_ADD(file_io_stats_by_temperature.hot_file_bytes_read, size);
+        IOSTATS_ADD(file_io_stats_by_temperature.hot_file_read_count, 1);
+        RecordTick(stats, HOT_FILE_READ_BYTES, size);
+        RecordTick(stats, HOT_FILE_READ_COUNT, 1);
+        break;
+      case Temperature::kWarm:
+        IOSTATS_ADD(file_io_stats_by_temperature.warm_file_bytes_read, size);
+        IOSTATS_ADD(file_io_stats_by_temperature.warm_file_read_count, 1);
+        RecordTick(stats, WARM_FILE_READ_BYTES, size);
+        RecordTick(stats, WARM_FILE_READ_COUNT, 1);
+        break;
+      case Temperature::kCold:
+        IOSTATS_ADD(file_io_stats_by_temperature.cold_file_bytes_read, size);
+        IOSTATS_ADD(file_io_stats_by_temperature.cold_file_read_count, 1);
+        RecordTick(stats, COLD_FILE_READ_BYTES, size);
+        RecordTick(stats, COLD_FILE_READ_COUNT, 1);
+        break;
+      default:
+        break;
+    }
+  }
+}
+
+IOStatus RandomAccessFileReader::Create(
+    const std::shared_ptr<FileSystem>& fs, const std::string& fname,
+    const FileOptions& file_opts,
+    std::unique_ptr<RandomAccessFileReader>* reader, IODebugContext* dbg) {
+  std::unique_ptr<FSRandomAccessFile> file;
+  IOStatus io_s = fs->NewRandomAccessFile(fname, file_opts, &file, dbg);
+  if (io_s.ok()) {
+    reader->reset(new RandomAccessFileReader(std::move(file), fname));
+  }
+  return io_s;
+}
+
+IOStatus RandomAccessFileReader::Read(
+    const IOOptions& opts, uint64_t offset, size_t n, Slice* result,
+    char* scratch, AlignedBuf* aligned_buf,
+    Env::IOPriority rate_limiter_priority) const {
+  (void)aligned_buf;
+
+  TEST_SYNC_POINT_CALLBACK("RandomAccessFileReader::Read", nullptr);
+
+  // To be paranoid: modify scratch a little bit, so in case underlying
+  // FileSystem doesn't fill the buffer but return success and `scratch` returns
+  // contains a previous block, returned value will not pass checksum.
+  if (n > 0 && scratch != nullptr) {
+    // This byte might not change anything for direct I/O case, but it's OK.
+    scratch[0]++;
+  }
+
+  IOStatus io_s;
+  uint64_t elapsed = 0;
+  {
+    StopWatch sw(clock_, stats_, hist_type_,
+                 (stats_ != nullptr) ? &elapsed : nullptr, true /*overwrite*/,
+                 true /*delay_enabled*/);
+    auto prev_perf_level = GetPerfLevel();
+    IOSTATS_TIMER_GUARD(read_nanos);
+    if (use_direct_io()) {
+#ifndef ROCKSDB_LITE
+      size_t alignment = file_->GetRequiredBufferAlignment();
+      size_t aligned_offset =
+          TruncateToPageBoundary(alignment, static_cast<size_t>(offset));
+      size_t offset_advance = static_cast<size_t>(offset) - aligned_offset;
+      size_t read_size =
+          Roundup(static_cast<size_t>(offset + n), alignment) - aligned_offset;
+      AlignedBuffer buf;
+      buf.Alignment(alignment);
+      buf.AllocateNewBuffer(read_size);
+      while (buf.CurrentSize() < read_size) {
+        size_t allowed;
+        if (rate_limiter_priority != Env::IO_TOTAL &&
+            rate_limiter_ != nullptr) {
+          allowed = rate_limiter_->RequestToken(
+              buf.Capacity() - buf.CurrentSize(), buf.Alignment(),
+              rate_limiter_priority, stats_, RateLimiter::OpType::kRead);
+        } else {
+          assert(buf.CurrentSize() == 0);
+          allowed = read_size;
+        }
+        Slice tmp;
+
+        FileOperationInfo::StartTimePoint start_ts;
+        uint64_t orig_offset = 0;
+        if (ShouldNotifyListeners()) {
+          start_ts = FileOperationInfo::StartNow();
+          orig_offset = aligned_offset + buf.CurrentSize();
+        }
+
+        {
+          IOSTATS_CPU_TIMER_GUARD(cpu_read_nanos, clock_);
+          // Only user reads are expected to specify a timeout. And user reads
+          // are not subjected to rate_limiter and should go through only
+          // one iteration of this loop, so we don't need to check and adjust
+          // the opts.timeout before calling file_->Read
+          assert(!opts.timeout.count() || allowed == read_size);
+          io_s = file_->Read(aligned_offset + buf.CurrentSize(), allowed, opts,
+                             &tmp, buf.Destination(), nullptr);
+        }
+        if (ShouldNotifyListeners()) {
+          auto finish_ts = FileOperationInfo::FinishNow();
+          NotifyOnFileReadFinish(orig_offset, tmp.size(), start_ts, finish_ts,
+                                 io_s);
+          if (!io_s.ok()) {
+            NotifyOnIOError(io_s, FileOperationType::kRead, file_name(),
+                            tmp.size(), orig_offset);
+          }
+        }
+
+        buf.Size(buf.CurrentSize() + tmp.size());
+        if (!io_s.ok() || tmp.size() < allowed) {
+          break;
+        }
+      }
+      size_t res_len = 0;
+      if (io_s.ok() && offset_advance < buf.CurrentSize()) {
+        res_len = std::min(buf.CurrentSize() - offset_advance, n);
+        if (aligned_buf == nullptr) {
+          buf.Read(scratch, offset_advance, res_len);
+        } else {
+          scratch = buf.BufferStart() + offset_advance;
+          aligned_buf->reset(buf.Release());
+        }
+      }
+      *result = Slice(scratch, res_len);
+#endif  // !ROCKSDB_LITE
+    } else {
+      size_t pos = 0;
+      const char* res_scratch = nullptr;
+      while (pos < n) {
+        size_t allowed;
+        if (rate_limiter_priority != Env::IO_TOTAL &&
+            rate_limiter_ != nullptr) {
+          if (rate_limiter_->IsRateLimited(RateLimiter::OpType::kRead)) {
+            sw.DelayStart();
+          }
+          allowed = rate_limiter_->RequestToken(n - pos, 0 /* alignment */,
+                                                rate_limiter_priority, stats_,
+                                                RateLimiter::OpType::kRead);
+          if (rate_limiter_->IsRateLimited(RateLimiter::OpType::kRead)) {
+            sw.DelayStop();
+          }
+        } else {
+          allowed = n;
+        }
+        Slice tmp_result;
+
+#ifndef ROCKSDB_LITE
+        FileOperationInfo::StartTimePoint start_ts;
+        if (ShouldNotifyListeners()) {
+          start_ts = FileOperationInfo::StartNow();
+        }
+#endif
+
+        {
+          IOSTATS_CPU_TIMER_GUARD(cpu_read_nanos, clock_);
+          // Only user reads are expected to specify a timeout. And user reads
+          // are not subjected to rate_limiter and should go through only
+          // one iteration of this loop, so we don't need to check and adjust
+          // the opts.timeout before calling file_->Read
+          assert(!opts.timeout.count() || allowed == n);
+          io_s = file_->Read(offset + pos, allowed, opts, &tmp_result,
+                             scratch + pos, nullptr);
+        }
+#ifndef ROCKSDB_LITE
+        if (ShouldNotifyListeners()) {
+          auto finish_ts = FileOperationInfo::FinishNow();
+          NotifyOnFileReadFinish(offset + pos, tmp_result.size(), start_ts,
+                                 finish_ts, io_s);
+
+          if (!io_s.ok()) {
+            NotifyOnIOError(io_s, FileOperationType::kRead, file_name(),
+                            tmp_result.size(), offset + pos);
+          }
+        }
+#endif
+        if (res_scratch == nullptr) {
+          // we can't simply use `scratch` because reads of mmap'd files return
+          // data in a different buffer.
+          res_scratch = tmp_result.data();
+        } else {
+          // make sure chunks are inserted contiguously into `res_scratch`.
+          assert(tmp_result.data() == res_scratch + pos);
+        }
+        pos += tmp_result.size();
+        if (!io_s.ok() || tmp_result.size() < allowed) {
+          break;
+        }
+      }
+      *result = Slice(res_scratch, io_s.ok() ? pos : 0);
+    }
+    RecordIOStats(stats_, file_temperature_, is_last_level_, result->size());
+    SetPerfLevel(prev_perf_level);
+  }
+  if (stats_ != nullptr && file_read_hist_ != nullptr) {
+    file_read_hist_->Add(elapsed);
+  }
+
+  return io_s;
+}
+
+size_t End(const FSReadRequest& r) {
+  return static_cast<size_t>(r.offset) + r.len;
+}
+
+FSReadRequest Align(const FSReadRequest& r, size_t alignment) {
+  FSReadRequest req;
+  req.offset = static_cast<uint64_t>(
+      TruncateToPageBoundary(alignment, static_cast<size_t>(r.offset)));
+  req.len = Roundup(End(r), alignment) - req.offset;
+  req.scratch = nullptr;
+  return req;
+}
+
+bool TryMerge(FSReadRequest* dest, const FSReadRequest& src) {
+  size_t dest_offset = static_cast<size_t>(dest->offset);
+  size_t src_offset = static_cast<size_t>(src.offset);
+  size_t dest_end = End(*dest);
+  size_t src_end = End(src);
+  if (std::max(dest_offset, src_offset) > std::min(dest_end, src_end)) {
+    return false;
+  }
+  dest->offset = static_cast<uint64_t>(std::min(dest_offset, src_offset));
+  dest->len = std::max(dest_end, src_end) - dest->offset;
+  return true;
+}
+
+IOStatus RandomAccessFileReader::MultiRead(
+    const IOOptions& opts, FSReadRequest* read_reqs, size_t num_reqs,
+    AlignedBuf* aligned_buf, Env::IOPriority rate_limiter_priority) const {
+  (void)aligned_buf;  // suppress warning of unused variable in LITE mode
+  assert(num_reqs > 0);
+
+#ifndef NDEBUG
+  for (size_t i = 0; i < num_reqs - 1; ++i) {
+    assert(read_reqs[i].offset <= read_reqs[i + 1].offset);
+  }
+#endif  // !NDEBUG
+
+  // To be paranoid modify scratch a little bit, so in case underlying
+  // FileSystem doesn't fill the buffer but return success and `scratch` returns
+  // contains a previous block, returned value will not pass checksum.
+  // This byte might not change anything for direct I/O case, but it's OK.
+  for (size_t i = 0; i < num_reqs; i++) {
+    FSReadRequest& r = read_reqs[i];
+    if (r.len > 0 && r.scratch != nullptr) {
+      r.scratch[0]++;
+    }
+  }
+
+  IOStatus io_s;
+  uint64_t elapsed = 0;
+  {
+    StopWatch sw(clock_, stats_, hist_type_,
+                 (stats_ != nullptr) ? &elapsed : nullptr, true /*overwrite*/,
+                 true /*delay_enabled*/);
+    auto prev_perf_level = GetPerfLevel();
+    IOSTATS_TIMER_GUARD(read_nanos);
+
+    FSReadRequest* fs_reqs = read_reqs;
+    size_t num_fs_reqs = num_reqs;
+#ifndef ROCKSDB_LITE
+    std::vector<FSReadRequest> aligned_reqs;
+    if (use_direct_io()) {
+      // num_reqs is the max possible size,
+      // this can reduce std::vecector's internal resize operations.
+      aligned_reqs.reserve(num_reqs);
+      // Align and merge the read requests.
+      size_t alignment = file_->GetRequiredBufferAlignment();
+      for (size_t i = 0; i < num_reqs; i++) {
+        const auto& r = Align(read_reqs[i], alignment);
+        if (i == 0) {
+          // head
+          aligned_reqs.push_back(r);
+
+        } else if (!TryMerge(&aligned_reqs.back(), r)) {
+          // head + n
+          aligned_reqs.push_back(r);
+
+        } else {
+          // unused
+          r.status.PermitUncheckedError();
+        }
+      }
+      TEST_SYNC_POINT_CALLBACK("RandomAccessFileReader::MultiRead:AlignedReqs",
+                               &aligned_reqs);
+
+      // Allocate aligned buffer and let scratch buffers point to it.
+      size_t total_len = 0;
+      for (const auto& r : aligned_reqs) {
+        total_len += r.len;
+      }
+      AlignedBuffer buf;
+      buf.Alignment(alignment);
+      buf.AllocateNewBuffer(total_len);
+      char* scratch = buf.BufferStart();
+      for (auto& r : aligned_reqs) {
+        r.scratch = scratch;
+        scratch += r.len;
+      }
+
+      aligned_buf->reset(buf.Release());
+      fs_reqs = aligned_reqs.data();
+      num_fs_reqs = aligned_reqs.size();
+    }
+#endif  // ROCKSDB_LITE
+
+#ifndef ROCKSDB_LITE
+    FileOperationInfo::StartTimePoint start_ts;
+    if (ShouldNotifyListeners()) {
+      start_ts = FileOperationInfo::StartNow();
+    }
+#endif  // ROCKSDB_LITE
+
+    {
+      IOSTATS_CPU_TIMER_GUARD(cpu_read_nanos, clock_);
+      if (rate_limiter_priority != Env::IO_TOTAL && rate_limiter_ != nullptr) {
+        // TODO: ideally we should call `RateLimiter::RequestToken()` for
+        // allowed bytes to multi-read and then consume those bytes by
+        // satisfying as many requests in `MultiRead()` as possible, instead of
+        // what we do here, which can cause burst when the
+        // `total_multi_read_size` is big.
+        size_t total_multi_read_size = 0;
+        assert(fs_reqs != nullptr);
+        for (size_t i = 0; i < num_fs_reqs; ++i) {
+          FSReadRequest& req = fs_reqs[i];
+          total_multi_read_size += req.len;
+        }
+        size_t remaining_bytes = total_multi_read_size;
+        size_t request_bytes = 0;
+        while (remaining_bytes > 0) {
+          request_bytes = std::min(
+              static_cast<size_t>(rate_limiter_->GetSingleBurstBytes()),
+              remaining_bytes);
+          rate_limiter_->Request(request_bytes, rate_limiter_priority,
+                                 nullptr /* stats */,
+                                 RateLimiter::OpType::kRead);
+          remaining_bytes -= request_bytes;
+        }
+      }
+      io_s = file_->MultiRead(fs_reqs, num_fs_reqs, opts, nullptr);
+      RecordInHistogram(stats_, MULTIGET_IO_BATCH_SIZE, num_fs_reqs);
+    }
+
+#ifndef ROCKSDB_LITE
+    if (use_direct_io()) {
+      // Populate results in the unaligned read requests.
+      size_t aligned_i = 0;
+      for (size_t i = 0; i < num_reqs; i++) {
+        auto& r = read_reqs[i];
+        if (static_cast<size_t>(r.offset) > End(aligned_reqs[aligned_i])) {
+          aligned_i++;
+        }
+        const auto& fs_r = fs_reqs[aligned_i];
+        r.status = fs_r.status;
+        if (r.status.ok()) {
+          uint64_t offset = r.offset - fs_r.offset;
+          if (fs_r.result.size() <= offset) {
+            // No byte in the read range is returned.
+            r.result = Slice();
+          } else {
+            size_t len = std::min(
+                r.len, static_cast<size_t>(fs_r.result.size() - offset));
+            r.result = Slice(fs_r.scratch + offset, len);
+          }
+        } else {
+          r.result = Slice();
+        }
+      }
+    }
+#endif  // ROCKSDB_LITE
+
+    for (size_t i = 0; i < num_reqs; ++i) {
+#ifndef ROCKSDB_LITE
+      if (ShouldNotifyListeners()) {
+        auto finish_ts = FileOperationInfo::FinishNow();
+        NotifyOnFileReadFinish(read_reqs[i].offset, read_reqs[i].result.size(),
+                               start_ts, finish_ts, read_reqs[i].status);
+      }
+      if (!read_reqs[i].status.ok()) {
+        NotifyOnIOError(read_reqs[i].status, FileOperationType::kRead,
+                        file_name(), read_reqs[i].result.size(),
+                        read_reqs[i].offset);
+      }
+
+#endif  // ROCKSDB_LITE
+      RecordIOStats(stats_, file_temperature_, is_last_level_,
+                    read_reqs[i].result.size());
+    }
+    SetPerfLevel(prev_perf_level);
+  }
+  if (stats_ != nullptr && file_read_hist_ != nullptr) {
+    file_read_hist_->Add(elapsed);
+  }
+
+  return io_s;
+}
+
+IOStatus RandomAccessFileReader::PrepareIOOptions(const ReadOptions& ro,
+                                                  IOOptions& opts) {
+  if (clock_ != nullptr) {
+    return PrepareIOFromReadOptions(ro, clock_, opts);
+  } else {
+    return PrepareIOFromReadOptions(ro, SystemClock::Default().get(), opts);
+  }
+}
+
+IOStatus RandomAccessFileReader::ReadAsync(
+    FSReadRequest& req, const IOOptions& opts,
+    std::function<void(const FSReadRequest&, void*)> cb, void* cb_arg,
+    void** io_handle, IOHandleDeleter* del_fn, AlignedBuf* aligned_buf) {
+  IOStatus s;
+  // Create a callback and populate info.
+  auto read_async_callback =
+      std::bind(&RandomAccessFileReader::ReadAsyncCallback, this,
+                std::placeholders::_1, std::placeholders::_2);
+  ReadAsyncInfo* read_async_info =
+      new ReadAsyncInfo(cb, cb_arg, clock_->NowMicros());
+
+#ifndef ROCKSDB_LITE
+  if (ShouldNotifyListeners()) {
+    read_async_info->fs_start_ts_ = FileOperationInfo::StartNow();
+  }
+#endif
+
+  size_t alignment = file_->GetRequiredBufferAlignment();
+  bool is_aligned = (req.offset & (alignment - 1)) == 0 &&
+                    (req.len & (alignment - 1)) == 0 &&
+                    (uintptr_t(req.scratch) & (alignment - 1)) == 0;
+  read_async_info->is_aligned_ = is_aligned;
+
+  uint64_t elapsed = 0;
+  if (use_direct_io() && is_aligned == false) {
+    FSReadRequest aligned_req = Align(req, alignment);
+    aligned_req.status.PermitUncheckedError();
+
+    // Allocate aligned buffer.
+    read_async_info->buf_.Alignment(alignment);
+    read_async_info->buf_.AllocateNewBuffer(aligned_req.len);
+
+    // Set rem fields in aligned FSReadRequest.
+    aligned_req.scratch = read_async_info->buf_.BufferStart();
+
+    // Set user provided fields to populate back in callback.
+    read_async_info->user_scratch_ = req.scratch;
+    read_async_info->user_aligned_buf_ = aligned_buf;
+    read_async_info->user_len_ = req.len;
+    read_async_info->user_offset_ = req.offset;
+    read_async_info->user_result_ = req.result;
+
+    assert(read_async_info->buf_.CurrentSize() == 0);
+
+    StopWatch sw(clock_, nullptr /*stats*/, 0 /*hist_type*/, &elapsed,
+                 true /*overwrite*/, true /*delay_enabled*/);
+    s = file_->ReadAsync(aligned_req, opts, read_async_callback,
+                         read_async_info, io_handle, del_fn, nullptr /*dbg*/);
+  } else {
+    StopWatch sw(clock_, nullptr /*stats*/, 0 /*hist_type*/, &elapsed,
+                 true /*overwrite*/, true /*delay_enabled*/);
+    s = file_->ReadAsync(req, opts, read_async_callback, read_async_info,
+                         io_handle, del_fn, nullptr /*dbg*/);
+  }
+  RecordTick(stats_, READ_ASYNC_MICROS, elapsed);
+
+// Suppress false positive clang analyzer warnings.
+// Memory is not released if file_->ReadAsync returns !s.ok(), because
+// ReadAsyncCallback is never called in that case. If ReadAsyncCallback is
+// called then ReadAsync should always return IOStatus::OK().
+#ifndef __clang_analyzer__
+  if (!s.ok()) {
+    delete read_async_info;
+  }
+#endif  // __clang_analyzer__
+
+  return s;
+}
+
+void RandomAccessFileReader::ReadAsyncCallback(const FSReadRequest& req,
+                                               void* cb_arg) {
+  ReadAsyncInfo* read_async_info = static_cast<ReadAsyncInfo*>(cb_arg);
+  assert(read_async_info);
+  assert(read_async_info->cb_);
+
+  if (use_direct_io() && read_async_info->is_aligned_ == false) {
+    // Create FSReadRequest with user provided fields.
+    FSReadRequest user_req;
+    user_req.scratch = read_async_info->user_scratch_;
+    user_req.offset = read_async_info->user_offset_;
+    user_req.len = read_async_info->user_len_;
+
+    // Update results in user_req.
+    user_req.result = req.result;
+    user_req.status = req.status;
+
+    read_async_info->buf_.Size(read_async_info->buf_.CurrentSize() +
+                               req.result.size());
+
+    size_t offset_advance_len = static_cast<size_t>(
+        /*offset_passed_by_user=*/read_async_info->user_offset_ -
+        /*aligned_offset=*/req.offset);
+
+    size_t res_len = 0;
+    if (req.status.ok() &&
+        offset_advance_len < read_async_info->buf_.CurrentSize()) {
+      res_len =
+          std::min(read_async_info->buf_.CurrentSize() - offset_advance_len,
+                   read_async_info->user_len_);
+      if (read_async_info->user_aligned_buf_ == nullptr) {
+        // Copy the data into user's scratch.
+// Clang analyzer assumes that it will take use_direct_io() == false in
+// ReadAsync and use_direct_io() == true in Callback which cannot be true.
+#ifndef __clang_analyzer__
+        read_async_info->buf_.Read(user_req.scratch, offset_advance_len,
+                                   res_len);
+#endif  // __clang_analyzer__
+      } else {
+        // Set aligned_buf provided by user without additional copy.
+        user_req.scratch =
+            read_async_info->buf_.BufferStart() + offset_advance_len;
+        read_async_info->user_aligned_buf_->reset(
+            read_async_info->buf_.Release());
+      }
+      user_req.result = Slice(user_req.scratch, res_len);
+    } else {
+      // Either req.status is not ok or data was not read.
+      user_req.result = Slice();
+    }
+    read_async_info->cb_(user_req, read_async_info->cb_arg_);
+  } else {
+    read_async_info->cb_(req, read_async_info->cb_arg_);
+  }
+
+  // Update stats and notify listeners.
+  if (stats_ != nullptr && file_read_hist_ != nullptr) {
+    // elapsed doesn't take into account delay and overwrite as StopWatch does
+    // in Read.
+    uint64_t elapsed = clock_->NowMicros() - read_async_info->start_time_;
+    file_read_hist_->Add(elapsed);
+  }
+  if (req.status.ok()) {
+    RecordInHistogram(stats_, ASYNC_READ_BYTES, req.result.size());
+  } else if (!req.status.IsAborted()) {
+    RecordTick(stats_, ASYNC_READ_ERROR_COUNT, 1);
+  }
+#ifndef ROCKSDB_LITE
+  if (ShouldNotifyListeners()) {
+    auto finish_ts = FileOperationInfo::FinishNow();
+    NotifyOnFileReadFinish(req.offset, req.result.size(),
+                           read_async_info->fs_start_ts_, finish_ts,
+                           req.status);
+  }
+  if (!req.status.ok()) {
+    NotifyOnIOError(req.status, FileOperationType::kRead, file_name(),
+                    req.result.size(), req.offset);
+  }
+#endif
+  RecordIOStats(stats_, file_temperature_, is_last_level_, req.result.size());
+  delete read_async_info;
+}
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/file/random_access_file_reader.h b/src/rocksdb/file/random_access_file_reader.h
new file mode 100644
index 000000000..ea7cfd234
--- /dev/null
+++ b/src/rocksdb/file/random_access_file_reader.h
@@ -0,0 +1,217 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <atomic>
+#include <sstream>
+#include <string>
+
+#include "env/file_system_tracer.h"
+#include "port/port.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/listener.h"
+#include "rocksdb/options.h"
+#include "rocksdb/rate_limiter.h"
+#include "util/aligned_buffer.h"
+
+namespace ROCKSDB_NAMESPACE {
+class Statistics;
+class HistogramImpl;
+class SystemClock;
+
+using AlignedBuf = std::unique_ptr<char[]>;
+
+// Align the request r according to alignment and return the aligned result.
+FSReadRequest Align(const FSReadRequest& r, size_t alignment);
+
+// Try to merge src to dest if they have overlap.
+//
+// Each request represents an inclusive interval [offset, offset + len].
+// If the intervals have overlap, update offset and len to represent the
+// merged interval, and return true.
+// Otherwise, do nothing and return false.
+bool TryMerge(FSReadRequest* dest, const FSReadRequest& src);
+
+// RandomAccessFileReader is a wrapper on top of FSRandomAccessFile. It is
+// responsible for:
+// - Handling Buffered and Direct reads appropriately.
+// - Rate limiting compaction reads.
+// - Notifying any interested listeners on the completion of a read.
+// - Updating IO stats.
+class RandomAccessFileReader {
+ private:
+#ifndef ROCKSDB_LITE
+  void NotifyOnFileReadFinish(
+      uint64_t offset, size_t length,
+      const FileOperationInfo::StartTimePoint& start_ts,
+      const FileOperationInfo::FinishTimePoint& finish_ts,
+      const Status& status) const {
+    FileOperationInfo info(FileOperationType::kRead, file_name_, start_ts,
+                           finish_ts, status, file_temperature_);
+    info.offset = offset;
+    info.length = length;
+
+    for (auto& listener : listeners_) {
+      listener->OnFileReadFinish(info);
+    }
+    info.status.PermitUncheckedError();
+  }
+
+  void NotifyOnIOError(const IOStatus& io_status, FileOperationType operation,
+                       const std::string& file_path, size_t length,
+                       uint64_t offset) const {
+    if (listeners_.empty()) {
+      return;
+    }
+    IOErrorInfo io_error_info(io_status, operation, file_path, length, offset);
+
+    for (auto& listener : listeners_) {
+      listener->OnIOError(io_error_info);
+    }
+    io_status.PermitUncheckedError();
+  }
+
+#endif  // ROCKSDB_LITE
+
+  bool ShouldNotifyListeners() const { return !listeners_.empty(); }
+
+  FSRandomAccessFilePtr file_;
+  std::string file_name_;
+  SystemClock* clock_;
+  Statistics* stats_;
+  uint32_t hist_type_;
+  HistogramImpl* file_read_hist_;
+  RateLimiter* rate_limiter_;
+  std::vector<std::shared_ptr<EventListener>> listeners_;
+  const Temperature file_temperature_;
+  const bool is_last_level_;
+
+  struct ReadAsyncInfo {
+    ReadAsyncInfo(std::function<void(const FSReadRequest&, void*)> cb,
+                  void* cb_arg, uint64_t start_time)
+        : cb_(cb),
+          cb_arg_(cb_arg),
+          start_time_(start_time),
+          user_scratch_(nullptr),
+          user_aligned_buf_(nullptr),
+          user_offset_(0),
+          user_len_(0),
+          is_aligned_(false) {}
+
+    std::function<void(const FSReadRequest&, void*)> cb_;
+    void* cb_arg_;
+    uint64_t start_time_;
+#ifndef ROCKSDB_LITE
+    FileOperationInfo::StartTimePoint fs_start_ts_;
+#endif
+    // Below fields stores the parameters passed by caller in case of direct_io.
+    char* user_scratch_;
+    AlignedBuf* user_aligned_buf_;
+    uint64_t user_offset_;
+    size_t user_len_;
+    Slice user_result_;
+    // Used in case of direct_io
+    AlignedBuffer buf_;
+    bool is_aligned_;
+  };
+
+ public:
+  explicit RandomAccessFileReader(
+      std::unique_ptr<FSRandomAccessFile>&& raf, const std::string& _file_name,
+      SystemClock* clock = nullptr,
+      const std::shared_ptr<IOTracer>& io_tracer = nullptr,
+      Statistics* stats = nullptr, uint32_t hist_type = 0,
+      HistogramImpl* file_read_hist = nullptr,
+      RateLimiter* rate_limiter = nullptr,
+      const std::vector<std::shared_ptr<EventListener>>& listeners = {},
+      Temperature file_temperature = Temperature::kUnknown,
+      bool is_last_level = false)
+      : file_(std::move(raf), io_tracer, _file_name),
+        file_name_(std::move(_file_name)),
+        clock_(clock),
+        stats_(stats),
+        hist_type_(hist_type),
+        file_read_hist_(file_read_hist),
+        rate_limiter_(rate_limiter),
+        listeners_(),
+        file_temperature_(file_temperature),
+        is_last_level_(is_last_level) {
+#ifndef ROCKSDB_LITE
+    std::for_each(listeners.begin(), listeners.end(),
+                  [this](const std::shared_ptr<EventListener>& e) {
+                    if (e->ShouldBeNotifiedOnFileIO()) {
+                      listeners_.emplace_back(e);
+                    }
+                  });
+#else  // !ROCKSDB_LITE
+    (void)listeners;
+#endif
+  }
+
+  static IOStatus Create(const std::shared_ptr<FileSystem>& fs,
+                         const std::string& fname, const FileOptions& file_opts,
+                         std::unique_ptr<RandomAccessFileReader>* reader,
+                         IODebugContext* dbg);
+  RandomAccessFileReader(const RandomAccessFileReader&) = delete;
+  RandomAccessFileReader& operator=(const RandomAccessFileReader&) = delete;
+
+  // In non-direct IO mode,
+  // 1. if using mmap, result is stored in a buffer other than scratch;
+  // 2. if not using mmap, result is stored in the buffer starting from scratch.
+  //
+  // In direct IO mode, an aligned buffer is allocated internally.
+  // 1. If aligned_buf is null, then results are copied to the buffer
+  // starting from scratch;
+  // 2. Otherwise, scratch is not used and can be null, the aligned_buf owns
+  // the internally allocated buffer on return, and the result refers to a
+  // region in aligned_buf.
+  //
+  // `rate_limiter_priority` is used to charge the internal rate limiter when
+  // enabled. The special value `Env::IO_TOTAL` makes this operation bypass the
+  // rate limiter.
+  IOStatus Read(const IOOptions& opts, uint64_t offset, size_t n, Slice* result,
+                char* scratch, AlignedBuf* aligned_buf,
+                Env::IOPriority rate_limiter_priority) const;
+
+  // REQUIRES:
+  // num_reqs > 0, reqs do not overlap, and offsets in reqs are increasing.
+  // In non-direct IO mode, aligned_buf should be null;
+  // In direct IO mode, aligned_buf stores the aligned buffer allocated inside
+  // MultiRead, the result Slices in reqs refer to aligned_buf.
+  //
+  // `rate_limiter_priority` will be used to charge the internal rate limiter.
+  // It is not yet supported so the client must provide the special value
+  // `Env::IO_TOTAL` to bypass the rate limiter.
+  IOStatus MultiRead(const IOOptions& opts, FSReadRequest* reqs,
+                     size_t num_reqs, AlignedBuf* aligned_buf,
+                     Env::IOPriority rate_limiter_priority) const;
+
+  IOStatus Prefetch(uint64_t offset, size_t n,
+                    const Env::IOPriority rate_limiter_priority) const {
+    IOOptions opts;
+    opts.rate_limiter_priority = rate_limiter_priority;
+    return file_->Prefetch(offset, n, opts, nullptr);
+  }
+
+  FSRandomAccessFile* file() { return file_.get(); }
+
+  const std::string& file_name() const { return file_name_; }
+
+  bool use_direct_io() const { return file_->use_direct_io(); }
+
+  IOStatus PrepareIOOptions(const ReadOptions& ro, IOOptions& opts);
+
+  IOStatus ReadAsync(FSReadRequest& req, const IOOptions& opts,
+                     std::function<void(const FSReadRequest&, void*)> cb,
+                     void* cb_arg, void** io_handle, IOHandleDeleter* del_fn,
+                     AlignedBuf* aligned_buf);
+
+  void ReadAsyncCallback(const FSReadRequest& req, void* cb_arg);
+};
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/file/random_access_file_reader_test.cc b/src/rocksdb/file/random_access_file_reader_test.cc
new file mode 100644
index 000000000..ac0e9e57a
--- /dev/null
+++ b/src/rocksdb/file/random_access_file_reader_test.cc
@@ -0,0 +1,481 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "file/random_access_file_reader.h"
+
+#include <algorithm>
+
+#include "file/file_util.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "rocksdb/file_system.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/random.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class RandomAccessFileReaderTest : public testing::Test {
+ public:
+  void SetUp() override {
+    SetupSyncPointsToMockDirectIO();
+    env_ = Env::Default();
+    fs_ = FileSystem::Default();
+    test_dir_ = test::PerThreadDBPath("random_access_file_reader_test");
+    ASSERT_OK(fs_->CreateDir(test_dir_, IOOptions(), nullptr));
+  }
+
+  void TearDown() override { EXPECT_OK(DestroyDir(env_, test_dir_)); }
+
+  void Write(const std::string& fname, const std::string& content) {
+    std::unique_ptr<FSWritableFile> f;
+    ASSERT_OK(fs_->NewWritableFile(Path(fname), FileOptions(), &f, nullptr));
+    ASSERT_OK(f->Append(content, IOOptions(), nullptr));
+    ASSERT_OK(f->Close(IOOptions(), nullptr));
+  }
+
+  void Read(const std::string& fname, const FileOptions& opts,
+            std::unique_ptr<RandomAccessFileReader>* reader) {
+    std::string fpath = Path(fname);
+    std::unique_ptr<FSRandomAccessFile> f;
+    ASSERT_OK(fs_->NewRandomAccessFile(fpath, opts, &f, nullptr));
+    reader->reset(new RandomAccessFileReader(std::move(f), fpath,
+                                             env_->GetSystemClock().get()));
+  }
+
+  void AssertResult(const std::string& content,
+                    const std::vector<FSReadRequest>& reqs) {
+    for (const auto& r : reqs) {
+      ASSERT_OK(r.status);
+      ASSERT_EQ(r.len, r.result.size());
+      ASSERT_EQ(content.substr(r.offset, r.len), r.result.ToString());
+    }
+  }
+
+ private:
+  Env* env_;
+  std::shared_ptr<FileSystem> fs_;
+  std::string test_dir_;
+
+  std::string Path(const std::string& fname) { return test_dir_ + "/" + fname; }
+};
+
+// Skip the following tests in lite mode since direct I/O is unsupported.
+#ifndef ROCKSDB_LITE
+
+TEST_F(RandomAccessFileReaderTest, ReadDirectIO) {
+  std::string fname = "read-direct-io";
+  Random rand(0);
+  std::string content = rand.RandomString(kDefaultPageSize);
+  Write(fname, content);
+
+  FileOptions opts;
+  opts.use_direct_reads = true;
+  std::unique_ptr<RandomAccessFileReader> r;
+  Read(fname, opts, &r);
+  ASSERT_TRUE(r->use_direct_io());
+
+  const size_t page_size = r->file()->GetRequiredBufferAlignment();
+  size_t offset = page_size / 2;
+  size_t len = page_size / 3;
+  Slice result;
+  AlignedBuf buf;
+  for (Env::IOPriority rate_limiter_priority : {Env::IO_LOW, Env::IO_TOTAL}) {
+    ASSERT_OK(r->Read(IOOptions(), offset, len, &result, nullptr, &buf,
+                      rate_limiter_priority));
+    ASSERT_EQ(result.ToString(), content.substr(offset, len));
+  }
+}
+
+TEST_F(RandomAccessFileReaderTest, MultiReadDirectIO) {
+  std::vector<FSReadRequest> aligned_reqs;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "RandomAccessFileReader::MultiRead:AlignedReqs", [&](void* reqs) {
+        // Copy reqs, since it's allocated on stack inside MultiRead, which will
+        // be deallocated after MultiRead returns.
+        aligned_reqs = *reinterpret_cast<std::vector<FSReadRequest>*>(reqs);
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // Creates a file with 3 pages.
+  std::string fname = "multi-read-direct-io";
+  Random rand(0);
+  std::string content = rand.RandomString(3 * kDefaultPageSize);
+  Write(fname, content);
+
+  FileOptions opts;
+  opts.use_direct_reads = true;
+  std::unique_ptr<RandomAccessFileReader> r;
+  Read(fname, opts, &r);
+  ASSERT_TRUE(r->use_direct_io());
+
+  const size_t page_size = r->file()->GetRequiredBufferAlignment();
+
+  {
+    // Reads 2 blocks in the 1st page.
+    // The results should be SharedSlices of the same underlying buffer.
+    //
+    // Illustration (each x is a 1/4 page)
+    // First page: xxxx
+    // 1st block:  x
+    // 2nd block:    xx
+    FSReadRequest r0;
+    r0.offset = 0;
+    r0.len = page_size / 4;
+    r0.scratch = nullptr;
+
+    FSReadRequest r1;
+    r1.offset = page_size / 2;
+    r1.len = page_size / 2;
+    r1.scratch = nullptr;
+
+    std::vector<FSReadRequest> reqs;
+    reqs.push_back(std::move(r0));
+    reqs.push_back(std::move(r1));
+    AlignedBuf aligned_buf;
+    ASSERT_OK(r->MultiRead(IOOptions(), reqs.data(), reqs.size(), &aligned_buf,
+                           Env::IO_TOTAL /* rate_limiter_priority */));
+
+    AssertResult(content, reqs);
+
+    // Reads the first page internally.
+    ASSERT_EQ(aligned_reqs.size(), 1);
+    const FSReadRequest& aligned_r = aligned_reqs[0];
+    ASSERT_OK(aligned_r.status);
+    ASSERT_EQ(aligned_r.offset, 0);
+    ASSERT_EQ(aligned_r.len, page_size);
+  }
+
+  {
+    // Reads 3 blocks:
+    // 1st block in the 1st page;
+    // 2nd block from the middle of the 1st page to the middle of the 2nd page;
+    // 3rd block in the 2nd page.
+    // The results should be SharedSlices of the same underlying buffer.
+    //
+    // Illustration (each x is a 1/4 page)
+    // 2 pages:   xxxxxxxx
+    // 1st block: x
+    // 2nd block:   xxxx
+    // 3rd block:        x
+    FSReadRequest r0;
+    r0.offset = 0;
+    r0.len = page_size / 4;
+    r0.scratch = nullptr;
+
+    FSReadRequest r1;
+    r1.offset = page_size / 2;
+    r1.len = page_size;
+    r1.scratch = nullptr;
+
+    FSReadRequest r2;
+    r2.offset = 2 * page_size - page_size / 4;
+    r2.len = page_size / 4;
+    r2.scratch = nullptr;
+
+    std::vector<FSReadRequest> reqs;
+    reqs.push_back(std::move(r0));
+    reqs.push_back(std::move(r1));
+    reqs.push_back(std::move(r2));
+    AlignedBuf aligned_buf;
+    ASSERT_OK(r->MultiRead(IOOptions(), reqs.data(), reqs.size(), &aligned_buf,
+                           Env::IO_TOTAL /* rate_limiter_priority */));
+
+    AssertResult(content, reqs);
+
+    // Reads the first two pages in one request internally.
+    ASSERT_EQ(aligned_reqs.size(), 1);
+    const FSReadRequest& aligned_r = aligned_reqs[0];
+    ASSERT_OK(aligned_r.status);
+    ASSERT_EQ(aligned_r.offset, 0);
+    ASSERT_EQ(aligned_r.len, 2 * page_size);
+  }
+
+  {
+    // Reads 3 blocks:
+    // 1st block in the middle of the 1st page;
+    // 2nd block in the middle of the 2nd page;
+    // 3rd block in the middle of the 3rd page.
+    // The results should be SharedSlices of the same underlying buffer.
+    //
+    // Illustration (each x is a 1/4 page)
+    // 3 pages:   xxxxxxxxxxxx
+    // 1st block:  xx
+    // 2nd block:      xx
+    // 3rd block:          xx
+    FSReadRequest r0;
+    r0.offset = page_size / 4;
+    r0.len = page_size / 2;
+    r0.scratch = nullptr;
+
+    FSReadRequest r1;
+    r1.offset = page_size + page_size / 4;
+    r1.len = page_size / 2;
+    r1.scratch = nullptr;
+
+    FSReadRequest r2;
+    r2.offset = 2 * page_size + page_size / 4;
+    r2.len = page_size / 2;
+    r2.scratch = nullptr;
+
+    std::vector<FSReadRequest> reqs;
+    reqs.push_back(std::move(r0));
+    reqs.push_back(std::move(r1));
+    reqs.push_back(std::move(r2));
+    AlignedBuf aligned_buf;
+    ASSERT_OK(r->MultiRead(IOOptions(), reqs.data(), reqs.size(), &aligned_buf,
+                           Env::IO_TOTAL /* rate_limiter_priority */));
+
+    AssertResult(content, reqs);
+
+    // Reads the first 3 pages in one request internally.
+    ASSERT_EQ(aligned_reqs.size(), 1);
+    const FSReadRequest& aligned_r = aligned_reqs[0];
+    ASSERT_OK(aligned_r.status);
+    ASSERT_EQ(aligned_r.offset, 0);
+    ASSERT_EQ(aligned_r.len, 3 * page_size);
+  }
+
+  {
+    // Reads 2 blocks:
+    // 1st block in the middle of the 1st page;
+    // 2nd block in the middle of the 3rd page.
+    // The results are two different buffers.
+    //
+    // Illustration (each x is a 1/4 page)
+    // 3 pages:   xxxxxxxxxxxx
+    // 1st block:  xx
+    // 2nd block:          xx
+    FSReadRequest r0;
+    r0.offset = page_size / 4;
+    r0.len = page_size / 2;
+    r0.scratch = nullptr;
+
+    FSReadRequest r1;
+    r1.offset = 2 * page_size + page_size / 4;
+    r1.len = page_size / 2;
+    r1.scratch = nullptr;
+
+    std::vector<FSReadRequest> reqs;
+    reqs.push_back(std::move(r0));
+    reqs.push_back(std::move(r1));
+    AlignedBuf aligned_buf;
+    ASSERT_OK(r->MultiRead(IOOptions(), reqs.data(), reqs.size(), &aligned_buf,
+                           Env::IO_TOTAL /* rate_limiter_priority */));
+
+    AssertResult(content, reqs);
+
+    // Reads the 1st and 3rd pages in two requests internally.
+    ASSERT_EQ(aligned_reqs.size(), 2);
+    const FSReadRequest& aligned_r0 = aligned_reqs[0];
+    const FSReadRequest& aligned_r1 = aligned_reqs[1];
+    ASSERT_OK(aligned_r0.status);
+    ASSERT_EQ(aligned_r0.offset, 0);
+    ASSERT_EQ(aligned_r0.len, page_size);
+    ASSERT_OK(aligned_r1.status);
+    ASSERT_EQ(aligned_r1.offset, 2 * page_size);
+    ASSERT_EQ(aligned_r1.len, page_size);
+  }
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+#endif  // ROCKSDB_LITE
+
+TEST(FSReadRequest, Align) {
+  FSReadRequest r;
+  r.offset = 2000;
+  r.len = 2000;
+  r.scratch = nullptr;
+  ASSERT_OK(r.status);
+
+  FSReadRequest aligned_r = Align(r, 1024);
+  ASSERT_OK(r.status);
+  ASSERT_OK(aligned_r.status);
+  ASSERT_EQ(aligned_r.offset, 1024);
+  ASSERT_EQ(aligned_r.len, 3072);
+}
+
+TEST(FSReadRequest, TryMerge) {
+  // reverse means merging dest into src.
+  for (bool reverse : {true, false}) {
+    {
+      // dest: [ ]
+      //  src:      [ ]
+      FSReadRequest dest;
+      dest.offset = 0;
+      dest.len = 10;
+      dest.scratch = nullptr;
+      ASSERT_OK(dest.status);
+
+      FSReadRequest src;
+      src.offset = 15;
+      src.len = 10;
+      src.scratch = nullptr;
+      ASSERT_OK(src.status);
+
+      if (reverse) {
+        std::swap(dest, src);
+      }
+      ASSERT_FALSE(TryMerge(&dest, src));
+      ASSERT_OK(dest.status);
+      ASSERT_OK(src.status);
+    }
+
+    {
+      // dest: [ ]
+      //  src:   [ ]
+      FSReadRequest dest;
+      dest.offset = 0;
+      dest.len = 10;
+      dest.scratch = nullptr;
+      ASSERT_OK(dest.status);
+
+      FSReadRequest src;
+      src.offset = 10;
+      src.len = 10;
+      src.scratch = nullptr;
+      ASSERT_OK(src.status);
+
+      if (reverse) {
+        std::swap(dest, src);
+      }
+      ASSERT_TRUE(TryMerge(&dest, src));
+      ASSERT_EQ(dest.offset, 0);
+      ASSERT_EQ(dest.len, 20);
+      ASSERT_OK(dest.status);
+      ASSERT_OK(src.status);
+    }
+
+    {
+      // dest: [    ]
+      //  src:   [    ]
+      FSReadRequest dest;
+      dest.offset = 0;
+      dest.len = 10;
+      dest.scratch = nullptr;
+      ASSERT_OK(dest.status);
+
+      FSReadRequest src;
+      src.offset = 5;
+      src.len = 10;
+      src.scratch = nullptr;
+      ASSERT_OK(src.status);
+
+      if (reverse) {
+        std::swap(dest, src);
+      }
+      ASSERT_TRUE(TryMerge(&dest, src));
+      ASSERT_EQ(dest.offset, 0);
+      ASSERT_EQ(dest.len, 15);
+      ASSERT_OK(dest.status);
+      ASSERT_OK(src.status);
+    }
+
+    {
+      // dest: [    ]
+      //  src:   [  ]
+      FSReadRequest dest;
+      dest.offset = 0;
+      dest.len = 10;
+      dest.scratch = nullptr;
+      ASSERT_OK(dest.status);
+
+      FSReadRequest src;
+      src.offset = 5;
+      src.len = 5;
+      src.scratch = nullptr;
+      ASSERT_OK(src.status);
+
+      if (reverse) {
+        std::swap(dest, src);
+      }
+      ASSERT_TRUE(TryMerge(&dest, src));
+      ASSERT_EQ(dest.offset, 0);
+      ASSERT_EQ(dest.len, 10);
+      ASSERT_OK(dest.status);
+      ASSERT_OK(src.status);
+    }
+
+    {
+      // dest: [     ]
+      //  src:   [ ]
+      FSReadRequest dest;
+      dest.offset = 0;
+      dest.len = 10;
+      dest.scratch = nullptr;
+      ASSERT_OK(dest.status);
+
+      FSReadRequest src;
+      src.offset = 5;
+      src.len = 1;
+      src.scratch = nullptr;
+      ASSERT_OK(src.status);
+
+      if (reverse) std::swap(dest, src);
+      ASSERT_TRUE(TryMerge(&dest, src));
+      ASSERT_EQ(dest.offset, 0);
+      ASSERT_EQ(dest.len, 10);
+      ASSERT_OK(dest.status);
+      ASSERT_OK(src.status);
+    }
+
+    {
+      // dest: [ ]
+      //  src: [ ]
+      FSReadRequest dest;
+      dest.offset = 0;
+      dest.len = 10;
+      dest.scratch = nullptr;
+      ASSERT_OK(dest.status);
+
+      FSReadRequest src;
+      src.offset = 0;
+      src.len = 10;
+      src.scratch = nullptr;
+      ASSERT_OK(src.status);
+
+      if (reverse) std::swap(dest, src);
+      ASSERT_TRUE(TryMerge(&dest, src));
+      ASSERT_EQ(dest.offset, 0);
+      ASSERT_EQ(dest.len, 10);
+      ASSERT_OK(dest.status);
+      ASSERT_OK(src.status);
+    }
+
+    {
+      // dest: [   ]
+      //  src: [ ]
+      FSReadRequest dest;
+      dest.offset = 0;
+      dest.len = 10;
+      dest.scratch = nullptr;
+      ASSERT_OK(dest.status);
+
+      FSReadRequest src;
+      src.offset = 0;
+      src.len = 5;
+      src.scratch = nullptr;
+      ASSERT_OK(src.status);
+
+      if (reverse) std::swap(dest, src);
+      ASSERT_TRUE(TryMerge(&dest, src));
+      ASSERT_EQ(dest.offset, 0);
+      ASSERT_EQ(dest.len, 10);
+      ASSERT_OK(dest.status);
+      ASSERT_OK(src.status);
+    }
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/file/read_write_util.cc b/src/rocksdb/file/read_write_util.cc
new file mode 100644
index 000000000..3617a35e3
--- /dev/null
+++ b/src/rocksdb/file/read_write_util.cc
@@ -0,0 +1,33 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "file/read_write_util.h"
+
+#include <sstream>
+
+#include "test_util/sync_point.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+IOStatus NewWritableFile(FileSystem* fs, const std::string& fname,
+                         std::unique_ptr<FSWritableFile>* result,
+                         const FileOptions& options) {
+  TEST_SYNC_POINT_CALLBACK("NewWritableFile::FileOptions.temperature",
+                           const_cast<Temperature*>(&options.temperature));
+  IOStatus s = fs->NewWritableFile(fname, options, result, nullptr);
+  TEST_KILL_RANDOM_WITH_WEIGHT("NewWritableFile:0", REDUCE_ODDS2);
+  return s;
+}
+
+#ifndef NDEBUG
+bool IsFileSectorAligned(const size_t off, size_t sector_size) {
+  return off % sector_size == 0;
+}
+#endif  // NDEBUG
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/file/read_write_util.h b/src/rocksdb/file/read_write_util.h
new file mode 100644
index 000000000..9f034b705
--- /dev/null
+++ b/src/rocksdb/file/read_write_util.h
@@ -0,0 +1,31 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <atomic>
+
+#include "file/sequence_file_reader.h"
+#include "rocksdb/env.h"
+#include "rocksdb/file_system.h"
+
+namespace ROCKSDB_NAMESPACE {
+// Returns a WritableFile.
+//
+// env     : the Env.
+// fname   : the file name.
+// result  : output arg. A WritableFile based on `fname` returned.
+// options : the Env Options.
+extern IOStatus NewWritableFile(FileSystem* fs, const std::string& fname,
+                                std::unique_ptr<FSWritableFile>* result,
+                                const FileOptions& options);
+
+#ifndef NDEBUG
+bool IsFileSectorAligned(const size_t off, size_t sector_size);
+#endif  // NDEBUG
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/file/readahead_file_info.h b/src/rocksdb/file/readahead_file_info.h
new file mode 100644
index 000000000..f0208bf2d
--- /dev/null
+++ b/src/rocksdb/file/readahead_file_info.h
@@ -0,0 +1,33 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// struct ReadaheadFileInfo contains readahead information that is passed from
+// one file to another file per level during iterations. This information helps
+// iterators to carry forward the internal automatic prefetching readahead value
+// to next file during sequential reads instead of starting from the scratch.
+
+struct ReadaheadFileInfo {
+  struct ReadaheadInfo {
+    size_t readahead_size = 0;
+    int64_t num_file_reads = 0;
+  };
+
+  // Used by Data block iterators to update readahead info.
+  ReadaheadInfo data_block_readahead_info;
+
+  // Used by Index block iterators to update readahead info.
+  ReadaheadInfo index_block_readahead_info;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/file/readahead_raf.cc b/src/rocksdb/file/readahead_raf.cc
new file mode 100644
index 000000000..6d346432e
--- /dev/null
+++ b/src/rocksdb/file/readahead_raf.cc
@@ -0,0 +1,169 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "file/readahead_raf.h"
+
+#include <algorithm>
+#include <mutex>
+
+#include "file/read_write_util.h"
+#include "rocksdb/file_system.h"
+#include "util/aligned_buffer.h"
+#include "util/rate_limiter.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace {
+class ReadaheadRandomAccessFile : public FSRandomAccessFile {
+ public:
+  ReadaheadRandomAccessFile(std::unique_ptr<FSRandomAccessFile>&& file,
+                            size_t readahead_size)
+      : file_(std::move(file)),
+        alignment_(file_->GetRequiredBufferAlignment()),
+        readahead_size_(Roundup(readahead_size, alignment_)),
+        buffer_(),
+        buffer_offset_(0) {
+    buffer_.Alignment(alignment_);
+    buffer_.AllocateNewBuffer(readahead_size_);
+  }
+
+  ReadaheadRandomAccessFile(const ReadaheadRandomAccessFile&) = delete;
+
+  ReadaheadRandomAccessFile& operator=(const ReadaheadRandomAccessFile&) =
+      delete;
+
+  IOStatus Read(uint64_t offset, size_t n, const IOOptions& options,
+                Slice* result, char* scratch,
+                IODebugContext* dbg) const override {
+    // Read-ahead only make sense if we have some slack left after reading
+    if (n + alignment_ >= readahead_size_) {
+      return file_->Read(offset, n, options, result, scratch, dbg);
+    }
+
+    std::unique_lock<std::mutex> lk(lock_);
+
+    size_t cached_len = 0;
+    // Check if there is a cache hit, meaning that [offset, offset + n) is
+    // either completely or partially in the buffer. If it's completely cached,
+    // including end of file case when offset + n is greater than EOF, then
+    // return.
+    if (TryReadFromCache(offset, n, &cached_len, scratch) &&
+        (cached_len == n || buffer_.CurrentSize() < readahead_size_)) {
+      // We read exactly what we needed, or we hit end of file - return.
+      *result = Slice(scratch, cached_len);
+      return IOStatus::OK();
+    }
+    size_t advanced_offset = static_cast<size_t>(offset + cached_len);
+    // In the case of cache hit advanced_offset is already aligned, means that
+    // chunk_offset equals to advanced_offset
+    size_t chunk_offset = TruncateToPageBoundary(alignment_, advanced_offset);
+
+    IOStatus s = ReadIntoBuffer(chunk_offset, readahead_size_, options, dbg);
+    if (s.ok()) {
+      // The data we need is now in cache, so we can safely read it
+      size_t remaining_len;
+      TryReadFromCache(advanced_offset, n - cached_len, &remaining_len,
+                       scratch + cached_len);
+      *result = Slice(scratch, cached_len + remaining_len);
+    }
+    return s;
+  }
+
+  IOStatus Prefetch(uint64_t offset, size_t n, const IOOptions& options,
+                    IODebugContext* dbg) override {
+    if (n < readahead_size_) {
+      // Don't allow smaller prefetches than the configured `readahead_size_`.
+      // `Read()` assumes a smaller prefetch buffer indicates EOF was reached.
+      return IOStatus::OK();
+    }
+
+    std::unique_lock<std::mutex> lk(lock_);
+
+    size_t offset_ = static_cast<size_t>(offset);
+    size_t prefetch_offset = TruncateToPageBoundary(alignment_, offset_);
+    if (prefetch_offset == buffer_offset_) {
+      return IOStatus::OK();
+    }
+    return ReadIntoBuffer(prefetch_offset,
+                          Roundup(offset_ + n, alignment_) - prefetch_offset,
+                          options, dbg);
+  }
+
+  size_t GetUniqueId(char* id, size_t max_size) const override {
+    return file_->GetUniqueId(id, max_size);
+  }
+
+  void Hint(AccessPattern pattern) override { file_->Hint(pattern); }
+
+  IOStatus InvalidateCache(size_t offset, size_t length) override {
+    std::unique_lock<std::mutex> lk(lock_);
+    buffer_.Clear();
+    return file_->InvalidateCache(offset, length);
+  }
+
+  bool use_direct_io() const override { return file_->use_direct_io(); }
+
+ private:
+  // Tries to read from buffer_ n bytes starting at offset. If anything was read
+  // from the cache, it sets cached_len to the number of bytes actually read,
+  // copies these number of bytes to scratch and returns true.
+  // If nothing was read sets cached_len to 0 and returns false.
+  bool TryReadFromCache(uint64_t offset, size_t n, size_t* cached_len,
+                        char* scratch) const {
+    if (offset < buffer_offset_ ||
+        offset >= buffer_offset_ + buffer_.CurrentSize()) {
+      *cached_len = 0;
+      return false;
+    }
+    uint64_t offset_in_buffer = offset - buffer_offset_;
+    *cached_len = std::min(
+        buffer_.CurrentSize() - static_cast<size_t>(offset_in_buffer), n);
+    memcpy(scratch, buffer_.BufferStart() + offset_in_buffer, *cached_len);
+    return true;
+  }
+
+  // Reads into buffer_ the next n bytes from file_ starting at offset.
+  // Can actually read less if EOF was reached.
+  // Returns the status of the read operastion on the file.
+  IOStatus ReadIntoBuffer(uint64_t offset, size_t n, const IOOptions& options,
+                          IODebugContext* dbg) const {
+    if (n > buffer_.Capacity()) {
+      n = buffer_.Capacity();
+    }
+    assert(IsFileSectorAligned(offset, alignment_));
+    assert(IsFileSectorAligned(n, alignment_));
+    Slice result;
+    IOStatus s =
+        file_->Read(offset, n, options, &result, buffer_.BufferStart(), dbg);
+    if (s.ok()) {
+      buffer_offset_ = offset;
+      buffer_.Size(result.size());
+      assert(result.size() == 0 || buffer_.BufferStart() == result.data());
+    }
+    return s;
+  }
+
+  const std::unique_ptr<FSRandomAccessFile> file_;
+  const size_t alignment_;
+  const size_t readahead_size_;
+
+  mutable std::mutex lock_;
+  // The buffer storing the prefetched data
+  mutable AlignedBuffer buffer_;
+  // The offset in file_, corresponding to data stored in buffer_
+  mutable uint64_t buffer_offset_;
+};
+}  // namespace
+
+std::unique_ptr<FSRandomAccessFile> NewReadaheadRandomAccessFile(
+    std::unique_ptr<FSRandomAccessFile>&& file, size_t readahead_size) {
+  std::unique_ptr<FSRandomAccessFile> result(
+      new ReadaheadRandomAccessFile(std::move(file), readahead_size));
+  return result;
+}
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/file/readahead_raf.h b/src/rocksdb/file/readahead_raf.h
new file mode 100644
index 000000000..dfaf2b4fa
--- /dev/null
+++ b/src/rocksdb/file/readahead_raf.h
@@ -0,0 +1,29 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <memory>
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+class FSRandomAccessFile;
+// This file provides the following main abstractions:
+// SequentialFileReader : wrapper over Env::SequentialFile
+// RandomAccessFileReader : wrapper over Env::RandomAccessFile
+// WritableFileWriter : wrapper over Env::WritableFile
+// In addition, it also exposed NewReadaheadRandomAccessFile, NewWritableFile,
+// and ReadOneLine primitives.
+
+// NewReadaheadRandomAccessFile provides a wrapper over RandomAccessFile to
+// always prefetch additional data with every read. This is mainly used in
+// Compaction Table Readers.
+std::unique_ptr<FSRandomAccessFile> NewReadaheadRandomAccessFile(
+    std::unique_ptr<FSRandomAccessFile>&& file, size_t readahead_size);
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/file/sequence_file_reader.cc b/src/rocksdb/file/sequence_file_reader.cc
new file mode 100644
index 000000000..d51d5be46
--- /dev/null
+++ b/src/rocksdb/file/sequence_file_reader.cc
@@ -0,0 +1,328 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "file/sequence_file_reader.h"
+
+#include <algorithm>
+#include <mutex>
+
+#include "file/read_write_util.h"
+#include "monitoring/histogram.h"
+#include "monitoring/iostats_context_imp.h"
+#include "port/port.h"
+#include "test_util/sync_point.h"
+#include "util/aligned_buffer.h"
+#include "util/random.h"
+#include "util/rate_limiter.h"
+
+namespace ROCKSDB_NAMESPACE {
+IOStatus SequentialFileReader::Create(
+    const std::shared_ptr<FileSystem>& fs, const std::string& fname,
+    const FileOptions& file_opts, std::unique_ptr<SequentialFileReader>* reader,
+    IODebugContext* dbg, RateLimiter* rate_limiter) {
+  std::unique_ptr<FSSequentialFile> file;
+  IOStatus io_s = fs->NewSequentialFile(fname, file_opts, &file, dbg);
+  if (io_s.ok()) {
+    reader->reset(new SequentialFileReader(std::move(file), fname, nullptr, {},
+                                           rate_limiter));
+  }
+  return io_s;
+}
+
+IOStatus SequentialFileReader::Read(size_t n, Slice* result, char* scratch,
+                                    Env::IOPriority rate_limiter_priority) {
+  IOStatus io_s;
+  if (use_direct_io()) {
+#ifndef ROCKSDB_LITE
+    //
+    //    |-offset_advance-|---bytes returned--|
+    //    |----------------------buf size-------------------------|
+    //    |                |                   |                  |
+    // aligned           offset          offset + n  Roundup(offset + n,
+    // offset                                             alignment)
+    //
+    size_t offset = offset_.fetch_add(n);
+    size_t alignment = file_->GetRequiredBufferAlignment();
+    size_t aligned_offset = TruncateToPageBoundary(alignment, offset);
+    size_t offset_advance = offset - aligned_offset;
+    size_t size = Roundup(offset + n, alignment) - aligned_offset;
+    size_t r = 0;
+    AlignedBuffer buf;
+    buf.Alignment(alignment);
+    buf.AllocateNewBuffer(size);
+
+    while (buf.CurrentSize() < size) {
+      size_t allowed;
+      if (rate_limiter_priority != Env::IO_TOTAL && rate_limiter_ != nullptr) {
+        allowed = rate_limiter_->RequestToken(
+            buf.Capacity() - buf.CurrentSize(), buf.Alignment(),
+            rate_limiter_priority, nullptr /* stats */,
+            RateLimiter::OpType::kRead);
+      } else {
+        assert(buf.CurrentSize() == 0);
+        allowed = size;
+      }
+
+      Slice tmp;
+      uint64_t orig_offset = 0;
+      FileOperationInfo::StartTimePoint start_ts;
+      if (ShouldNotifyListeners()) {
+        orig_offset = aligned_offset + buf.CurrentSize();
+        start_ts = FileOperationInfo::StartNow();
+      }
+      io_s = file_->PositionedRead(aligned_offset + buf.CurrentSize(), allowed,
+                                   IOOptions(), &tmp, buf.Destination(),
+                                   nullptr /* dbg */);
+      if (ShouldNotifyListeners()) {
+        auto finish_ts = FileOperationInfo::FinishNow();
+        NotifyOnFileReadFinish(orig_offset, tmp.size(), start_ts, finish_ts,
+                               io_s);
+      }
+      buf.Size(buf.CurrentSize() + tmp.size());
+      if (!io_s.ok() || tmp.size() < allowed) {
+        break;
+      }
+    }
+
+    if (io_s.ok() && offset_advance < buf.CurrentSize()) {
+      r = buf.Read(scratch, offset_advance,
+                   std::min(buf.CurrentSize() - offset_advance, n));
+    }
+    *result = Slice(scratch, r);
+#endif  // !ROCKSDB_LITE
+  } else {
+    // To be paranoid, modify scratch a little bit, so in case underlying
+    // FileSystem doesn't fill the buffer but return success and `scratch`
+    // returns contains a previous block, returned value will not pass
+    // checksum.
+    // It's hard to find useful byte for direct I/O case, so we skip it.
+    if (n > 0 && scratch != nullptr) {
+      scratch[0]++;
+    }
+
+    size_t read = 0;
+    while (read < n) {
+      size_t allowed;
+      if (rate_limiter_priority != Env::IO_TOTAL && rate_limiter_ != nullptr) {
+        allowed = rate_limiter_->RequestToken(
+            n - read, 0 /* alignment */, rate_limiter_priority,
+            nullptr /* stats */, RateLimiter::OpType::kRead);
+      } else {
+        allowed = n;
+      }
+#ifndef ROCKSDB_LITE
+      FileOperationInfo::StartTimePoint start_ts;
+      if (ShouldNotifyListeners()) {
+        start_ts = FileOperationInfo::StartNow();
+      }
+#endif
+      Slice tmp;
+      io_s = file_->Read(allowed, IOOptions(), &tmp, scratch + read,
+                         nullptr /* dbg */);
+#ifndef ROCKSDB_LITE
+      if (ShouldNotifyListeners()) {
+        auto finish_ts = FileOperationInfo::FinishNow();
+        size_t offset = offset_.fetch_add(tmp.size());
+        NotifyOnFileReadFinish(offset, tmp.size(), start_ts, finish_ts, io_s);
+      }
+#endif
+      read += tmp.size();
+      if (!io_s.ok() || tmp.size() < allowed) {
+        break;
+      }
+    }
+    *result = Slice(scratch, read);
+  }
+  IOSTATS_ADD(bytes_read, result->size());
+  return io_s;
+}
+
+IOStatus SequentialFileReader::Skip(uint64_t n) {
+#ifndef ROCKSDB_LITE
+  if (use_direct_io()) {
+    offset_ += static_cast<size_t>(n);
+    return IOStatus::OK();
+  }
+#endif  // !ROCKSDB_LITE
+  return file_->Skip(n);
+}
+
+namespace {
+// This class wraps a SequentialFile, exposing same API, with the differenece
+// of being able to prefetch up to readahead_size bytes and then serve them
+// from memory, avoiding the entire round-trip if, for example, the data for the
+// file is actually remote.
+class ReadaheadSequentialFile : public FSSequentialFile {
+ public:
+  ReadaheadSequentialFile(std::unique_ptr<FSSequentialFile>&& file,
+                          size_t readahead_size)
+      : file_(std::move(file)),
+        alignment_(file_->GetRequiredBufferAlignment()),
+        readahead_size_(Roundup(readahead_size, alignment_)),
+        buffer_(),
+        buffer_offset_(0),
+        read_offset_(0) {
+    buffer_.Alignment(alignment_);
+    buffer_.AllocateNewBuffer(readahead_size_);
+  }
+
+  ReadaheadSequentialFile(const ReadaheadSequentialFile&) = delete;
+
+  ReadaheadSequentialFile& operator=(const ReadaheadSequentialFile&) = delete;
+
+  IOStatus Read(size_t n, const IOOptions& opts, Slice* result, char* scratch,
+                IODebugContext* dbg) override {
+    std::unique_lock<std::mutex> lk(lock_);
+
+    size_t cached_len = 0;
+    // Check if there is a cache hit, meaning that [offset, offset + n) is
+    // either completely or partially in the buffer. If it's completely cached,
+    // including end of file case when offset + n is greater than EOF, then
+    // return.
+    if (TryReadFromCache(n, &cached_len, scratch) &&
+        (cached_len == n || buffer_.CurrentSize() < readahead_size_)) {
+      // We read exactly what we needed, or we hit end of file - return.
+      *result = Slice(scratch, cached_len);
+      return IOStatus::OK();
+    }
+    n -= cached_len;
+
+    IOStatus s;
+    // Read-ahead only make sense if we have some slack left after reading
+    if (n + alignment_ >= readahead_size_) {
+      s = file_->Read(n, opts, result, scratch + cached_len, dbg);
+      if (s.ok()) {
+        read_offset_ += result->size();
+        *result = Slice(scratch, cached_len + result->size());
+      }
+      buffer_.Clear();
+      return s;
+    }
+
+    s = ReadIntoBuffer(readahead_size_, opts, dbg);
+    if (s.ok()) {
+      // The data we need is now in cache, so we can safely read it
+      size_t remaining_len;
+      TryReadFromCache(n, &remaining_len, scratch + cached_len);
+      *result = Slice(scratch, cached_len + remaining_len);
+    }
+    return s;
+  }
+
+  IOStatus Skip(uint64_t n) override {
+    std::unique_lock<std::mutex> lk(lock_);
+    IOStatus s = IOStatus::OK();
+    // First check if we need to skip already cached data
+    if (buffer_.CurrentSize() > 0) {
+      // Do we need to skip beyond cached data?
+      if (read_offset_ + n >= buffer_offset_ + buffer_.CurrentSize()) {
+        // Yes. Skip whaterver is in memory and adjust offset accordingly
+        n -= buffer_offset_ + buffer_.CurrentSize() - read_offset_;
+        read_offset_ = buffer_offset_ + buffer_.CurrentSize();
+      } else {
+        // No. The entire section to be skipped is entirely i cache.
+        read_offset_ += n;
+        n = 0;
+      }
+    }
+    if (n > 0) {
+      // We still need to skip more, so call the file API for skipping
+      s = file_->Skip(n);
+      if (s.ok()) {
+        read_offset_ += n;
+      }
+      buffer_.Clear();
+    }
+    return s;
+  }
+
+  IOStatus PositionedRead(uint64_t offset, size_t n, const IOOptions& opts,
+                          Slice* result, char* scratch,
+                          IODebugContext* dbg) override {
+    return file_->PositionedRead(offset, n, opts, result, scratch, dbg);
+  }
+
+  IOStatus InvalidateCache(size_t offset, size_t length) override {
+    std::unique_lock<std::mutex> lk(lock_);
+    buffer_.Clear();
+    return file_->InvalidateCache(offset, length);
+  }
+
+  bool use_direct_io() const override { return file_->use_direct_io(); }
+
+ private:
+  // Tries to read from buffer_ n bytes. If anything was read from the cache, it
+  // sets cached_len to the number of bytes actually read, copies these number
+  // of bytes to scratch and returns true.
+  // If nothing was read sets cached_len to 0 and returns false.
+  bool TryReadFromCache(size_t n, size_t* cached_len, char* scratch) {
+    if (read_offset_ < buffer_offset_ ||
+        read_offset_ >= buffer_offset_ + buffer_.CurrentSize()) {
+      *cached_len = 0;
+      return false;
+    }
+    uint64_t offset_in_buffer = read_offset_ - buffer_offset_;
+    *cached_len = std::min(
+        buffer_.CurrentSize() - static_cast<size_t>(offset_in_buffer), n);
+    memcpy(scratch, buffer_.BufferStart() + offset_in_buffer, *cached_len);
+    read_offset_ += *cached_len;
+    return true;
+  }
+
+  // Reads into buffer_ the next n bytes from file_.
+  // Can actually read less if EOF was reached.
+  // Returns the status of the read operastion on the file.
+  IOStatus ReadIntoBuffer(size_t n, const IOOptions& opts,
+                          IODebugContext* dbg) {
+    if (n > buffer_.Capacity()) {
+      n = buffer_.Capacity();
+    }
+    assert(IsFileSectorAligned(n, alignment_));
+    Slice result;
+    IOStatus s = file_->Read(n, opts, &result, buffer_.BufferStart(), dbg);
+    if (s.ok()) {
+      buffer_offset_ = read_offset_;
+      buffer_.Size(result.size());
+      assert(result.size() == 0 || buffer_.BufferStart() == result.data());
+    }
+    return s;
+  }
+
+  const std::unique_ptr<FSSequentialFile> file_;
+  const size_t alignment_;
+  const size_t readahead_size_;
+
+  std::mutex lock_;
+  // The buffer storing the prefetched data
+  AlignedBuffer buffer_;
+  // The offset in file_, corresponding to data stored in buffer_
+  uint64_t buffer_offset_;
+  // The offset up to which data was read from file_. In fact, it can be larger
+  // than the actual file size, since the file_->Skip(n) call doesn't return the
+  // actual number of bytes that were skipped, which can be less than n.
+  // This is not a problemm since read_offset_ is monotonically increasing and
+  // its only use is to figure out if next piece of data should be read from
+  // buffer_ or file_ directly.
+  uint64_t read_offset_;
+};
+}  // namespace
+
+std::unique_ptr<FSSequentialFile>
+SequentialFileReader::NewReadaheadSequentialFile(
+    std::unique_ptr<FSSequentialFile>&& file, size_t readahead_size) {
+  if (file->GetRequiredBufferAlignment() >= readahead_size) {
+    // Short-circuit and return the original file if readahead_size is
+    // too small and hence doesn't make sense to be used for prefetching.
+    return std::move(file);
+  }
+  std::unique_ptr<FSSequentialFile> result(
+      new ReadaheadSequentialFile(std::move(file), readahead_size));
+  return result;
+}
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/file/sequence_file_reader.h b/src/rocksdb/file/sequence_file_reader.h
new file mode 100644
index 000000000..baea10eb7
--- /dev/null
+++ b/src/rocksdb/file/sequence_file_reader.h
@@ -0,0 +1,129 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <atomic>
+#include <string>
+
+#include "env/file_system_tracer.h"
+#include "port/port.h"
+#include "rocksdb/env.h"
+#include "rocksdb/file_system.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// SequentialFileReader is a wrapper on top of Env::SequentialFile. It handles
+// Buffered (i.e when page cache is enabled) and Direct (with O_DIRECT / page
+// cache disabled) reads appropriately, and also updates the IO stats.
+class SequentialFileReader {
+ private:
+#ifndef ROCKSDB_LITE
+  void NotifyOnFileReadFinish(
+      uint64_t offset, size_t length,
+      const FileOperationInfo::StartTimePoint& start_ts,
+      const FileOperationInfo::FinishTimePoint& finish_ts,
+      const Status& status) const {
+    FileOperationInfo info(FileOperationType::kRead, file_name_, start_ts,
+                           finish_ts, status);
+    info.offset = offset;
+    info.length = length;
+
+    for (auto& listener : listeners_) {
+      listener->OnFileReadFinish(info);
+    }
+    info.status.PermitUncheckedError();
+  }
+
+  void AddFileIOListeners(
+      const std::vector<std::shared_ptr<EventListener>>& listeners) {
+    std::for_each(listeners.begin(), listeners.end(),
+                  [this](const std::shared_ptr<EventListener>& e) {
+                    if (e->ShouldBeNotifiedOnFileIO()) {
+                      listeners_.emplace_back(e);
+                    }
+                  });
+  }
+#endif  // ROCKSDB_LITE
+
+  bool ShouldNotifyListeners() const { return !listeners_.empty(); }
+
+  std::string file_name_;
+  FSSequentialFilePtr file_;
+  std::atomic<size_t> offset_{0};  // read offset
+  std::vector<std::shared_ptr<EventListener>> listeners_{};
+  RateLimiter* rate_limiter_;
+
+ public:
+  explicit SequentialFileReader(
+      std::unique_ptr<FSSequentialFile>&& _file, const std::string& _file_name,
+      const std::shared_ptr<IOTracer>& io_tracer = nullptr,
+      const std::vector<std::shared_ptr<EventListener>>& listeners = {},
+      RateLimiter* rate_limiter =
+          nullptr)  // TODO: migrate call sites to provide rate limiter
+      : file_name_(_file_name),
+        file_(std::move(_file), io_tracer, _file_name),
+        listeners_(),
+        rate_limiter_(rate_limiter) {
+#ifndef ROCKSDB_LITE
+    AddFileIOListeners(listeners);
+#else
+    (void)listeners;
+#endif
+  }
+
+  explicit SequentialFileReader(
+      std::unique_ptr<FSSequentialFile>&& _file, const std::string& _file_name,
+      size_t _readahead_size,
+      const std::shared_ptr<IOTracer>& io_tracer = nullptr,
+      const std::vector<std::shared_ptr<EventListener>>& listeners = {},
+      RateLimiter* rate_limiter =
+          nullptr)  // TODO: migrate call sites to provide rate limiter
+      : file_name_(_file_name),
+        file_(NewReadaheadSequentialFile(std::move(_file), _readahead_size),
+              io_tracer, _file_name),
+        listeners_(),
+        rate_limiter_(rate_limiter) {
+#ifndef ROCKSDB_LITE
+    AddFileIOListeners(listeners);
+#else
+    (void)listeners;
+#endif
+  }
+  static IOStatus Create(const std::shared_ptr<FileSystem>& fs,
+                         const std::string& fname, const FileOptions& file_opts,
+                         std::unique_ptr<SequentialFileReader>* reader,
+                         IODebugContext* dbg, RateLimiter* rate_limiter);
+
+  SequentialFileReader(const SequentialFileReader&) = delete;
+  SequentialFileReader& operator=(const SequentialFileReader&) = delete;
+
+  // `rate_limiter_priority` is used to charge the internal rate limiter when
+  // enabled. The special value `Env::IO_TOTAL` makes this operation bypass the
+  // rate limiter. The amount charged to the internal rate limiter is n, even
+  // when less than n bytes are actually read (e.g. at end of file). To avoid
+  // overcharging the rate limiter, the caller can use file size to cap n to
+  // read until end of file.
+  IOStatus Read(size_t n, Slice* result, char* scratch,
+                Env::IOPriority rate_limiter_priority);
+
+  IOStatus Skip(uint64_t n);
+
+  FSSequentialFile* file() { return file_.get(); }
+
+  std::string file_name() { return file_name_; }
+
+  bool use_direct_io() const { return file_->use_direct_io(); }
+
+ private:
+  // NewReadaheadSequentialFile provides a wrapper over SequentialFile to
+  // always prefetch additional data with every read.
+  static std::unique_ptr<FSSequentialFile> NewReadaheadSequentialFile(
+      std::unique_ptr<FSSequentialFile>&& file, size_t readahead_size);
+};
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/file/sst_file_manager_impl.cc b/src/rocksdb/file/sst_file_manager_impl.cc
new file mode 100644
index 000000000..7053e6a07
--- /dev/null
+++ b/src/rocksdb/file/sst_file_manager_impl.cc
@@ -0,0 +1,525 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "file/sst_file_manager_impl.h"
+
+#include <cinttypes>
+#include <vector>
+
+#include "db/db_impl/db_impl.h"
+#include "logging/logging.h"
+#include "port/port.h"
+#include "rocksdb/env.h"
+#include "rocksdb/sst_file_manager.h"
+#include "test_util/sync_point.h"
+#include "util/mutexlock.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+#ifndef ROCKSDB_LITE
+SstFileManagerImpl::SstFileManagerImpl(
+    const std::shared_ptr<SystemClock>& clock,
+    const std::shared_ptr<FileSystem>& fs,
+    const std::shared_ptr<Logger>& logger, int64_t rate_bytes_per_sec,
+    double max_trash_db_ratio, uint64_t bytes_max_delete_chunk)
+    : clock_(clock),
+      fs_(fs),
+      logger_(logger),
+      total_files_size_(0),
+      compaction_buffer_size_(0),
+      cur_compactions_reserved_size_(0),
+      max_allowed_space_(0),
+      delete_scheduler_(clock_.get(), fs_.get(), rate_bytes_per_sec,
+                        logger.get(), this, max_trash_db_ratio,
+                        bytes_max_delete_chunk),
+      cv_(&mu_),
+      closing_(false),
+      bg_thread_(nullptr),
+      reserved_disk_buffer_(0),
+      free_space_trigger_(0),
+      cur_instance_(nullptr) {}
+
+SstFileManagerImpl::~SstFileManagerImpl() {
+  Close();
+  bg_err_.PermitUncheckedError();
+}
+
+void SstFileManagerImpl::Close() {
+  {
+    MutexLock l(&mu_);
+    if (closing_) {
+      return;
+    }
+    closing_ = true;
+    cv_.SignalAll();
+  }
+  if (bg_thread_) {
+    bg_thread_->join();
+  }
+}
+
+Status SstFileManagerImpl::OnAddFile(const std::string& file_path) {
+  uint64_t file_size;
+  Status s = fs_->GetFileSize(file_path, IOOptions(), &file_size, nullptr);
+  if (s.ok()) {
+    MutexLock l(&mu_);
+    OnAddFileImpl(file_path, file_size);
+  }
+  TEST_SYNC_POINT_CALLBACK("SstFileManagerImpl::OnAddFile",
+                           const_cast<std::string*>(&file_path));
+  return s;
+}
+
+Status SstFileManagerImpl::OnAddFile(const std::string& file_path,
+                                     uint64_t file_size) {
+  MutexLock l(&mu_);
+  OnAddFileImpl(file_path, file_size);
+  TEST_SYNC_POINT_CALLBACK("SstFileManagerImpl::OnAddFile",
+                           const_cast<std::string*>(&file_path));
+  return Status::OK();
+}
+
+Status SstFileManagerImpl::OnDeleteFile(const std::string& file_path) {
+  {
+    MutexLock l(&mu_);
+    OnDeleteFileImpl(file_path);
+  }
+  TEST_SYNC_POINT_CALLBACK("SstFileManagerImpl::OnDeleteFile",
+                           const_cast<std::string*>(&file_path));
+  return Status::OK();
+}
+
+void SstFileManagerImpl::OnCompactionCompletion(Compaction* c) {
+  MutexLock l(&mu_);
+  uint64_t size_added_by_compaction = 0;
+  for (size_t i = 0; i < c->num_input_levels(); i++) {
+    for (size_t j = 0; j < c->num_input_files(i); j++) {
+      FileMetaData* filemeta = c->input(i, j);
+      size_added_by_compaction += filemeta->fd.GetFileSize();
+    }
+  }
+  cur_compactions_reserved_size_ -= size_added_by_compaction;
+}
+
+Status SstFileManagerImpl::OnMoveFile(const std::string& old_path,
+                                      const std::string& new_path,
+                                      uint64_t* file_size) {
+  {
+    MutexLock l(&mu_);
+    if (file_size != nullptr) {
+      *file_size = tracked_files_[old_path];
+    }
+    OnAddFileImpl(new_path, tracked_files_[old_path]);
+    OnDeleteFileImpl(old_path);
+  }
+  TEST_SYNC_POINT("SstFileManagerImpl::OnMoveFile");
+  return Status::OK();
+}
+
+void SstFileManagerImpl::SetMaxAllowedSpaceUsage(uint64_t max_allowed_space) {
+  MutexLock l(&mu_);
+  max_allowed_space_ = max_allowed_space;
+}
+
+void SstFileManagerImpl::SetCompactionBufferSize(
+    uint64_t compaction_buffer_size) {
+  MutexLock l(&mu_);
+  compaction_buffer_size_ = compaction_buffer_size;
+}
+
+bool SstFileManagerImpl::IsMaxAllowedSpaceReached() {
+  MutexLock l(&mu_);
+  if (max_allowed_space_ <= 0) {
+    return false;
+  }
+  return total_files_size_ >= max_allowed_space_;
+}
+
+bool SstFileManagerImpl::IsMaxAllowedSpaceReachedIncludingCompactions() {
+  MutexLock l(&mu_);
+  if (max_allowed_space_ <= 0) {
+    return false;
+  }
+  return total_files_size_ + cur_compactions_reserved_size_ >=
+         max_allowed_space_;
+}
+
+bool SstFileManagerImpl::EnoughRoomForCompaction(
+    ColumnFamilyData* cfd, const std::vector<CompactionInputFiles>& inputs,
+    const Status& bg_error) {
+  MutexLock l(&mu_);
+  uint64_t size_added_by_compaction = 0;
+  // First check if we even have the space to do the compaction
+  for (size_t i = 0; i < inputs.size(); i++) {
+    for (size_t j = 0; j < inputs[i].size(); j++) {
+      FileMetaData* filemeta = inputs[i][j];
+      size_added_by_compaction += filemeta->fd.GetFileSize();
+    }
+  }
+
+  // Update cur_compactions_reserved_size_ so concurrent compaction
+  // don't max out space
+  size_t needed_headroom = cur_compactions_reserved_size_ +
+                           size_added_by_compaction + compaction_buffer_size_;
+  if (max_allowed_space_ != 0 &&
+      (needed_headroom + total_files_size_ > max_allowed_space_)) {
+    return false;
+  }
+
+  // Implement more aggressive checks only if this DB instance has already
+  // seen a NoSpace() error. This is tin order to contain a single potentially
+  // misbehaving DB instance and prevent it from slowing down compactions of
+  // other DB instances
+  if (bg_error.IsNoSpace() && CheckFreeSpace()) {
+    auto fn =
+        TableFileName(cfd->ioptions()->cf_paths, inputs[0][0]->fd.GetNumber(),
+                      inputs[0][0]->fd.GetPathId());
+    uint64_t free_space = 0;
+    Status s = fs_->GetFreeSpace(fn, IOOptions(), &free_space, nullptr);
+    s.PermitUncheckedError();  // TODO: Check the status
+    // needed_headroom is based on current size reserved by compactions,
+    // minus any files created by running compactions as they would count
+    // against the reserved size. If user didn't specify any compaction
+    // buffer, add reserved_disk_buffer_ that's calculated by default so the
+    // compaction doesn't end up leaving nothing for logs and flush SSTs
+    if (compaction_buffer_size_ == 0) {
+      needed_headroom += reserved_disk_buffer_;
+    }
+    if (free_space < needed_headroom + size_added_by_compaction) {
+      // We hit the condition of not enough disk space
+      ROCKS_LOG_ERROR(logger_,
+                      "free space [%" PRIu64
+                      " bytes] is less than "
+                      "needed headroom [%" ROCKSDB_PRIszt " bytes]\n",
+                      free_space, needed_headroom);
+      return false;
+    }
+  }
+
+  cur_compactions_reserved_size_ += size_added_by_compaction;
+  // Take a snapshot of cur_compactions_reserved_size_ for when we encounter
+  // a NoSpace error.
+  free_space_trigger_ = cur_compactions_reserved_size_;
+  return true;
+}
+
+uint64_t SstFileManagerImpl::GetCompactionsReservedSize() {
+  MutexLock l(&mu_);
+  return cur_compactions_reserved_size_;
+}
+
+uint64_t SstFileManagerImpl::GetTotalSize() {
+  MutexLock l(&mu_);
+  return total_files_size_;
+}
+
+std::unordered_map<std::string, uint64_t>
+SstFileManagerImpl::GetTrackedFiles() {
+  MutexLock l(&mu_);
+  return tracked_files_;
+}
+
+int64_t SstFileManagerImpl::GetDeleteRateBytesPerSecond() {
+  return delete_scheduler_.GetRateBytesPerSecond();
+}
+
+void SstFileManagerImpl::SetDeleteRateBytesPerSecond(int64_t delete_rate) {
+  return delete_scheduler_.SetRateBytesPerSecond(delete_rate);
+}
+
+double SstFileManagerImpl::GetMaxTrashDBRatio() {
+  return delete_scheduler_.GetMaxTrashDBRatio();
+}
+
+void SstFileManagerImpl::SetMaxTrashDBRatio(double r) {
+  return delete_scheduler_.SetMaxTrashDBRatio(r);
+}
+
+uint64_t SstFileManagerImpl::GetTotalTrashSize() {
+  return delete_scheduler_.GetTotalTrashSize();
+}
+
+void SstFileManagerImpl::ReserveDiskBuffer(uint64_t size,
+                                           const std::string& path) {
+  MutexLock l(&mu_);
+
+  reserved_disk_buffer_ += size;
+  if (path_.empty()) {
+    path_ = path;
+  }
+}
+
+void SstFileManagerImpl::ClearError() {
+  while (true) {
+    MutexLock l(&mu_);
+
+    if (error_handler_list_.empty() || closing_) {
+      return;
+    }
+
+    uint64_t free_space = 0;
+    Status s = fs_->GetFreeSpace(path_, IOOptions(), &free_space, nullptr);
+    free_space = max_allowed_space_ > 0
+                     ? std::min(max_allowed_space_, free_space)
+                     : free_space;
+    if (s.ok()) {
+      // In case of multi-DB instances, some of them may have experienced a
+      // soft error and some a hard error. In the SstFileManagerImpl, a hard
+      // error will basically override previously reported soft errors. Once
+      // we clear the hard error, we don't keep track of previous errors for
+      // now
+      if (bg_err_.severity() == Status::Severity::kHardError) {
+        if (free_space < reserved_disk_buffer_) {
+          ROCKS_LOG_ERROR(logger_,
+                          "free space [%" PRIu64
+                          " bytes] is less than "
+                          "required disk buffer [%" PRIu64 " bytes]\n",
+                          free_space, reserved_disk_buffer_);
+          ROCKS_LOG_ERROR(logger_, "Cannot clear hard error\n");
+          s = Status::NoSpace();
+        }
+      } else if (bg_err_.severity() == Status::Severity::kSoftError) {
+        if (free_space < free_space_trigger_) {
+          ROCKS_LOG_WARN(logger_,
+                         "free space [%" PRIu64
+                         " bytes] is less than "
+                         "free space for compaction trigger [%" PRIu64
+                         " bytes]\n",
+                         free_space, free_space_trigger_);
+          ROCKS_LOG_WARN(logger_, "Cannot clear soft error\n");
+          s = Status::NoSpace();
+        }
+      }
+    }
+
+    // Someone could have called CancelErrorRecovery() and the list could have
+    // become empty, so check again here
+    if (s.ok()) {
+      assert(!error_handler_list_.empty());
+      auto error_handler = error_handler_list_.front();
+      // Since we will release the mutex, set cur_instance_ to signal to the
+      // shutdown thread, if it calls // CancelErrorRecovery() the meantime,
+      // to indicate that this DB instance is busy. The DB instance is
+      // guaranteed to not be deleted before RecoverFromBGError() returns,
+      // since the ErrorHandler::recovery_in_prog_ flag would be true
+      cur_instance_ = error_handler;
+      mu_.Unlock();
+      s = error_handler->RecoverFromBGError();
+      TEST_SYNC_POINT("SstFileManagerImpl::ErrorCleared");
+      mu_.Lock();
+      // The DB instance might have been deleted while we were
+      // waiting for the mutex, so check cur_instance_ to make sure its
+      // still non-null
+      if (cur_instance_) {
+        // Check for error again, since the instance may have recovered but
+        // immediately got another error. If that's the case, and the new
+        // error is also a NoSpace() non-fatal error, leave the instance in
+        // the list
+        Status err = cur_instance_->GetBGError();
+        if (s.ok() && err.subcode() == IOStatus::SubCode::kNoSpace &&
+            err.severity() < Status::Severity::kFatalError) {
+          s = err;
+        }
+        cur_instance_ = nullptr;
+      }
+
+      if (s.ok() || s.IsShutdownInProgress() ||
+          (!s.ok() && s.severity() >= Status::Severity::kFatalError)) {
+        // If shutdown is in progress, abandon this handler instance
+        // and continue with the others
+        error_handler_list_.pop_front();
+      }
+    }
+
+    if (!error_handler_list_.empty()) {
+      // If there are more instances to be recovered, reschedule after 5
+      // seconds
+      int64_t wait_until = clock_->NowMicros() + 5000000;
+      cv_.TimedWait(wait_until);
+    }
+
+    // Check again for error_handler_list_ empty, as a DB instance shutdown
+    // could have removed it from the queue while we were in timed wait
+    if (error_handler_list_.empty()) {
+      ROCKS_LOG_INFO(logger_, "Clearing error\n");
+      bg_err_ = Status::OK();
+      return;
+    }
+  }
+}
+
+void SstFileManagerImpl::StartErrorRecovery(ErrorHandler* handler,
+                                            Status bg_error) {
+  MutexLock l(&mu_);
+  if (bg_error.severity() == Status::Severity::kSoftError) {
+    if (bg_err_.ok()) {
+      // Setting bg_err_ basically means we're in degraded mode
+      // Assume that all pending compactions will fail similarly. The trigger
+      // for clearing this condition is set to current compaction reserved
+      // size, so we stop checking disk space available in
+      // EnoughRoomForCompaction once this much free space is available
+      bg_err_ = bg_error;
+    }
+  } else if (bg_error.severity() == Status::Severity::kHardError) {
+    bg_err_ = bg_error;
+  } else {
+    assert(false);
+  }
+
+  // If this is the first instance of this error, kick of a thread to poll
+  // and recover from this condition
+  if (error_handler_list_.empty()) {
+    error_handler_list_.push_back(handler);
+    // Release lock before calling join. Its ok to do so because
+    // error_handler_list_ is now non-empty, so no other invocation of this
+    // function will execute this piece of code
+    mu_.Unlock();
+    if (bg_thread_) {
+      bg_thread_->join();
+    }
+    // Start a new thread. The previous one would have exited.
+    bg_thread_.reset(new port::Thread(&SstFileManagerImpl::ClearError, this));
+    mu_.Lock();
+  } else {
+    // Check if this DB instance is already in the list
+    for (auto iter = error_handler_list_.begin();
+         iter != error_handler_list_.end(); ++iter) {
+      if ((*iter) == handler) {
+        return;
+      }
+    }
+    error_handler_list_.push_back(handler);
+  }
+}
+
+bool SstFileManagerImpl::CancelErrorRecovery(ErrorHandler* handler) {
+  MutexLock l(&mu_);
+
+  if (cur_instance_ == handler) {
+    // This instance is currently busy attempting to recover
+    // Nullify it so the recovery thread doesn't attempt to access it again
+    cur_instance_ = nullptr;
+    return false;
+  }
+
+  for (auto iter = error_handler_list_.begin();
+       iter != error_handler_list_.end(); ++iter) {
+    if ((*iter) == handler) {
+      error_handler_list_.erase(iter);
+      return true;
+    }
+  }
+  return false;
+}
+
+Status SstFileManagerImpl::ScheduleFileDeletion(const std::string& file_path,
+                                                const std::string& path_to_sync,
+                                                const bool force_bg) {
+  TEST_SYNC_POINT_CALLBACK("SstFileManagerImpl::ScheduleFileDeletion",
+                           const_cast<std::string*>(&file_path));
+  return delete_scheduler_.DeleteFile(file_path, path_to_sync, force_bg);
+}
+
+void SstFileManagerImpl::WaitForEmptyTrash() {
+  delete_scheduler_.WaitForEmptyTrash();
+}
+
+void SstFileManagerImpl::OnAddFileImpl(const std::string& file_path,
+                                       uint64_t file_size) {
+  auto tracked_file = tracked_files_.find(file_path);
+  if (tracked_file != tracked_files_.end()) {
+    // File was added before, we will just update the size
+    total_files_size_ -= tracked_file->second;
+    total_files_size_ += file_size;
+    cur_compactions_reserved_size_ -= file_size;
+  } else {
+    total_files_size_ += file_size;
+  }
+  tracked_files_[file_path] = file_size;
+}
+
+void SstFileManagerImpl::OnDeleteFileImpl(const std::string& file_path) {
+  auto tracked_file = tracked_files_.find(file_path);
+  if (tracked_file == tracked_files_.end()) {
+    // File is not tracked
+    return;
+  }
+
+  total_files_size_ -= tracked_file->second;
+  tracked_files_.erase(tracked_file);
+}
+
+SstFileManager* NewSstFileManager(Env* env, std::shared_ptr<Logger> info_log,
+                                  std::string trash_dir,
+                                  int64_t rate_bytes_per_sec,
+                                  bool delete_existing_trash, Status* status,
+                                  double max_trash_db_ratio,
+                                  uint64_t bytes_max_delete_chunk) {
+  const auto& fs = env->GetFileSystem();
+  return NewSstFileManager(env, fs, info_log, trash_dir, rate_bytes_per_sec,
+                           delete_existing_trash, status, max_trash_db_ratio,
+                           bytes_max_delete_chunk);
+}
+
+SstFileManager* NewSstFileManager(Env* env, std::shared_ptr<FileSystem> fs,
+                                  std::shared_ptr<Logger> info_log,
+                                  const std::string& trash_dir,
+                                  int64_t rate_bytes_per_sec,
+                                  bool delete_existing_trash, Status* status,
+                                  double max_trash_db_ratio,
+                                  uint64_t bytes_max_delete_chunk) {
+  const auto& clock = env->GetSystemClock();
+  SstFileManagerImpl* res =
+      new SstFileManagerImpl(clock, fs, info_log, rate_bytes_per_sec,
+                             max_trash_db_ratio, bytes_max_delete_chunk);
+
+  // trash_dir is deprecated and not needed anymore, but if user passed it
+  // we will still remove files in it.
+  Status s = Status::OK();
+  if (delete_existing_trash && trash_dir != "") {
+    std::vector<std::string> files_in_trash;
+    s = fs->GetChildren(trash_dir, IOOptions(), &files_in_trash, nullptr);
+    if (s.ok()) {
+      for (const std::string& trash_file : files_in_trash) {
+        std::string path_in_trash = trash_dir + "/" + trash_file;
+        res->OnAddFile(path_in_trash);
+        Status file_delete =
+            res->ScheduleFileDeletion(path_in_trash, trash_dir);
+        if (s.ok() && !file_delete.ok()) {
+          s = file_delete;
+        }
+      }
+    }
+  }
+
+  if (status) {
+    *status = s;
+  } else {
+    // No one passed us a Status, so they must not care about the error...
+    s.PermitUncheckedError();
+  }
+
+  return res;
+}
+
+#else
+
+SstFileManager* NewSstFileManager(Env* /*env*/,
+                                  std::shared_ptr<Logger> /*info_log*/,
+                                  std::string /*trash_dir*/,
+                                  int64_t /*rate_bytes_per_sec*/,
+                                  bool /*delete_existing_trash*/,
+                                  Status* status, double /*max_trash_db_ratio*/,
+                                  uint64_t /*bytes_max_delete_chunk*/) {
+  if (status) {
+    *status =
+        Status::NotSupported("SstFileManager is not supported in ROCKSDB_LITE");
+  }
+  return nullptr;
+}
+
+#endif  // ROCKSDB_LITE
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/file/sst_file_manager_impl.h b/src/rocksdb/file/sst_file_manager_impl.h
new file mode 100644
index 000000000..548eb57f8
--- /dev/null
+++ b/src/rocksdb/file/sst_file_manager_impl.h
@@ -0,0 +1,195 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <string>
+
+#include "db/compaction/compaction.h"
+#include "file/delete_scheduler.h"
+#include "port/port.h"
+#include "rocksdb/sst_file_manager.h"
+
+namespace ROCKSDB_NAMESPACE {
+class ErrorHandler;
+class FileSystem;
+class SystemClock;
+class Logger;
+
+// SstFileManager is used to track SST and blob files in the DB and control
+// their deletion rate. All SstFileManager public functions are thread-safe.
+class SstFileManagerImpl : public SstFileManager {
+ public:
+  explicit SstFileManagerImpl(const std::shared_ptr<SystemClock>& clock,
+                              const std::shared_ptr<FileSystem>& fs,
+                              const std::shared_ptr<Logger>& logger,
+                              int64_t rate_bytes_per_sec,
+                              double max_trash_db_ratio,
+                              uint64_t bytes_max_delete_chunk);
+
+  ~SstFileManagerImpl();
+
+  // DB will call OnAddFile whenever a new sst/blob file is added.
+  Status OnAddFile(const std::string& file_path);
+
+  // Overload where size of the file is provided by the caller rather than
+  // queried from the filesystem. This is an optimization.
+  Status OnAddFile(const std::string& file_path, uint64_t file_size);
+
+  // DB will call OnDeleteFile whenever a sst/blob file is deleted.
+  Status OnDeleteFile(const std::string& file_path);
+
+  // DB will call OnMoveFile whenever a sst/blob file is move to a new path.
+  Status OnMoveFile(const std::string& old_path, const std::string& new_path,
+                    uint64_t* file_size = nullptr);
+
+  // Update the maximum allowed space that should be used by RocksDB, if
+  // the total size of the SST and blob files exceeds max_allowed_space, writes
+  // to RocksDB will fail.
+  //
+  // Setting max_allowed_space to 0 will disable this feature, maximum allowed
+  // space will be infinite (Default value).
+  //
+  // thread-safe.
+  void SetMaxAllowedSpaceUsage(uint64_t max_allowed_space) override;
+
+  void SetCompactionBufferSize(uint64_t compaction_buffer_size) override;
+
+  // Return true if the total size of SST and blob files exceeded the maximum
+  // allowed space usage.
+  //
+  // thread-safe.
+  bool IsMaxAllowedSpaceReached() override;
+
+  bool IsMaxAllowedSpaceReachedIncludingCompactions() override;
+
+  // Returns true is there is enough (approximate) space for the specified
+  // compaction. Space is approximate because this function conservatively
+  // estimates how much space is currently being used by compactions (i.e.
+  // if a compaction has started, this function bumps the used space by
+  // the full compaction size).
+  bool EnoughRoomForCompaction(ColumnFamilyData* cfd,
+                               const std::vector<CompactionInputFiles>& inputs,
+                               const Status& bg_error);
+
+  // Bookkeeping so total_file_sizes_ goes back to normal after compaction
+  // finishes
+  void OnCompactionCompletion(Compaction* c);
+
+  uint64_t GetCompactionsReservedSize();
+
+  // Return the total size of all tracked files.
+  uint64_t GetTotalSize() override;
+
+  // Return a map containing all tracked files and there corresponding sizes.
+  std::unordered_map<std::string, uint64_t> GetTrackedFiles() override;
+
+  // Return delete rate limit in bytes per second.
+  virtual int64_t GetDeleteRateBytesPerSecond() override;
+
+  // Update the delete rate limit in bytes per second.
+  virtual void SetDeleteRateBytesPerSecond(int64_t delete_rate) override;
+
+  // Return trash/DB size ratio where new files will be deleted immediately
+  virtual double GetMaxTrashDBRatio() override;
+
+  // Update trash/DB size ratio where new files will be deleted immediately
+  virtual void SetMaxTrashDBRatio(double ratio) override;
+
+  // Return the total size of trash files
+  uint64_t GetTotalTrashSize() override;
+
+  // Called by each DB instance using this sst file manager to reserve
+  // disk buffer space for recovery from out of space errors
+  void ReserveDiskBuffer(uint64_t buffer, const std::string& path);
+
+  // Set a flag upon encountering disk full. May enqueue the ErrorHandler
+  // instance for background polling and recovery
+  void StartErrorRecovery(ErrorHandler* db, Status bg_error);
+
+  // Remove the given Errorhandler instance from the recovery queue. Its
+  // not guaranteed
+  bool CancelErrorRecovery(ErrorHandler* db);
+
+  // Mark file as trash and schedule it's deletion. If force_bg is set, it
+  // forces the file to be deleting in the background regardless of DB size,
+  // except when rate limited delete is disabled
+  virtual Status ScheduleFileDeletion(const std::string& file_path,
+                                      const std::string& dir_to_sync,
+                                      const bool force_bg = false);
+
+  // Wait for all files being deleteing in the background to finish or for
+  // destructor to be called.
+  virtual void WaitForEmptyTrash();
+
+  DeleteScheduler* delete_scheduler() { return &delete_scheduler_; }
+
+  // Stop the error recovery background thread. This should be called only
+  // once in the object's lifetime, and before the destructor
+  void Close();
+
+  void SetStatisticsPtr(const std::shared_ptr<Statistics>& stats) override {
+    stats_ = stats;
+    delete_scheduler_.SetStatisticsPtr(stats);
+  }
+
+ private:
+  // REQUIRES: mutex locked
+  void OnAddFileImpl(const std::string& file_path, uint64_t file_size);
+  // REQUIRES: mutex locked
+  void OnDeleteFileImpl(const std::string& file_path);
+
+  void ClearError();
+  bool CheckFreeSpace() {
+    return bg_err_.severity() == Status::Severity::kSoftError;
+  }
+
+  std::shared_ptr<SystemClock> clock_;
+  std::shared_ptr<FileSystem> fs_;
+  std::shared_ptr<Logger> logger_;
+  // Mutex to protect tracked_files_, total_files_size_
+  port::Mutex mu_;
+  // The summation of the sizes of all files in tracked_files_ map
+  uint64_t total_files_size_;
+  // Compactions should only execute if they can leave at least
+  // this amount of buffer space for logs and flushes
+  uint64_t compaction_buffer_size_;
+  // Estimated size of the current ongoing compactions
+  uint64_t cur_compactions_reserved_size_;
+  // A map containing all tracked files and there sizes
+  //  file_path => file_size
+  std::unordered_map<std::string, uint64_t> tracked_files_;
+  // The maximum allowed space (in bytes) for sst and blob files.
+  uint64_t max_allowed_space_;
+  // DeleteScheduler used to throttle file deletion.
+  DeleteScheduler delete_scheduler_;
+  port::CondVar cv_;
+  // Flag to force error recovery thread to exit
+  bool closing_;
+  // Background error recovery thread
+  std::unique_ptr<port::Thread> bg_thread_;
+  // A path in the filesystem corresponding to this SFM. This is used for
+  // calling Env::GetFreeSpace. Posix requires a path in the filesystem
+  std::string path_;
+  // Save the current background error
+  Status bg_err_;
+  // Amount of free disk headroom before allowing recovery from hard errors
+  uint64_t reserved_disk_buffer_;
+  // For soft errors, amount of free disk space before we can allow
+  // compactions to run full throttle. If disk space is below this trigger,
+  // compactions will be gated by free disk space > input size
+  uint64_t free_space_trigger_;
+  // List of database error handler instances tracked by this SstFileManager.
+  std::list<ErrorHandler*> error_handler_list_;
+  // Pointer to ErrorHandler instance that is currently processing recovery
+  ErrorHandler* cur_instance_;
+  std::shared_ptr<Statistics> stats_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/file/writable_file_writer.cc b/src/rocksdb/file/writable_file_writer.cc
new file mode 100644
index 000000000..3afc51c56
--- /dev/null
+++ b/src/rocksdb/file/writable_file_writer.cc
@@ -0,0 +1,1025 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "file/writable_file_writer.h"
+
+#include <algorithm>
+#include <mutex>
+
+#include "db/version_edit.h"
+#include "monitoring/histogram.h"
+#include "monitoring/iostats_context_imp.h"
+#include "port/port.h"
+#include "rocksdb/io_status.h"
+#include "rocksdb/system_clock.h"
+#include "test_util/sync_point.h"
+#include "util/crc32c.h"
+#include "util/random.h"
+#include "util/rate_limiter.h"
+
+namespace ROCKSDB_NAMESPACE {
+IOStatus WritableFileWriter::Create(const std::shared_ptr<FileSystem>& fs,
+                                    const std::string& fname,
+                                    const FileOptions& file_opts,
+                                    std::unique_ptr<WritableFileWriter>* writer,
+                                    IODebugContext* dbg) {
+  if (file_opts.use_direct_writes &&
+      0 == file_opts.writable_file_max_buffer_size) {
+    return IOStatus::InvalidArgument(
+        "Direct write requires writable_file_max_buffer_size > 0");
+  }
+  std::unique_ptr<FSWritableFile> file;
+  IOStatus io_s = fs->NewWritableFile(fname, file_opts, &file, dbg);
+  if (io_s.ok()) {
+    writer->reset(new WritableFileWriter(std::move(file), fname, file_opts));
+  }
+  return io_s;
+}
+
+IOStatus WritableFileWriter::Append(const Slice& data, uint32_t crc32c_checksum,
+                                    Env::IOPriority op_rate_limiter_priority) {
+  if (seen_error()) {
+    return AssertFalseAndGetStatusForPrevError();
+  }
+
+  const char* src = data.data();
+  size_t left = data.size();
+  IOStatus s;
+  pending_sync_ = true;
+
+  TEST_KILL_RANDOM_WITH_WEIGHT("WritableFileWriter::Append:0", REDUCE_ODDS2);
+
+  // Calculate the checksum of appended data
+  UpdateFileChecksum(data);
+
+  {
+    IOOptions io_options;
+    io_options.rate_limiter_priority =
+        WritableFileWriter::DecideRateLimiterPriority(
+            writable_file_->GetIOPriority(), op_rate_limiter_priority);
+    IOSTATS_TIMER_GUARD(prepare_write_nanos);
+    TEST_SYNC_POINT("WritableFileWriter::Append:BeforePrepareWrite");
+    writable_file_->PrepareWrite(static_cast<size_t>(GetFileSize()), left,
+                                 io_options, nullptr);
+  }
+
+  // See whether we need to enlarge the buffer to avoid the flush
+  if (buf_.Capacity() - buf_.CurrentSize() < left) {
+    for (size_t cap = buf_.Capacity();
+         cap < max_buffer_size_;  // There is still room to increase
+         cap *= 2) {
+      // See whether the next available size is large enough.
+      // Buffer will never be increased to more than max_buffer_size_.
+      size_t desired_capacity = std::min(cap * 2, max_buffer_size_);
+      if (desired_capacity - buf_.CurrentSize() >= left ||
+          (use_direct_io() && desired_capacity == max_buffer_size_)) {
+        buf_.AllocateNewBuffer(desired_capacity, true);
+        break;
+      }
+    }
+  }
+
+  // Flush only when buffered I/O
+  if (!use_direct_io() && (buf_.Capacity() - buf_.CurrentSize()) < left) {
+    if (buf_.CurrentSize() > 0) {
+      s = Flush(op_rate_limiter_priority);
+      if (!s.ok()) {
+        set_seen_error();
+        return s;
+      }
+    }
+    assert(buf_.CurrentSize() == 0);
+  }
+
+  if (perform_data_verification_ && buffered_data_with_checksum_ &&
+      crc32c_checksum != 0) {
+    // Since we want to use the checksum of the input data, we cannot break it
+    // into several pieces. We will only write them in the buffer when buffer
+    // size is enough. Otherwise, we will directly write it down.
+    if (use_direct_io() || (buf_.Capacity() - buf_.CurrentSize()) >= left) {
+      if ((buf_.Capacity() - buf_.CurrentSize()) >= left) {
+        size_t appended = buf_.Append(src, left);
+        if (appended != left) {
+          s = IOStatus::Corruption("Write buffer append failure");
+        }
+        buffered_data_crc32c_checksum_ = crc32c::Crc32cCombine(
+            buffered_data_crc32c_checksum_, crc32c_checksum, appended);
+      } else {
+        while (left > 0) {
+          size_t appended = buf_.Append(src, left);
+          buffered_data_crc32c_checksum_ =
+              crc32c::Extend(buffered_data_crc32c_checksum_, src, appended);
+          left -= appended;
+          src += appended;
+
+          if (left > 0) {
+            s = Flush(op_rate_limiter_priority);
+            if (!s.ok()) {
+              break;
+            }
+          }
+        }
+      }
+    } else {
+      assert(buf_.CurrentSize() == 0);
+      buffered_data_crc32c_checksum_ = crc32c_checksum;
+      s = WriteBufferedWithChecksum(src, left, op_rate_limiter_priority);
+    }
+  } else {
+    // In this case, either we do not need to do the data verification or
+    // caller does not provide the checksum of the data (crc32c_checksum = 0).
+    //
+    // We never write directly to disk with direct I/O on.
+    // or we simply use it for its original purpose to accumulate many small
+    // chunks
+    if (use_direct_io() || (buf_.Capacity() >= left)) {
+      while (left > 0) {
+        size_t appended = buf_.Append(src, left);
+        if (perform_data_verification_ && buffered_data_with_checksum_) {
+          buffered_data_crc32c_checksum_ =
+              crc32c::Extend(buffered_data_crc32c_checksum_, src, appended);
+        }
+        left -= appended;
+        src += appended;
+
+        if (left > 0) {
+          s = Flush(op_rate_limiter_priority);
+          if (!s.ok()) {
+            break;
+          }
+        }
+      }
+    } else {
+      // Writing directly to file bypassing the buffer
+      assert(buf_.CurrentSize() == 0);
+      if (perform_data_verification_ && buffered_data_with_checksum_) {
+        buffered_data_crc32c_checksum_ = crc32c::Value(src, left);
+        s = WriteBufferedWithChecksum(src, left, op_rate_limiter_priority);
+      } else {
+        s = WriteBuffered(src, left, op_rate_limiter_priority);
+      }
+    }
+  }
+
+  TEST_KILL_RANDOM("WritableFileWriter::Append:1");
+  if (s.ok()) {
+    uint64_t cur_size = filesize_.load(std::memory_order_acquire);
+    filesize_.store(cur_size + data.size(), std::memory_order_release);
+  } else {
+    set_seen_error();
+  }
+  return s;
+}
+
+IOStatus WritableFileWriter::Pad(const size_t pad_bytes,
+                                 Env::IOPriority op_rate_limiter_priority) {
+  if (seen_error()) {
+    return AssertFalseAndGetStatusForPrevError();
+  }
+  assert(pad_bytes < kDefaultPageSize);
+  size_t left = pad_bytes;
+  size_t cap = buf_.Capacity() - buf_.CurrentSize();
+  size_t pad_start = buf_.CurrentSize();
+
+  // Assume pad_bytes is small compared to buf_ capacity. So we always
+  // use buf_ rather than write directly to file in certain cases like
+  // Append() does.
+  while (left) {
+    size_t append_bytes = std::min(cap, left);
+    buf_.PadWith(append_bytes, 0);
+    left -= append_bytes;
+    if (left > 0) {
+      IOStatus s = Flush(op_rate_limiter_priority);
+      if (!s.ok()) {
+        set_seen_error();
+        return s;
+      }
+    }
+    cap = buf_.Capacity() - buf_.CurrentSize();
+  }
+  pending_sync_ = true;
+  uint64_t cur_size = filesize_.load(std::memory_order_acquire);
+  filesize_.store(cur_size + pad_bytes, std::memory_order_release);
+  if (perform_data_verification_) {
+    buffered_data_crc32c_checksum_ =
+        crc32c::Extend(buffered_data_crc32c_checksum_,
+                       buf_.BufferStart() + pad_start, pad_bytes);
+  }
+  return IOStatus::OK();
+}
+
+IOStatus WritableFileWriter::Close() {
+  if (seen_error()) {
+    IOStatus interim;
+    if (writable_file_.get() != nullptr) {
+      interim = writable_file_->Close(IOOptions(), nullptr);
+      writable_file_.reset();
+    }
+    if (interim.ok()) {
+      return IOStatus::IOError(
+          "File is closed but data not flushed as writer has previous error.");
+    } else {
+      return interim;
+    }
+  }
+
+  // Do not quit immediately on failure the file MUST be closed
+
+  // Possible to close it twice now as we MUST close
+  // in __dtor, simply flushing is not enough
+  // Windows when pre-allocating does not fill with zeros
+  // also with unbuffered access we also set the end of data.
+  if (writable_file_.get() == nullptr) {
+    return IOStatus::OK();
+  }
+
+  IOStatus s;
+  s = Flush();  // flush cache to OS
+
+  IOStatus interim;
+  IOOptions io_options;
+  io_options.rate_limiter_priority = writable_file_->GetIOPriority();
+  // In direct I/O mode we write whole pages so
+  // we need to let the file know where data ends.
+  if (use_direct_io()) {
+    {
+#ifndef ROCKSDB_LITE
+      FileOperationInfo::StartTimePoint start_ts;
+      if (ShouldNotifyListeners()) {
+        start_ts = FileOperationInfo::StartNow();
+      }
+#endif
+      uint64_t filesz = filesize_.load(std::memory_order_acquire);
+      interim = writable_file_->Truncate(filesz, io_options, nullptr);
+#ifndef ROCKSDB_LITE
+      if (ShouldNotifyListeners()) {
+        auto finish_ts = FileOperationInfo::FinishNow();
+        NotifyOnFileTruncateFinish(start_ts, finish_ts, s);
+        if (!interim.ok()) {
+          NotifyOnIOError(interim, FileOperationType::kTruncate, file_name(),
+                          filesz);
+        }
+      }
+#endif
+    }
+    if (interim.ok()) {
+      {
+#ifndef ROCKSDB_LITE
+        FileOperationInfo::StartTimePoint start_ts;
+        if (ShouldNotifyListeners()) {
+          start_ts = FileOperationInfo::StartNow();
+        }
+#endif
+        interim = writable_file_->Fsync(io_options, nullptr);
+#ifndef ROCKSDB_LITE
+        if (ShouldNotifyListeners()) {
+          auto finish_ts = FileOperationInfo::FinishNow();
+          NotifyOnFileSyncFinish(start_ts, finish_ts, s,
+                                 FileOperationType::kFsync);
+          if (!interim.ok()) {
+            NotifyOnIOError(interim, FileOperationType::kFsync, file_name());
+          }
+        }
+#endif
+      }
+    }
+    if (!interim.ok() && s.ok()) {
+      s = interim;
+    }
+  }
+
+  TEST_KILL_RANDOM("WritableFileWriter::Close:0");
+  {
+#ifndef ROCKSDB_LITE
+    FileOperationInfo::StartTimePoint start_ts;
+    if (ShouldNotifyListeners()) {
+      start_ts = FileOperationInfo::StartNow();
+    }
+#endif
+    interim = writable_file_->Close(io_options, nullptr);
+#ifndef ROCKSDB_LITE
+    if (ShouldNotifyListeners()) {
+      auto finish_ts = FileOperationInfo::FinishNow();
+      NotifyOnFileCloseFinish(start_ts, finish_ts, s);
+      if (!interim.ok()) {
+        NotifyOnIOError(interim, FileOperationType::kClose, file_name());
+      }
+    }
+#endif
+  }
+  if (!interim.ok() && s.ok()) {
+    s = interim;
+  }
+
+  writable_file_.reset();
+  TEST_KILL_RANDOM("WritableFileWriter::Close:1");
+
+  if (s.ok()) {
+    if (checksum_generator_ != nullptr && !checksum_finalized_) {
+      checksum_generator_->Finalize();
+      checksum_finalized_ = true;
+    }
+  } else {
+    set_seen_error();
+  }
+
+  return s;
+}
+
+// write out the cached data to the OS cache or storage if direct I/O
+// enabled
+IOStatus WritableFileWriter::Flush(Env::IOPriority op_rate_limiter_priority) {
+  if (seen_error()) {
+    return AssertFalseAndGetStatusForPrevError();
+  }
+
+  IOStatus s;
+  TEST_KILL_RANDOM_WITH_WEIGHT("WritableFileWriter::Flush:0", REDUCE_ODDS2);
+
+  if (buf_.CurrentSize() > 0) {
+    if (use_direct_io()) {
+#ifndef ROCKSDB_LITE
+      if (pending_sync_) {
+        if (perform_data_verification_ && buffered_data_with_checksum_) {
+          s = WriteDirectWithChecksum(op_rate_limiter_priority);
+        } else {
+          s = WriteDirect(op_rate_limiter_priority);
+        }
+      }
+#endif  // !ROCKSDB_LITE
+    } else {
+      if (perform_data_verification_ && buffered_data_with_checksum_) {
+        s = WriteBufferedWithChecksum(buf_.BufferStart(), buf_.CurrentSize(),
+                                      op_rate_limiter_priority);
+      } else {
+        s = WriteBuffered(buf_.BufferStart(), buf_.CurrentSize(),
+                          op_rate_limiter_priority);
+      }
+    }
+    if (!s.ok()) {
+      set_seen_error();
+      return s;
+    }
+  }
+
+  {
+#ifndef ROCKSDB_LITE
+    FileOperationInfo::StartTimePoint start_ts;
+    if (ShouldNotifyListeners()) {
+      start_ts = FileOperationInfo::StartNow();
+    }
+#endif
+    IOOptions io_options;
+    io_options.rate_limiter_priority =
+        WritableFileWriter::DecideRateLimiterPriority(
+            writable_file_->GetIOPriority(), op_rate_limiter_priority);
+    s = writable_file_->Flush(io_options, nullptr);
+#ifndef ROCKSDB_LITE
+    if (ShouldNotifyListeners()) {
+      auto finish_ts = std::chrono::steady_clock::now();
+      NotifyOnFileFlushFinish(start_ts, finish_ts, s);
+      if (!s.ok()) {
+        NotifyOnIOError(s, FileOperationType::kFlush, file_name());
+      }
+    }
+#endif
+  }
+
+  if (!s.ok()) {
+    set_seen_error();
+    return s;
+  }
+
+  // sync OS cache to disk for every bytes_per_sync_
+  // TODO: give log file and sst file different options (log
+  // files could be potentially cached in OS for their whole
+  // life time, thus we might not want to flush at all).
+
+  // We try to avoid sync to the last 1MB of data. For two reasons:
+  // (1) avoid rewrite the same page that is modified later.
+  // (2) for older version of OS, write can block while writing out
+  //     the page.
+  // Xfs does neighbor page flushing outside of the specified ranges. We
+  // need to make sure sync range is far from the write offset.
+  if (!use_direct_io() && bytes_per_sync_) {
+    const uint64_t kBytesNotSyncRange =
+        1024 * 1024;                                // recent 1MB is not synced.
+    const uint64_t kBytesAlignWhenSync = 4 * 1024;  // Align 4KB.
+    uint64_t cur_size = filesize_.load(std::memory_order_acquire);
+    if (cur_size > kBytesNotSyncRange) {
+      uint64_t offset_sync_to = cur_size - kBytesNotSyncRange;
+      offset_sync_to -= offset_sync_to % kBytesAlignWhenSync;
+      assert(offset_sync_to >= last_sync_size_);
+      if (offset_sync_to > 0 &&
+          offset_sync_to - last_sync_size_ >= bytes_per_sync_) {
+        s = RangeSync(last_sync_size_, offset_sync_to - last_sync_size_);
+        if (!s.ok()) {
+          set_seen_error();
+        }
+        last_sync_size_ = offset_sync_to;
+      }
+    }
+  }
+
+  return s;
+}
+
+std::string WritableFileWriter::GetFileChecksum() {
+  if (checksum_generator_ != nullptr) {
+    assert(checksum_finalized_);
+    return checksum_generator_->GetChecksum();
+  } else {
+    return kUnknownFileChecksum;
+  }
+}
+
+const char* WritableFileWriter::GetFileChecksumFuncName() const {
+  if (checksum_generator_ != nullptr) {
+    return checksum_generator_->Name();
+  } else {
+    return kUnknownFileChecksumFuncName;
+  }
+}
+
+IOStatus WritableFileWriter::Sync(bool use_fsync) {
+  if (seen_error()) {
+    return AssertFalseAndGetStatusForPrevError();
+  }
+
+  IOStatus s = Flush();
+  if (!s.ok()) {
+    set_seen_error();
+    return s;
+  }
+  TEST_KILL_RANDOM("WritableFileWriter::Sync:0");
+  if (!use_direct_io() && pending_sync_) {
+    s = SyncInternal(use_fsync);
+    if (!s.ok()) {
+      set_seen_error();
+      return s;
+    }
+  }
+  TEST_KILL_RANDOM("WritableFileWriter::Sync:1");
+  pending_sync_ = false;
+  return IOStatus::OK();
+}
+
+IOStatus WritableFileWriter::SyncWithoutFlush(bool use_fsync) {
+  if (seen_error()) {
+    return AssertFalseAndGetStatusForPrevError();
+  }
+  if (!writable_file_->IsSyncThreadSafe()) {
+    return IOStatus::NotSupported(
+        "Can't WritableFileWriter::SyncWithoutFlush() because "
+        "WritableFile::IsSyncThreadSafe() is false");
+  }
+  TEST_SYNC_POINT("WritableFileWriter::SyncWithoutFlush:1");
+  IOStatus s = SyncInternal(use_fsync);
+  TEST_SYNC_POINT("WritableFileWriter::SyncWithoutFlush:2");
+  if (!s.ok()) {
+#ifndef NDEBUG
+    sync_without_flush_called_ = true;
+#endif  // NDEBUG
+    set_seen_error();
+  }
+  return s;
+}
+
+IOStatus WritableFileWriter::SyncInternal(bool use_fsync) {
+  // Caller is supposed to check seen_error_
+  IOStatus s;
+  IOSTATS_TIMER_GUARD(fsync_nanos);
+  TEST_SYNC_POINT("WritableFileWriter::SyncInternal:0");
+  auto prev_perf_level = GetPerfLevel();
+
+  IOSTATS_CPU_TIMER_GUARD(cpu_write_nanos, clock_);
+
+#ifndef ROCKSDB_LITE
+  FileOperationInfo::StartTimePoint start_ts;
+  if (ShouldNotifyListeners()) {
+    start_ts = FileOperationInfo::StartNow();
+  }
+#endif
+
+  IOOptions io_options;
+  io_options.rate_limiter_priority = writable_file_->GetIOPriority();
+  if (use_fsync) {
+    s = writable_file_->Fsync(io_options, nullptr);
+  } else {
+    s = writable_file_->Sync(io_options, nullptr);
+  }
+#ifndef ROCKSDB_LITE
+  if (ShouldNotifyListeners()) {
+    auto finish_ts = std::chrono::steady_clock::now();
+    NotifyOnFileSyncFinish(
+        start_ts, finish_ts, s,
+        use_fsync ? FileOperationType::kFsync : FileOperationType::kSync);
+    if (!s.ok()) {
+      NotifyOnIOError(
+          s, (use_fsync ? FileOperationType::kFsync : FileOperationType::kSync),
+          file_name());
+    }
+  }
+#endif
+  SetPerfLevel(prev_perf_level);
+
+  // The caller will be responsible to call set_seen_error() if s is not OK.
+  return s;
+}
+
+IOStatus WritableFileWriter::RangeSync(uint64_t offset, uint64_t nbytes) {
+  if (seen_error()) {
+    return AssertFalseAndGetStatusForPrevError();
+  }
+
+  IOSTATS_TIMER_GUARD(range_sync_nanos);
+  TEST_SYNC_POINT("WritableFileWriter::RangeSync:0");
+#ifndef ROCKSDB_LITE
+  FileOperationInfo::StartTimePoint start_ts;
+  if (ShouldNotifyListeners()) {
+    start_ts = FileOperationInfo::StartNow();
+  }
+#endif
+  IOOptions io_options;
+  io_options.rate_limiter_priority = writable_file_->GetIOPriority();
+  IOStatus s = writable_file_->RangeSync(offset, nbytes, io_options, nullptr);
+  if (!s.ok()) {
+    set_seen_error();
+  }
+#ifndef ROCKSDB_LITE
+  if (ShouldNotifyListeners()) {
+    auto finish_ts = std::chrono::steady_clock::now();
+    NotifyOnFileRangeSyncFinish(offset, nbytes, start_ts, finish_ts, s);
+    if (!s.ok()) {
+      NotifyOnIOError(s, FileOperationType::kRangeSync, file_name(), nbytes,
+                      offset);
+    }
+  }
+#endif
+  return s;
+}
+
+// This method writes to disk the specified data and makes use of the rate
+// limiter if available
+IOStatus WritableFileWriter::WriteBuffered(
+    const char* data, size_t size, Env::IOPriority op_rate_limiter_priority) {
+  if (seen_error()) {
+    return AssertFalseAndGetStatusForPrevError();
+  }
+
+  IOStatus s;
+  assert(!use_direct_io());
+  const char* src = data;
+  size_t left = size;
+  DataVerificationInfo v_info;
+  char checksum_buf[sizeof(uint32_t)];
+  Env::IOPriority rate_limiter_priority_used =
+      WritableFileWriter::DecideRateLimiterPriority(
+          writable_file_->GetIOPriority(), op_rate_limiter_priority);
+  IOOptions io_options;
+  io_options.rate_limiter_priority = rate_limiter_priority_used;
+
+  while (left > 0) {
+    size_t allowed = left;
+    if (rate_limiter_ != nullptr &&
+        rate_limiter_priority_used != Env::IO_TOTAL) {
+      allowed = rate_limiter_->RequestToken(left, 0 /* alignment */,
+                                            rate_limiter_priority_used, stats_,
+                                            RateLimiter::OpType::kWrite);
+    }
+
+    {
+      IOSTATS_TIMER_GUARD(write_nanos);
+      TEST_SYNC_POINT("WritableFileWriter::Flush:BeforeAppend");
+
+#ifndef ROCKSDB_LITE
+      FileOperationInfo::StartTimePoint start_ts;
+      uint64_t old_size = writable_file_->GetFileSize(io_options, nullptr);
+      if (ShouldNotifyListeners()) {
+        start_ts = FileOperationInfo::StartNow();
+        old_size = next_write_offset_;
+      }
+#endif
+      {
+        auto prev_perf_level = GetPerfLevel();
+
+        IOSTATS_CPU_TIMER_GUARD(cpu_write_nanos, clock_);
+        if (perform_data_verification_) {
+          Crc32cHandoffChecksumCalculation(src, allowed, checksum_buf);
+          v_info.checksum = Slice(checksum_buf, sizeof(uint32_t));
+          s = writable_file_->Append(Slice(src, allowed), io_options, v_info,
+                                     nullptr);
+        } else {
+          s = writable_file_->Append(Slice(src, allowed), io_options, nullptr);
+        }
+        if (!s.ok()) {
+          // If writable_file_->Append() failed, then the data may or may not
+          // exist in the underlying memory buffer, OS page cache, remote file
+          // system's buffer, etc. If WritableFileWriter keeps the data in
+          // buf_, then a future Close() or write retry may send the data to
+          // the underlying file again. If the data does exist in the
+          // underlying buffer and gets written to the file eventually despite
+          // returning error, the file may end up with two duplicate pieces of
+          // data. Therefore, clear the buf_ at the WritableFileWriter layer
+          // and let caller determine error handling.
+          buf_.Size(0);
+          buffered_data_crc32c_checksum_ = 0;
+        }
+        SetPerfLevel(prev_perf_level);
+      }
+#ifndef ROCKSDB_LITE
+      if (ShouldNotifyListeners()) {
+        auto finish_ts = std::chrono::steady_clock::now();
+        NotifyOnFileWriteFinish(old_size, allowed, start_ts, finish_ts, s);
+        if (!s.ok()) {
+          NotifyOnIOError(s, FileOperationType::kAppend, file_name(), allowed,
+                          old_size);
+        }
+      }
+#endif
+      if (!s.ok()) {
+        set_seen_error();
+        return s;
+      }
+    }
+
+    IOSTATS_ADD(bytes_written, allowed);
+    TEST_KILL_RANDOM("WritableFileWriter::WriteBuffered:0");
+
+    left -= allowed;
+    src += allowed;
+    uint64_t cur_size = flushed_size_.load(std::memory_order_acquire);
+    flushed_size_.store(cur_size + allowed, std::memory_order_release);
+  }
+  buf_.Size(0);
+  buffered_data_crc32c_checksum_ = 0;
+  if (!s.ok()) {
+    set_seen_error();
+  }
+  return s;
+}
+
+IOStatus WritableFileWriter::WriteBufferedWithChecksum(
+    const char* data, size_t size, Env::IOPriority op_rate_limiter_priority) {
+  if (seen_error()) {
+    return AssertFalseAndGetStatusForPrevError();
+  }
+
+  IOStatus s;
+  assert(!use_direct_io());
+  assert(perform_data_verification_ && buffered_data_with_checksum_);
+  const char* src = data;
+  size_t left = size;
+  DataVerificationInfo v_info;
+  char checksum_buf[sizeof(uint32_t)];
+  Env::IOPriority rate_limiter_priority_used =
+      WritableFileWriter::DecideRateLimiterPriority(
+          writable_file_->GetIOPriority(), op_rate_limiter_priority);
+  IOOptions io_options;
+  io_options.rate_limiter_priority = rate_limiter_priority_used;
+  // Check how much is allowed. Here, we loop until the rate limiter allows to
+  // write the entire buffer.
+  // TODO: need to be improved since it sort of defeats the purpose of the rate
+  // limiter
+  size_t data_size = left;
+  if (rate_limiter_ != nullptr && rate_limiter_priority_used != Env::IO_TOTAL) {
+    while (data_size > 0) {
+      size_t tmp_size;
+      tmp_size = rate_limiter_->RequestToken(data_size, buf_.Alignment(),
+                                             rate_limiter_priority_used, stats_,
+                                             RateLimiter::OpType::kWrite);
+      data_size -= tmp_size;
+    }
+  }
+
+  {
+    IOSTATS_TIMER_GUARD(write_nanos);
+    TEST_SYNC_POINT("WritableFileWriter::Flush:BeforeAppend");
+
+#ifndef ROCKSDB_LITE
+    FileOperationInfo::StartTimePoint start_ts;
+    uint64_t old_size = writable_file_->GetFileSize(io_options, nullptr);
+    if (ShouldNotifyListeners()) {
+      start_ts = FileOperationInfo::StartNow();
+      old_size = next_write_offset_;
+    }
+#endif
+    {
+      auto prev_perf_level = GetPerfLevel();
+
+      IOSTATS_CPU_TIMER_GUARD(cpu_write_nanos, clock_);
+
+      EncodeFixed32(checksum_buf, buffered_data_crc32c_checksum_);
+      v_info.checksum = Slice(checksum_buf, sizeof(uint32_t));
+      s = writable_file_->Append(Slice(src, left), io_options, v_info, nullptr);
+      SetPerfLevel(prev_perf_level);
+    }
+#ifndef ROCKSDB_LITE
+    if (ShouldNotifyListeners()) {
+      auto finish_ts = std::chrono::steady_clock::now();
+      NotifyOnFileWriteFinish(old_size, left, start_ts, finish_ts, s);
+      if (!s.ok()) {
+        NotifyOnIOError(s, FileOperationType::kAppend, file_name(), left,
+                        old_size);
+      }
+    }
+#endif
+    if (!s.ok()) {
+      // If writable_file_->Append() failed, then the data may or may not
+      // exist in the underlying memory buffer, OS page cache, remote file
+      // system's buffer, etc. If WritableFileWriter keeps the data in
+      // buf_, then a future Close() or write retry may send the data to
+      // the underlying file again. If the data does exist in the
+      // underlying buffer and gets written to the file eventually despite
+      // returning error, the file may end up with two duplicate pieces of
+      // data. Therefore, clear the buf_ at the WritableFileWriter layer
+      // and let caller determine error handling.
+      buf_.Size(0);
+      buffered_data_crc32c_checksum_ = 0;
+      set_seen_error();
+      return s;
+    }
+  }
+
+  IOSTATS_ADD(bytes_written, left);
+  TEST_KILL_RANDOM("WritableFileWriter::WriteBuffered:0");
+
+  // Buffer write is successful, reset the buffer current size to 0 and reset
+  // the corresponding checksum value
+  buf_.Size(0);
+  buffered_data_crc32c_checksum_ = 0;
+  uint64_t cur_size = flushed_size_.load(std::memory_order_acquire);
+  flushed_size_.store(cur_size + left, std::memory_order_release);
+  if (!s.ok()) {
+    set_seen_error();
+  }
+  return s;
+}
+
+void WritableFileWriter::UpdateFileChecksum(const Slice& data) {
+  if (checksum_generator_ != nullptr) {
+    checksum_generator_->Update(data.data(), data.size());
+  }
+}
+
+// Currently, crc32c checksum is used to calculate the checksum value of the
+// content in the input buffer for handoff. In the future, the checksum might be
+// calculated from the existing crc32c checksums of the in WAl and Manifest
+// records, or even SST file blocks.
+// TODO: effectively use the existing checksum of the data being writing to
+// generate the crc32c checksum instead of a raw calculation.
+void WritableFileWriter::Crc32cHandoffChecksumCalculation(const char* data,
+                                                          size_t size,
+                                                          char* buf) {
+  uint32_t v_crc32c = crc32c::Extend(0, data, size);
+  EncodeFixed32(buf, v_crc32c);
+}
+
+// This flushes the accumulated data in the buffer. We pad data with zeros if
+// necessary to the whole page.
+// However, during automatic flushes padding would not be necessary.
+// We always use RateLimiter if available. We move (Refit) any buffer bytes
+// that are left over the
+// whole number of pages to be written again on the next flush because we can
+// only write on aligned
+// offsets.
+#ifndef ROCKSDB_LITE
+IOStatus WritableFileWriter::WriteDirect(
+    Env::IOPriority op_rate_limiter_priority) {
+  if (seen_error()) {
+    assert(false);
+
+    return IOStatus::IOError("Writer has previous error.");
+  }
+
+  assert(use_direct_io());
+  IOStatus s;
+  const size_t alignment = buf_.Alignment();
+  assert((next_write_offset_ % alignment) == 0);
+
+  // Calculate whole page final file advance if all writes succeed
+  const size_t file_advance =
+      TruncateToPageBoundary(alignment, buf_.CurrentSize());
+
+  // Calculate the leftover tail, we write it here padded with zeros BUT we
+  // will write it again in the future either on Close() OR when the current
+  // whole page fills out.
+  const size_t leftover_tail = buf_.CurrentSize() - file_advance;
+
+  // Round up and pad
+  buf_.PadToAlignmentWith(0);
+
+  const char* src = buf_.BufferStart();
+  uint64_t write_offset = next_write_offset_;
+  size_t left = buf_.CurrentSize();
+  DataVerificationInfo v_info;
+  char checksum_buf[sizeof(uint32_t)];
+  Env::IOPriority rate_limiter_priority_used =
+      WritableFileWriter::DecideRateLimiterPriority(
+          writable_file_->GetIOPriority(), op_rate_limiter_priority);
+  IOOptions io_options;
+  io_options.rate_limiter_priority = rate_limiter_priority_used;
+
+  while (left > 0) {
+    // Check how much is allowed
+    size_t size = left;
+    if (rate_limiter_ != nullptr &&
+        rate_limiter_priority_used != Env::IO_TOTAL) {
+      size = rate_limiter_->RequestToken(left, buf_.Alignment(),
+                                         rate_limiter_priority_used, stats_,
+                                         RateLimiter::OpType::kWrite);
+    }
+
+    {
+      IOSTATS_TIMER_GUARD(write_nanos);
+      TEST_SYNC_POINT("WritableFileWriter::Flush:BeforeAppend");
+      FileOperationInfo::StartTimePoint start_ts;
+      if (ShouldNotifyListeners()) {
+        start_ts = FileOperationInfo::StartNow();
+      }
+      // direct writes must be positional
+      if (perform_data_verification_) {
+        Crc32cHandoffChecksumCalculation(src, size, checksum_buf);
+        v_info.checksum = Slice(checksum_buf, sizeof(uint32_t));
+        s = writable_file_->PositionedAppend(Slice(src, size), write_offset,
+                                             io_options, v_info, nullptr);
+      } else {
+        s = writable_file_->PositionedAppend(Slice(src, size), write_offset,
+                                             io_options, nullptr);
+      }
+
+      if (ShouldNotifyListeners()) {
+        auto finish_ts = std::chrono::steady_clock::now();
+        NotifyOnFileWriteFinish(write_offset, size, start_ts, finish_ts, s);
+        if (!s.ok()) {
+          NotifyOnIOError(s, FileOperationType::kPositionedAppend, file_name(),
+                          size, write_offset);
+        }
+      }
+      if (!s.ok()) {
+        buf_.Size(file_advance + leftover_tail);
+        set_seen_error();
+        return s;
+      }
+    }
+
+    IOSTATS_ADD(bytes_written, size);
+    left -= size;
+    src += size;
+    write_offset += size;
+    uint64_t cur_size = flushed_size_.load(std::memory_order_acquire);
+    flushed_size_.store(cur_size + size, std::memory_order_release);
+    assert((next_write_offset_ % alignment) == 0);
+  }
+
+  if (s.ok()) {
+    // Move the tail to the beginning of the buffer
+    // This never happens during normal Append but rather during
+    // explicit call to Flush()/Sync() or Close()
+    buf_.RefitTail(file_advance, leftover_tail);
+    // This is where we start writing next time which may or not be
+    // the actual file size on disk. They match if the buffer size
+    // is a multiple of whole pages otherwise filesize_ is leftover_tail
+    // behind
+    next_write_offset_ += file_advance;
+  } else {
+    set_seen_error();
+  }
+  return s;
+}
+
+IOStatus WritableFileWriter::WriteDirectWithChecksum(
+    Env::IOPriority op_rate_limiter_priority) {
+  if (seen_error()) {
+    return AssertFalseAndGetStatusForPrevError();
+  }
+
+  assert(use_direct_io());
+  assert(perform_data_verification_ && buffered_data_with_checksum_);
+  IOStatus s;
+  const size_t alignment = buf_.Alignment();
+  assert((next_write_offset_ % alignment) == 0);
+
+  // Calculate whole page final file advance if all writes succeed
+  const size_t file_advance =
+      TruncateToPageBoundary(alignment, buf_.CurrentSize());
+
+  // Calculate the leftover tail, we write it here padded with zeros BUT we
+  // will write it again in the future either on Close() OR when the current
+  // whole page fills out.
+  const size_t leftover_tail = buf_.CurrentSize() - file_advance;
+
+  // Round up, pad, and combine the checksum.
+  size_t last_cur_size = buf_.CurrentSize();
+  buf_.PadToAlignmentWith(0);
+  size_t padded_size = buf_.CurrentSize() - last_cur_size;
+  const char* padded_start = buf_.BufferStart() + last_cur_size;
+  uint32_t padded_checksum = crc32c::Value(padded_start, padded_size);
+  buffered_data_crc32c_checksum_ = crc32c::Crc32cCombine(
+      buffered_data_crc32c_checksum_, padded_checksum, padded_size);
+
+  const char* src = buf_.BufferStart();
+  uint64_t write_offset = next_write_offset_;
+  size_t left = buf_.CurrentSize();
+  DataVerificationInfo v_info;
+  char checksum_buf[sizeof(uint32_t)];
+
+  Env::IOPriority rate_limiter_priority_used =
+      WritableFileWriter::DecideRateLimiterPriority(
+          writable_file_->GetIOPriority(), op_rate_limiter_priority);
+  IOOptions io_options;
+  io_options.rate_limiter_priority = rate_limiter_priority_used;
+  // Check how much is allowed. Here, we loop until the rate limiter allows to
+  // write the entire buffer.
+  // TODO: need to be improved since it sort of defeats the purpose of the rate
+  // limiter
+  size_t data_size = left;
+  if (rate_limiter_ != nullptr && rate_limiter_priority_used != Env::IO_TOTAL) {
+    while (data_size > 0) {
+      size_t size;
+      size = rate_limiter_->RequestToken(data_size, buf_.Alignment(),
+                                         rate_limiter_priority_used, stats_,
+                                         RateLimiter::OpType::kWrite);
+      data_size -= size;
+    }
+  }
+
+  {
+    IOSTATS_TIMER_GUARD(write_nanos);
+    TEST_SYNC_POINT("WritableFileWriter::Flush:BeforeAppend");
+    FileOperationInfo::StartTimePoint start_ts;
+    if (ShouldNotifyListeners()) {
+      start_ts = FileOperationInfo::StartNow();
+    }
+    // direct writes must be positional
+    EncodeFixed32(checksum_buf, buffered_data_crc32c_checksum_);
+    v_info.checksum = Slice(checksum_buf, sizeof(uint32_t));
+    s = writable_file_->PositionedAppend(Slice(src, left), write_offset,
+                                         io_options, v_info, nullptr);
+
+    if (ShouldNotifyListeners()) {
+      auto finish_ts = std::chrono::steady_clock::now();
+      NotifyOnFileWriteFinish(write_offset, left, start_ts, finish_ts, s);
+      if (!s.ok()) {
+        NotifyOnIOError(s, FileOperationType::kPositionedAppend, file_name(),
+                        left, write_offset);
+      }
+    }
+    if (!s.ok()) {
+      // In this case, we do not change buffered_data_crc32c_checksum_ because
+      // it still aligns with the data in the buffer.
+      buf_.Size(file_advance + leftover_tail);
+      buffered_data_crc32c_checksum_ =
+          crc32c::Value(buf_.BufferStart(), buf_.CurrentSize());
+      set_seen_error();
+      return s;
+    }
+  }
+
+  IOSTATS_ADD(bytes_written, left);
+  assert((next_write_offset_ % alignment) == 0);
+  uint64_t cur_size = flushed_size_.load(std::memory_order_acquire);
+  flushed_size_.store(cur_size + left, std::memory_order_release);
+
+  if (s.ok()) {
+    // Move the tail to the beginning of the buffer
+    // This never happens during normal Append but rather during
+    // explicit call to Flush()/Sync() or Close(). Also the buffer checksum will
+    // recalculated accordingly.
+    buf_.RefitTail(file_advance, leftover_tail);
+    // Adjust the checksum value to align with the data in the buffer
+    buffered_data_crc32c_checksum_ =
+        crc32c::Value(buf_.BufferStart(), buf_.CurrentSize());
+    // This is where we start writing next time which may or not be
+    // the actual file size on disk. They match if the buffer size
+    // is a multiple of whole pages otherwise filesize_ is leftover_tail
+    // behind
+    next_write_offset_ += file_advance;
+  } else {
+    set_seen_error();
+  }
+  return s;
+}
+#endif  // !ROCKSDB_LITE
+Env::IOPriority WritableFileWriter::DecideRateLimiterPriority(
+    Env::IOPriority writable_file_io_priority,
+    Env::IOPriority op_rate_limiter_priority) {
+  if (writable_file_io_priority == Env::IO_TOTAL &&
+      op_rate_limiter_priority == Env::IO_TOTAL) {
+    return Env::IO_TOTAL;
+  } else if (writable_file_io_priority == Env::IO_TOTAL) {
+    return op_rate_limiter_priority;
+  } else if (op_rate_limiter_priority == Env::IO_TOTAL) {
+    return writable_file_io_priority;
+  } else {
+    return op_rate_limiter_priority;
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/file/writable_file_writer.h b/src/rocksdb/file/writable_file_writer.h
new file mode 100644
index 000000000..b3985eb20
--- /dev/null
+++ b/src/rocksdb/file/writable_file_writer.h
@@ -0,0 +1,336 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <atomic>
+#include <string>
+
+#include "db/version_edit.h"
+#include "env/file_system_tracer.h"
+#include "port/port.h"
+#include "rocksdb/file_checksum.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/io_status.h"
+#include "rocksdb/listener.h"
+#include "rocksdb/rate_limiter.h"
+#include "test_util/sync_point.h"
+#include "util/aligned_buffer.h"
+
+namespace ROCKSDB_NAMESPACE {
+class Statistics;
+class SystemClock;
+
+// WritableFileWriter is a wrapper on top of Env::WritableFile. It provides
+// facilities to:
+// - Handle Buffered and Direct writes.
+// - Rate limit writes.
+// - Flush and Sync the data to the underlying filesystem.
+// - Notify any interested listeners on the completion of a write.
+// - Update IO stats.
+class WritableFileWriter {
+ private:
+#ifndef ROCKSDB_LITE
+  void NotifyOnFileWriteFinish(
+      uint64_t offset, size_t length,
+      const FileOperationInfo::StartTimePoint& start_ts,
+      const FileOperationInfo::FinishTimePoint& finish_ts,
+      const IOStatus& io_status) {
+    FileOperationInfo info(FileOperationType::kWrite, file_name_, start_ts,
+                           finish_ts, io_status, temperature_);
+    info.offset = offset;
+    info.length = length;
+
+    for (auto& listener : listeners_) {
+      listener->OnFileWriteFinish(info);
+    }
+    info.status.PermitUncheckedError();
+  }
+  void NotifyOnFileFlushFinish(
+      const FileOperationInfo::StartTimePoint& start_ts,
+      const FileOperationInfo::FinishTimePoint& finish_ts,
+      const IOStatus& io_status) {
+    FileOperationInfo info(FileOperationType::kFlush, file_name_, start_ts,
+                           finish_ts, io_status, temperature_);
+
+    for (auto& listener : listeners_) {
+      listener->OnFileFlushFinish(info);
+    }
+    info.status.PermitUncheckedError();
+  }
+  void NotifyOnFileSyncFinish(
+      const FileOperationInfo::StartTimePoint& start_ts,
+      const FileOperationInfo::FinishTimePoint& finish_ts,
+      const IOStatus& io_status,
+      FileOperationType type = FileOperationType::kSync) {
+    FileOperationInfo info(type, file_name_, start_ts, finish_ts, io_status,
+                           temperature_);
+
+    for (auto& listener : listeners_) {
+      listener->OnFileSyncFinish(info);
+    }
+    info.status.PermitUncheckedError();
+  }
+  void NotifyOnFileRangeSyncFinish(
+      uint64_t offset, size_t length,
+      const FileOperationInfo::StartTimePoint& start_ts,
+      const FileOperationInfo::FinishTimePoint& finish_ts,
+      const IOStatus& io_status) {
+    FileOperationInfo info(FileOperationType::kRangeSync, file_name_, start_ts,
+                           finish_ts, io_status, temperature_);
+    info.offset = offset;
+    info.length = length;
+
+    for (auto& listener : listeners_) {
+      listener->OnFileRangeSyncFinish(info);
+    }
+    info.status.PermitUncheckedError();
+  }
+  void NotifyOnFileTruncateFinish(
+      const FileOperationInfo::StartTimePoint& start_ts,
+      const FileOperationInfo::FinishTimePoint& finish_ts,
+      const IOStatus& io_status) {
+    FileOperationInfo info(FileOperationType::kTruncate, file_name_, start_ts,
+                           finish_ts, io_status, temperature_);
+
+    for (auto& listener : listeners_) {
+      listener->OnFileTruncateFinish(info);
+    }
+    info.status.PermitUncheckedError();
+  }
+  void NotifyOnFileCloseFinish(
+      const FileOperationInfo::StartTimePoint& start_ts,
+      const FileOperationInfo::FinishTimePoint& finish_ts,
+      const IOStatus& io_status) {
+    FileOperationInfo info(FileOperationType::kClose, file_name_, start_ts,
+                           finish_ts, io_status, temperature_);
+
+    for (auto& listener : listeners_) {
+      listener->OnFileCloseFinish(info);
+    }
+    info.status.PermitUncheckedError();
+  }
+
+  void NotifyOnIOError(const IOStatus& io_status, FileOperationType operation,
+                       const std::string& file_path, size_t length = 0,
+                       uint64_t offset = 0) {
+    if (listeners_.empty()) {
+      return;
+    }
+    IOErrorInfo io_error_info(io_status, operation, file_path, length, offset);
+    for (auto& listener : listeners_) {
+      listener->OnIOError(io_error_info);
+    }
+    io_error_info.io_status.PermitUncheckedError();
+  }
+#endif  // ROCKSDB_LITE
+
+  bool ShouldNotifyListeners() const { return !listeners_.empty(); }
+  void UpdateFileChecksum(const Slice& data);
+  void Crc32cHandoffChecksumCalculation(const char* data, size_t size,
+                                        char* buf);
+
+  std::string file_name_;
+  FSWritableFilePtr writable_file_;
+  SystemClock* clock_;
+  AlignedBuffer buf_;
+  size_t max_buffer_size_;
+  // Actually written data size can be used for truncate
+  // not counting padding data
+  std::atomic<uint64_t> filesize_;
+  std::atomic<uint64_t> flushed_size_;
+#ifndef ROCKSDB_LITE
+  // This is necessary when we use unbuffered access
+  // and writes must happen on aligned offsets
+  // so we need to go back and write that page again
+  uint64_t next_write_offset_;
+#endif  // ROCKSDB_LITE
+  bool pending_sync_;
+  std::atomic<bool> seen_error_;
+#ifndef NDEBUG
+  // SyncWithoutFlush() is the function that is allowed to be called
+  // concurrently with other function. One of the concurrent call
+  // could set seen_error_, and the other one would hit assertion
+  // in debug mode.
+  std::atomic<bool> sync_without_flush_called_ = false;
+#endif  // NDEBUG
+  uint64_t last_sync_size_;
+  uint64_t bytes_per_sync_;
+  RateLimiter* rate_limiter_;
+  Statistics* stats_;
+  std::vector<std::shared_ptr<EventListener>> listeners_;
+  std::unique_ptr<FileChecksumGenerator> checksum_generator_;
+  bool checksum_finalized_;
+  bool perform_data_verification_;
+  uint32_t buffered_data_crc32c_checksum_;
+  bool buffered_data_with_checksum_;
+#ifndef ROCKSDB_LITE
+  Temperature temperature_;
+#endif  // ROCKSDB_LITE
+
+ public:
+  WritableFileWriter(
+      std::unique_ptr<FSWritableFile>&& file, const std::string& _file_name,
+      const FileOptions& options, SystemClock* clock = nullptr,
+      const std::shared_ptr<IOTracer>& io_tracer = nullptr,
+      Statistics* stats = nullptr,
+      const std::vector<std::shared_ptr<EventListener>>& listeners = {},
+      FileChecksumGenFactory* file_checksum_gen_factory = nullptr,
+      bool perform_data_verification = false,
+      bool buffered_data_with_checksum = false)
+      : file_name_(_file_name),
+        writable_file_(std::move(file), io_tracer, _file_name),
+        clock_(clock),
+        buf_(),
+        max_buffer_size_(options.writable_file_max_buffer_size),
+        filesize_(0),
+        flushed_size_(0),
+#ifndef ROCKSDB_LITE
+        next_write_offset_(0),
+#endif  // ROCKSDB_LITE
+        pending_sync_(false),
+        seen_error_(false),
+        last_sync_size_(0),
+        bytes_per_sync_(options.bytes_per_sync),
+        rate_limiter_(options.rate_limiter),
+        stats_(stats),
+        listeners_(),
+        checksum_generator_(nullptr),
+        checksum_finalized_(false),
+        perform_data_verification_(perform_data_verification),
+        buffered_data_crc32c_checksum_(0),
+        buffered_data_with_checksum_(buffered_data_with_checksum) {
+#ifndef ROCKSDB_LITE
+    temperature_ = options.temperature;
+#endif  // ROCKSDB_LITE
+    assert(!use_direct_io() || max_buffer_size_ > 0);
+    TEST_SYNC_POINT_CALLBACK("WritableFileWriter::WritableFileWriter:0",
+                             reinterpret_cast<void*>(max_buffer_size_));
+    buf_.Alignment(writable_file_->GetRequiredBufferAlignment());
+    buf_.AllocateNewBuffer(std::min((size_t)65536, max_buffer_size_));
+#ifndef ROCKSDB_LITE
+    std::for_each(listeners.begin(), listeners.end(),
+                  [this](const std::shared_ptr<EventListener>& e) {
+                    if (e->ShouldBeNotifiedOnFileIO()) {
+                      listeners_.emplace_back(e);
+                    }
+                  });
+#else  // !ROCKSDB_LITE
+    (void)listeners;
+#endif
+    if (file_checksum_gen_factory != nullptr) {
+      FileChecksumGenContext checksum_gen_context;
+      checksum_gen_context.file_name = _file_name;
+      checksum_generator_ =
+          file_checksum_gen_factory->CreateFileChecksumGenerator(
+              checksum_gen_context);
+    }
+  }
+
+  static IOStatus Create(const std::shared_ptr<FileSystem>& fs,
+                         const std::string& fname, const FileOptions& file_opts,
+                         std::unique_ptr<WritableFileWriter>* writer,
+                         IODebugContext* dbg);
+  WritableFileWriter(const WritableFileWriter&) = delete;
+
+  WritableFileWriter& operator=(const WritableFileWriter&) = delete;
+
+  ~WritableFileWriter() {
+    auto s = Close();
+    s.PermitUncheckedError();
+  }
+
+  std::string file_name() const { return file_name_; }
+
+  // When this Append API is called, if the crc32c_checksum is not provided, we
+  // will calculate the checksum internally.
+  IOStatus Append(const Slice& data, uint32_t crc32c_checksum = 0,
+                  Env::IOPriority op_rate_limiter_priority = Env::IO_TOTAL);
+
+  IOStatus Pad(const size_t pad_bytes,
+               Env::IOPriority op_rate_limiter_priority = Env::IO_TOTAL);
+
+  IOStatus Flush(Env::IOPriority op_rate_limiter_priority = Env::IO_TOTAL);
+
+  IOStatus Close();
+
+  IOStatus Sync(bool use_fsync);
+
+  // Sync only the data that was already Flush()ed. Safe to call concurrently
+  // with Append() and Flush(). If !writable_file_->IsSyncThreadSafe(),
+  // returns NotSupported status.
+  IOStatus SyncWithoutFlush(bool use_fsync);
+
+  uint64_t GetFileSize() const {
+    return filesize_.load(std::memory_order_acquire);
+  }
+
+  // Returns the size of data flushed to the underlying `FSWritableFile`.
+  // Expected to match `writable_file()->GetFileSize()`.
+  // The return value can serve as a lower-bound for the amount of data synced
+  // by a future call to `SyncWithoutFlush()`.
+  uint64_t GetFlushedSize() const {
+    return flushed_size_.load(std::memory_order_acquire);
+  }
+
+  IOStatus InvalidateCache(size_t offset, size_t length) {
+    return writable_file_->InvalidateCache(offset, length);
+  }
+
+  FSWritableFile* writable_file() const { return writable_file_.get(); }
+
+  bool use_direct_io() { return writable_file_->use_direct_io(); }
+
+  bool BufferIsEmpty() { return buf_.CurrentSize() == 0; }
+
+  void TEST_SetFileChecksumGenerator(
+      FileChecksumGenerator* checksum_generator) {
+    checksum_generator_.reset(checksum_generator);
+  }
+
+  std::string GetFileChecksum();
+
+  const char* GetFileChecksumFuncName() const;
+
+  bool seen_error() const {
+    return seen_error_.load(std::memory_order_relaxed);
+  }
+  // For options of relaxed consistency, users might hope to continue
+  // operating on the file after an error happens.
+  void reset_seen_error() {
+    seen_error_.store(false, std::memory_order_relaxed);
+  }
+  void set_seen_error() { seen_error_.store(true, std::memory_order_relaxed); }
+
+  IOStatus AssertFalseAndGetStatusForPrevError() {
+    // This should only happen if SyncWithoutFlush() was called.
+    assert(sync_without_flush_called_);
+    return IOStatus::IOError("Writer has previous error.");
+  }
+
+ private:
+  // Decide the Rate Limiter priority.
+  static Env::IOPriority DecideRateLimiterPriority(
+      Env::IOPriority writable_file_io_priority,
+      Env::IOPriority op_rate_limiter_priority);
+
+  // Used when os buffering is OFF and we are writing
+  // DMA such as in Direct I/O mode
+#ifndef ROCKSDB_LITE
+  IOStatus WriteDirect(Env::IOPriority op_rate_limiter_priority);
+  IOStatus WriteDirectWithChecksum(Env::IOPriority op_rate_limiter_priority);
+#endif  // !ROCKSDB_LITE
+  // Normal write.
+  IOStatus WriteBuffered(const char* data, size_t size,
+                         Env::IOPriority op_rate_limiter_priority);
+  IOStatus WriteBufferedWithChecksum(const char* data, size_t size,
+                                     Env::IOPriority op_rate_limiter_priority);
+  IOStatus RangeSync(uint64_t offset, uint64_t nbytes);
+  IOStatus SyncInternal(bool use_fsync);
+};
+}  // namespace ROCKSDB_NAMESPACE