Adding upstream version 16.2.11+ds.upstream/16.2.11+ds upstream

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-07 18:45:59 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-07 18:45:59 +0000
commit: 19fcec84d8d7d21e796c7624e521b60d28ee21ed (patch)
tree: 42d26aa27d1e3f7c0b8bd3fd14e7d7082f5008dc /src/rocksdb/file
parent: Initial commit. (diff)
download: ceph-upstream/16.2.11+ds.tar.xz
ceph-upstream/16.2.11+ds.zip
21 files changed, 4480 insertions, 0 deletions
diff --git a/src/rocksdb/file/delete_scheduler.cc b/src/rocksdb/file/delete_scheduler.cc
new file mode 100644
index 000000000..bb318e595
--- /dev/null
+++ b/src/rocksdb/file/delete_scheduler.cc
@@ -0,0 +1,357 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "file/delete_scheduler.h"
+
+#include <thread>
+#include <vector>
+
+#include "file/sst_file_manager_impl.h"
+#include "logging/logging.h"
+#include "port/port.h"
+#include "rocksdb/env.h"
+#include "test_util/sync_point.h"
+#include "util/mutexlock.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+DeleteScheduler::DeleteScheduler(Env* env, FileSystem* fs,
+                                 int64_t rate_bytes_per_sec, Logger* info_log,
+                                 SstFileManagerImpl* sst_file_manager,
+                                 double max_trash_db_ratio,
+                                 uint64_t bytes_max_delete_chunk)
+    : env_(env),
+      fs_(fs),
+      total_trash_size_(0),
+      rate_bytes_per_sec_(rate_bytes_per_sec),
+      pending_files_(0),
+      bytes_max_delete_chunk_(bytes_max_delete_chunk),
+      closing_(false),
+      cv_(&mu_),
+      info_log_(info_log),
+      sst_file_manager_(sst_file_manager),
+      max_trash_db_ratio_(max_trash_db_ratio) {
+  assert(sst_file_manager != nullptr);
+  assert(max_trash_db_ratio >= 0);
+  bg_thread_.reset(
+      new port::Thread(&DeleteScheduler::BackgroundEmptyTrash, this));
+}
+
+DeleteScheduler::~DeleteScheduler() {
+  {
+    InstrumentedMutexLock l(&mu_);
+    closing_ = true;
+    cv_.SignalAll();
+  }
+  if (bg_thread_) {
+    bg_thread_->join();
+  }
+}
+
+Status DeleteScheduler::DeleteFile(const std::string& file_path,
+                                   const std::string& dir_to_sync,
+                                   const bool force_bg) {
+  Status s;
+  if (rate_bytes_per_sec_.load() <= 0 || (!force_bg &&
+      total_trash_size_.load() >
+          sst_file_manager_->GetTotalSize() * max_trash_db_ratio_.load())) {
+    // Rate limiting is disabled or trash size makes up more than
+    // max_trash_db_ratio_ (default 25%) of the total DB size
+    TEST_SYNC_POINT("DeleteScheduler::DeleteFile");
+    s = fs_->DeleteFile(file_path, IOOptions(), nullptr);
+    if (s.ok()) {
+      sst_file_manager_->OnDeleteFile(file_path);
+    }
+    return s;
+  }
+
+  // Move file to trash
+  std::string trash_file;
+  s = MarkAsTrash(file_path, &trash_file);
+
+  if (!s.ok()) {
+    ROCKS_LOG_ERROR(info_log_, "Failed to mark %s as trash -- %s",
+                    file_path.c_str(), s.ToString().c_str());
+    s = fs_->DeleteFile(file_path, IOOptions(), nullptr);
+    if (s.ok()) {
+      sst_file_manager_->OnDeleteFile(file_path);
+    }
+    return s;
+  }
+
+  // Update the total trash size
+  uint64_t trash_file_size = 0;
+  fs_->GetFileSize(trash_file, IOOptions(), &trash_file_size, nullptr);
+  total_trash_size_.fetch_add(trash_file_size);
+
+  // Add file to delete queue
+  {
+    InstrumentedMutexLock l(&mu_);
+    queue_.emplace(trash_file, dir_to_sync);
+    pending_files_++;
+    if (pending_files_ == 1) {
+      cv_.SignalAll();
+    }
+  }
+  return s;
+}
+
+std::map<std::string, Status> DeleteScheduler::GetBackgroundErrors() {
+  InstrumentedMutexLock l(&mu_);
+  return bg_errors_;
+}
+
+const std::string DeleteScheduler::kTrashExtension = ".trash";
+bool DeleteScheduler::IsTrashFile(const std::string& file_path) {
+  return (file_path.size() >= kTrashExtension.size() &&
+          file_path.rfind(kTrashExtension) ==
+              file_path.size() - kTrashExtension.size());
+}
+
+Status DeleteScheduler::CleanupDirectory(Env* env, SstFileManagerImpl* sfm,
+                                         const std::string& path) {
+  Status s;
+  // Check if there are any files marked as trash in this path
+  std::vector<std::string> files_in_path;
+  s = env->GetChildren(path, &files_in_path);
+  if (!s.ok()) {
+    return s;
+  }
+  for (const std::string& current_file : files_in_path) {
+    if (!DeleteScheduler::IsTrashFile(current_file)) {
+      // not a trash file, skip
+      continue;
+    }
+
+    Status file_delete;
+    std::string trash_file = path + "/" + current_file;
+    if (sfm) {
+      // We have an SstFileManager that will schedule the file delete
+      sfm->OnAddFile(trash_file);
+      file_delete = sfm->ScheduleFileDeletion(trash_file, path);
+    } else {
+      // Delete the file immediately
+      file_delete = env->DeleteFile(trash_file);
+    }
+
+    if (s.ok() && !file_delete.ok()) {
+      s = file_delete;
+    }
+  }
+
+  return s;
+}
+
+Status DeleteScheduler::MarkAsTrash(const std::string& file_path,
+                                    std::string* trash_file) {
+  // Sanity check of the path
+  size_t idx = file_path.rfind("/");
+  if (idx == std::string::npos || idx == file_path.size() - 1) {
+    return Status::InvalidArgument("file_path is corrupted");
+  }
+
+  Status s;
+  if (DeleteScheduler::IsTrashFile(file_path)) {
+    // This is already a trash file
+    *trash_file = file_path;
+    return s;
+  }
+
+  *trash_file = file_path + kTrashExtension;
+  // TODO(tec) : Implement Env::RenameFileIfNotExist and remove
+  //             file_move_mu mutex.
+  int cnt = 0;
+  InstrumentedMutexLock l(&file_move_mu_);
+  while (true) {
+    s = fs_->FileExists(*trash_file, IOOptions(), nullptr);
+    if (s.IsNotFound()) {
+      // We found a path for our file in trash
+      s = fs_->RenameFile(file_path, *trash_file, IOOptions(), nullptr);
+      break;
+    } else if (s.ok()) {
+      // Name conflict, generate new random suffix
+      *trash_file = file_path + std::to_string(cnt) + kTrashExtension;
+    } else {
+      // Error during FileExists call, we cannot continue
+      break;
+    }
+    cnt++;
+  }
+  if (s.ok()) {
+    sst_file_manager_->OnMoveFile(file_path, *trash_file);
+  }
+  return s;
+}
+
+void DeleteScheduler::BackgroundEmptyTrash() {
+  TEST_SYNC_POINT("DeleteScheduler::BackgroundEmptyTrash");
+
+  while (true) {
+    InstrumentedMutexLock l(&mu_);
+    while (queue_.empty() && !closing_) {
+      cv_.Wait();
+    }
+
+    if (closing_) {
+      return;
+    }
+
+    // Delete all files in queue_
+    uint64_t start_time = env_->NowMicros();
+    uint64_t total_deleted_bytes = 0;
+    int64_t current_delete_rate = rate_bytes_per_sec_.load();
+    while (!queue_.empty() && !closing_) {
+      if (current_delete_rate != rate_bytes_per_sec_.load()) {
+        // User changed the delete rate
+        current_delete_rate = rate_bytes_per_sec_.load();
+        start_time = env_->NowMicros();
+        total_deleted_bytes = 0;
+      }
+
+      // Get new file to delete
+      const FileAndDir& fad = queue_.front();
+      std::string path_in_trash = fad.fname;
+
+      // We dont need to hold the lock while deleting the file
+      mu_.Unlock();
+      uint64_t deleted_bytes = 0;
+      bool is_complete = true;
+      // Delete file from trash and update total_penlty value
+      Status s =
+          DeleteTrashFile(path_in_trash, fad.dir, &deleted_bytes, &is_complete);
+      total_deleted_bytes += deleted_bytes;
+      mu_.Lock();
+      if (is_complete) {
+        queue_.pop();
+      }
+
+      if (!s.ok()) {
+        bg_errors_[path_in_trash] = s;
+      }
+
+      // Apply penlty if necessary
+      uint64_t total_penlty;
+      if (current_delete_rate > 0) {
+        // rate limiting is enabled
+        total_penlty =
+            ((total_deleted_bytes * kMicrosInSecond) / current_delete_rate);
+        while (!closing_ && !cv_.TimedWait(start_time + total_penlty)) {}
+      } else {
+        // rate limiting is disabled
+        total_penlty = 0;
+      }
+      TEST_SYNC_POINT_CALLBACK("DeleteScheduler::BackgroundEmptyTrash:Wait",
+                               &total_penlty);
+
+      if (is_complete) {
+        pending_files_--;
+      }
+      if (pending_files_ == 0) {
+        // Unblock WaitForEmptyTrash since there are no more files waiting
+        // to be deleted
+        cv_.SignalAll();
+      }
+    }
+  }
+}
+
+Status DeleteScheduler::DeleteTrashFile(const std::string& path_in_trash,
+                                        const std::string& dir_to_sync,
+                                        uint64_t* deleted_bytes,
+                                        bool* is_complete) {
+  uint64_t file_size;
+  Status s = fs_->GetFileSize(path_in_trash, IOOptions(), &file_size, nullptr);
+  *is_complete = true;
+  TEST_SYNC_POINT("DeleteScheduler::DeleteTrashFile:DeleteFile");
+  if (s.ok()) {
+    bool need_full_delete = true;
+    if (bytes_max_delete_chunk_ != 0 && file_size > bytes_max_delete_chunk_) {
+      uint64_t num_hard_links = 2;
+      // We don't have to worry aobut data race between linking a new
+      // file after the number of file link check and ftruncte because
+      // the file is now in trash and no hardlink is supposed to create
+      // to trash files by RocksDB.
+      Status my_status = fs_->NumFileLinks(path_in_trash, IOOptions(),
+                                           &num_hard_links, nullptr);
+      if (my_status.ok()) {
+        if (num_hard_links == 1) {
+          std::unique_ptr<FSWritableFile> wf;
+          my_status = fs_->ReopenWritableFile(path_in_trash, FileOptions(),
+                                              &wf, nullptr);
+          if (my_status.ok()) {
+            my_status = wf->Truncate(file_size - bytes_max_delete_chunk_,
+                                     IOOptions(), nullptr);
+            if (my_status.ok()) {
+              TEST_SYNC_POINT("DeleteScheduler::DeleteTrashFile:Fsync");
+              my_status = wf->Fsync(IOOptions(), nullptr);
+            }
+          }
+          if (my_status.ok()) {
+            *deleted_bytes = bytes_max_delete_chunk_;
+            need_full_delete = false;
+            *is_complete = false;
+          } else {
+            ROCKS_LOG_WARN(info_log_,
+                           "Failed to partially delete %s from trash -- %s",
+                           path_in_trash.c_str(), my_status.ToString().c_str());
+          }
+        } else {
+          ROCKS_LOG_INFO(info_log_,
+                         "Cannot delete %s slowly through ftruncate from trash "
+                         "as it has other links",
+                         path_in_trash.c_str());
+        }
+      } else if (!num_link_error_printed_) {
+        ROCKS_LOG_INFO(
+            info_log_,
+            "Cannot delete files slowly through ftruncate from trash "
+            "as Env::NumFileLinks() returns error: %s",
+            my_status.ToString().c_str());
+        num_link_error_printed_ = true;
+      }
+    }
+
+    if (need_full_delete) {
+      s = fs_->DeleteFile(path_in_trash, IOOptions(), nullptr);
+      if (!dir_to_sync.empty()) {
+        std::unique_ptr<FSDirectory> dir_obj;
+        if (s.ok()) {
+          s = fs_->NewDirectory(dir_to_sync, IOOptions(), &dir_obj, nullptr);
+        }
+        if (s.ok()) {
+          s = dir_obj->Fsync(IOOptions(), nullptr);
+          TEST_SYNC_POINT_CALLBACK(
+              "DeleteScheduler::DeleteTrashFile::AfterSyncDir",
+              reinterpret_cast<void*>(const_cast<std::string*>(&dir_to_sync)));
+        }
+      }
+      *deleted_bytes = file_size;
+      sst_file_manager_->OnDeleteFile(path_in_trash);
+    }
+  }
+  if (!s.ok()) {
+    // Error while getting file size or while deleting
+    ROCKS_LOG_ERROR(info_log_, "Failed to delete %s from trash -- %s",
+                    path_in_trash.c_str(), s.ToString().c_str());
+    *deleted_bytes = 0;
+  } else {
+    total_trash_size_.fetch_sub(*deleted_bytes);
+  }
+
+  return s;
+}
+
+void DeleteScheduler::WaitForEmptyTrash() {
+  InstrumentedMutexLock l(&mu_);
+  while (pending_files_ > 0 && !closing_) {
+    cv_.Wait();
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/file/delete_scheduler.h b/src/rocksdb/file/delete_scheduler.h
new file mode 100644
index 000000000..60948480a
--- /dev/null
+++ b/src/rocksdb/file/delete_scheduler.h
@@ -0,0 +1,141 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <map>
+#include <queue>
+#include <string>
+#include <thread>
+
+#include "monitoring/instrumented_mutex.h"
+#include "port/port.h"
+
+#include "rocksdb/file_system.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Env;
+class Logger;
+class SstFileManagerImpl;
+
+// DeleteScheduler allows the DB to enforce a rate limit on file deletion,
+// Instead of deleteing files immediately, files are marked as trash
+// and deleted in a background thread that apply sleep penlty between deletes
+// if they are happening in a rate faster than rate_bytes_per_sec,
+//
+// Rate limiting can be turned off by setting rate_bytes_per_sec = 0, In this
+// case DeleteScheduler will delete files immediately.
+class DeleteScheduler {
+ public:
+  DeleteScheduler(Env* env, FileSystem* fs, int64_t rate_bytes_per_sec,
+                  Logger* info_log, SstFileManagerImpl* sst_file_manager,
+                  double max_trash_db_ratio, uint64_t bytes_max_delete_chunk);
+
+  ~DeleteScheduler();
+
+  // Return delete rate limit in bytes per second
+  int64_t GetRateBytesPerSecond() { return rate_bytes_per_sec_.load(); }
+
+  // Set delete rate limit in bytes per second
+  void SetRateBytesPerSecond(int64_t bytes_per_sec) {
+    rate_bytes_per_sec_.store(bytes_per_sec);
+  }
+
+  // Mark file as trash directory and schedule it's deletion. If force_bg is
+  // set, it forces the file to always be deleted in the background thread,
+  // except when rate limiting is disabled
+  Status DeleteFile(const std::string& fname, const std::string& dir_to_sync,
+      const bool force_bg = false);
+
+  // Wait for all files being deleteing in the background to finish or for
+  // destructor to be called.
+  void WaitForEmptyTrash();
+
+  // Return a map containing errors that happened in BackgroundEmptyTrash
+  // file_path => error status
+  std::map<std::string, Status> GetBackgroundErrors();
+
+  uint64_t GetTotalTrashSize() { return total_trash_size_.load(); }
+
+  // Return trash/DB size ratio where new files will be deleted immediately
+  double GetMaxTrashDBRatio() {
+    return max_trash_db_ratio_.load();
+  }
+
+  // Update trash/DB size ratio where new files will be deleted immediately
+  void SetMaxTrashDBRatio(double r) {
+    assert(r >= 0);
+    max_trash_db_ratio_.store(r);
+  }
+
+  static const std::string kTrashExtension;
+  static bool IsTrashFile(const std::string& file_path);
+
+  // Check if there are any .trash filse in path, and schedule their deletion
+  // Or delete immediately if sst_file_manager is nullptr
+  static Status CleanupDirectory(Env* env, SstFileManagerImpl* sfm,
+                                 const std::string& path);
+
+ private:
+  Status MarkAsTrash(const std::string& file_path, std::string* path_in_trash);
+
+  Status DeleteTrashFile(const std::string& path_in_trash,
+                         const std::string& dir_to_sync,
+                         uint64_t* deleted_bytes, bool* is_complete);
+
+  void BackgroundEmptyTrash();
+
+  Env* env_;
+  FileSystem* fs_;
+
+  // total size of trash files
+  std::atomic<uint64_t> total_trash_size_;
+  // Maximum number of bytes that should be deleted per second
+  std::atomic<int64_t> rate_bytes_per_sec_;
+  // Mutex to protect queue_, pending_files_, bg_errors_, closing_
+  InstrumentedMutex mu_;
+
+  struct FileAndDir {
+    FileAndDir(const std::string& f, const std::string& d) : fname(f), dir(d) {}
+    std::string fname;
+    std::string dir;  // empty will be skipped.
+  };
+
+  // Queue of trash files that need to be deleted
+  std::queue<FileAndDir> queue_;
+  // Number of trash files that are waiting to be deleted
+  int32_t pending_files_;
+  uint64_t bytes_max_delete_chunk_;
+  // Errors that happened in BackgroundEmptyTrash (file_path => error)
+  std::map<std::string, Status> bg_errors_;
+
+  bool num_link_error_printed_ = false;
+  // Set to true in ~DeleteScheduler() to force BackgroundEmptyTrash to stop
+  bool closing_;
+  // Condition variable signaled in these conditions
+  //    - pending_files_ value change from 0 => 1
+  //    - pending_files_ value change from 1 => 0
+  //    - closing_ value is set to true
+  InstrumentedCondVar cv_;
+  // Background thread running BackgroundEmptyTrash
+  std::unique_ptr<port::Thread> bg_thread_;
+  // Mutex to protect threads from file name conflicts
+  InstrumentedMutex file_move_mu_;
+  Logger* info_log_;
+  SstFileManagerImpl* sst_file_manager_;
+  // If the trash size constitutes for more than this fraction of the total DB
+  // size we will start deleting new files passed to DeleteScheduler
+  // immediately
+  std::atomic<double> max_trash_db_ratio_;
+  static const uint64_t kMicrosInSecond = 1000 * 1000LL;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/file/delete_scheduler_test.cc b/src/rocksdb/file/delete_scheduler_test.cc
new file mode 100644
index 000000000..cff645de5
--- /dev/null
+++ b/src/rocksdb/file/delete_scheduler_test.cc
@@ -0,0 +1,693 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include <atomic>
+#include <cinttypes>
+#include <thread>
+#include <vector>
+
+#include "file/delete_scheduler.h"
+#include "file/sst_file_manager_impl.h"
+#include "rocksdb/env.h"
+#include "rocksdb/options.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/string_util.h"
+
+#ifndef ROCKSDB_LITE
+
+namespace ROCKSDB_NAMESPACE {
+
+class DeleteSchedulerTest : public testing::Test {
+ public:
+  DeleteSchedulerTest() : env_(Env::Default()) {
+    const int kNumDataDirs = 3;
+    dummy_files_dirs_.reserve(kNumDataDirs);
+    for (size_t i = 0; i < kNumDataDirs; ++i) {
+      dummy_files_dirs_.emplace_back(
+          test::PerThreadDBPath(env_, "delete_scheduler_dummy_data_dir") +
+          ToString(i));
+      DestroyAndCreateDir(dummy_files_dirs_.back());
+    }
+  }
+
+  ~DeleteSchedulerTest() override {
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({});
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+    for (const auto& dummy_files_dir : dummy_files_dirs_) {
+      test::DestroyDir(env_, dummy_files_dir);
+    }
+  }
+
+  void DestroyAndCreateDir(const std::string& dir) {
+    ASSERT_OK(test::DestroyDir(env_, dir));
+    EXPECT_OK(env_->CreateDir(dir));
+  }
+
+  int CountNormalFiles(size_t dummy_files_dirs_idx = 0) {
+    std::vector<std::string> files_in_dir;
+    EXPECT_OK(env_->GetChildren(dummy_files_dirs_[dummy_files_dirs_idx],
+                                &files_in_dir));
+
+    int normal_cnt = 0;
+    for (auto& f : files_in_dir) {
+      if (!DeleteScheduler::IsTrashFile(f) && f != "." && f != "..") {
+        normal_cnt++;
+      }
+    }
+    return normal_cnt;
+  }
+
+  int CountTrashFiles(size_t dummy_files_dirs_idx = 0) {
+    std::vector<std::string> files_in_dir;
+    EXPECT_OK(env_->GetChildren(dummy_files_dirs_[dummy_files_dirs_idx],
+                                &files_in_dir));
+
+    int trash_cnt = 0;
+    for (auto& f : files_in_dir) {
+      if (DeleteScheduler::IsTrashFile(f)) {
+        trash_cnt++;
+      }
+    }
+    return trash_cnt;
+  }
+
+  std::string NewDummyFile(const std::string& file_name, uint64_t size = 1024,
+                           size_t dummy_files_dirs_idx = 0) {
+    std::string file_path =
+        dummy_files_dirs_[dummy_files_dirs_idx] + "/" + file_name;
+    std::unique_ptr<WritableFile> f;
+    env_->NewWritableFile(file_path, &f, EnvOptions());
+    std::string data(size, 'A');
+    EXPECT_OK(f->Append(data));
+    EXPECT_OK(f->Close());
+    sst_file_mgr_->OnAddFile(file_path, false);
+    return file_path;
+  }
+
+  void NewDeleteScheduler() {
+    // Tests in this file are for DeleteScheduler component and dont create any
+    // DBs, so we need to set max_trash_db_ratio to 100% (instead of default
+    // 25%)
+    std::shared_ptr<FileSystem>
+                fs(std::make_shared<LegacyFileSystemWrapper>(env_));
+    sst_file_mgr_.reset(
+        new SstFileManagerImpl(env_, fs, nullptr, rate_bytes_per_sec_,
+                               /* max_trash_db_ratio= */ 1.1, 128 * 1024));
+    delete_scheduler_ = sst_file_mgr_->delete_scheduler();
+  }
+
+  Env* env_;
+  std::vector<std::string> dummy_files_dirs_;
+  int64_t rate_bytes_per_sec_;
+  DeleteScheduler* delete_scheduler_;
+  std::unique_ptr<SstFileManagerImpl> sst_file_mgr_;
+};
+
+// Test the basic functionality of DeleteScheduler (Rate Limiting).
+// 1- Create 100 dummy files
+// 2- Delete the 100 dummy files using DeleteScheduler
+// --- Hold DeleteScheduler::BackgroundEmptyTrash ---
+// 3- Wait for DeleteScheduler to delete all files in trash
+// 4- Verify that BackgroundEmptyTrash used to correct penlties for the files
+// 5- Make sure that all created files were completely deleted
+TEST_F(DeleteSchedulerTest, BasicRateLimiting) {
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+      {"DeleteSchedulerTest::BasicRateLimiting:1",
+       "DeleteScheduler::BackgroundEmptyTrash"},
+  });
+
+  std::vector<uint64_t> penalties;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DeleteScheduler::BackgroundEmptyTrash:Wait",
+      [&](void* arg) { penalties.push_back(*(static_cast<uint64_t*>(arg))); });
+  int dir_synced = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DeleteScheduler::DeleteTrashFile::AfterSyncDir", [&](void* arg) {
+        dir_synced++;
+        std::string* dir = reinterpret_cast<std::string*>(arg);
+        EXPECT_EQ(dummy_files_dirs_[0], *dir);
+      });
+
+  int num_files = 100;  // 100 files
+  uint64_t file_size = 1024;  // every file is 1 kb
+  std::vector<uint64_t> delete_kbs_per_sec = {512, 200, 100, 50, 25};
+
+  for (size_t t = 0; t < delete_kbs_per_sec.size(); t++) {
+    penalties.clear();
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearTrace();
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+    DestroyAndCreateDir(dummy_files_dirs_[0]);
+    rate_bytes_per_sec_ = delete_kbs_per_sec[t] * 1024;
+    NewDeleteScheduler();
+
+    dir_synced = 0;
+    // Create 100 dummy files, every file is 1 Kb
+    std::vector<std::string> generated_files;
+    for (int i = 0; i < num_files; i++) {
+      std::string file_name = "file" + ToString(i) + ".data";
+      generated_files.push_back(NewDummyFile(file_name, file_size));
+    }
+
+    // Delete dummy files and measure time spent to empty trash
+    for (int i = 0; i < num_files; i++) {
+      ASSERT_OK(delete_scheduler_->DeleteFile(generated_files[i],
+                                              dummy_files_dirs_[0]));
+    }
+    ASSERT_EQ(CountNormalFiles(), 0);
+
+    uint64_t delete_start_time = env_->NowMicros();
+    TEST_SYNC_POINT("DeleteSchedulerTest::BasicRateLimiting:1");
+    delete_scheduler_->WaitForEmptyTrash();
+    uint64_t time_spent_deleting = env_->NowMicros() - delete_start_time;
+
+    auto bg_errors = delete_scheduler_->GetBackgroundErrors();
+    ASSERT_EQ(bg_errors.size(), 0);
+
+    uint64_t total_files_size = 0;
+    uint64_t expected_penlty = 0;
+    ASSERT_EQ(penalties.size(), num_files);
+    for (int i = 0; i < num_files; i++) {
+      total_files_size += file_size;
+      expected_penlty = ((total_files_size * 1000000) / rate_bytes_per_sec_);
+      ASSERT_EQ(expected_penlty, penalties[i]);
+    }
+    ASSERT_GT(time_spent_deleting, expected_penlty * 0.9);
+
+    ASSERT_EQ(num_files, dir_synced);
+
+    ASSERT_EQ(CountTrashFiles(), 0);
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  }
+}
+
+TEST_F(DeleteSchedulerTest, MultiDirectoryDeletionsScheduled) {
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+      {"DeleteSchedulerTest::MultiDbPathDeletionsScheduled:1",
+       "DeleteScheduler::BackgroundEmptyTrash"},
+  });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  rate_bytes_per_sec_ = 1 << 20;  // 1MB
+  NewDeleteScheduler();
+
+  // Generate dummy files in multiple directories
+  const size_t kNumFiles = dummy_files_dirs_.size();
+  const size_t kFileSize = 1 << 10;  // 1KB
+  std::vector<std::string> generated_files;
+  for (size_t i = 0; i < kNumFiles; i++) {
+    generated_files.push_back(NewDummyFile("file", kFileSize, i));
+    ASSERT_EQ(1, CountNormalFiles(i));
+  }
+
+  // Mark dummy files as trash
+  for (size_t i = 0; i < kNumFiles; i++) {
+    ASSERT_OK(delete_scheduler_->DeleteFile(generated_files[i], ""));
+    ASSERT_EQ(0, CountNormalFiles(i));
+    ASSERT_EQ(1, CountTrashFiles(i));
+  }
+  TEST_SYNC_POINT("DeleteSchedulerTest::MultiDbPathDeletionsScheduled:1");
+  delete_scheduler_->WaitForEmptyTrash();
+
+  // Verify dummy files eventually got deleted
+  for (size_t i = 0; i < kNumFiles; i++) {
+    ASSERT_EQ(0, CountNormalFiles(i));
+    ASSERT_EQ(0, CountTrashFiles(i));
+  }
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+// Same as the BasicRateLimiting test but delete files in multiple threads.
+// 1- Create 100 dummy files
+// 2- Delete the 100 dummy files using DeleteScheduler using 10 threads
+// --- Hold DeleteScheduler::BackgroundEmptyTrash ---
+// 3- Wait for DeleteScheduler to delete all files in queue
+// 4- Verify that BackgroundEmptyTrash used to correct penlties for the files
+// 5- Make sure that all created files were completely deleted
+TEST_F(DeleteSchedulerTest, RateLimitingMultiThreaded) {
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+      {"DeleteSchedulerTest::RateLimitingMultiThreaded:1",
+       "DeleteScheduler::BackgroundEmptyTrash"},
+  });
+
+  std::vector<uint64_t> penalties;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DeleteScheduler::BackgroundEmptyTrash:Wait",
+      [&](void* arg) { penalties.push_back(*(static_cast<uint64_t*>(arg))); });
+
+  int thread_cnt = 10;
+  int num_files = 10;  // 10 files per thread
+  uint64_t file_size = 1024;  // every file is 1 kb
+
+  std::vector<uint64_t> delete_kbs_per_sec = {512, 200, 100, 50, 25};
+  for (size_t t = 0; t < delete_kbs_per_sec.size(); t++) {
+    penalties.clear();
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearTrace();
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+    DestroyAndCreateDir(dummy_files_dirs_[0]);
+    rate_bytes_per_sec_ = delete_kbs_per_sec[t] * 1024;
+    NewDeleteScheduler();
+
+    // Create 100 dummy files, every file is 1 Kb
+    std::vector<std::string> generated_files;
+    for (int i = 0; i < num_files * thread_cnt; i++) {
+      std::string file_name = "file" + ToString(i) + ".data";
+      generated_files.push_back(NewDummyFile(file_name, file_size));
+    }
+
+    // Delete dummy files using 10 threads and measure time spent to empty trash
+    std::atomic<int> thread_num(0);
+    std::vector<port::Thread> threads;
+    std::function<void()> delete_thread = [&]() {
+      int idx = thread_num.fetch_add(1);
+      int range_start = idx * num_files;
+      int range_end = range_start + num_files;
+      for (int j = range_start; j < range_end; j++) {
+        ASSERT_OK(delete_scheduler_->DeleteFile(generated_files[j], ""));
+      }
+    };
+
+    for (int i = 0; i < thread_cnt; i++) {
+      threads.emplace_back(delete_thread);
+    }
+
+    for (size_t i = 0; i < threads.size(); i++) {
+      threads[i].join();
+    }
+
+    uint64_t delete_start_time = env_->NowMicros();
+    TEST_SYNC_POINT("DeleteSchedulerTest::RateLimitingMultiThreaded:1");
+    delete_scheduler_->WaitForEmptyTrash();
+    uint64_t time_spent_deleting = env_->NowMicros() - delete_start_time;
+
+    auto bg_errors = delete_scheduler_->GetBackgroundErrors();
+    ASSERT_EQ(bg_errors.size(), 0);
+
+    uint64_t total_files_size = 0;
+    uint64_t expected_penlty = 0;
+    ASSERT_EQ(penalties.size(), num_files * thread_cnt);
+    for (int i = 0; i < num_files * thread_cnt; i++) {
+      total_files_size += file_size;
+      expected_penlty = ((total_files_size * 1000000) / rate_bytes_per_sec_);
+      ASSERT_EQ(expected_penlty, penalties[i]);
+    }
+    ASSERT_GT(time_spent_deleting, expected_penlty * 0.9);
+
+    ASSERT_EQ(CountNormalFiles(), 0);
+    ASSERT_EQ(CountTrashFiles(), 0);
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  }
+}
+
+// Disable rate limiting by setting rate_bytes_per_sec_ to 0 and make sure
+// that when DeleteScheduler delete a file it delete it immediately and dont
+// move it to trash
+TEST_F(DeleteSchedulerTest, DisableRateLimiting) {
+  int bg_delete_file = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DeleteScheduler::DeleteTrashFile:DeleteFile",
+      [&](void* /*arg*/) { bg_delete_file++; });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  rate_bytes_per_sec_ = 0;
+  NewDeleteScheduler();
+
+  for (int i = 0; i < 10; i++) {
+    // Every file we delete will be deleted immediately
+    std::string dummy_file = NewDummyFile("dummy.data");
+    ASSERT_OK(delete_scheduler_->DeleteFile(dummy_file, ""));
+    ASSERT_TRUE(env_->FileExists(dummy_file).IsNotFound());
+    ASSERT_EQ(CountNormalFiles(), 0);
+    ASSERT_EQ(CountTrashFiles(), 0);
+  }
+
+  ASSERT_EQ(bg_delete_file, 0);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+// Testing that moving files to trash with the same name is not a problem
+// 1- Create 10 files with the same name "conflict.data"
+// 2- Delete the 10 files using DeleteScheduler
+// 3- Make sure that trash directory contain 10 files ("conflict.data" x 10)
+// --- Hold DeleteScheduler::BackgroundEmptyTrash ---
+// 4- Make sure that files are deleted from trash
+TEST_F(DeleteSchedulerTest, ConflictNames) {
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+      {"DeleteSchedulerTest::ConflictNames:1",
+       "DeleteScheduler::BackgroundEmptyTrash"},
+  });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  rate_bytes_per_sec_ = 1024 * 1024;  // 1 Mb/sec
+  NewDeleteScheduler();
+
+  // Create "conflict.data" and move it to trash 10 times
+  for (int i = 0; i < 10; i++) {
+    std::string dummy_file = NewDummyFile("conflict.data");
+    ASSERT_OK(delete_scheduler_->DeleteFile(dummy_file, ""));
+  }
+  ASSERT_EQ(CountNormalFiles(), 0);
+  // 10 files ("conflict.data" x 10) in trash
+  ASSERT_EQ(CountTrashFiles(), 10);
+
+  // Hold BackgroundEmptyTrash
+  TEST_SYNC_POINT("DeleteSchedulerTest::ConflictNames:1");
+  delete_scheduler_->WaitForEmptyTrash();
+  ASSERT_EQ(CountTrashFiles(), 0);
+
+  auto bg_errors = delete_scheduler_->GetBackgroundErrors();
+  ASSERT_EQ(bg_errors.size(), 0);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+// 1- Create 10 dummy files
+// 2- Delete the 10 files using DeleteScheduler (move them to trsah)
+// 3- Delete the 10 files directly (using env_->DeleteFile)
+// --- Hold DeleteScheduler::BackgroundEmptyTrash ---
+// 4- Make sure that DeleteScheduler failed to delete the 10 files and
+//    reported 10 background errors
+TEST_F(DeleteSchedulerTest, BackgroundError) {
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+      {"DeleteSchedulerTest::BackgroundError:1",
+       "DeleteScheduler::BackgroundEmptyTrash"},
+  });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  rate_bytes_per_sec_ = 1024 * 1024;  // 1 Mb/sec
+  NewDeleteScheduler();
+
+  // Generate 10 dummy files and move them to trash
+  for (int i = 0; i < 10; i++) {
+    std::string file_name = "data_" + ToString(i) + ".data";
+    ASSERT_OK(delete_scheduler_->DeleteFile(NewDummyFile(file_name), ""));
+  }
+  ASSERT_EQ(CountNormalFiles(), 0);
+  ASSERT_EQ(CountTrashFiles(), 10);
+
+  // Delete 10 files from trash, this will cause background errors in
+  // BackgroundEmptyTrash since we already deleted the files it was
+  // goind to delete
+  for (int i = 0; i < 10; i++) {
+    std::string file_name = "data_" + ToString(i) + ".data.trash";
+    ASSERT_OK(env_->DeleteFile(dummy_files_dirs_[0] + "/" + file_name));
+  }
+
+  // Hold BackgroundEmptyTrash
+  TEST_SYNC_POINT("DeleteSchedulerTest::BackgroundError:1");
+  delete_scheduler_->WaitForEmptyTrash();
+  auto bg_errors = delete_scheduler_->GetBackgroundErrors();
+  ASSERT_EQ(bg_errors.size(), 10);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+// 1- Create 10 dummy files
+// 2- Delete 10 dummy files using DeleteScheduler
+// 3- Wait for DeleteScheduler to delete all files in queue
+// 4- Make sure all files in trash directory were deleted
+// 5- Repeat previous steps 5 times
+TEST_F(DeleteSchedulerTest, StartBGEmptyTrashMultipleTimes) {
+  int bg_delete_file = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DeleteScheduler::DeleteTrashFile:DeleteFile",
+      [&](void* /*arg*/) { bg_delete_file++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  rate_bytes_per_sec_ = 1024 * 1024;  // 1 MB / sec
+  NewDeleteScheduler();
+
+  // Move files to trash, wait for empty trash, start again
+  for (int run = 1; run <= 5; run++) {
+    // Generate 10 dummy files and move them to trash
+    for (int i = 0; i < 10; i++) {
+      std::string file_name = "data_" + ToString(i) + ".data";
+      ASSERT_OK(delete_scheduler_->DeleteFile(NewDummyFile(file_name), ""));
+    }
+    ASSERT_EQ(CountNormalFiles(), 0);
+    delete_scheduler_->WaitForEmptyTrash();
+    ASSERT_EQ(bg_delete_file, 10 * run);
+    ASSERT_EQ(CountTrashFiles(), 0);
+
+    auto bg_errors = delete_scheduler_->GetBackgroundErrors();
+    ASSERT_EQ(bg_errors.size(), 0);
+  }
+
+  ASSERT_EQ(bg_delete_file, 50);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+}
+
+TEST_F(DeleteSchedulerTest, DeletePartialFile) {
+  int bg_delete_file = 0;
+  int bg_fsync = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DeleteScheduler::DeleteTrashFile:DeleteFile",
+      [&](void*) { bg_delete_file++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DeleteScheduler::DeleteTrashFile:Fsync", [&](void*) { bg_fsync++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  rate_bytes_per_sec_ = 1024 * 1024;  // 1 MB / sec
+  NewDeleteScheduler();
+
+  // Should delete in 4 batch
+  ASSERT_OK(
+      delete_scheduler_->DeleteFile(NewDummyFile("data_1", 500 * 1024), ""));
+  ASSERT_OK(
+      delete_scheduler_->DeleteFile(NewDummyFile("data_2", 100 * 1024), ""));
+  // Should delete in 2 batch
+  ASSERT_OK(
+      delete_scheduler_->DeleteFile(NewDummyFile("data_2", 200 * 1024), ""));
+
+  delete_scheduler_->WaitForEmptyTrash();
+
+  auto bg_errors = delete_scheduler_->GetBackgroundErrors();
+  ASSERT_EQ(bg_errors.size(), 0);
+  ASSERT_EQ(7, bg_delete_file);
+  ASSERT_EQ(4, bg_fsync);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+}
+
+#ifdef OS_LINUX
+TEST_F(DeleteSchedulerTest, NoPartialDeleteWithLink) {
+  int bg_delete_file = 0;
+  int bg_fsync = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DeleteScheduler::DeleteTrashFile:DeleteFile",
+      [&](void*) { bg_delete_file++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DeleteScheduler::DeleteTrashFile:Fsync", [&](void*) { bg_fsync++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  rate_bytes_per_sec_ = 1024 * 1024;  // 1 MB / sec
+  NewDeleteScheduler();
+
+  std::string file1 = NewDummyFile("data_1", 500 * 1024);
+  std::string file2 = NewDummyFile("data_2", 100 * 1024);
+
+  ASSERT_OK(env_->LinkFile(file1, dummy_files_dirs_[0] + "/data_1b"));
+  ASSERT_OK(env_->LinkFile(file2, dummy_files_dirs_[0] + "/data_2b"));
+
+  // Should delete in 4 batch if there is no hardlink
+  ASSERT_OK(delete_scheduler_->DeleteFile(file1, ""));
+  ASSERT_OK(delete_scheduler_->DeleteFile(file2, ""));
+
+  delete_scheduler_->WaitForEmptyTrash();
+
+  auto bg_errors = delete_scheduler_->GetBackgroundErrors();
+  ASSERT_EQ(bg_errors.size(), 0);
+  ASSERT_EQ(2, bg_delete_file);
+  ASSERT_EQ(0, bg_fsync);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+}
+#endif
+
+// 1- Create a DeleteScheduler with very slow rate limit (1 Byte / sec)
+// 2- Delete 100 files using DeleteScheduler
+// 3- Delete the DeleteScheduler (call the destructor while queue is not empty)
+// 4- Make sure that not all files were deleted from trash and that
+//    DeleteScheduler background thread did not delete all files
+TEST_F(DeleteSchedulerTest, DestructorWithNonEmptyQueue) {
+  int bg_delete_file = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DeleteScheduler::DeleteTrashFile:DeleteFile",
+      [&](void* /*arg*/) { bg_delete_file++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  rate_bytes_per_sec_ = 1;  // 1 Byte / sec
+  NewDeleteScheduler();
+
+  for (int i = 0; i < 100; i++) {
+    std::string file_name = "data_" + ToString(i) + ".data";
+    ASSERT_OK(delete_scheduler_->DeleteFile(NewDummyFile(file_name), ""));
+  }
+
+  // Deleting 100 files will need >28 hours to delete
+  // we will delete the DeleteScheduler while delete queue is not empty
+  sst_file_mgr_.reset();
+
+  ASSERT_LT(bg_delete_file, 100);
+  ASSERT_GT(CountTrashFiles(), 0);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DeleteSchedulerTest, DISABLED_DynamicRateLimiting1) {
+  std::vector<uint64_t> penalties;
+  int bg_delete_file = 0;
+  int fg_delete_file = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DeleteScheduler::DeleteTrashFile:DeleteFile",
+      [&](void* /*arg*/) { bg_delete_file++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DeleteScheduler::DeleteFile", [&](void* /*arg*/) { fg_delete_file++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DeleteScheduler::BackgroundEmptyTrash:Wait",
+      [&](void* arg) { penalties.push_back(*(static_cast<int*>(arg))); });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+      {"DeleteSchedulerTest::DynamicRateLimiting1:1",
+       "DeleteScheduler::BackgroundEmptyTrash"},
+  });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  rate_bytes_per_sec_ = 0;  // Disable rate limiting initially
+  NewDeleteScheduler();
+
+
+  int num_files = 10;  // 10 files
+  uint64_t file_size = 1024;  // every file is 1 kb
+
+  std::vector<int64_t> delete_kbs_per_sec = {512, 200, 0, 100, 50, -2, 25};
+  for (size_t t = 0; t < delete_kbs_per_sec.size(); t++) {
+    penalties.clear();
+    bg_delete_file = 0;
+    fg_delete_file = 0;
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearTrace();
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+    DestroyAndCreateDir(dummy_files_dirs_[0]);
+    rate_bytes_per_sec_ = delete_kbs_per_sec[t] * 1024;
+    delete_scheduler_->SetRateBytesPerSecond(rate_bytes_per_sec_);
+
+    // Create 100 dummy files, every file is 1 Kb
+    std::vector<std::string> generated_files;
+    for (int i = 0; i < num_files; i++) {
+      std::string file_name = "file" + ToString(i) + ".data";
+      generated_files.push_back(NewDummyFile(file_name, file_size));
+    }
+
+    // Delete dummy files and measure time spent to empty trash
+    for (int i = 0; i < num_files; i++) {
+      ASSERT_OK(delete_scheduler_->DeleteFile(generated_files[i], ""));
+    }
+    ASSERT_EQ(CountNormalFiles(), 0);
+
+    if (rate_bytes_per_sec_ > 0) {
+      uint64_t delete_start_time = env_->NowMicros();
+      TEST_SYNC_POINT("DeleteSchedulerTest::DynamicRateLimiting1:1");
+      delete_scheduler_->WaitForEmptyTrash();
+      uint64_t time_spent_deleting = env_->NowMicros() - delete_start_time;
+
+      auto bg_errors = delete_scheduler_->GetBackgroundErrors();
+      ASSERT_EQ(bg_errors.size(), 0);
+
+      uint64_t total_files_size = 0;
+      uint64_t expected_penlty = 0;
+      ASSERT_EQ(penalties.size(), num_files);
+      for (int i = 0; i < num_files; i++) {
+        total_files_size += file_size;
+        expected_penlty = ((total_files_size * 1000000) / rate_bytes_per_sec_);
+        ASSERT_EQ(expected_penlty, penalties[i]);
+      }
+      ASSERT_GT(time_spent_deleting, expected_penlty * 0.9);
+      ASSERT_EQ(bg_delete_file, num_files);
+      ASSERT_EQ(fg_delete_file, 0);
+    } else {
+      ASSERT_EQ(penalties.size(), 0);
+      ASSERT_EQ(bg_delete_file, 0);
+      ASSERT_EQ(fg_delete_file, num_files);
+    }
+
+    ASSERT_EQ(CountTrashFiles(), 0);
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  }
+}
+
+TEST_F(DeleteSchedulerTest, ImmediateDeleteOn25PercDBSize) {
+  int bg_delete_file = 0;
+  int fg_delete_file = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DeleteScheduler::DeleteTrashFile:DeleteFile",
+      [&](void* /*arg*/) { bg_delete_file++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DeleteScheduler::DeleteFile", [&](void* /*arg*/) { fg_delete_file++; });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  int num_files = 100;  // 100 files
+  uint64_t file_size = 1024 * 10; // 100 KB as a file size
+  rate_bytes_per_sec_ = 1;  // 1 byte per sec (very slow trash delete)
+
+  NewDeleteScheduler();
+  delete_scheduler_->SetMaxTrashDBRatio(0.25);
+
+  std::vector<std::string> generated_files;
+  for (int i = 0; i < num_files; i++) {
+    std::string file_name = "file" + ToString(i) + ".data";
+    generated_files.push_back(NewDummyFile(file_name, file_size));
+  }
+
+  for (std::string& file_name : generated_files) {
+    delete_scheduler_->DeleteFile(file_name, "");
+  }
+
+  // When we end up with 26 files in trash we will start
+  // deleting new files immediately
+  ASSERT_EQ(fg_delete_file, 74);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DeleteSchedulerTest, IsTrashCheck) {
+  // Trash files
+  ASSERT_TRUE(DeleteScheduler::IsTrashFile("x.trash"));
+  ASSERT_TRUE(DeleteScheduler::IsTrashFile(".trash"));
+  ASSERT_TRUE(DeleteScheduler::IsTrashFile("abc.sst.trash"));
+  ASSERT_TRUE(DeleteScheduler::IsTrashFile("/a/b/c/abc..sst.trash"));
+  ASSERT_TRUE(DeleteScheduler::IsTrashFile("log.trash"));
+  ASSERT_TRUE(DeleteScheduler::IsTrashFile("^^^^^.log.trash"));
+  ASSERT_TRUE(DeleteScheduler::IsTrashFile("abc.t.trash"));
+
+  // Not trash files
+  ASSERT_FALSE(DeleteScheduler::IsTrashFile("abc.sst"));
+  ASSERT_FALSE(DeleteScheduler::IsTrashFile("abc.txt"));
+  ASSERT_FALSE(DeleteScheduler::IsTrashFile("/a/b/c/abc.sst"));
+  ASSERT_FALSE(DeleteScheduler::IsTrashFile("/a/b/c/abc.sstrash"));
+  ASSERT_FALSE(DeleteScheduler::IsTrashFile("^^^^^.trashh"));
+  ASSERT_FALSE(DeleteScheduler::IsTrashFile("abc.ttrash"));
+  ASSERT_FALSE(DeleteScheduler::IsTrashFile(".ttrash"));
+  ASSERT_FALSE(DeleteScheduler::IsTrashFile("abc.trashx"));
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+
+#else
+int main(int /*argc*/, char** /*argv*/) {
+  printf("DeleteScheduler is not supported in ROCKSDB_LITE\n");
+  return 0;
+}
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/file/file_prefetch_buffer.cc b/src/rocksdb/file/file_prefetch_buffer.cc
new file mode 100644
index 000000000..7b55bd397
--- /dev/null
+++ b/src/rocksdb/file/file_prefetch_buffer.cc
@@ -0,0 +1,136 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "file/file_prefetch_buffer.h"
+
+#include <algorithm>
+#include <mutex>
+
+#include "file/random_access_file_reader.h"
+#include "monitoring/histogram.h"
+#include "monitoring/iostats_context_imp.h"
+#include "port/port.h"
+#include "test_util/sync_point.h"
+#include "util/random.h"
+#include "util/rate_limiter.h"
+
+namespace ROCKSDB_NAMESPACE {
+Status FilePrefetchBuffer::Prefetch(RandomAccessFileReader* reader,
+                                    uint64_t offset, size_t n,
+                                    bool for_compaction) {
+  if (!enable_ || reader == nullptr) {
+    return Status::OK();
+  }
+  size_t alignment = reader->file()->GetRequiredBufferAlignment();
+  size_t offset_ = static_cast<size_t>(offset);
+  uint64_t rounddown_offset = Rounddown(offset_, alignment);
+  uint64_t roundup_end = Roundup(offset_ + n, alignment);
+  uint64_t roundup_len = roundup_end - rounddown_offset;
+  assert(roundup_len >= alignment);
+  assert(roundup_len % alignment == 0);
+
+  // Check if requested bytes are in the existing buffer_.
+  // If all bytes exist -- return.
+  // If only a few bytes exist -- reuse them & read only what is really needed.
+  //     This is typically the case of incremental reading of data.
+  // If no bytes exist in buffer -- full pread.
+
+  Status s;
+  uint64_t chunk_offset_in_buffer = 0;
+  uint64_t chunk_len = 0;
+  bool copy_data_to_new_buffer = false;
+  if (buffer_.CurrentSize() > 0 && offset >= buffer_offset_ &&
+      offset <= buffer_offset_ + buffer_.CurrentSize()) {
+    if (offset + n <= buffer_offset_ + buffer_.CurrentSize()) {
+      // All requested bytes are already in the buffer. So no need to Read
+      // again.
+      return s;
+    } else {
+      // Only a few requested bytes are in the buffer. memmove those chunk of
+      // bytes to the beginning, and memcpy them back into the new buffer if a
+      // new buffer is created.
+      chunk_offset_in_buffer =
+          Rounddown(static_cast<size_t>(offset - buffer_offset_), alignment);
+      chunk_len = buffer_.CurrentSize() - chunk_offset_in_buffer;
+      assert(chunk_offset_in_buffer % alignment == 0);
+      assert(chunk_len % alignment == 0);
+      assert(chunk_offset_in_buffer + chunk_len <=
+             buffer_offset_ + buffer_.CurrentSize());
+      if (chunk_len > 0) {
+        copy_data_to_new_buffer = true;
+      } else {
+        // this reset is not necessary, but just to be safe.
+        chunk_offset_in_buffer = 0;
+      }
+    }
+  }
+
+  // Create a new buffer only if current capacity is not sufficient, and memcopy
+  // bytes from old buffer if needed (i.e., if chunk_len is greater than 0).
+  if (buffer_.Capacity() < roundup_len) {
+    buffer_.Alignment(alignment);
+    buffer_.AllocateNewBuffer(static_cast<size_t>(roundup_len),
+                              copy_data_to_new_buffer, chunk_offset_in_buffer,
+                              static_cast<size_t>(chunk_len));
+  } else if (chunk_len > 0) {
+    // New buffer not needed. But memmove bytes from tail to the beginning since
+    // chunk_len is greater than 0.
+    buffer_.RefitTail(static_cast<size_t>(chunk_offset_in_buffer),
+                      static_cast<size_t>(chunk_len));
+  }
+
+  Slice result;
+  s = reader->Read(rounddown_offset + chunk_len,
+                   static_cast<size_t>(roundup_len - chunk_len), &result,
+                   buffer_.BufferStart() + chunk_len, for_compaction);
+  if (s.ok()) {
+    buffer_offset_ = rounddown_offset;
+    buffer_.Size(static_cast<size_t>(chunk_len) + result.size());
+  }
+  return s;
+}
+
+bool FilePrefetchBuffer::TryReadFromCache(uint64_t offset, size_t n,
+                                          Slice* result, bool for_compaction) {
+  if (track_min_offset_ && offset < min_offset_read_) {
+    min_offset_read_ = static_cast<size_t>(offset);
+  }
+  if (!enable_ || offset < buffer_offset_) {
+    return false;
+  }
+
+  // If the buffer contains only a few of the requested bytes:
+  //    If readahead is enabled: prefetch the remaining bytes + readadhead bytes
+  //        and satisfy the request.
+  //    If readahead is not enabled: return false.
+  if (offset + n > buffer_offset_ + buffer_.CurrentSize()) {
+    if (readahead_size_ > 0) {
+      assert(file_reader_ != nullptr);
+      assert(max_readahead_size_ >= readahead_size_);
+      Status s;
+      if (for_compaction) {
+        s = Prefetch(file_reader_, offset, std::max(n, readahead_size_),
+                     for_compaction);
+      } else {
+        s = Prefetch(file_reader_, offset, n + readahead_size_, for_compaction);
+      }
+      if (!s.ok()) {
+        return false;
+      }
+      readahead_size_ = std::min(max_readahead_size_, readahead_size_ * 2);
+    } else {
+      return false;
+    }
+  }
+
+  uint64_t offset_in_buffer = offset - buffer_offset_;
+  *result = Slice(buffer_.BufferStart() + offset_in_buffer, n);
+  return true;
+}
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/file/file_prefetch_buffer.h b/src/rocksdb/file/file_prefetch_buffer.h
new file mode 100644
index 000000000..d53f627b5
--- /dev/null
+++ b/src/rocksdb/file/file_prefetch_buffer.h
@@ -0,0 +1,97 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <atomic>
+#include <sstream>
+#include <string>
+#include "file/random_access_file_reader.h"
+#include "port/port.h"
+#include "rocksdb/env.h"
+#include "util/aligned_buffer.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// FilePrefetchBuffer is a smart buffer to store and read data from a file.
+class FilePrefetchBuffer {
+ public:
+  // Constructor.
+  //
+  // All arguments are optional.
+  // file_reader        : the file reader to use. Can be a nullptr.
+  // readahead_size     : the initial readahead size.
+  // max_readahead_size : the maximum readahead size.
+  //   If max_readahead_size > readahead_size, the readahead size will be
+  //   doubled on every IO until max_readahead_size is hit.
+  //   Typically this is set as a multiple of readahead_size.
+  //   max_readahead_size should be greater than equal to readahead_size.
+  // enable : controls whether reading from the buffer is enabled.
+  //   If false, TryReadFromCache() always return false, and we only take stats
+  //   for the minimum offset if track_min_offset = true.
+  // track_min_offset : Track the minimum offset ever read and collect stats on
+  //   it. Used for adaptable readahead of the file footer/metadata.
+  //
+  // Automatic readhead is enabled for a file if file_reader, readahead_size,
+  // and max_readahead_size are passed in.
+  // If file_reader is a nullptr, setting readadhead_size and max_readahead_size
+  // does not make any sense. So it does nothing.
+  // A user can construct a FilePrefetchBuffer without any arguments, but use
+  // `Prefetch` to load data into the buffer.
+  FilePrefetchBuffer(RandomAccessFileReader* file_reader = nullptr,
+                     size_t readadhead_size = 0, size_t max_readahead_size = 0,
+                     bool enable = true, bool track_min_offset = false)
+      : buffer_offset_(0),
+        file_reader_(file_reader),
+        readahead_size_(readadhead_size),
+        max_readahead_size_(max_readahead_size),
+        min_offset_read_(port::kMaxSizet),
+        enable_(enable),
+        track_min_offset_(track_min_offset) {}
+
+  // Load data into the buffer from a file.
+  // reader : the file reader.
+  // offset : the file offset to start reading from.
+  // n      : the number of bytes to read.
+  // for_compaction : if prefetch is done for compaction read.
+  Status Prefetch(RandomAccessFileReader* reader, uint64_t offset, size_t n,
+                  bool for_compaction = false);
+
+  // Tries returning the data for a file raed from this buffer, if that data is
+  // in the buffer.
+  // It handles tracking the minimum read offset if track_min_offset = true.
+  // It also does the exponential readahead when readadhead_size is set as part
+  // of the constructor.
+  //
+  // offset : the file offset.
+  // n      : the number of bytes.
+  // result : output buffer to put the data into.
+  // for_compaction : if cache read is done for compaction read.
+  bool TryReadFromCache(uint64_t offset, size_t n, Slice* result,
+                        bool for_compaction = false);
+
+  // The minimum `offset` ever passed to TryReadFromCache(). This will nly be
+  // tracked if track_min_offset = true.
+  size_t min_offset_read() const { return min_offset_read_; }
+
+ private:
+  AlignedBuffer buffer_;
+  uint64_t buffer_offset_;
+  RandomAccessFileReader* file_reader_;
+  size_t readahead_size_;
+  size_t max_readahead_size_;
+  // The minimum `offset` ever passed to TryReadFromCache().
+  size_t min_offset_read_;
+  // if false, TryReadFromCache() always return false, and we only take stats
+  // for track_min_offset_ if track_min_offset_ = true
+  bool enable_;
+  // If true, track minimum `offset` ever passed to TryReadFromCache(), which
+  // can be fetched from min_offset_read().
+  bool track_min_offset_;
+};
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/file/file_util.cc b/src/rocksdb/file/file_util.cc
new file mode 100644
index 000000000..c231bf7d1
--- /dev/null
+++ b/src/rocksdb/file/file_util.cc
@@ -0,0 +1,124 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#include "file/file_util.h"
+
+#include <string>
+#include <algorithm>
+
+#include "file/random_access_file_reader.h"
+#include "file/sequence_file_reader.h"
+#include "file/sst_file_manager_impl.h"
+#include "file/writable_file_writer.h"
+#include "rocksdb/env.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Utility function to copy a file up to a specified length
+Status CopyFile(FileSystem* fs, const std::string& source,
+                const std::string& destination, uint64_t size, bool use_fsync) {
+  const FileOptions soptions;
+  Status s;
+  std::unique_ptr<SequentialFileReader> src_reader;
+  std::unique_ptr<WritableFileWriter> dest_writer;
+
+  {
+    std::unique_ptr<FSSequentialFile> srcfile;
+    s = fs->NewSequentialFile(source, soptions, &srcfile, nullptr);
+    if (!s.ok()) {
+      return s;
+    }
+    std::unique_ptr<FSWritableFile> destfile;
+    s = fs->NewWritableFile(destination, soptions, &destfile, nullptr);
+    if (!s.ok()) {
+      return s;
+    }
+
+    if (size == 0) {
+      // default argument means copy everything
+      s = fs->GetFileSize(source, IOOptions(), &size, nullptr);
+      if (!s.ok()) {
+        return s;
+      }
+    }
+    src_reader.reset(new SequentialFileReader(std::move(srcfile), source));
+    dest_writer.reset(
+        new WritableFileWriter(std::move(destfile), destination, soptions));
+  }
+
+  char buffer[4096];
+  Slice slice;
+  while (size > 0) {
+    size_t bytes_to_read = std::min(sizeof(buffer), static_cast<size_t>(size));
+    s = src_reader->Read(bytes_to_read, &slice, buffer);
+    if (!s.ok()) {
+      return s;
+    }
+    if (slice.size() == 0) {
+      return Status::Corruption("file too small");
+    }
+    s = dest_writer->Append(slice);
+    if (!s.ok()) {
+      return s;
+    }
+    size -= slice.size();
+  }
+  return dest_writer->Sync(use_fsync);
+}
+
+// Utility function to create a file with the provided contents
+Status CreateFile(FileSystem* fs, const std::string& destination,
+                  const std::string& contents, bool use_fsync) {
+  const EnvOptions soptions;
+  Status s;
+  std::unique_ptr<WritableFileWriter> dest_writer;
+
+  std::unique_ptr<FSWritableFile> destfile;
+  s = fs->NewWritableFile(destination, soptions, &destfile, nullptr);
+  if (!s.ok()) {
+    return s;
+  }
+  dest_writer.reset(
+      new WritableFileWriter(std::move(destfile), destination, soptions));
+  s = dest_writer->Append(Slice(contents));
+  if (!s.ok()) {
+    return s;
+  }
+  return dest_writer->Sync(use_fsync);
+}
+
+Status DeleteDBFile(const ImmutableDBOptions* db_options,
+                    const std::string& fname, const std::string& dir_to_sync,
+                    const bool force_bg, const bool force_fg) {
+#ifndef ROCKSDB_LITE
+  SstFileManagerImpl* sfm =
+      static_cast<SstFileManagerImpl*>(db_options->sst_file_manager.get());
+  if (sfm && !force_fg) {
+    return sfm->ScheduleFileDeletion(fname, dir_to_sync, force_bg);
+  } else {
+    return db_options->env->DeleteFile(fname);
+  }
+#else
+  (void)dir_to_sync;
+  (void)force_bg;
+  (void)force_fg;
+  // SstFileManager is not supported in ROCKSDB_LITE
+  // Delete file immediately
+  return db_options->env->DeleteFile(fname);
+#endif
+}
+
+bool IsWalDirSameAsDBPath(const ImmutableDBOptions* db_options) {
+  bool same = false;
+  assert(!db_options->db_paths.empty());
+  Status s = db_options->env->AreFilesSame(db_options->wal_dir,
+                                           db_options->db_paths[0].path, &same);
+  if (s.IsNotSupported()) {
+    same = db_options->wal_dir == db_options->db_paths[0].path;
+  }
+  return same;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/file/file_util.h b/src/rocksdb/file/file_util.h
new file mode 100644
index 000000000..333749e4d
--- /dev/null
+++ b/src/rocksdb/file/file_util.h
@@ -0,0 +1,33 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#pragma once
+#include <string>
+
+#include "file/filename.h"
+#include "options/db_options.h"
+#include "rocksdb/env.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/status.h"
+#include "rocksdb/types.h"
+
+namespace ROCKSDB_NAMESPACE {
+// use_fsync maps to options.use_fsync, which determines the way that
+// the file is synced after copying.
+extern Status CopyFile(FileSystem* fs, const std::string& source,
+                       const std::string& destination, uint64_t size,
+                       bool use_fsync);
+
+extern Status CreateFile(FileSystem* fs, const std::string& destination,
+                         const std::string& contents, bool use_fsync);
+
+extern Status DeleteDBFile(const ImmutableDBOptions* db_options,
+                           const std::string& fname,
+                           const std::string& path_to_sync, const bool force_bg,
+                           const bool force_fg);
+
+extern bool IsWalDirSameAsDBPath(const ImmutableDBOptions* db_options);
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/file/filename.cc b/src/rocksdb/file/filename.cc
new file mode 100644
index 000000000..d620b7351
--- /dev/null
+++ b/src/rocksdb/file/filename.cc
@@ -0,0 +1,456 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include "file/filename.h"
+#include <cinttypes>
+
+#include <ctype.h>
+#include <stdio.h>
+#include <vector>
+#include "file/writable_file_writer.h"
+#include "logging/logging.h"
+#include "rocksdb/env.h"
+#include "test_util/sync_point.h"
+#include "util/stop_watch.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+static const std::string kRocksDbTFileExt = "sst";
+static const std::string kLevelDbTFileExt = "ldb";
+static const std::string kRocksDBBlobFileExt = "blob";
+
+// Given a path, flatten the path name by replacing all chars not in
+// {[0-9,a-z,A-Z,-,_,.]} with _. And append '_LOG\0' at the end.
+// Return the number of chars stored in dest not including the trailing '\0'.
+static size_t GetInfoLogPrefix(const std::string& path, char* dest, int len) {
+  const char suffix[] = "_LOG";
+
+  size_t write_idx = 0;
+  size_t i = 0;
+  size_t src_len = path.size();
+
+  while (i < src_len && write_idx < len - sizeof(suffix)) {
+    if ((path[i] >= 'a' && path[i] <= 'z') ||
+        (path[i] >= '0' && path[i] <= '9') ||
+        (path[i] >= 'A' && path[i] <= 'Z') ||
+        path[i] == '-' ||
+        path[i] == '.' ||
+        path[i] == '_'){
+      dest[write_idx++] = path[i];
+    } else {
+      if (i > 0) {
+        dest[write_idx++] = '_';
+      }
+    }
+    i++;
+  }
+  assert(sizeof(suffix) <= len - write_idx);
+  // "\0" is automatically added by snprintf
+  snprintf(dest + write_idx, len - write_idx, suffix);
+  write_idx += sizeof(suffix) - 1;
+  return write_idx;
+}
+
+static std::string MakeFileName(uint64_t number, const char* suffix) {
+  char buf[100];
+  snprintf(buf, sizeof(buf), "%06llu.%s",
+           static_cast<unsigned long long>(number), suffix);
+  return buf;
+}
+
+static std::string MakeFileName(const std::string& name, uint64_t number,
+                                const char* suffix) {
+  return name + "/" + MakeFileName(number, suffix);
+}
+
+std::string LogFileName(const std::string& name, uint64_t number) {
+  assert(number > 0);
+  return MakeFileName(name, number, "log");
+}
+
+std::string LogFileName(uint64_t number) {
+  assert(number > 0);
+  return MakeFileName(number, "log");
+}
+
+std::string BlobFileName(const std::string& blobdirname, uint64_t number) {
+  assert(number > 0);
+  return MakeFileName(blobdirname, number, kRocksDBBlobFileExt.c_str());
+}
+
+std::string BlobFileName(const std::string& dbname, const std::string& blob_dir,
+                         uint64_t number) {
+  assert(number > 0);
+  return MakeFileName(dbname + "/" + blob_dir, number,
+                      kRocksDBBlobFileExt.c_str());
+}
+
+std::string ArchivalDirectory(const std::string& dir) {
+  return dir + "/" + ARCHIVAL_DIR;
+}
+std::string ArchivedLogFileName(const std::string& name, uint64_t number) {
+  assert(number > 0);
+  return MakeFileName(name + "/" + ARCHIVAL_DIR, number, "log");
+}
+
+std::string MakeTableFileName(const std::string& path, uint64_t number) {
+  return MakeFileName(path, number, kRocksDbTFileExt.c_str());
+}
+
+std::string MakeTableFileName(uint64_t number) {
+  return MakeFileName(number, kRocksDbTFileExt.c_str());
+}
+
+std::string Rocks2LevelTableFileName(const std::string& fullname) {
+  assert(fullname.size() > kRocksDbTFileExt.size() + 1);
+  if (fullname.size() <= kRocksDbTFileExt.size() + 1) {
+    return "";
+  }
+  return fullname.substr(0, fullname.size() - kRocksDbTFileExt.size()) +
+         kLevelDbTFileExt;
+}
+
+uint64_t TableFileNameToNumber(const std::string& name) {
+  uint64_t number = 0;
+  uint64_t base = 1;
+  int pos = static_cast<int>(name.find_last_of('.'));
+  while (--pos >= 0 && name[pos] >= '0' && name[pos] <= '9') {
+    number += (name[pos] - '0') * base;
+    base *= 10;
+  }
+  return number;
+}
+
+std::string TableFileName(const std::vector<DbPath>& db_paths, uint64_t number,
+                          uint32_t path_id) {
+  assert(number > 0);
+  std::string path;
+  if (path_id >= db_paths.size()) {
+    path = db_paths.back().path;
+  } else {
+    path = db_paths[path_id].path;
+  }
+  return MakeTableFileName(path, number);
+}
+
+void FormatFileNumber(uint64_t number, uint32_t path_id, char* out_buf,
+                      size_t out_buf_size) {
+  if (path_id == 0) {
+    snprintf(out_buf, out_buf_size, "%" PRIu64, number);
+  } else {
+    snprintf(out_buf, out_buf_size, "%" PRIu64
+                                    "(path "
+                                    "%" PRIu32 ")",
+             number, path_id);
+  }
+}
+
+std::string DescriptorFileName(const std::string& dbname, uint64_t number) {
+  assert(number > 0);
+  char buf[100];
+  snprintf(buf, sizeof(buf), "/MANIFEST-%06llu",
+           static_cast<unsigned long long>(number));
+  return dbname + buf;
+}
+
+std::string CurrentFileName(const std::string& dbname) {
+  return dbname + "/CURRENT";
+}
+
+std::string LockFileName(const std::string& dbname) {
+  return dbname + "/LOCK";
+}
+
+std::string TempFileName(const std::string& dbname, uint64_t number) {
+  return MakeFileName(dbname, number, kTempFileNameSuffix.c_str());
+}
+
+InfoLogPrefix::InfoLogPrefix(bool has_log_dir,
+                             const std::string& db_absolute_path) {
+  if (!has_log_dir) {
+    const char kInfoLogPrefix[] = "LOG";
+    // "\0" is automatically added to the end
+    snprintf(buf, sizeof(buf), kInfoLogPrefix);
+    prefix = Slice(buf, sizeof(kInfoLogPrefix) - 1);
+  } else {
+    size_t len = GetInfoLogPrefix(db_absolute_path, buf, sizeof(buf));
+    prefix = Slice(buf, len);
+  }
+}
+
+std::string InfoLogFileName(const std::string& dbname,
+    const std::string& db_path, const std::string& log_dir) {
+  if (log_dir.empty()) {
+    return dbname + "/LOG";
+  }
+
+  InfoLogPrefix info_log_prefix(true, db_path);
+  return log_dir + "/" + info_log_prefix.buf;
+}
+
+// Return the name of the old info log file for "dbname".
+std::string OldInfoLogFileName(const std::string& dbname, uint64_t ts,
+    const std::string& db_path, const std::string& log_dir) {
+  char buf[50];
+  snprintf(buf, sizeof(buf), "%llu", static_cast<unsigned long long>(ts));
+
+  if (log_dir.empty()) {
+    return dbname + "/LOG.old." + buf;
+  }
+
+  InfoLogPrefix info_log_prefix(true, db_path);
+  return log_dir + "/" + info_log_prefix.buf + ".old." + buf;
+}
+
+std::string OptionsFileName(const std::string& dbname, uint64_t file_num) {
+  char buffer[256];
+  snprintf(buffer, sizeof(buffer), "%s%06" PRIu64,
+           kOptionsFileNamePrefix.c_str(), file_num);
+  return dbname + "/" + buffer;
+}
+
+std::string TempOptionsFileName(const std::string& dbname, uint64_t file_num) {
+  char buffer[256];
+  snprintf(buffer, sizeof(buffer), "%s%06" PRIu64 ".%s",
+           kOptionsFileNamePrefix.c_str(), file_num,
+           kTempFileNameSuffix.c_str());
+  return dbname + "/" + buffer;
+}
+
+std::string MetaDatabaseName(const std::string& dbname, uint64_t number) {
+  char buf[100];
+  snprintf(buf, sizeof(buf), "/METADB-%llu",
+           static_cast<unsigned long long>(number));
+  return dbname + buf;
+}
+
+std::string IdentityFileName(const std::string& dbname) {
+  return dbname + "/IDENTITY";
+}
+
+// Owned filenames have the form:
+//    dbname/IDENTITY
+//    dbname/CURRENT
+//    dbname/LOCK
+//    dbname/<info_log_name_prefix>
+//    dbname/<info_log_name_prefix>.old.[0-9]+
+//    dbname/MANIFEST-[0-9]+
+//    dbname/[0-9]+.(log|sst|blob)
+//    dbname/METADB-[0-9]+
+//    dbname/OPTIONS-[0-9]+
+//    dbname/OPTIONS-[0-9]+.dbtmp
+//    Disregards / at the beginning
+bool ParseFileName(const std::string& fname,
+                   uint64_t* number,
+                   FileType* type,
+                   WalFileType* log_type) {
+  return ParseFileName(fname, number, "", type, log_type);
+}
+
+bool ParseFileName(const std::string& fname, uint64_t* number,
+                   const Slice& info_log_name_prefix, FileType* type,
+                   WalFileType* log_type) {
+  Slice rest(fname);
+  if (fname.length() > 1 && fname[0] == '/') {
+    rest.remove_prefix(1);
+  }
+  if (rest == "IDENTITY") {
+    *number = 0;
+    *type = kIdentityFile;
+  } else if (rest == "CURRENT") {
+    *number = 0;
+    *type = kCurrentFile;
+  } else if (rest == "LOCK") {
+    *number = 0;
+    *type = kDBLockFile;
+  } else if (info_log_name_prefix.size() > 0 &&
+             rest.starts_with(info_log_name_prefix)) {
+    rest.remove_prefix(info_log_name_prefix.size());
+    if (rest == "" || rest == ".old") {
+      *number = 0;
+      *type = kInfoLogFile;
+    } else if (rest.starts_with(".old.")) {
+      uint64_t ts_suffix;
+      // sizeof also counts the trailing '\0'.
+      rest.remove_prefix(sizeof(".old.") - 1);
+      if (!ConsumeDecimalNumber(&rest, &ts_suffix)) {
+        return false;
+      }
+      *number = ts_suffix;
+      *type = kInfoLogFile;
+    }
+  } else if (rest.starts_with("MANIFEST-")) {
+    rest.remove_prefix(strlen("MANIFEST-"));
+    uint64_t num;
+    if (!ConsumeDecimalNumber(&rest, &num)) {
+      return false;
+    }
+    if (!rest.empty()) {
+      return false;
+    }
+    *type = kDescriptorFile;
+    *number = num;
+  } else if (rest.starts_with("METADB-")) {
+    rest.remove_prefix(strlen("METADB-"));
+    uint64_t num;
+    if (!ConsumeDecimalNumber(&rest, &num)) {
+      return false;
+    }
+    if (!rest.empty()) {
+      return false;
+    }
+    *type = kMetaDatabase;
+    *number = num;
+  } else if (rest.starts_with(kOptionsFileNamePrefix)) {
+    uint64_t ts_suffix;
+    bool is_temp_file = false;
+    rest.remove_prefix(kOptionsFileNamePrefix.size());
+    const std::string kTempFileNameSuffixWithDot =
+        std::string(".") + kTempFileNameSuffix;
+    if (rest.ends_with(kTempFileNameSuffixWithDot)) {
+      rest.remove_suffix(kTempFileNameSuffixWithDot.size());
+      is_temp_file = true;
+    }
+    if (!ConsumeDecimalNumber(&rest, &ts_suffix)) {
+      return false;
+    }
+    *number = ts_suffix;
+    *type = is_temp_file ? kTempFile : kOptionsFile;
+  } else {
+    // Avoid strtoull() to keep filename format independent of the
+    // current locale
+    bool archive_dir_found = false;
+    if (rest.starts_with(ARCHIVAL_DIR)) {
+      if (rest.size() <= ARCHIVAL_DIR.size()) {
+        return false;
+      }
+      rest.remove_prefix(ARCHIVAL_DIR.size() + 1); // Add 1 to remove / also
+      if (log_type) {
+        *log_type = kArchivedLogFile;
+      }
+      archive_dir_found = true;
+    }
+    uint64_t num;
+    if (!ConsumeDecimalNumber(&rest, &num)) {
+      return false;
+    }
+    if (rest.size() <= 1 || rest[0] != '.') {
+      return false;
+    }
+    rest.remove_prefix(1);
+
+    Slice suffix = rest;
+    if (suffix == Slice("log")) {
+      *type = kLogFile;
+      if (log_type && !archive_dir_found) {
+        *log_type = kAliveLogFile;
+      }
+    } else if (archive_dir_found) {
+      return false; // Archive dir can contain only log files
+    } else if (suffix == Slice(kRocksDbTFileExt) ||
+               suffix == Slice(kLevelDbTFileExt)) {
+      *type = kTableFile;
+    } else if (suffix == Slice(kRocksDBBlobFileExt)) {
+      *type = kBlobFile;
+    } else if (suffix == Slice(kTempFileNameSuffix)) {
+      *type = kTempFile;
+    } else {
+      return false;
+    }
+    *number = num;
+  }
+  return true;
+}
+
+Status SetCurrentFile(Env* env, const std::string& dbname,
+                      uint64_t descriptor_number,
+                      Directory* directory_to_fsync) {
+  // Remove leading "dbname/" and add newline to manifest file name
+  std::string manifest = DescriptorFileName(dbname, descriptor_number);
+  Slice contents = manifest;
+  assert(contents.starts_with(dbname + "/"));
+  contents.remove_prefix(dbname.size() + 1);
+  std::string tmp = TempFileName(dbname, descriptor_number);
+  Status s = WriteStringToFile(env, contents.ToString() + "\n", tmp, true);
+  if (s.ok()) {
+    TEST_KILL_RANDOM("SetCurrentFile:0", rocksdb_kill_odds * REDUCE_ODDS2);
+    s = env->RenameFile(tmp, CurrentFileName(dbname));
+    TEST_KILL_RANDOM("SetCurrentFile:1", rocksdb_kill_odds * REDUCE_ODDS2);
+  }
+  if (s.ok()) {
+    if (directory_to_fsync != nullptr) {
+      s = directory_to_fsync->Fsync();
+    }
+  } else {
+    env->DeleteFile(tmp);
+  }
+  return s;
+}
+
+Status SetIdentityFile(Env* env, const std::string& dbname,
+                       const std::string& db_id) {
+  std::string id;
+  if (db_id.empty()) {
+    id = env->GenerateUniqueId();
+  } else {
+    id = db_id;
+  }
+  assert(!id.empty());
+  // Reserve the filename dbname/000000.dbtmp for the temporary identity file
+  std::string tmp = TempFileName(dbname, 0);
+  Status s = WriteStringToFile(env, id, tmp, true);
+  if (s.ok()) {
+    s = env->RenameFile(tmp, IdentityFileName(dbname));
+  }
+  if (!s.ok()) {
+    env->DeleteFile(tmp);
+  }
+  return s;
+}
+
+Status SyncManifest(Env* env, const ImmutableDBOptions* db_options,
+                    WritableFileWriter* file) {
+  TEST_KILL_RANDOM("SyncManifest:0", rocksdb_kill_odds * REDUCE_ODDS2);
+  StopWatch sw(env, db_options->statistics.get(), MANIFEST_FILE_SYNC_MICROS);
+  return file->Sync(db_options->use_fsync);
+}
+
+Status GetInfoLogFiles(Env* env, const std::string& db_log_dir,
+                       const std::string& dbname, std::string* parent_dir,
+                       std::vector<std::string>* info_log_list) {
+  assert(parent_dir != nullptr);
+  assert(info_log_list != nullptr);
+  uint64_t number = 0;
+  FileType type = kLogFile;
+
+  if (!db_log_dir.empty()) {
+    *parent_dir = db_log_dir;
+  } else {
+    *parent_dir = dbname;
+  }
+
+  InfoLogPrefix info_log_prefix(!db_log_dir.empty(), dbname);
+
+  std::vector<std::string> file_names;
+  Status s = env->GetChildren(*parent_dir, &file_names);
+
+  if (!s.ok()) {
+    return s;
+  }
+
+  for (auto& f : file_names) {
+    if (ParseFileName(f, &number, info_log_prefix.prefix, &type) &&
+        (type == kInfoLogFile)) {
+      info_log_list->push_back(f);
+    }
+  }
+  return Status::OK();
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/file/filename.h b/src/rocksdb/file/filename.h
new file mode 100644
index 000000000..909287c25
--- /dev/null
+++ b/src/rocksdb/file/filename.h
@@ -0,0 +1,185 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// File names used by DB code
+
+#pragma once
+#include <stdint.h>
+#include <unordered_map>
+#include <string>
+#include <vector>
+
+#include "options/db_options.h"
+#include "port/port.h"
+#include "rocksdb/options.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+#include "rocksdb/transaction_log.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Env;
+class Directory;
+class WritableFileWriter;
+
+enum FileType {
+  kLogFile,
+  kDBLockFile,
+  kTableFile,
+  kDescriptorFile,
+  kCurrentFile,
+  kTempFile,
+  kInfoLogFile,  // Either the current one, or an old one
+  kMetaDatabase,
+  kIdentityFile,
+  kOptionsFile,
+  kBlobFile
+};
+
+// Return the name of the log file with the specified number
+// in the db named by "dbname".  The result will be prefixed with
+// "dbname".
+extern std::string LogFileName(const std::string& dbname, uint64_t number);
+
+extern std::string LogFileName(uint64_t number);
+
+extern std::string BlobFileName(const std::string& bdirname, uint64_t number);
+
+extern std::string BlobFileName(const std::string& dbname,
+                                const std::string& blob_dir, uint64_t number);
+
+static const std::string ARCHIVAL_DIR = "archive";
+
+extern std::string ArchivalDirectory(const std::string& dbname);
+
+//  Return the name of the archived log file with the specified number
+//  in the db named by "dbname". The result will be prefixed with "dbname".
+extern std::string ArchivedLogFileName(const std::string& dbname,
+                                       uint64_t num);
+
+extern std::string MakeTableFileName(const std::string& name, uint64_t number);
+
+extern std::string MakeTableFileName(uint64_t number);
+
+// Return the name of sstable with LevelDB suffix
+// created from RocksDB sstable suffixed name
+extern std::string Rocks2LevelTableFileName(const std::string& fullname);
+
+// the reverse function of MakeTableFileName
+// TODO(yhchiang): could merge this function with ParseFileName()
+extern uint64_t TableFileNameToNumber(const std::string& name);
+
+// Return the name of the sstable with the specified number
+// in the db named by "dbname".  The result will be prefixed with
+// "dbname".
+extern std::string TableFileName(const std::vector<DbPath>& db_paths,
+                                 uint64_t number, uint32_t path_id);
+
+// Sufficient buffer size for FormatFileNumber.
+const size_t kFormatFileNumberBufSize = 38;
+
+extern void FormatFileNumber(uint64_t number, uint32_t path_id, char* out_buf,
+                             size_t out_buf_size);
+
+// Return the name of the descriptor file for the db named by
+// "dbname" and the specified incarnation number.  The result will be
+// prefixed with "dbname".
+extern std::string DescriptorFileName(const std::string& dbname,
+                                      uint64_t number);
+
+// Return the name of the current file.  This file contains the name
+// of the current manifest file.  The result will be prefixed with
+// "dbname".
+extern std::string CurrentFileName(const std::string& dbname);
+
+// Return the name of the lock file for the db named by
+// "dbname".  The result will be prefixed with "dbname".
+extern std::string LockFileName(const std::string& dbname);
+
+// Return the name of a temporary file owned by the db named "dbname".
+// The result will be prefixed with "dbname".
+extern std::string TempFileName(const std::string& dbname, uint64_t number);
+
+// A helper structure for prefix of info log names.
+struct InfoLogPrefix {
+  char buf[260];
+  Slice prefix;
+  // Prefix with DB absolute path encoded
+  explicit InfoLogPrefix(bool has_log_dir, const std::string& db_absolute_path);
+  // Default Prefix
+  explicit InfoLogPrefix();
+};
+
+// Return the name of the info log file for "dbname".
+extern std::string InfoLogFileName(const std::string& dbname,
+                                   const std::string& db_path = "",
+                                   const std::string& log_dir = "");
+
+// Return the name of the old info log file for "dbname".
+extern std::string OldInfoLogFileName(const std::string& dbname, uint64_t ts,
+                                      const std::string& db_path = "",
+                                      const std::string& log_dir = "");
+
+static const std::string kOptionsFileNamePrefix = "OPTIONS-";
+static const std::string kTempFileNameSuffix = "dbtmp";
+
+// Return a options file name given the "dbname" and file number.
+// Format:  OPTIONS-[number].dbtmp
+extern std::string OptionsFileName(const std::string& dbname,
+                                   uint64_t file_num);
+
+// Return a temp options file name given the "dbname" and file number.
+// Format:  OPTIONS-[number]
+extern std::string TempOptionsFileName(const std::string& dbname,
+                                       uint64_t file_num);
+
+// Return the name to use for a metadatabase. The result will be prefixed with
+// "dbname".
+extern std::string MetaDatabaseName(const std::string& dbname,
+                                    uint64_t number);
+
+// Return the name of the Identity file which stores a unique number for the db
+// that will get regenerated if the db loses all its data and is recreated fresh
+// either from a backup-image or empty
+extern std::string IdentityFileName(const std::string& dbname);
+
+// If filename is a rocksdb file, store the type of the file in *type.
+// The number encoded in the filename is stored in *number.  If the
+// filename was successfully parsed, returns true.  Else return false.
+// info_log_name_prefix is the path of info logs.
+extern bool ParseFileName(const std::string& filename, uint64_t* number,
+                          const Slice& info_log_name_prefix, FileType* type,
+                          WalFileType* log_type = nullptr);
+// Same as previous function, but skip info log files.
+extern bool ParseFileName(const std::string& filename, uint64_t* number,
+                          FileType* type, WalFileType* log_type = nullptr);
+
+// Make the CURRENT file point to the descriptor file with the
+// specified number.
+extern Status SetCurrentFile(Env* env, const std::string& dbname,
+                             uint64_t descriptor_number,
+                             Directory* directory_to_fsync);
+
+// Make the IDENTITY file for the db
+extern Status SetIdentityFile(Env* env, const std::string& dbname,
+                              const std::string& db_id = {});
+
+// Sync manifest file `file`.
+extern Status SyncManifest(Env* env, const ImmutableDBOptions* db_options,
+                           WritableFileWriter* file);
+
+// Return list of file names of info logs in `file_names`.
+// The list only contains file name. The parent directory name is stored
+// in `parent_dir`.
+// `db_log_dir` should be the one as in options.db_log_dir
+extern Status GetInfoLogFiles(Env* env, const std::string& db_log_dir,
+                              const std::string& dbname,
+                              std::string* parent_dir,
+                              std::vector<std::string>* file_names);
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/file/random_access_file_reader.cc b/src/rocksdb/file/random_access_file_reader.cc
new file mode 100644
index 000000000..46892360f
--- /dev/null
+++ b/src/rocksdb/file/random_access_file_reader.cc
@@ -0,0 +1,189 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "file/random_access_file_reader.h"
+
+#include <algorithm>
+#include <mutex>
+
+#include "monitoring/histogram.h"
+#include "monitoring/iostats_context_imp.h"
+#include "port/port.h"
+#include "test_util/sync_point.h"
+#include "util/random.h"
+#include "util/rate_limiter.h"
+
+namespace ROCKSDB_NAMESPACE {
+Status RandomAccessFileReader::Read(uint64_t offset, size_t n, Slice* result,
+                                    char* scratch, bool for_compaction) const {
+  Status s;
+  uint64_t elapsed = 0;
+  {
+    StopWatch sw(env_, stats_, hist_type_,
+                 (stats_ != nullptr) ? &elapsed : nullptr, true /*overwrite*/,
+                 true /*delay_enabled*/);
+    auto prev_perf_level = GetPerfLevel();
+    IOSTATS_TIMER_GUARD(read_nanos);
+    if (use_direct_io()) {
+#ifndef ROCKSDB_LITE
+      size_t alignment = file_->GetRequiredBufferAlignment();
+      size_t aligned_offset =
+          TruncateToPageBoundary(alignment, static_cast<size_t>(offset));
+      size_t offset_advance = static_cast<size_t>(offset) - aligned_offset;
+      size_t read_size =
+          Roundup(static_cast<size_t>(offset + n), alignment) - aligned_offset;
+      AlignedBuffer buf;
+      buf.Alignment(alignment);
+      buf.AllocateNewBuffer(read_size);
+      while (buf.CurrentSize() < read_size) {
+        size_t allowed;
+        if (for_compaction && rate_limiter_ != nullptr) {
+          allowed = rate_limiter_->RequestToken(
+              buf.Capacity() - buf.CurrentSize(), buf.Alignment(),
+              Env::IOPriority::IO_LOW, stats_, RateLimiter::OpType::kRead);
+        } else {
+          assert(buf.CurrentSize() == 0);
+          allowed = read_size;
+        }
+        Slice tmp;
+
+        FileOperationInfo::TimePoint start_ts;
+        uint64_t orig_offset = 0;
+        if (ShouldNotifyListeners()) {
+          start_ts = std::chrono::system_clock::now();
+          orig_offset = aligned_offset + buf.CurrentSize();
+        }
+        {
+          IOSTATS_CPU_TIMER_GUARD(cpu_read_nanos, env_);
+          s = file_->Read(aligned_offset + buf.CurrentSize(), allowed,
+                          IOOptions(), &tmp, buf.Destination(), nullptr);
+        }
+        if (ShouldNotifyListeners()) {
+          auto finish_ts = std::chrono::system_clock::now();
+          NotifyOnFileReadFinish(orig_offset, tmp.size(), start_ts, finish_ts,
+                                 s);
+        }
+
+        buf.Size(buf.CurrentSize() + tmp.size());
+        if (!s.ok() || tmp.size() < allowed) {
+          break;
+        }
+      }
+      size_t res_len = 0;
+      if (s.ok() && offset_advance < buf.CurrentSize()) {
+        res_len = buf.Read(scratch, offset_advance,
+                           std::min(buf.CurrentSize() - offset_advance, n));
+      }
+      *result = Slice(scratch, res_len);
+#endif  // !ROCKSDB_LITE
+    } else {
+      size_t pos = 0;
+      const char* res_scratch = nullptr;
+      while (pos < n) {
+        size_t allowed;
+        if (for_compaction && rate_limiter_ != nullptr) {
+          if (rate_limiter_->IsRateLimited(RateLimiter::OpType::kRead)) {
+            sw.DelayStart();
+          }
+          allowed = rate_limiter_->RequestToken(n - pos, 0 /* alignment */,
+                                                Env::IOPriority::IO_LOW, stats_,
+                                                RateLimiter::OpType::kRead);
+          if (rate_limiter_->IsRateLimited(RateLimiter::OpType::kRead)) {
+            sw.DelayStop();
+          }
+        } else {
+          allowed = n;
+        }
+        Slice tmp_result;
+
+#ifndef ROCKSDB_LITE
+        FileOperationInfo::TimePoint start_ts;
+        if (ShouldNotifyListeners()) {
+          start_ts = std::chrono::system_clock::now();
+        }
+#endif
+        {
+          IOSTATS_CPU_TIMER_GUARD(cpu_read_nanos, env_);
+          s = file_->Read(offset + pos, allowed, IOOptions(), &tmp_result,
+                          scratch + pos, nullptr);
+        }
+#ifndef ROCKSDB_LITE
+        if (ShouldNotifyListeners()) {
+          auto finish_ts = std::chrono::system_clock::now();
+          NotifyOnFileReadFinish(offset + pos, tmp_result.size(), start_ts,
+                                 finish_ts, s);
+        }
+#endif
+
+        if (res_scratch == nullptr) {
+          // we can't simply use `scratch` because reads of mmap'd files return
+          // data in a different buffer.
+          res_scratch = tmp_result.data();
+        } else {
+          // make sure chunks are inserted contiguously into `res_scratch`.
+          assert(tmp_result.data() == res_scratch + pos);
+        }
+        pos += tmp_result.size();
+        if (!s.ok() || tmp_result.size() < allowed) {
+          break;
+        }
+      }
+      *result = Slice(res_scratch, s.ok() ? pos : 0);
+    }
+    IOSTATS_ADD_IF_POSITIVE(bytes_read, result->size());
+    SetPerfLevel(prev_perf_level);
+  }
+  if (stats_ != nullptr && file_read_hist_ != nullptr) {
+    file_read_hist_->Add(elapsed);
+  }
+
+  return s;
+}
+
+Status RandomAccessFileReader::MultiRead(FSReadRequest* read_reqs,
+                                         size_t num_reqs) const {
+  Status s;
+  uint64_t elapsed = 0;
+  assert(!use_direct_io());
+  {
+    StopWatch sw(env_, stats_, hist_type_,
+                 (stats_ != nullptr) ? &elapsed : nullptr, true /*overwrite*/,
+                 true /*delay_enabled*/);
+    auto prev_perf_level = GetPerfLevel();
+    IOSTATS_TIMER_GUARD(read_nanos);
+
+#ifndef ROCKSDB_LITE
+    FileOperationInfo::TimePoint start_ts;
+    if (ShouldNotifyListeners()) {
+      start_ts = std::chrono::system_clock::now();
+    }
+#endif  // ROCKSDB_LITE
+    {
+      IOSTATS_CPU_TIMER_GUARD(cpu_read_nanos, env_);
+      s = file_->MultiRead(read_reqs, num_reqs, IOOptions(), nullptr);
+    }
+    for (size_t i = 0; i < num_reqs; ++i) {
+#ifndef ROCKSDB_LITE
+      if (ShouldNotifyListeners()) {
+        auto finish_ts = std::chrono::system_clock::now();
+        NotifyOnFileReadFinish(read_reqs[i].offset, read_reqs[i].result.size(),
+                               start_ts, finish_ts, read_reqs[i].status);
+      }
+#endif  // ROCKSDB_LITE
+      IOSTATS_ADD_IF_POSITIVE(bytes_read, read_reqs[i].result.size());
+    }
+    SetPerfLevel(prev_perf_level);
+  }
+  if (stats_ != nullptr && file_read_hist_ != nullptr) {
+    file_read_hist_->Add(elapsed);
+  }
+
+  return s;
+}
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/file/random_access_file_reader.h b/src/rocksdb/file/random_access_file_reader.h
new file mode 100644
index 000000000..35027bf45
--- /dev/null
+++ b/src/rocksdb/file/random_access_file_reader.h
@@ -0,0 +1,120 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <atomic>
+#include <sstream>
+#include <string>
+#include "port/port.h"
+#include "rocksdb/env.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/listener.h"
+#include "rocksdb/rate_limiter.h"
+#include "util/aligned_buffer.h"
+
+namespace ROCKSDB_NAMESPACE {
+class Statistics;
+class HistogramImpl;
+
+// RandomAccessFileReader is a wrapper on top of Env::RnadomAccessFile. It is
+// responsible for:
+// - Handling Buffered and Direct reads appropriately.
+// - Rate limiting compaction reads.
+// - Notifying any interested listeners on the completion of a read.
+// - Updating IO stats.
+class RandomAccessFileReader {
+ private:
+#ifndef ROCKSDB_LITE
+  void NotifyOnFileReadFinish(uint64_t offset, size_t length,
+                              const FileOperationInfo::TimePoint& start_ts,
+                              const FileOperationInfo::TimePoint& finish_ts,
+                              const Status& status) const {
+    FileOperationInfo info(file_name_, start_ts, finish_ts);
+    info.offset = offset;
+    info.length = length;
+    info.status = status;
+
+    for (auto& listener : listeners_) {
+      listener->OnFileReadFinish(info);
+    }
+  }
+#endif  // ROCKSDB_LITE
+
+  bool ShouldNotifyListeners() const { return !listeners_.empty(); }
+
+  std::unique_ptr<FSRandomAccessFile> file_;
+  std::string file_name_;
+  Env* env_;
+  Statistics* stats_;
+  uint32_t hist_type_;
+  HistogramImpl* file_read_hist_;
+  RateLimiter* rate_limiter_;
+  std::vector<std::shared_ptr<EventListener>> listeners_;
+
+ public:
+  explicit RandomAccessFileReader(
+      std::unique_ptr<FSRandomAccessFile>&& raf, std::string _file_name,
+      Env* env = nullptr, Statistics* stats = nullptr, uint32_t hist_type = 0,
+      HistogramImpl* file_read_hist = nullptr,
+      RateLimiter* rate_limiter = nullptr,
+      const std::vector<std::shared_ptr<EventListener>>& listeners = {})
+      : file_(std::move(raf)),
+        file_name_(std::move(_file_name)),
+        env_(env),
+        stats_(stats),
+        hist_type_(hist_type),
+        file_read_hist_(file_read_hist),
+        rate_limiter_(rate_limiter),
+        listeners_() {
+#ifndef ROCKSDB_LITE
+    std::for_each(listeners.begin(), listeners.end(),
+                  [this](const std::shared_ptr<EventListener>& e) {
+                    if (e->ShouldBeNotifiedOnFileIO()) {
+                      listeners_.emplace_back(e);
+                    }
+                  });
+#else  // !ROCKSDB_LITE
+    (void)listeners;
+#endif
+  }
+
+  RandomAccessFileReader(RandomAccessFileReader&& o) ROCKSDB_NOEXCEPT {
+    *this = std::move(o);
+  }
+
+  RandomAccessFileReader& operator=(RandomAccessFileReader&& o)
+      ROCKSDB_NOEXCEPT {
+    file_ = std::move(o.file_);
+    env_ = std::move(o.env_);
+    stats_ = std::move(o.stats_);
+    hist_type_ = std::move(o.hist_type_);
+    file_read_hist_ = std::move(o.file_read_hist_);
+    rate_limiter_ = std::move(o.rate_limiter_);
+    return *this;
+  }
+
+  RandomAccessFileReader(const RandomAccessFileReader&) = delete;
+  RandomAccessFileReader& operator=(const RandomAccessFileReader&) = delete;
+
+  Status Read(uint64_t offset, size_t n, Slice* result, char* scratch,
+              bool for_compaction = false) const;
+
+  Status MultiRead(FSReadRequest* reqs, size_t num_reqs) const;
+
+  Status Prefetch(uint64_t offset, size_t n) const {
+    return file_->Prefetch(offset, n, IOOptions(), nullptr);
+  }
+
+  FSRandomAccessFile* file() { return file_.get(); }
+
+  std::string file_name() const { return file_name_; }
+
+  bool use_direct_io() const { return file_->use_direct_io(); }
+};
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/file/read_write_util.cc b/src/rocksdb/file/read_write_util.cc
new file mode 100644
index 000000000..b4854e110
--- /dev/null
+++ b/src/rocksdb/file/read_write_util.cc
@@ -0,0 +1,67 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "file/read_write_util.h"
+
+#include <sstream>
+#include "test_util/sync_point.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+IOStatus NewWritableFile(FileSystem* fs, const std::string& fname,
+                         std::unique_ptr<FSWritableFile>* result,
+                         const FileOptions& options) {
+  IOStatus s = fs->NewWritableFile(fname, options, result, nullptr);
+  TEST_KILL_RANDOM("NewWritableFile:0", rocksdb_kill_odds * REDUCE_ODDS2);
+  return s;
+}
+
+bool ReadOneLine(std::istringstream* iss, SequentialFileReader* seq_file_reader,
+                 std::string* output, bool* has_data, Status* result) {
+  const int kBufferSize = 8192;
+  char buffer[kBufferSize + 1];
+  Slice input_slice;
+
+  std::string line;
+  bool has_complete_line = false;
+  while (!has_complete_line) {
+    if (std::getline(*iss, line)) {
+      has_complete_line = !iss->eof();
+    } else {
+      has_complete_line = false;
+    }
+    if (!has_complete_line) {
+      // if we're not sure whether we have a complete line,
+      // further read from the file.
+      if (*has_data) {
+        *result = seq_file_reader->Read(kBufferSize, &input_slice, buffer);
+      }
+      if (input_slice.size() == 0) {
+        // meaning we have read all the data
+        *has_data = false;
+        break;
+      } else {
+        iss->str(line + input_slice.ToString());
+        // reset the internal state of iss so that we can keep reading it.
+        iss->clear();
+        *has_data = (input_slice.size() == kBufferSize);
+        continue;
+      }
+    }
+  }
+  *output = line;
+  return *has_data || has_complete_line;
+}
+
+#ifndef NDEBUG
+bool IsFileSectorAligned(const size_t off, size_t sector_size) {
+  return off % sector_size == 0;
+}
+#endif  // NDEBUG
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/file/read_write_util.h b/src/rocksdb/file/read_write_util.h
new file mode 100644
index 000000000..22f4076b3
--- /dev/null
+++ b/src/rocksdb/file/read_write_util.h
@@ -0,0 +1,34 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <atomic>
+#include "file/sequence_file_reader.h"
+#include "rocksdb/env.h"
+#include "rocksdb/file_system.h"
+
+namespace ROCKSDB_NAMESPACE {
+// Returns a WritableFile.
+//
+// env     : the Env.
+// fname   : the file name.
+// result  : output arg. A WritableFile based on `fname` returned.
+// options : the Env Options.
+extern IOStatus NewWritableFile(FileSystem* fs, const std::string& fname,
+                                std::unique_ptr<FSWritableFile>* result,
+                                const FileOptions& options);
+
+// Read a single line from a file.
+bool ReadOneLine(std::istringstream* iss, SequentialFileReader* seq_file_reader,
+                 std::string* output, bool* has_data, Status* result);
+
+#ifndef NDEBUG
+bool IsFileSectorAligned(const size_t off, size_t sector_size);
+#endif  // NDEBUG
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/file/readahead_raf.cc b/src/rocksdb/file/readahead_raf.cc
new file mode 100644
index 000000000..493f9d9e8
--- /dev/null
+++ b/src/rocksdb/file/readahead_raf.cc
@@ -0,0 +1,162 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "file/readahead_raf.h"
+
+#include <algorithm>
+#include <mutex>
+#include "file/read_write_util.h"
+#include "util/aligned_buffer.h"
+#include "util/rate_limiter.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace {
+class ReadaheadRandomAccessFile : public RandomAccessFile {
+ public:
+  ReadaheadRandomAccessFile(std::unique_ptr<RandomAccessFile>&& file,
+                            size_t readahead_size)
+      : file_(std::move(file)),
+        alignment_(file_->GetRequiredBufferAlignment()),
+        readahead_size_(Roundup(readahead_size, alignment_)),
+        buffer_(),
+        buffer_offset_(0) {
+    buffer_.Alignment(alignment_);
+    buffer_.AllocateNewBuffer(readahead_size_);
+  }
+
+  ReadaheadRandomAccessFile(const ReadaheadRandomAccessFile&) = delete;
+
+  ReadaheadRandomAccessFile& operator=(const ReadaheadRandomAccessFile&) =
+      delete;
+
+  Status Read(uint64_t offset, size_t n, Slice* result,
+              char* scratch) const override {
+    // Read-ahead only make sense if we have some slack left after reading
+    if (n + alignment_ >= readahead_size_) {
+      return file_->Read(offset, n, result, scratch);
+    }
+
+    std::unique_lock<std::mutex> lk(lock_);
+
+    size_t cached_len = 0;
+    // Check if there is a cache hit, meaning that [offset, offset + n) is
+    // either completely or partially in the buffer. If it's completely cached,
+    // including end of file case when offset + n is greater than EOF, then
+    // return.
+    if (TryReadFromCache(offset, n, &cached_len, scratch) &&
+        (cached_len == n || buffer_.CurrentSize() < readahead_size_)) {
+      // We read exactly what we needed, or we hit end of file - return.
+      *result = Slice(scratch, cached_len);
+      return Status::OK();
+    }
+    size_t advanced_offset = static_cast<size_t>(offset + cached_len);
+    // In the case of cache hit advanced_offset is already aligned, means that
+    // chunk_offset equals to advanced_offset
+    size_t chunk_offset = TruncateToPageBoundary(alignment_, advanced_offset);
+
+    Status s = ReadIntoBuffer(chunk_offset, readahead_size_);
+    if (s.ok()) {
+      // The data we need is now in cache, so we can safely read it
+      size_t remaining_len;
+      TryReadFromCache(advanced_offset, n - cached_len, &remaining_len,
+                       scratch + cached_len);
+      *result = Slice(scratch, cached_len + remaining_len);
+    }
+    return s;
+  }
+
+  Status Prefetch(uint64_t offset, size_t n) override {
+    if (n < readahead_size_) {
+      // Don't allow smaller prefetches than the configured `readahead_size_`.
+      // `Read()` assumes a smaller prefetch buffer indicates EOF was reached.
+      return Status::OK();
+    }
+
+    std::unique_lock<std::mutex> lk(lock_);
+
+    size_t offset_ = static_cast<size_t>(offset);
+    size_t prefetch_offset = TruncateToPageBoundary(alignment_, offset_);
+    if (prefetch_offset == buffer_offset_) {
+      return Status::OK();
+    }
+    return ReadIntoBuffer(prefetch_offset,
+                          Roundup(offset_ + n, alignment_) - prefetch_offset);
+  }
+
+  size_t GetUniqueId(char* id, size_t max_size) const override {
+    return file_->GetUniqueId(id, max_size);
+  }
+
+  void Hint(AccessPattern pattern) override { file_->Hint(pattern); }
+
+  Status InvalidateCache(size_t offset, size_t length) override {
+    std::unique_lock<std::mutex> lk(lock_);
+    buffer_.Clear();
+    return file_->InvalidateCache(offset, length);
+  }
+
+  bool use_direct_io() const override { return file_->use_direct_io(); }
+
+ private:
+  // Tries to read from buffer_ n bytes starting at offset. If anything was read
+  // from the cache, it sets cached_len to the number of bytes actually read,
+  // copies these number of bytes to scratch and returns true.
+  // If nothing was read sets cached_len to 0 and returns false.
+  bool TryReadFromCache(uint64_t offset, size_t n, size_t* cached_len,
+                        char* scratch) const {
+    if (offset < buffer_offset_ ||
+        offset >= buffer_offset_ + buffer_.CurrentSize()) {
+      *cached_len = 0;
+      return false;
+    }
+    uint64_t offset_in_buffer = offset - buffer_offset_;
+    *cached_len = std::min(
+        buffer_.CurrentSize() - static_cast<size_t>(offset_in_buffer), n);
+    memcpy(scratch, buffer_.BufferStart() + offset_in_buffer, *cached_len);
+    return true;
+  }
+
+  // Reads into buffer_ the next n bytes from file_ starting at offset.
+  // Can actually read less if EOF was reached.
+  // Returns the status of the read operastion on the file.
+  Status ReadIntoBuffer(uint64_t offset, size_t n) const {
+    if (n > buffer_.Capacity()) {
+      n = buffer_.Capacity();
+    }
+    assert(IsFileSectorAligned(offset, alignment_));
+    assert(IsFileSectorAligned(n, alignment_));
+    Slice result;
+    Status s = file_->Read(offset, n, &result, buffer_.BufferStart());
+    if (s.ok()) {
+      buffer_offset_ = offset;
+      buffer_.Size(result.size());
+      assert(result.size() == 0 || buffer_.BufferStart() == result.data());
+    }
+    return s;
+  }
+
+  const std::unique_ptr<RandomAccessFile> file_;
+  const size_t alignment_;
+  const size_t readahead_size_;
+
+  mutable std::mutex lock_;
+  // The buffer storing the prefetched data
+  mutable AlignedBuffer buffer_;
+  // The offset in file_, corresponding to data stored in buffer_
+  mutable uint64_t buffer_offset_;
+};
+}  // namespace
+
+std::unique_ptr<RandomAccessFile> NewReadaheadRandomAccessFile(
+    std::unique_ptr<RandomAccessFile>&& file, size_t readahead_size) {
+  std::unique_ptr<RandomAccessFile> result(
+      new ReadaheadRandomAccessFile(std::move(file), readahead_size));
+  return result;
+}
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/file/readahead_raf.h b/src/rocksdb/file/readahead_raf.h
new file mode 100644
index 000000000..cbdcb124f
--- /dev/null
+++ b/src/rocksdb/file/readahead_raf.h
@@ -0,0 +1,27 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <atomic>
+#include "rocksdb/env.h"
+
+namespace ROCKSDB_NAMESPACE {
+// This file provides the following main abstractions:
+// SequentialFileReader : wrapper over Env::SequentialFile
+// RandomAccessFileReader : wrapper over Env::RandomAccessFile
+// WritableFileWriter : wrapper over Env::WritableFile
+// In addition, it also exposed NewReadaheadRandomAccessFile, NewWritableFile,
+// and ReadOneLine primitives.
+
+// NewReadaheadRandomAccessFile provides a wrapper over RandomAccessFile to
+// always prefetch additional data with every read. This is mainly used in
+// Compaction Table Readers.
+std::unique_ptr<RandomAccessFile> NewReadaheadRandomAccessFile(
+    std::unique_ptr<RandomAccessFile>&& file, size_t readahead_size);
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/file/sequence_file_reader.cc b/src/rocksdb/file/sequence_file_reader.cc
new file mode 100644
index 000000000..81c5e5d1d
--- /dev/null
+++ b/src/rocksdb/file/sequence_file_reader.cc
@@ -0,0 +1,237 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "file/sequence_file_reader.h"
+
+#include <algorithm>
+#include <mutex>
+
+#include "file/read_write_util.h"
+#include "monitoring/histogram.h"
+#include "monitoring/iostats_context_imp.h"
+#include "port/port.h"
+#include "test_util/sync_point.h"
+#include "util/aligned_buffer.h"
+#include "util/random.h"
+#include "util/rate_limiter.h"
+
+namespace ROCKSDB_NAMESPACE {
+Status SequentialFileReader::Read(size_t n, Slice* result, char* scratch) {
+  Status s;
+  if (use_direct_io()) {
+#ifndef ROCKSDB_LITE
+    size_t offset = offset_.fetch_add(n);
+    size_t alignment = file_->GetRequiredBufferAlignment();
+    size_t aligned_offset = TruncateToPageBoundary(alignment, offset);
+    size_t offset_advance = offset - aligned_offset;
+    size_t size = Roundup(offset + n, alignment) - aligned_offset;
+    size_t r = 0;
+    AlignedBuffer buf;
+    buf.Alignment(alignment);
+    buf.AllocateNewBuffer(size);
+    Slice tmp;
+    s = file_->PositionedRead(aligned_offset, size, IOOptions(), &tmp,
+                              buf.BufferStart(), nullptr);
+    if (s.ok() && offset_advance < tmp.size()) {
+      buf.Size(tmp.size());
+      r = buf.Read(scratch, offset_advance,
+                   std::min(tmp.size() - offset_advance, n));
+    }
+    *result = Slice(scratch, r);
+#endif  // !ROCKSDB_LITE
+  } else {
+    s = file_->Read(n, IOOptions(), result, scratch, nullptr);
+  }
+  IOSTATS_ADD(bytes_read, result->size());
+  return s;
+}
+
+Status SequentialFileReader::Skip(uint64_t n) {
+#ifndef ROCKSDB_LITE
+  if (use_direct_io()) {
+    offset_ += static_cast<size_t>(n);
+    return Status::OK();
+  }
+#endif  // !ROCKSDB_LITE
+  return file_->Skip(n);
+}
+
+namespace {
+// This class wraps a SequentialFile, exposing same API, with the differenece
+// of being able to prefetch up to readahead_size bytes and then serve them
+// from memory, avoiding the entire round-trip if, for example, the data for the
+// file is actually remote.
+class ReadaheadSequentialFile : public FSSequentialFile {
+ public:
+  ReadaheadSequentialFile(std::unique_ptr<FSSequentialFile>&& file,
+                          size_t readahead_size)
+      : file_(std::move(file)),
+        alignment_(file_->GetRequiredBufferAlignment()),
+        readahead_size_(Roundup(readahead_size, alignment_)),
+        buffer_(),
+        buffer_offset_(0),
+        read_offset_(0) {
+    buffer_.Alignment(alignment_);
+    buffer_.AllocateNewBuffer(readahead_size_);
+  }
+
+  ReadaheadSequentialFile(const ReadaheadSequentialFile&) = delete;
+
+  ReadaheadSequentialFile& operator=(const ReadaheadSequentialFile&) = delete;
+
+  IOStatus Read(size_t n, const IOOptions& opts, Slice* result, char* scratch,
+                IODebugContext* dbg) override {
+    std::unique_lock<std::mutex> lk(lock_);
+
+    size_t cached_len = 0;
+    // Check if there is a cache hit, meaning that [offset, offset + n) is
+    // either completely or partially in the buffer. If it's completely cached,
+    // including end of file case when offset + n is greater than EOF, then
+    // return.
+    if (TryReadFromCache(n, &cached_len, scratch) &&
+        (cached_len == n || buffer_.CurrentSize() < readahead_size_)) {
+      // We read exactly what we needed, or we hit end of file - return.
+      *result = Slice(scratch, cached_len);
+      return IOStatus::OK();
+    }
+    n -= cached_len;
+
+    IOStatus s;
+    // Read-ahead only make sense if we have some slack left after reading
+    if (n + alignment_ >= readahead_size_) {
+      s = file_->Read(n, opts, result, scratch + cached_len, dbg);
+      if (s.ok()) {
+        read_offset_ += result->size();
+        *result = Slice(scratch, cached_len + result->size());
+      }
+      buffer_.Clear();
+      return s;
+    }
+
+    s = ReadIntoBuffer(readahead_size_, opts, dbg);
+    if (s.ok()) {
+      // The data we need is now in cache, so we can safely read it
+      size_t remaining_len;
+      TryReadFromCache(n, &remaining_len, scratch + cached_len);
+      *result = Slice(scratch, cached_len + remaining_len);
+    }
+    return s;
+  }
+
+  IOStatus Skip(uint64_t n) override {
+    std::unique_lock<std::mutex> lk(lock_);
+    IOStatus s = IOStatus::OK();
+    // First check if we need to skip already cached data
+    if (buffer_.CurrentSize() > 0) {
+      // Do we need to skip beyond cached data?
+      if (read_offset_ + n >= buffer_offset_ + buffer_.CurrentSize()) {
+        // Yes. Skip whaterver is in memory and adjust offset accordingly
+        n -= buffer_offset_ + buffer_.CurrentSize() - read_offset_;
+        read_offset_ = buffer_offset_ + buffer_.CurrentSize();
+      } else {
+        // No. The entire section to be skipped is entirely i cache.
+        read_offset_ += n;
+        n = 0;
+      }
+    }
+    if (n > 0) {
+      // We still need to skip more, so call the file API for skipping
+      s = file_->Skip(n);
+      if (s.ok()) {
+        read_offset_ += n;
+      }
+      buffer_.Clear();
+    }
+    return s;
+  }
+
+  IOStatus PositionedRead(uint64_t offset, size_t n, const IOOptions& opts,
+                          Slice* result, char* scratch,
+                          IODebugContext* dbg) override {
+    return file_->PositionedRead(offset, n, opts, result, scratch, dbg);
+  }
+
+  IOStatus InvalidateCache(size_t offset, size_t length) override {
+    std::unique_lock<std::mutex> lk(lock_);
+    buffer_.Clear();
+    return file_->InvalidateCache(offset, length);
+  }
+
+  bool use_direct_io() const override { return file_->use_direct_io(); }
+
+ private:
+  // Tries to read from buffer_ n bytes. If anything was read from the cache, it
+  // sets cached_len to the number of bytes actually read, copies these number
+  // of bytes to scratch and returns true.
+  // If nothing was read sets cached_len to 0 and returns false.
+  bool TryReadFromCache(size_t n, size_t* cached_len, char* scratch) {
+    if (read_offset_ < buffer_offset_ ||
+        read_offset_ >= buffer_offset_ + buffer_.CurrentSize()) {
+      *cached_len = 0;
+      return false;
+    }
+    uint64_t offset_in_buffer = read_offset_ - buffer_offset_;
+    *cached_len = std::min(
+        buffer_.CurrentSize() - static_cast<size_t>(offset_in_buffer), n);
+    memcpy(scratch, buffer_.BufferStart() + offset_in_buffer, *cached_len);
+    read_offset_ += *cached_len;
+    return true;
+  }
+
+  // Reads into buffer_ the next n bytes from file_.
+  // Can actually read less if EOF was reached.
+  // Returns the status of the read operastion on the file.
+  IOStatus ReadIntoBuffer(size_t n, const IOOptions& opts,
+                          IODebugContext* dbg) {
+    if (n > buffer_.Capacity()) {
+      n = buffer_.Capacity();
+    }
+    assert(IsFileSectorAligned(n, alignment_));
+    Slice result;
+    IOStatus s = file_->Read(n, opts, &result, buffer_.BufferStart(), dbg);
+    if (s.ok()) {
+      buffer_offset_ = read_offset_;
+      buffer_.Size(result.size());
+      assert(result.size() == 0 || buffer_.BufferStart() == result.data());
+    }
+    return s;
+  }
+
+  const std::unique_ptr<FSSequentialFile> file_;
+  const size_t alignment_;
+  const size_t readahead_size_;
+
+  std::mutex lock_;
+  // The buffer storing the prefetched data
+  AlignedBuffer buffer_;
+  // The offset in file_, corresponding to data stored in buffer_
+  uint64_t buffer_offset_;
+  // The offset up to which data was read from file_. In fact, it can be larger
+  // than the actual file size, since the file_->Skip(n) call doesn't return the
+  // actual number of bytes that were skipped, which can be less than n.
+  // This is not a problemm since read_offset_ is monotonically increasing and
+  // its only use is to figure out if next piece of data should be read from
+  // buffer_ or file_ directly.
+  uint64_t read_offset_;
+};
+}  // namespace
+
+std::unique_ptr<FSSequentialFile>
+SequentialFileReader::NewReadaheadSequentialFile(
+    std::unique_ptr<FSSequentialFile>&& file, size_t readahead_size) {
+  if (file->GetRequiredBufferAlignment() >= readahead_size) {
+    // Short-circuit and return the original file if readahead_size is
+    // too small and hence doesn't make sense to be used for prefetching.
+    return std::move(file);
+  }
+  std::unique_ptr<FSSequentialFile> result(
+      new ReadaheadSequentialFile(std::move(file), readahead_size));
+  return result;
+}
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/file/sequence_file_reader.h b/src/rocksdb/file/sequence_file_reader.h
new file mode 100644
index 000000000..2f0898b42
--- /dev/null
+++ b/src/rocksdb/file/sequence_file_reader.h
@@ -0,0 +1,67 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <atomic>
+#include <string>
+#include "port/port.h"
+#include "rocksdb/env.h"
+#include "rocksdb/file_system.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// SequentialFileReader is a wrapper on top of Env::SequentialFile. It handles
+// Buffered (i.e when page cache is enabled) and Direct (with O_DIRECT / page
+// cache disabled) reads appropriately, and also updates the IO stats.
+class SequentialFileReader {
+ private:
+  std::unique_ptr<FSSequentialFile> file_;
+  std::string file_name_;
+  std::atomic<size_t> offset_{0};  // read offset
+
+ public:
+  explicit SequentialFileReader(std::unique_ptr<FSSequentialFile>&& _file,
+                                const std::string& _file_name)
+      : file_(std::move(_file)), file_name_(_file_name) {}
+
+  explicit SequentialFileReader(std::unique_ptr<FSSequentialFile>&& _file,
+                                const std::string& _file_name,
+                                size_t _readahead_size)
+      : file_(NewReadaheadSequentialFile(std::move(_file), _readahead_size)),
+        file_name_(_file_name) {}
+
+  SequentialFileReader(SequentialFileReader&& o) ROCKSDB_NOEXCEPT {
+    *this = std::move(o);
+  }
+
+  SequentialFileReader& operator=(SequentialFileReader&& o) ROCKSDB_NOEXCEPT {
+    file_ = std::move(o.file_);
+    return *this;
+  }
+
+  SequentialFileReader(const SequentialFileReader&) = delete;
+  SequentialFileReader& operator=(const SequentialFileReader&) = delete;
+
+  Status Read(size_t n, Slice* result, char* scratch);
+
+  Status Skip(uint64_t n);
+
+  FSSequentialFile* file() { return file_.get(); }
+
+  std::string file_name() { return file_name_; }
+
+  bool use_direct_io() const { return file_->use_direct_io(); }
+
+ private:
+  // NewReadaheadSequentialFile provides a wrapper over SequentialFile to
+  // always prefetch additional data with every read.
+  static std::unique_ptr<FSSequentialFile> NewReadaheadSequentialFile(
+      std::unique_ptr<FSSequentialFile>&& file, size_t readahead_size);
+};
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/file/sst_file_manager_impl.cc b/src/rocksdb/file/sst_file_manager_impl.cc
new file mode 100644
index 000000000..35056429e
--- /dev/null
+++ b/src/rocksdb/file/sst_file_manager_impl.cc
@@ -0,0 +1,558 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "file/sst_file_manager_impl.h"
+
+#include <cinttypes>
+#include <vector>
+
+#include "db/db_impl/db_impl.h"
+#include "env/composite_env_wrapper.h"
+#include "port/port.h"
+#include "rocksdb/env.h"
+#include "rocksdb/sst_file_manager.h"
+#include "test_util/sync_point.h"
+#include "util/mutexlock.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+#ifndef ROCKSDB_LITE
+SstFileManagerImpl::SstFileManagerImpl(Env* env, std::shared_ptr<FileSystem> fs,
+                                       std::shared_ptr<Logger> logger,
+                                       int64_t rate_bytes_per_sec,
+                                       double max_trash_db_ratio,
+                                       uint64_t bytes_max_delete_chunk)
+    : env_(env),
+      fs_(fs),
+      logger_(logger),
+      total_files_size_(0),
+      in_progress_files_size_(0),
+      compaction_buffer_size_(0),
+      cur_compactions_reserved_size_(0),
+      max_allowed_space_(0),
+      delete_scheduler_(env, fs_.get(), rate_bytes_per_sec, logger.get(), this,
+                        max_trash_db_ratio, bytes_max_delete_chunk),
+      cv_(&mu_),
+      closing_(false),
+      bg_thread_(nullptr),
+      reserved_disk_buffer_(0),
+      free_space_trigger_(0),
+      cur_instance_(nullptr) {}
+
+SstFileManagerImpl::~SstFileManagerImpl() {
+  Close();
+}
+
+void SstFileManagerImpl::Close() {
+  {
+    MutexLock l(&mu_);
+    if (closing_) {
+      return;
+    }
+    closing_ = true;
+    cv_.SignalAll();
+  }
+  if (bg_thread_) {
+    bg_thread_->join();
+  }
+}
+
+Status SstFileManagerImpl::OnAddFile(const std::string& file_path,
+                                     bool compaction) {
+  uint64_t file_size;
+  Status s = fs_->GetFileSize(file_path, IOOptions(), &file_size, nullptr);
+  if (s.ok()) {
+    MutexLock l(&mu_);
+    OnAddFileImpl(file_path, file_size, compaction);
+  }
+  TEST_SYNC_POINT("SstFileManagerImpl::OnAddFile");
+  return s;
+}
+
+Status SstFileManagerImpl::OnAddFile(const std::string& file_path,
+                                     uint64_t file_size, bool compaction) {
+  MutexLock l(&mu_);
+  OnAddFileImpl(file_path, file_size, compaction);
+  TEST_SYNC_POINT("SstFileManagerImpl::OnAddFile");
+  return Status::OK();
+}
+
+Status SstFileManagerImpl::OnDeleteFile(const std::string& file_path) {
+  {
+    MutexLock l(&mu_);
+    OnDeleteFileImpl(file_path);
+  }
+  TEST_SYNC_POINT("SstFileManagerImpl::OnDeleteFile");
+  return Status::OK();
+}
+
+void SstFileManagerImpl::OnCompactionCompletion(Compaction* c) {
+  MutexLock l(&mu_);
+  uint64_t size_added_by_compaction = 0;
+  for (size_t i = 0; i < c->num_input_levels(); i++) {
+    for (size_t j = 0; j < c->num_input_files(i); j++) {
+      FileMetaData* filemeta = c->input(i, j);
+      size_added_by_compaction += filemeta->fd.GetFileSize();
+    }
+  }
+  cur_compactions_reserved_size_ -= size_added_by_compaction;
+
+  auto new_files = c->edit()->GetNewFiles();
+  for (auto& new_file : new_files) {
+    auto fn = TableFileName(c->immutable_cf_options()->cf_paths,
+                            new_file.second.fd.GetNumber(),
+                            new_file.second.fd.GetPathId());
+    if (in_progress_files_.find(fn) != in_progress_files_.end()) {
+      auto tracked_file = tracked_files_.find(fn);
+      assert(tracked_file != tracked_files_.end());
+      in_progress_files_size_ -= tracked_file->second;
+      in_progress_files_.erase(fn);
+    }
+  }
+}
+
+Status SstFileManagerImpl::OnMoveFile(const std::string& old_path,
+                                      const std::string& new_path,
+                                      uint64_t* file_size) {
+  {
+    MutexLock l(&mu_);
+    if (file_size != nullptr) {
+      *file_size = tracked_files_[old_path];
+    }
+    OnAddFileImpl(new_path, tracked_files_[old_path], false);
+    OnDeleteFileImpl(old_path);
+  }
+  TEST_SYNC_POINT("SstFileManagerImpl::OnMoveFile");
+  return Status::OK();
+}
+
+void SstFileManagerImpl::SetMaxAllowedSpaceUsage(uint64_t max_allowed_space) {
+  MutexLock l(&mu_);
+  max_allowed_space_ = max_allowed_space;
+}
+
+void SstFileManagerImpl::SetCompactionBufferSize(
+    uint64_t compaction_buffer_size) {
+  MutexLock l(&mu_);
+  compaction_buffer_size_ = compaction_buffer_size;
+}
+
+bool SstFileManagerImpl::IsMaxAllowedSpaceReached() {
+  MutexLock l(&mu_);
+  if (max_allowed_space_ <= 0) {
+    return false;
+  }
+  return total_files_size_ >= max_allowed_space_;
+}
+
+bool SstFileManagerImpl::IsMaxAllowedSpaceReachedIncludingCompactions() {
+  MutexLock l(&mu_);
+  if (max_allowed_space_ <= 0) {
+    return false;
+  }
+  return total_files_size_ + cur_compactions_reserved_size_ >=
+         max_allowed_space_;
+}
+
+bool SstFileManagerImpl::EnoughRoomForCompaction(
+    ColumnFamilyData* cfd, const std::vector<CompactionInputFiles>& inputs,
+    Status bg_error) {
+  MutexLock l(&mu_);
+  uint64_t size_added_by_compaction = 0;
+  // First check if we even have the space to do the compaction
+  for (size_t i = 0; i < inputs.size(); i++) {
+    for (size_t j = 0; j < inputs[i].size(); j++) {
+      FileMetaData* filemeta = inputs[i][j];
+      size_added_by_compaction += filemeta->fd.GetFileSize();
+    }
+  }
+
+  // Update cur_compactions_reserved_size_ so concurrent compaction
+  // don't max out space
+  size_t needed_headroom =
+      cur_compactions_reserved_size_ + size_added_by_compaction +
+      compaction_buffer_size_;
+  if (max_allowed_space_ != 0 &&
+      (needed_headroom + total_files_size_ > max_allowed_space_)) {
+    return false;
+  }
+
+  // Implement more aggressive checks only if this DB instance has already
+  // seen a NoSpace() error. This is tin order to contain a single potentially
+  // misbehaving DB instance and prevent it from slowing down compactions of
+  // other DB instances
+  if (CheckFreeSpace() && bg_error == Status::NoSpace()) {
+    auto fn =
+        TableFileName(cfd->ioptions()->cf_paths, inputs[0][0]->fd.GetNumber(),
+                      inputs[0][0]->fd.GetPathId());
+    uint64_t free_space = 0;
+    fs_->GetFreeSpace(fn, IOOptions(), &free_space, nullptr);
+    // needed_headroom is based on current size reserved by compactions,
+    // minus any files created by running compactions as they would count
+    // against the reserved size. If user didn't specify any compaction
+    // buffer, add reserved_disk_buffer_ that's calculated by default so the
+    // compaction doesn't end up leaving nothing for logs and flush SSTs
+    if (compaction_buffer_size_ == 0) {
+      needed_headroom += reserved_disk_buffer_;
+    }
+    needed_headroom -= in_progress_files_size_;
+    if (free_space < needed_headroom + size_added_by_compaction) {
+      // We hit the condition of not enough disk space
+      ROCKS_LOG_ERROR(logger_,
+                      "free space [%" PRIu64
+                      " bytes] is less than "
+                      "needed headroom [%" ROCKSDB_PRIszt " bytes]\n",
+                      free_space, needed_headroom);
+      return false;
+    }
+  }
+
+  cur_compactions_reserved_size_ += size_added_by_compaction;
+  // Take a snapshot of cur_compactions_reserved_size_ for when we encounter
+  // a NoSpace error.
+  free_space_trigger_ = cur_compactions_reserved_size_;
+  return true;
+}
+
+uint64_t SstFileManagerImpl::GetCompactionsReservedSize() {
+  MutexLock l(&mu_);
+  return cur_compactions_reserved_size_;
+}
+
+uint64_t SstFileManagerImpl::GetTotalSize() {
+  MutexLock l(&mu_);
+  return total_files_size_;
+}
+
+std::unordered_map<std::string, uint64_t>
+SstFileManagerImpl::GetTrackedFiles() {
+  MutexLock l(&mu_);
+  return tracked_files_;
+}
+
+int64_t SstFileManagerImpl::GetDeleteRateBytesPerSecond() {
+  return delete_scheduler_.GetRateBytesPerSecond();
+}
+
+void SstFileManagerImpl::SetDeleteRateBytesPerSecond(int64_t delete_rate) {
+  return delete_scheduler_.SetRateBytesPerSecond(delete_rate);
+}
+
+double SstFileManagerImpl::GetMaxTrashDBRatio() {
+  return delete_scheduler_.GetMaxTrashDBRatio();
+}
+
+void SstFileManagerImpl::SetMaxTrashDBRatio(double r) {
+  return delete_scheduler_.SetMaxTrashDBRatio(r);
+}
+
+uint64_t SstFileManagerImpl::GetTotalTrashSize() {
+  return delete_scheduler_.GetTotalTrashSize();
+}
+
+void SstFileManagerImpl::ReserveDiskBuffer(uint64_t size,
+                                           const std::string& path) {
+  MutexLock l(&mu_);
+
+  reserved_disk_buffer_ += size;
+  if (path_.empty()) {
+    path_ = path;
+  }
+}
+
+void SstFileManagerImpl::ClearError() {
+  while (true) {
+    MutexLock l(&mu_);
+
+    if (closing_) {
+      return;
+    }
+
+    uint64_t free_space = 0;
+    Status s = fs_->GetFreeSpace(path_, IOOptions(), &free_space, nullptr);
+    free_space = max_allowed_space_ > 0
+                     ? std::min(max_allowed_space_, free_space)
+                     : free_space;
+    if (s.ok()) {
+      // In case of multi-DB instances, some of them may have experienced a
+      // soft error and some a hard error. In the SstFileManagerImpl, a hard
+      // error will basically override previously reported soft errors. Once
+      // we clear the hard error, we don't keep track of previous errors for
+      // now
+      if (bg_err_.severity() == Status::Severity::kHardError) {
+        if (free_space < reserved_disk_buffer_) {
+          ROCKS_LOG_ERROR(logger_,
+                          "free space [%" PRIu64
+                          " bytes] is less than "
+                          "required disk buffer [%" PRIu64 " bytes]\n",
+                          free_space, reserved_disk_buffer_);
+          ROCKS_LOG_ERROR(logger_, "Cannot clear hard error\n");
+          s = Status::NoSpace();
+        }
+      } else if (bg_err_.severity() == Status::Severity::kSoftError) {
+        if (free_space < free_space_trigger_) {
+          ROCKS_LOG_WARN(logger_,
+                         "free space [%" PRIu64
+                         " bytes] is less than "
+                         "free space for compaction trigger [%" PRIu64
+                         " bytes]\n",
+                         free_space, free_space_trigger_);
+          ROCKS_LOG_WARN(logger_, "Cannot clear soft error\n");
+          s = Status::NoSpace();
+        }
+      }
+    }
+
+    // Someone could have called CancelErrorRecovery() and the list could have
+    // become empty, so check again here
+    if (s.ok() && !error_handler_list_.empty()) {
+      auto error_handler = error_handler_list_.front();
+      // Since we will release the mutex, set cur_instance_ to signal to the
+      // shutdown thread, if it calls // CancelErrorRecovery() the meantime,
+      // to indicate that this DB instance is busy. The DB instance is
+      // guaranteed to not be deleted before RecoverFromBGError() returns,
+      // since the ErrorHandler::recovery_in_prog_ flag would be true
+      cur_instance_ = error_handler;
+      mu_.Unlock();
+      s = error_handler->RecoverFromBGError();
+      TEST_SYNC_POINT("SstFileManagerImpl::ErrorCleared");
+      mu_.Lock();
+      // The DB instance might have been deleted while we were
+      // waiting for the mutex, so check cur_instance_ to make sure its
+      // still non-null
+      if (cur_instance_) {
+        // Check for error again, since the instance may have recovered but
+        // immediately got another error. If that's the case, and the new
+        // error is also a NoSpace() non-fatal error, leave the instance in
+        // the list
+        Status err = cur_instance_->GetBGError();
+        if (s.ok() && err == Status::NoSpace() &&
+            err.severity() < Status::Severity::kFatalError) {
+          s = err;
+        }
+        cur_instance_ = nullptr;
+      }
+
+      if (s.ok() || s.IsShutdownInProgress() ||
+          (!s.ok() && s.severity() >= Status::Severity::kFatalError)) {
+        // If shutdown is in progress, abandon this handler instance
+        // and continue with the others
+        error_handler_list_.pop_front();
+      }
+    }
+
+    if (!error_handler_list_.empty()) {
+      // If there are more instances to be recovered, reschedule after 5
+      // seconds
+      int64_t wait_until = env_->NowMicros() + 5000000;
+      cv_.TimedWait(wait_until);
+    }
+
+    // Check again for error_handler_list_ empty, as a DB instance shutdown
+    // could have removed it from the queue while we were in timed wait
+    if (error_handler_list_.empty()) {
+      ROCKS_LOG_INFO(logger_, "Clearing error\n");
+      bg_err_ = Status::OK();
+      return;
+    }
+  }
+}
+
+void SstFileManagerImpl::StartErrorRecovery(ErrorHandler* handler,
+                                            Status bg_error) {
+  MutexLock l(&mu_);
+  if (bg_error.severity() == Status::Severity::kSoftError) {
+    if (bg_err_.ok()) {
+      // Setting bg_err_ basically means we're in degraded mode
+      // Assume that all pending compactions will fail similarly. The trigger
+      // for clearing this condition is set to current compaction reserved
+      // size, so we stop checking disk space available in
+      // EnoughRoomForCompaction once this much free space is available
+      bg_err_ = bg_error;
+    }
+  } else if (bg_error.severity() == Status::Severity::kHardError) {
+    bg_err_ = bg_error;
+  } else {
+    assert(false);
+  }
+
+  // If this is the first instance of this error, kick of a thread to poll
+  // and recover from this condition
+  if (error_handler_list_.empty()) {
+    error_handler_list_.push_back(handler);
+    // Release lock before calling join. Its ok to do so because
+    // error_handler_list_ is now non-empty, so no other invocation of this
+    // function will execute this piece of code
+    mu_.Unlock();
+    if (bg_thread_) {
+      bg_thread_->join();
+    }
+    // Start a new thread. The previous one would have exited.
+    bg_thread_.reset(new port::Thread(&SstFileManagerImpl::ClearError, this));
+    mu_.Lock();
+  } else {
+    // Check if this DB instance is already in the list
+    for (auto iter = error_handler_list_.begin();
+         iter != error_handler_list_.end(); ++iter) {
+      if ((*iter) == handler) {
+        return;
+      }
+    }
+    error_handler_list_.push_back(handler);
+  }
+}
+
+bool SstFileManagerImpl::CancelErrorRecovery(ErrorHandler* handler) {
+  MutexLock l(&mu_);
+
+  if (cur_instance_ == handler) {
+    // This instance is currently busy attempting to recover
+    // Nullify it so the recovery thread doesn't attempt to access it again
+    cur_instance_ = nullptr;
+    return false;
+  }
+
+  for (auto iter = error_handler_list_.begin();
+       iter != error_handler_list_.end(); ++iter) {
+    if ((*iter) == handler) {
+      error_handler_list_.erase(iter);
+      return true;
+    }
+  }
+  return false;
+}
+
+Status SstFileManagerImpl::ScheduleFileDeletion(
+    const std::string& file_path, const std::string& path_to_sync,
+    const bool force_bg) {
+  TEST_SYNC_POINT_CALLBACK("SstFileManagerImpl::ScheduleFileDeletion",
+                           const_cast<std::string*>(&file_path));
+  return delete_scheduler_.DeleteFile(file_path, path_to_sync,
+                                      force_bg);
+}
+
+void SstFileManagerImpl::WaitForEmptyTrash() {
+  delete_scheduler_.WaitForEmptyTrash();
+}
+
+void SstFileManagerImpl::OnAddFileImpl(const std::string& file_path,
+                                       uint64_t file_size, bool compaction) {
+  auto tracked_file = tracked_files_.find(file_path);
+  if (tracked_file != tracked_files_.end()) {
+    // File was added before, we will just update the size
+    assert(!compaction);
+    total_files_size_ -= tracked_file->second;
+    total_files_size_ += file_size;
+    cur_compactions_reserved_size_ -= file_size;
+  } else {
+    total_files_size_ += file_size;
+    if (compaction) {
+      // Keep track of the size of files created by in-progress compactions.
+      // When calculating whether there's enough headroom for new compactions,
+      // this will be subtracted from cur_compactions_reserved_size_.
+      // Otherwise, compactions will be double counted.
+      in_progress_files_size_ += file_size;
+      in_progress_files_.insert(file_path);
+    }
+  }
+  tracked_files_[file_path] = file_size;
+}
+
+void SstFileManagerImpl::OnDeleteFileImpl(const std::string& file_path) {
+  auto tracked_file = tracked_files_.find(file_path);
+  if (tracked_file == tracked_files_.end()) {
+    // File is not tracked
+    assert(in_progress_files_.find(file_path) == in_progress_files_.end());
+    return;
+  }
+
+  total_files_size_ -= tracked_file->second;
+  // Check if it belonged to an in-progress compaction
+  if (in_progress_files_.find(file_path) != in_progress_files_.end()) {
+    in_progress_files_size_ -= tracked_file->second;
+    in_progress_files_.erase(file_path);
+  }
+  tracked_files_.erase(tracked_file);
+}
+
+SstFileManager* NewSstFileManager(Env* env, std::shared_ptr<Logger> info_log,
+                                  std::string trash_dir,
+                                  int64_t rate_bytes_per_sec,
+                                  bool delete_existing_trash, Status* status,
+                                  double max_trash_db_ratio,
+                                  uint64_t bytes_max_delete_chunk) {
+  std::shared_ptr<FileSystem> fs;
+
+  if (env == Env::Default()) {
+    fs = FileSystem::Default();
+  } else {
+    fs.reset(new LegacyFileSystemWrapper(env));
+  }
+
+  return NewSstFileManager(env, fs, info_log, trash_dir, rate_bytes_per_sec,
+                           delete_existing_trash, status, max_trash_db_ratio,
+                           bytes_max_delete_chunk);
+}
+
+SstFileManager* NewSstFileManager(Env* env, std::shared_ptr<FileSystem> fs,
+                                  std::shared_ptr<Logger> info_log,
+                                  const std::string& trash_dir,
+                                  int64_t rate_bytes_per_sec,
+                                  bool delete_existing_trash, Status* status,
+                                  double max_trash_db_ratio,
+                                  uint64_t bytes_max_delete_chunk) {
+  SstFileManagerImpl* res =
+      new SstFileManagerImpl(env, fs, info_log, rate_bytes_per_sec,
+                             max_trash_db_ratio, bytes_max_delete_chunk);
+
+  // trash_dir is deprecated and not needed anymore, but if user passed it
+  // we will still remove files in it.
+  Status s;
+  if (delete_existing_trash && trash_dir != "") {
+    std::vector<std::string> files_in_trash;
+    s = fs->GetChildren(trash_dir, IOOptions(), &files_in_trash, nullptr);
+    if (s.ok()) {
+      for (const std::string& trash_file : files_in_trash) {
+        if (trash_file == "." || trash_file == "..") {
+          continue;
+        }
+
+        std::string path_in_trash = trash_dir + "/" + trash_file;
+        res->OnAddFile(path_in_trash);
+        Status file_delete =
+            res->ScheduleFileDeletion(path_in_trash, trash_dir);
+        if (s.ok() && !file_delete.ok()) {
+          s = file_delete;
+        }
+      }
+    }
+  }
+
+  if (status) {
+    *status = s;
+  }
+
+  return res;
+}
+
+#else
+
+SstFileManager* NewSstFileManager(Env* /*env*/,
+                                  std::shared_ptr<Logger> /*info_log*/,
+                                  std::string /*trash_dir*/,
+                                  int64_t /*rate_bytes_per_sec*/,
+                                  bool /*delete_existing_trash*/,
+                                  Status* status, double /*max_trash_db_ratio*/,
+                                  uint64_t /*bytes_max_delete_chunk*/) {
+  if (status) {
+    *status =
+        Status::NotSupported("SstFileManager is not supported in ROCKSDB_LITE");
+  }
+  return nullptr;
+}
+
+#endif  // ROCKSDB_LITE
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/file/sst_file_manager_impl.h b/src/rocksdb/file/sst_file_manager_impl.h
new file mode 100644
index 000000000..323ffc7b2
--- /dev/null
+++ b/src/rocksdb/file/sst_file_manager_impl.h
@@ -0,0 +1,197 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <string>
+
+#include "port/port.h"
+
+#include "db/compaction/compaction.h"
+#include "db/error_handler.h"
+#include "file/delete_scheduler.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/sst_file_manager.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Env;
+class Logger;
+
+// SstFileManager is used to track SST files in the DB and control there
+// deletion rate.
+// All SstFileManager public functions are thread-safe.
+class SstFileManagerImpl : public SstFileManager {
+ public:
+  explicit SstFileManagerImpl(Env* env, std::shared_ptr<FileSystem> fs,
+                              std::shared_ptr<Logger> logger,
+                              int64_t rate_bytes_per_sec,
+                              double max_trash_db_ratio,
+                              uint64_t bytes_max_delete_chunk);
+
+  ~SstFileManagerImpl();
+
+  // DB will call OnAddFile whenever a new sst file is added.
+  Status OnAddFile(const std::string& file_path, bool compaction = false);
+
+  // Overload where size of the file is provided by the caller rather than
+  // queried from the filesystem. This is an optimization.
+  Status OnAddFile(const std::string& file_path, uint64_t file_size,
+                   bool compaction);
+
+  // DB will call OnDeleteFile whenever an sst file is deleted.
+  Status OnDeleteFile(const std::string& file_path);
+
+  // DB will call OnMoveFile whenever an sst file is move to a new path.
+  Status OnMoveFile(const std::string& old_path, const std::string& new_path,
+                    uint64_t* file_size = nullptr);
+
+  // Update the maximum allowed space that should be used by RocksDB, if
+  // the total size of the SST files exceeds max_allowed_space, writes to
+  // RocksDB will fail.
+  //
+  // Setting max_allowed_space to 0 will disable this feature, maximum allowed
+  // space will be infinite (Default value).
+  //
+  // thread-safe.
+  void SetMaxAllowedSpaceUsage(uint64_t max_allowed_space) override;
+
+  void SetCompactionBufferSize(uint64_t compaction_buffer_size) override;
+
+  // Return true if the total size of SST files exceeded the maximum allowed
+  // space usage.
+  //
+  // thread-safe.
+  bool IsMaxAllowedSpaceReached() override;
+
+  bool IsMaxAllowedSpaceReachedIncludingCompactions() override;
+
+  // Returns true is there is enough (approximate) space for the specified
+  // compaction. Space is approximate because this function conservatively
+  // estimates how much space is currently being used by compactions (i.e.
+  // if a compaction has started, this function bumps the used space by
+  // the full compaction size).
+  bool EnoughRoomForCompaction(ColumnFamilyData* cfd,
+                               const std::vector<CompactionInputFiles>& inputs,
+                               Status bg_error);
+
+  // Bookkeeping so total_file_sizes_ goes back to normal after compaction
+  // finishes
+  void OnCompactionCompletion(Compaction* c);
+
+  uint64_t GetCompactionsReservedSize();
+
+  // Return the total size of all tracked files.
+  uint64_t GetTotalSize() override;
+
+  // Return a map containing all tracked files and there corresponding sizes.
+  std::unordered_map<std::string, uint64_t> GetTrackedFiles() override;
+
+  // Return delete rate limit in bytes per second.
+  virtual int64_t GetDeleteRateBytesPerSecond() override;
+
+  // Update the delete rate limit in bytes per second.
+  virtual void SetDeleteRateBytesPerSecond(int64_t delete_rate) override;
+
+  // Return trash/DB size ratio where new files will be deleted immediately
+  virtual double GetMaxTrashDBRatio() override;
+
+  // Update trash/DB size ratio where new files will be deleted immediately
+  virtual void SetMaxTrashDBRatio(double ratio) override;
+
+  // Return the total size of trash files
+  uint64_t GetTotalTrashSize() override;
+
+  // Called by each DB instance using this sst file manager to reserve
+  // disk buffer space for recovery from out of space errors
+  void ReserveDiskBuffer(uint64_t buffer, const std::string& path);
+
+  // Set a flag upon encountering disk full. May enqueue the ErrorHandler
+  // instance for background polling and recovery
+  void StartErrorRecovery(ErrorHandler* db, Status bg_error);
+
+  // Remove the given Errorhandler instance from the recovery queue. Its
+  // not guaranteed
+  bool CancelErrorRecovery(ErrorHandler* db);
+
+  // Mark file as trash and schedule it's deletion. If force_bg is set, it
+  // forces the file to be deleting in the background regardless of DB size,
+  // except when rate limited delete is disabled
+  virtual Status ScheduleFileDeletion(const std::string& file_path,
+                                      const std::string& dir_to_sync,
+                                      const bool force_bg = false);
+
+  // Wait for all files being deleteing in the background to finish or for
+  // destructor to be called.
+  virtual void WaitForEmptyTrash();
+
+  DeleteScheduler* delete_scheduler() { return &delete_scheduler_; }
+
+  // Stop the error recovery background thread. This should be called only
+  // once in the object's lifetime, and before the destructor
+  void Close();
+
+ private:
+  // REQUIRES: mutex locked
+  void OnAddFileImpl(const std::string& file_path, uint64_t file_size,
+                     bool compaction);
+  // REQUIRES: mutex locked
+  void OnDeleteFileImpl(const std::string& file_path);
+
+  void ClearError();
+  bool CheckFreeSpace() {
+    return bg_err_.severity() == Status::Severity::kSoftError;
+  }
+
+  Env* env_;
+  std::shared_ptr<FileSystem> fs_;
+  std::shared_ptr<Logger> logger_;
+  // Mutex to protect tracked_files_, total_files_size_
+  port::Mutex mu_;
+  // The summation of the sizes of all files in tracked_files_ map
+  uint64_t total_files_size_;
+  // The summation of all output files of in-progress compactions
+  uint64_t in_progress_files_size_;
+  // Compactions should only execute if they can leave at least
+  // this amount of buffer space for logs and flushes
+  uint64_t compaction_buffer_size_;
+  // Estimated size of the current ongoing compactions
+  uint64_t cur_compactions_reserved_size_;
+  // A map containing all tracked files and there sizes
+  //  file_path => file_size
+  std::unordered_map<std::string, uint64_t> tracked_files_;
+  // A set of files belonging to in-progress compactions
+  std::unordered_set<std::string> in_progress_files_;
+  // The maximum allowed space (in bytes) for sst files.
+  uint64_t max_allowed_space_;
+  // DeleteScheduler used to throttle file deletition.
+  DeleteScheduler delete_scheduler_;
+  port::CondVar cv_;
+  // Flag to force error recovery thread to exit
+  bool closing_;
+  // Background error recovery thread
+  std::unique_ptr<port::Thread> bg_thread_;
+  // A path in the filesystem corresponding to this SFM. This is used for
+  // calling Env::GetFreeSpace. Posix requires a path in the filesystem
+  std::string path_;
+  // Save the current background error
+  Status bg_err_;
+  // Amount of free disk headroom before allowing recovery from hard errors
+  uint64_t reserved_disk_buffer_;
+  // For soft errors, amount of free disk space before we can allow
+  // compactions to run full throttle. If disk space is below this trigger,
+  // compactions will be gated by free disk space > input size
+  uint64_t free_space_trigger_;
+  // List of database error handler instances tracked by this sst file manager
+  std::list<ErrorHandler*> error_handler_list_;
+  // Pointer to ErrorHandler instance that is currently processing recovery
+  ErrorHandler* cur_instance_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/file/writable_file_writer.cc b/src/rocksdb/file/writable_file_writer.cc
new file mode 100644
index 000000000..d5894c17a
--- /dev/null
+++ b/src/rocksdb/file/writable_file_writer.cc
@@ -0,0 +1,429 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "file/writable_file_writer.h"
+
+#include <algorithm>
+#include <mutex>
+
+#include "db/version_edit.h"
+#include "monitoring/histogram.h"
+#include "monitoring/iostats_context_imp.h"
+#include "port/port.h"
+#include "test_util/sync_point.h"
+#include "util/random.h"
+#include "util/rate_limiter.h"
+
+namespace ROCKSDB_NAMESPACE {
+Status WritableFileWriter::Append(const Slice& data) {
+  const char* src = data.data();
+  size_t left = data.size();
+  Status s;
+  pending_sync_ = true;
+
+  TEST_KILL_RANDOM("WritableFileWriter::Append:0",
+                   rocksdb_kill_odds * REDUCE_ODDS2);
+
+  {
+    IOSTATS_TIMER_GUARD(prepare_write_nanos);
+    TEST_SYNC_POINT("WritableFileWriter::Append:BeforePrepareWrite");
+    writable_file_->PrepareWrite(static_cast<size_t>(GetFileSize()), left,
+                                 IOOptions(), nullptr);
+  }
+
+  // See whether we need to enlarge the buffer to avoid the flush
+  if (buf_.Capacity() - buf_.CurrentSize() < left) {
+    for (size_t cap = buf_.Capacity();
+         cap < max_buffer_size_;  // There is still room to increase
+         cap *= 2) {
+      // See whether the next available size is large enough.
+      // Buffer will never be increased to more than max_buffer_size_.
+      size_t desired_capacity = std::min(cap * 2, max_buffer_size_);
+      if (desired_capacity - buf_.CurrentSize() >= left ||
+          (use_direct_io() && desired_capacity == max_buffer_size_)) {
+        buf_.AllocateNewBuffer(desired_capacity, true);
+        break;
+      }
+    }
+  }
+
+  // Flush only when buffered I/O
+  if (!use_direct_io() && (buf_.Capacity() - buf_.CurrentSize()) < left) {
+    if (buf_.CurrentSize() > 0) {
+      s = Flush();
+      if (!s.ok()) {
+        return s;
+      }
+    }
+    assert(buf_.CurrentSize() == 0);
+  }
+
+  // We never write directly to disk with direct I/O on.
+  // or we simply use it for its original purpose to accumulate many small
+  // chunks
+  if (use_direct_io() || (buf_.Capacity() >= left)) {
+    while (left > 0) {
+      size_t appended = buf_.Append(src, left);
+      left -= appended;
+      src += appended;
+
+      if (left > 0) {
+        s = Flush();
+        if (!s.ok()) {
+          break;
+        }
+      }
+    }
+  } else {
+    // Writing directly to file bypassing the buffer
+    assert(buf_.CurrentSize() == 0);
+    s = WriteBuffered(src, left);
+  }
+
+  TEST_KILL_RANDOM("WritableFileWriter::Append:1", rocksdb_kill_odds);
+  if (s.ok()) {
+    filesize_ += data.size();
+    CalculateFileChecksum(data);
+  }
+  return s;
+}
+
+Status WritableFileWriter::Pad(const size_t pad_bytes) {
+  assert(pad_bytes < kDefaultPageSize);
+  size_t left = pad_bytes;
+  size_t cap = buf_.Capacity() - buf_.CurrentSize();
+
+  // Assume pad_bytes is small compared to buf_ capacity. So we always
+  // use buf_ rather than write directly to file in certain cases like
+  // Append() does.
+  while (left) {
+    size_t append_bytes = std::min(cap, left);
+    buf_.PadWith(append_bytes, 0);
+    left -= append_bytes;
+    if (left > 0) {
+      Status s = Flush();
+      if (!s.ok()) {
+        return s;
+      }
+    }
+    cap = buf_.Capacity() - buf_.CurrentSize();
+  }
+  pending_sync_ = true;
+  filesize_ += pad_bytes;
+  return Status::OK();
+}
+
+Status WritableFileWriter::Close() {
+  // Do not quit immediately on failure the file MUST be closed
+  Status s;
+
+  // Possible to close it twice now as we MUST close
+  // in __dtor, simply flushing is not enough
+  // Windows when pre-allocating does not fill with zeros
+  // also with unbuffered access we also set the end of data.
+  if (!writable_file_) {
+    return s;
+  }
+
+  s = Flush();  // flush cache to OS
+
+  Status interim;
+  // In direct I/O mode we write whole pages so
+  // we need to let the file know where data ends.
+  if (use_direct_io()) {
+    interim = writable_file_->Truncate(filesize_, IOOptions(), nullptr);
+    if (interim.ok()) {
+      interim = writable_file_->Fsync(IOOptions(), nullptr);
+    }
+    if (!interim.ok() && s.ok()) {
+      s = interim;
+    }
+  }
+
+  TEST_KILL_RANDOM("WritableFileWriter::Close:0", rocksdb_kill_odds);
+  interim = writable_file_->Close(IOOptions(), nullptr);
+  if (!interim.ok() && s.ok()) {
+    s = interim;
+  }
+
+  writable_file_.reset();
+  TEST_KILL_RANDOM("WritableFileWriter::Close:1", rocksdb_kill_odds);
+
+  return s;
+}
+
+// write out the cached data to the OS cache or storage if direct I/O
+// enabled
+Status WritableFileWriter::Flush() {
+  Status s;
+  TEST_KILL_RANDOM("WritableFileWriter::Flush:0",
+                   rocksdb_kill_odds * REDUCE_ODDS2);
+
+  if (buf_.CurrentSize() > 0) {
+    if (use_direct_io()) {
+#ifndef ROCKSDB_LITE
+      if (pending_sync_) {
+        s = WriteDirect();
+      }
+#endif  // !ROCKSDB_LITE
+    } else {
+      s = WriteBuffered(buf_.BufferStart(), buf_.CurrentSize());
+    }
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  s = writable_file_->Flush(IOOptions(), nullptr);
+
+  if (!s.ok()) {
+    return s;
+  }
+
+  // sync OS cache to disk for every bytes_per_sync_
+  // TODO: give log file and sst file different options (log
+  // files could be potentially cached in OS for their whole
+  // life time, thus we might not want to flush at all).
+
+  // We try to avoid sync to the last 1MB of data. For two reasons:
+  // (1) avoid rewrite the same page that is modified later.
+  // (2) for older version of OS, write can block while writing out
+  //     the page.
+  // Xfs does neighbor page flushing outside of the specified ranges. We
+  // need to make sure sync range is far from the write offset.
+  if (!use_direct_io() && bytes_per_sync_) {
+    const uint64_t kBytesNotSyncRange =
+        1024 * 1024;                                // recent 1MB is not synced.
+    const uint64_t kBytesAlignWhenSync = 4 * 1024;  // Align 4KB.
+    if (filesize_ > kBytesNotSyncRange) {
+      uint64_t offset_sync_to = filesize_ - kBytesNotSyncRange;
+      offset_sync_to -= offset_sync_to % kBytesAlignWhenSync;
+      assert(offset_sync_to >= last_sync_size_);
+      if (offset_sync_to > 0 &&
+          offset_sync_to - last_sync_size_ >= bytes_per_sync_) {
+        s = RangeSync(last_sync_size_, offset_sync_to - last_sync_size_);
+        last_sync_size_ = offset_sync_to;
+      }
+    }
+  }
+
+  return s;
+}
+
+const char* WritableFileWriter::GetFileChecksumFuncName() const {
+  if (checksum_func_ != nullptr) {
+    return checksum_func_->Name();
+  } else {
+    return kUnknownFileChecksumFuncName.c_str();
+  }
+}
+
+Status WritableFileWriter::Sync(bool use_fsync) {
+  Status s = Flush();
+  if (!s.ok()) {
+    return s;
+  }
+  TEST_KILL_RANDOM("WritableFileWriter::Sync:0", rocksdb_kill_odds);
+  if (!use_direct_io() && pending_sync_) {
+    s = SyncInternal(use_fsync);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+  TEST_KILL_RANDOM("WritableFileWriter::Sync:1", rocksdb_kill_odds);
+  pending_sync_ = false;
+  return Status::OK();
+}
+
+Status WritableFileWriter::SyncWithoutFlush(bool use_fsync) {
+  if (!writable_file_->IsSyncThreadSafe()) {
+    return Status::NotSupported(
+        "Can't WritableFileWriter::SyncWithoutFlush() because "
+        "WritableFile::IsSyncThreadSafe() is false");
+  }
+  TEST_SYNC_POINT("WritableFileWriter::SyncWithoutFlush:1");
+  Status s = SyncInternal(use_fsync);
+  TEST_SYNC_POINT("WritableFileWriter::SyncWithoutFlush:2");
+  return s;
+}
+
+Status WritableFileWriter::SyncInternal(bool use_fsync) {
+  Status s;
+  IOSTATS_TIMER_GUARD(fsync_nanos);
+  TEST_SYNC_POINT("WritableFileWriter::SyncInternal:0");
+  auto prev_perf_level = GetPerfLevel();
+  IOSTATS_CPU_TIMER_GUARD(cpu_write_nanos, env_);
+  if (use_fsync) {
+    s = writable_file_->Fsync(IOOptions(), nullptr);
+  } else {
+    s = writable_file_->Sync(IOOptions(), nullptr);
+  }
+  SetPerfLevel(prev_perf_level);
+  return s;
+}
+
+Status WritableFileWriter::RangeSync(uint64_t offset, uint64_t nbytes) {
+  IOSTATS_TIMER_GUARD(range_sync_nanos);
+  TEST_SYNC_POINT("WritableFileWriter::RangeSync:0");
+  return writable_file_->RangeSync(offset, nbytes, IOOptions(), nullptr);
+}
+
+// This method writes to disk the specified data and makes use of the rate
+// limiter if available
+Status WritableFileWriter::WriteBuffered(const char* data, size_t size) {
+  Status s;
+  assert(!use_direct_io());
+  const char* src = data;
+  size_t left = size;
+
+  while (left > 0) {
+    size_t allowed;
+    if (rate_limiter_ != nullptr) {
+      allowed = rate_limiter_->RequestToken(
+          left, 0 /* alignment */, writable_file_->GetIOPriority(), stats_,
+          RateLimiter::OpType::kWrite);
+    } else {
+      allowed = left;
+    }
+
+    {
+      IOSTATS_TIMER_GUARD(write_nanos);
+      TEST_SYNC_POINT("WritableFileWriter::Flush:BeforeAppend");
+
+#ifndef ROCKSDB_LITE
+      FileOperationInfo::TimePoint start_ts;
+      uint64_t old_size = writable_file_->GetFileSize(IOOptions(), nullptr);
+      if (ShouldNotifyListeners()) {
+        start_ts = std::chrono::system_clock::now();
+        old_size = next_write_offset_;
+      }
+#endif
+      {
+        auto prev_perf_level = GetPerfLevel();
+        IOSTATS_CPU_TIMER_GUARD(cpu_write_nanos, env_);
+        s = writable_file_->Append(Slice(src, allowed), IOOptions(), nullptr);
+        SetPerfLevel(prev_perf_level);
+      }
+#ifndef ROCKSDB_LITE
+      if (ShouldNotifyListeners()) {
+        auto finish_ts = std::chrono::system_clock::now();
+        NotifyOnFileWriteFinish(old_size, allowed, start_ts, finish_ts, s);
+      }
+#endif
+      if (!s.ok()) {
+        return s;
+      }
+    }
+
+    IOSTATS_ADD(bytes_written, allowed);
+    TEST_KILL_RANDOM("WritableFileWriter::WriteBuffered:0", rocksdb_kill_odds);
+
+    left -= allowed;
+    src += allowed;
+  }
+  buf_.Size(0);
+  return s;
+}
+
+void WritableFileWriter::CalculateFileChecksum(const Slice& data) {
+  if (checksum_func_ != nullptr) {
+    if (is_first_checksum_) {
+      file_checksum_ = checksum_func_->Value(data.data(), data.size());
+      is_first_checksum_ = false;
+    } else {
+      file_checksum_ =
+          checksum_func_->Extend(file_checksum_, data.data(), data.size());
+    }
+  }
+}
+
+// This flushes the accumulated data in the buffer. We pad data with zeros if
+// necessary to the whole page.
+// However, during automatic flushes padding would not be necessary.
+// We always use RateLimiter if available. We move (Refit) any buffer bytes
+// that are left over the
+// whole number of pages to be written again on the next flush because we can
+// only write on aligned
+// offsets.
+#ifndef ROCKSDB_LITE
+Status WritableFileWriter::WriteDirect() {
+  assert(use_direct_io());
+  Status s;
+  const size_t alignment = buf_.Alignment();
+  assert((next_write_offset_ % alignment) == 0);
+
+  // Calculate whole page final file advance if all writes succeed
+  size_t file_advance = TruncateToPageBoundary(alignment, buf_.CurrentSize());
+
+  // Calculate the leftover tail, we write it here padded with zeros BUT we
+  // will write
+  // it again in the future either on Close() OR when the current whole page
+  // fills out
+  size_t leftover_tail = buf_.CurrentSize() - file_advance;
+
+  // Round up and pad
+  buf_.PadToAlignmentWith(0);
+
+  const char* src = buf_.BufferStart();
+  uint64_t write_offset = next_write_offset_;
+  size_t left = buf_.CurrentSize();
+
+  while (left > 0) {
+    // Check how much is allowed
+    size_t size;
+    if (rate_limiter_ != nullptr) {
+      size = rate_limiter_->RequestToken(left, buf_.Alignment(),
+                                         writable_file_->GetIOPriority(),
+                                         stats_, RateLimiter::OpType::kWrite);
+    } else {
+      size = left;
+    }
+
+    {
+      IOSTATS_TIMER_GUARD(write_nanos);
+      TEST_SYNC_POINT("WritableFileWriter::Flush:BeforeAppend");
+      FileOperationInfo::TimePoint start_ts;
+      if (ShouldNotifyListeners()) {
+        start_ts = std::chrono::system_clock::now();
+      }
+      // direct writes must be positional
+      s = writable_file_->PositionedAppend(Slice(src, size), write_offset,
+                                           IOOptions(), nullptr);
+      if (ShouldNotifyListeners()) {
+        auto finish_ts = std::chrono::system_clock::now();
+        NotifyOnFileWriteFinish(write_offset, size, start_ts, finish_ts, s);
+      }
+      if (!s.ok()) {
+        buf_.Size(file_advance + leftover_tail);
+        return s;
+      }
+    }
+
+    IOSTATS_ADD(bytes_written, size);
+    left -= size;
+    src += size;
+    write_offset += size;
+    assert((next_write_offset_ % alignment) == 0);
+  }
+
+  if (s.ok()) {
+    // Move the tail to the beginning of the buffer
+    // This never happens during normal Append but rather during
+    // explicit call to Flush()/Sync() or Close()
+    buf_.RefitTail(file_advance, leftover_tail);
+    // This is where we start writing next time which may or not be
+    // the actual file size on disk. They match if the buffer size
+    // is a multiple of whole pages otherwise filesize_ is leftover_tail
+    // behind
+    next_write_offset_ += file_advance;
+  }
+  return s;
+}
+#endif  // !ROCKSDB_LITE
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/file/writable_file_writer.h b/src/rocksdb/file/writable_file_writer.h
new file mode 100644
index 000000000..123110713
--- /dev/null
+++ b/src/rocksdb/file/writable_file_writer.h
@@ -0,0 +1,171 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <atomic>
+#include <string>
+#include "db/version_edit.h"
+#include "port/port.h"
+#include "rocksdb/env.h"
+#include "rocksdb/file_checksum.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/listener.h"
+#include "rocksdb/rate_limiter.h"
+#include "test_util/sync_point.h"
+#include "util/aligned_buffer.h"
+
+namespace ROCKSDB_NAMESPACE {
+class Statistics;
+
+// WritableFileWriter is a wrapper on top of Env::WritableFile. It provides
+// facilities to:
+// - Handle Buffered and Direct writes.
+// - Rate limit writes.
+// - Flush and Sync the data to the underlying filesystem.
+// - Notify any interested listeners on the completion of a write.
+// - Update IO stats.
+class WritableFileWriter {
+ private:
+#ifndef ROCKSDB_LITE
+  void NotifyOnFileWriteFinish(uint64_t offset, size_t length,
+                               const FileOperationInfo::TimePoint& start_ts,
+                               const FileOperationInfo::TimePoint& finish_ts,
+                               const Status& status) {
+    FileOperationInfo info(file_name_, start_ts, finish_ts);
+    info.offset = offset;
+    info.length = length;
+    info.status = status;
+
+    for (auto& listener : listeners_) {
+      listener->OnFileWriteFinish(info);
+    }
+  }
+#endif  // ROCKSDB_LITE
+
+  bool ShouldNotifyListeners() const { return !listeners_.empty(); }
+  void CalculateFileChecksum(const Slice& data);
+
+  std::unique_ptr<FSWritableFile> writable_file_;
+  std::string file_name_;
+  Env* env_;
+  AlignedBuffer buf_;
+  size_t max_buffer_size_;
+  // Actually written data size can be used for truncate
+  // not counting padding data
+  uint64_t filesize_;
+#ifndef ROCKSDB_LITE
+  // This is necessary when we use unbuffered access
+  // and writes must happen on aligned offsets
+  // so we need to go back and write that page again
+  uint64_t next_write_offset_;
+#endif  // ROCKSDB_LITE
+  bool pending_sync_;
+  uint64_t last_sync_size_;
+  uint64_t bytes_per_sync_;
+  RateLimiter* rate_limiter_;
+  Statistics* stats_;
+  std::vector<std::shared_ptr<EventListener>> listeners_;
+  FileChecksumFunc* checksum_func_;
+  std::string file_checksum_ = kUnknownFileChecksum;
+  bool is_first_checksum_ = true;
+
+ public:
+  WritableFileWriter(
+      std::unique_ptr<FSWritableFile>&& file, const std::string& _file_name,
+      const FileOptions& options, Env* env = nullptr,
+      Statistics* stats = nullptr,
+      const std::vector<std::shared_ptr<EventListener>>& listeners = {},
+      FileChecksumFunc* checksum_func = nullptr)
+      : writable_file_(std::move(file)),
+        file_name_(_file_name),
+        env_(env),
+        buf_(),
+        max_buffer_size_(options.writable_file_max_buffer_size),
+        filesize_(0),
+#ifndef ROCKSDB_LITE
+        next_write_offset_(0),
+#endif  // ROCKSDB_LITE
+        pending_sync_(false),
+        last_sync_size_(0),
+        bytes_per_sync_(options.bytes_per_sync),
+        rate_limiter_(options.rate_limiter),
+        stats_(stats),
+        listeners_(),
+        checksum_func_(checksum_func) {
+    TEST_SYNC_POINT_CALLBACK("WritableFileWriter::WritableFileWriter:0",
+                             reinterpret_cast<void*>(max_buffer_size_));
+    buf_.Alignment(writable_file_->GetRequiredBufferAlignment());
+    buf_.AllocateNewBuffer(std::min((size_t)65536, max_buffer_size_));
+#ifndef ROCKSDB_LITE
+    std::for_each(listeners.begin(), listeners.end(),
+                  [this](const std::shared_ptr<EventListener>& e) {
+                    if (e->ShouldBeNotifiedOnFileIO()) {
+                      listeners_.emplace_back(e);
+                    }
+                  });
+#else  // !ROCKSDB_LITE
+    (void)listeners;
+#endif
+  }
+
+  WritableFileWriter(const WritableFileWriter&) = delete;
+
+  WritableFileWriter& operator=(const WritableFileWriter&) = delete;
+
+  ~WritableFileWriter() { Close(); }
+
+  std::string file_name() const { return file_name_; }
+
+  Status Append(const Slice& data);
+
+  Status Pad(const size_t pad_bytes);
+
+  Status Flush();
+
+  Status Close();
+
+  Status Sync(bool use_fsync);
+
+  // Sync only the data that was already Flush()ed. Safe to call concurrently
+  // with Append() and Flush(). If !writable_file_->IsSyncThreadSafe(),
+  // returns NotSupported status.
+  Status SyncWithoutFlush(bool use_fsync);
+
+  uint64_t GetFileSize() const { return filesize_; }
+
+  Status InvalidateCache(size_t offset, size_t length) {
+    return writable_file_->InvalidateCache(offset, length);
+  }
+
+  FSWritableFile* writable_file() const { return writable_file_.get(); }
+
+  bool use_direct_io() { return writable_file_->use_direct_io(); }
+
+  bool TEST_BufferIsEmpty() { return buf_.CurrentSize() == 0; }
+
+  void TEST_SetFileChecksumFunc(FileChecksumFunc* checksum_func) {
+    checksum_func_ = checksum_func;
+  }
+
+  const std::string& GetFileChecksum() const { return file_checksum_; }
+
+  const char* GetFileChecksumFuncName() const;
+
+ private:
+  // Used when os buffering is OFF and we are writing
+  // DMA such as in Direct I/O mode
+#ifndef ROCKSDB_LITE
+  Status WriteDirect();
+#endif  // !ROCKSDB_LITE
+  // Normal write
+  Status WriteBuffered(const char* data, size_t size);
+  Status RangeSync(uint64_t offset, uint64_t nbytes);
+  Status SyncInternal(bool use_fsync);
+};
+}  // namespace ROCKSDB_NAMESPACE
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-07 18:45:59 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-07 18:45:59 +0000
commit	19fcec84d8d7d21e796c7624e521b60d28ee21ed (patch)
tree	42d26aa27d1e3f7c0b8bd3fd14e7d7082f5008dc /src/rocksdb/file
parent	Initial commit. (diff)
download	ceph-upstream/16.2.11+ds.tar.xz ceph-upstream/16.2.11+ds.zip