summaryrefslogtreecommitdiffstats
path: root/src/rocksdb/file
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 18:45:59 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 18:45:59 +0000
commit19fcec84d8d7d21e796c7624e521b60d28ee21ed (patch)
tree42d26aa27d1e3f7c0b8bd3fd14e7d7082f5008dc /src/rocksdb/file
parentInitial commit. (diff)
downloadceph-upstream/16.2.11+ds.tar.xz
ceph-upstream/16.2.11+ds.zip
Adding upstream version 16.2.11+ds.upstream/16.2.11+dsupstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/rocksdb/file')
-rw-r--r--src/rocksdb/file/delete_scheduler.cc357
-rw-r--r--src/rocksdb/file/delete_scheduler.h141
-rw-r--r--src/rocksdb/file/delete_scheduler_test.cc693
-rw-r--r--src/rocksdb/file/file_prefetch_buffer.cc136
-rw-r--r--src/rocksdb/file/file_prefetch_buffer.h97
-rw-r--r--src/rocksdb/file/file_util.cc124
-rw-r--r--src/rocksdb/file/file_util.h33
-rw-r--r--src/rocksdb/file/filename.cc456
-rw-r--r--src/rocksdb/file/filename.h185
-rw-r--r--src/rocksdb/file/random_access_file_reader.cc189
-rw-r--r--src/rocksdb/file/random_access_file_reader.h120
-rw-r--r--src/rocksdb/file/read_write_util.cc67
-rw-r--r--src/rocksdb/file/read_write_util.h34
-rw-r--r--src/rocksdb/file/readahead_raf.cc162
-rw-r--r--src/rocksdb/file/readahead_raf.h27
-rw-r--r--src/rocksdb/file/sequence_file_reader.cc237
-rw-r--r--src/rocksdb/file/sequence_file_reader.h67
-rw-r--r--src/rocksdb/file/sst_file_manager_impl.cc558
-rw-r--r--src/rocksdb/file/sst_file_manager_impl.h197
-rw-r--r--src/rocksdb/file/writable_file_writer.cc429
-rw-r--r--src/rocksdb/file/writable_file_writer.h171
21 files changed, 4480 insertions, 0 deletions
diff --git a/src/rocksdb/file/delete_scheduler.cc b/src/rocksdb/file/delete_scheduler.cc
new file mode 100644
index 000000000..bb318e595
--- /dev/null
+++ b/src/rocksdb/file/delete_scheduler.cc
@@ -0,0 +1,357 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "file/delete_scheduler.h"
+
+#include <thread>
+#include <vector>
+
+#include "file/sst_file_manager_impl.h"
+#include "logging/logging.h"
+#include "port/port.h"
+#include "rocksdb/env.h"
+#include "test_util/sync_point.h"
+#include "util/mutexlock.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+DeleteScheduler::DeleteScheduler(Env* env, FileSystem* fs,
+ int64_t rate_bytes_per_sec, Logger* info_log,
+ SstFileManagerImpl* sst_file_manager,
+ double max_trash_db_ratio,
+ uint64_t bytes_max_delete_chunk)
+ : env_(env),
+ fs_(fs),
+ total_trash_size_(0),
+ rate_bytes_per_sec_(rate_bytes_per_sec),
+ pending_files_(0),
+ bytes_max_delete_chunk_(bytes_max_delete_chunk),
+ closing_(false),
+ cv_(&mu_),
+ info_log_(info_log),
+ sst_file_manager_(sst_file_manager),
+ max_trash_db_ratio_(max_trash_db_ratio) {
+ assert(sst_file_manager != nullptr);
+ assert(max_trash_db_ratio >= 0);
+ bg_thread_.reset(
+ new port::Thread(&DeleteScheduler::BackgroundEmptyTrash, this));
+}
+
+DeleteScheduler::~DeleteScheduler() {
+ {
+ InstrumentedMutexLock l(&mu_);
+ closing_ = true;
+ cv_.SignalAll();
+ }
+ if (bg_thread_) {
+ bg_thread_->join();
+ }
+}
+
+Status DeleteScheduler::DeleteFile(const std::string& file_path,
+ const std::string& dir_to_sync,
+ const bool force_bg) {
+ Status s;
+ if (rate_bytes_per_sec_.load() <= 0 || (!force_bg &&
+ total_trash_size_.load() >
+ sst_file_manager_->GetTotalSize() * max_trash_db_ratio_.load())) {
+ // Rate limiting is disabled or trash size makes up more than
+ // max_trash_db_ratio_ (default 25%) of the total DB size
+ TEST_SYNC_POINT("DeleteScheduler::DeleteFile");
+ s = fs_->DeleteFile(file_path, IOOptions(), nullptr);
+ if (s.ok()) {
+ sst_file_manager_->OnDeleteFile(file_path);
+ }
+ return s;
+ }
+
+ // Move file to trash
+ std::string trash_file;
+ s = MarkAsTrash(file_path, &trash_file);
+
+ if (!s.ok()) {
+ ROCKS_LOG_ERROR(info_log_, "Failed to mark %s as trash -- %s",
+ file_path.c_str(), s.ToString().c_str());
+ s = fs_->DeleteFile(file_path, IOOptions(), nullptr);
+ if (s.ok()) {
+ sst_file_manager_->OnDeleteFile(file_path);
+ }
+ return s;
+ }
+
+ // Update the total trash size
+ uint64_t trash_file_size = 0;
+ fs_->GetFileSize(trash_file, IOOptions(), &trash_file_size, nullptr);
+ total_trash_size_.fetch_add(trash_file_size);
+
+ // Add file to delete queue
+ {
+ InstrumentedMutexLock l(&mu_);
+ queue_.emplace(trash_file, dir_to_sync);
+ pending_files_++;
+ if (pending_files_ == 1) {
+ cv_.SignalAll();
+ }
+ }
+ return s;
+}
+
+std::map<std::string, Status> DeleteScheduler::GetBackgroundErrors() {
+ InstrumentedMutexLock l(&mu_);
+ return bg_errors_;
+}
+
+const std::string DeleteScheduler::kTrashExtension = ".trash";
+bool DeleteScheduler::IsTrashFile(const std::string& file_path) {
+ return (file_path.size() >= kTrashExtension.size() &&
+ file_path.rfind(kTrashExtension) ==
+ file_path.size() - kTrashExtension.size());
+}
+
+Status DeleteScheduler::CleanupDirectory(Env* env, SstFileManagerImpl* sfm,
+ const std::string& path) {
+ Status s;
+ // Check if there are any files marked as trash in this path
+ std::vector<std::string> files_in_path;
+ s = env->GetChildren(path, &files_in_path);
+ if (!s.ok()) {
+ return s;
+ }
+ for (const std::string& current_file : files_in_path) {
+ if (!DeleteScheduler::IsTrashFile(current_file)) {
+ // not a trash file, skip
+ continue;
+ }
+
+ Status file_delete;
+ std::string trash_file = path + "/" + current_file;
+ if (sfm) {
+ // We have an SstFileManager that will schedule the file delete
+ sfm->OnAddFile(trash_file);
+ file_delete = sfm->ScheduleFileDeletion(trash_file, path);
+ } else {
+ // Delete the file immediately
+ file_delete = env->DeleteFile(trash_file);
+ }
+
+ if (s.ok() && !file_delete.ok()) {
+ s = file_delete;
+ }
+ }
+
+ return s;
+}
+
+Status DeleteScheduler::MarkAsTrash(const std::string& file_path,
+ std::string* trash_file) {
+ // Sanity check of the path
+ size_t idx = file_path.rfind("/");
+ if (idx == std::string::npos || idx == file_path.size() - 1) {
+ return Status::InvalidArgument("file_path is corrupted");
+ }
+
+ Status s;
+ if (DeleteScheduler::IsTrashFile(file_path)) {
+ // This is already a trash file
+ *trash_file = file_path;
+ return s;
+ }
+
+ *trash_file = file_path + kTrashExtension;
+ // TODO(tec) : Implement Env::RenameFileIfNotExist and remove
+ // file_move_mu mutex.
+ int cnt = 0;
+ InstrumentedMutexLock l(&file_move_mu_);
+ while (true) {
+ s = fs_->FileExists(*trash_file, IOOptions(), nullptr);
+ if (s.IsNotFound()) {
+ // We found a path for our file in trash
+ s = fs_->RenameFile(file_path, *trash_file, IOOptions(), nullptr);
+ break;
+ } else if (s.ok()) {
+ // Name conflict, generate new random suffix
+ *trash_file = file_path + std::to_string(cnt) + kTrashExtension;
+ } else {
+ // Error during FileExists call, we cannot continue
+ break;
+ }
+ cnt++;
+ }
+ if (s.ok()) {
+ sst_file_manager_->OnMoveFile(file_path, *trash_file);
+ }
+ return s;
+}
+
+void DeleteScheduler::BackgroundEmptyTrash() {
+ TEST_SYNC_POINT("DeleteScheduler::BackgroundEmptyTrash");
+
+ while (true) {
+ InstrumentedMutexLock l(&mu_);
+ while (queue_.empty() && !closing_) {
+ cv_.Wait();
+ }
+
+ if (closing_) {
+ return;
+ }
+
+ // Delete all files in queue_
+ uint64_t start_time = env_->NowMicros();
+ uint64_t total_deleted_bytes = 0;
+ int64_t current_delete_rate = rate_bytes_per_sec_.load();
+ while (!queue_.empty() && !closing_) {
+ if (current_delete_rate != rate_bytes_per_sec_.load()) {
+ // User changed the delete rate
+ current_delete_rate = rate_bytes_per_sec_.load();
+ start_time = env_->NowMicros();
+ total_deleted_bytes = 0;
+ }
+
+ // Get new file to delete
+ const FileAndDir& fad = queue_.front();
+ std::string path_in_trash = fad.fname;
+
+ // We dont need to hold the lock while deleting the file
+ mu_.Unlock();
+ uint64_t deleted_bytes = 0;
+ bool is_complete = true;
+ // Delete file from trash and update total_penlty value
+ Status s =
+ DeleteTrashFile(path_in_trash, fad.dir, &deleted_bytes, &is_complete);
+ total_deleted_bytes += deleted_bytes;
+ mu_.Lock();
+ if (is_complete) {
+ queue_.pop();
+ }
+
+ if (!s.ok()) {
+ bg_errors_[path_in_trash] = s;
+ }
+
+ // Apply penlty if necessary
+ uint64_t total_penlty;
+ if (current_delete_rate > 0) {
+ // rate limiting is enabled
+ total_penlty =
+ ((total_deleted_bytes * kMicrosInSecond) / current_delete_rate);
+ while (!closing_ && !cv_.TimedWait(start_time + total_penlty)) {}
+ } else {
+ // rate limiting is disabled
+ total_penlty = 0;
+ }
+ TEST_SYNC_POINT_CALLBACK("DeleteScheduler::BackgroundEmptyTrash:Wait",
+ &total_penlty);
+
+ if (is_complete) {
+ pending_files_--;
+ }
+ if (pending_files_ == 0) {
+ // Unblock WaitForEmptyTrash since there are no more files waiting
+ // to be deleted
+ cv_.SignalAll();
+ }
+ }
+ }
+}
+
+Status DeleteScheduler::DeleteTrashFile(const std::string& path_in_trash,
+ const std::string& dir_to_sync,
+ uint64_t* deleted_bytes,
+ bool* is_complete) {
+ uint64_t file_size;
+ Status s = fs_->GetFileSize(path_in_trash, IOOptions(), &file_size, nullptr);
+ *is_complete = true;
+ TEST_SYNC_POINT("DeleteScheduler::DeleteTrashFile:DeleteFile");
+ if (s.ok()) {
+ bool need_full_delete = true;
+ if (bytes_max_delete_chunk_ != 0 && file_size > bytes_max_delete_chunk_) {
+ uint64_t num_hard_links = 2;
+ // We don't have to worry aobut data race between linking a new
+ // file after the number of file link check and ftruncte because
+ // the file is now in trash and no hardlink is supposed to create
+ // to trash files by RocksDB.
+ Status my_status = fs_->NumFileLinks(path_in_trash, IOOptions(),
+ &num_hard_links, nullptr);
+ if (my_status.ok()) {
+ if (num_hard_links == 1) {
+ std::unique_ptr<FSWritableFile> wf;
+ my_status = fs_->ReopenWritableFile(path_in_trash, FileOptions(),
+ &wf, nullptr);
+ if (my_status.ok()) {
+ my_status = wf->Truncate(file_size - bytes_max_delete_chunk_,
+ IOOptions(), nullptr);
+ if (my_status.ok()) {
+ TEST_SYNC_POINT("DeleteScheduler::DeleteTrashFile:Fsync");
+ my_status = wf->Fsync(IOOptions(), nullptr);
+ }
+ }
+ if (my_status.ok()) {
+ *deleted_bytes = bytes_max_delete_chunk_;
+ need_full_delete = false;
+ *is_complete = false;
+ } else {
+ ROCKS_LOG_WARN(info_log_,
+ "Failed to partially delete %s from trash -- %s",
+ path_in_trash.c_str(), my_status.ToString().c_str());
+ }
+ } else {
+ ROCKS_LOG_INFO(info_log_,
+ "Cannot delete %s slowly through ftruncate from trash "
+ "as it has other links",
+ path_in_trash.c_str());
+ }
+ } else if (!num_link_error_printed_) {
+ ROCKS_LOG_INFO(
+ info_log_,
+ "Cannot delete files slowly through ftruncate from trash "
+ "as Env::NumFileLinks() returns error: %s",
+ my_status.ToString().c_str());
+ num_link_error_printed_ = true;
+ }
+ }
+
+ if (need_full_delete) {
+ s = fs_->DeleteFile(path_in_trash, IOOptions(), nullptr);
+ if (!dir_to_sync.empty()) {
+ std::unique_ptr<FSDirectory> dir_obj;
+ if (s.ok()) {
+ s = fs_->NewDirectory(dir_to_sync, IOOptions(), &dir_obj, nullptr);
+ }
+ if (s.ok()) {
+ s = dir_obj->Fsync(IOOptions(), nullptr);
+ TEST_SYNC_POINT_CALLBACK(
+ "DeleteScheduler::DeleteTrashFile::AfterSyncDir",
+ reinterpret_cast<void*>(const_cast<std::string*>(&dir_to_sync)));
+ }
+ }
+ *deleted_bytes = file_size;
+ sst_file_manager_->OnDeleteFile(path_in_trash);
+ }
+ }
+ if (!s.ok()) {
+ // Error while getting file size or while deleting
+ ROCKS_LOG_ERROR(info_log_, "Failed to delete %s from trash -- %s",
+ path_in_trash.c_str(), s.ToString().c_str());
+ *deleted_bytes = 0;
+ } else {
+ total_trash_size_.fetch_sub(*deleted_bytes);
+ }
+
+ return s;
+}
+
+void DeleteScheduler::WaitForEmptyTrash() {
+ InstrumentedMutexLock l(&mu_);
+ while (pending_files_ > 0 && !closing_) {
+ cv_.Wait();
+ }
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/file/delete_scheduler.h b/src/rocksdb/file/delete_scheduler.h
new file mode 100644
index 000000000..60948480a
--- /dev/null
+++ b/src/rocksdb/file/delete_scheduler.h
@@ -0,0 +1,141 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <map>
+#include <queue>
+#include <string>
+#include <thread>
+
+#include "monitoring/instrumented_mutex.h"
+#include "port/port.h"
+
+#include "rocksdb/file_system.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Env;
+class Logger;
+class SstFileManagerImpl;
+
+// DeleteScheduler allows the DB to enforce a rate limit on file deletion,
+// Instead of deleteing files immediately, files are marked as trash
+// and deleted in a background thread that apply sleep penlty between deletes
+// if they are happening in a rate faster than rate_bytes_per_sec,
+//
+// Rate limiting can be turned off by setting rate_bytes_per_sec = 0, In this
+// case DeleteScheduler will delete files immediately.
+class DeleteScheduler {
+ public:
+ DeleteScheduler(Env* env, FileSystem* fs, int64_t rate_bytes_per_sec,
+ Logger* info_log, SstFileManagerImpl* sst_file_manager,
+ double max_trash_db_ratio, uint64_t bytes_max_delete_chunk);
+
+ ~DeleteScheduler();
+
+ // Return delete rate limit in bytes per second
+ int64_t GetRateBytesPerSecond() { return rate_bytes_per_sec_.load(); }
+
+ // Set delete rate limit in bytes per second
+ void SetRateBytesPerSecond(int64_t bytes_per_sec) {
+ rate_bytes_per_sec_.store(bytes_per_sec);
+ }
+
+ // Mark file as trash directory and schedule it's deletion. If force_bg is
+ // set, it forces the file to always be deleted in the background thread,
+ // except when rate limiting is disabled
+ Status DeleteFile(const std::string& fname, const std::string& dir_to_sync,
+ const bool force_bg = false);
+
+ // Wait for all files being deleteing in the background to finish or for
+ // destructor to be called.
+ void WaitForEmptyTrash();
+
+ // Return a map containing errors that happened in BackgroundEmptyTrash
+ // file_path => error status
+ std::map<std::string, Status> GetBackgroundErrors();
+
+ uint64_t GetTotalTrashSize() { return total_trash_size_.load(); }
+
+ // Return trash/DB size ratio where new files will be deleted immediately
+ double GetMaxTrashDBRatio() {
+ return max_trash_db_ratio_.load();
+ }
+
+ // Update trash/DB size ratio where new files will be deleted immediately
+ void SetMaxTrashDBRatio(double r) {
+ assert(r >= 0);
+ max_trash_db_ratio_.store(r);
+ }
+
+ static const std::string kTrashExtension;
+ static bool IsTrashFile(const std::string& file_path);
+
+ // Check if there are any .trash filse in path, and schedule their deletion
+ // Or delete immediately if sst_file_manager is nullptr
+ static Status CleanupDirectory(Env* env, SstFileManagerImpl* sfm,
+ const std::string& path);
+
+ private:
+ Status MarkAsTrash(const std::string& file_path, std::string* path_in_trash);
+
+ Status DeleteTrashFile(const std::string& path_in_trash,
+ const std::string& dir_to_sync,
+ uint64_t* deleted_bytes, bool* is_complete);
+
+ void BackgroundEmptyTrash();
+
+ Env* env_;
+ FileSystem* fs_;
+
+ // total size of trash files
+ std::atomic<uint64_t> total_trash_size_;
+ // Maximum number of bytes that should be deleted per second
+ std::atomic<int64_t> rate_bytes_per_sec_;
+ // Mutex to protect queue_, pending_files_, bg_errors_, closing_
+ InstrumentedMutex mu_;
+
+ struct FileAndDir {
+ FileAndDir(const std::string& f, const std::string& d) : fname(f), dir(d) {}
+ std::string fname;
+ std::string dir; // empty will be skipped.
+ };
+
+ // Queue of trash files that need to be deleted
+ std::queue<FileAndDir> queue_;
+ // Number of trash files that are waiting to be deleted
+ int32_t pending_files_;
+ uint64_t bytes_max_delete_chunk_;
+ // Errors that happened in BackgroundEmptyTrash (file_path => error)
+ std::map<std::string, Status> bg_errors_;
+
+ bool num_link_error_printed_ = false;
+ // Set to true in ~DeleteScheduler() to force BackgroundEmptyTrash to stop
+ bool closing_;
+ // Condition variable signaled in these conditions
+ // - pending_files_ value change from 0 => 1
+ // - pending_files_ value change from 1 => 0
+ // - closing_ value is set to true
+ InstrumentedCondVar cv_;
+ // Background thread running BackgroundEmptyTrash
+ std::unique_ptr<port::Thread> bg_thread_;
+ // Mutex to protect threads from file name conflicts
+ InstrumentedMutex file_move_mu_;
+ Logger* info_log_;
+ SstFileManagerImpl* sst_file_manager_;
+ // If the trash size constitutes for more than this fraction of the total DB
+ // size we will start deleting new files passed to DeleteScheduler
+ // immediately
+ std::atomic<double> max_trash_db_ratio_;
+ static const uint64_t kMicrosInSecond = 1000 * 1000LL;
+};
+
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/file/delete_scheduler_test.cc b/src/rocksdb/file/delete_scheduler_test.cc
new file mode 100644
index 000000000..cff645de5
--- /dev/null
+++ b/src/rocksdb/file/delete_scheduler_test.cc
@@ -0,0 +1,693 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include <atomic>
+#include <cinttypes>
+#include <thread>
+#include <vector>
+
+#include "file/delete_scheduler.h"
+#include "file/sst_file_manager_impl.h"
+#include "rocksdb/env.h"
+#include "rocksdb/options.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/string_util.h"
+
+#ifndef ROCKSDB_LITE
+
+namespace ROCKSDB_NAMESPACE {
+
+class DeleteSchedulerTest : public testing::Test {
+ public:
+ DeleteSchedulerTest() : env_(Env::Default()) {
+ const int kNumDataDirs = 3;
+ dummy_files_dirs_.reserve(kNumDataDirs);
+ for (size_t i = 0; i < kNumDataDirs; ++i) {
+ dummy_files_dirs_.emplace_back(
+ test::PerThreadDBPath(env_, "delete_scheduler_dummy_data_dir") +
+ ToString(i));
+ DestroyAndCreateDir(dummy_files_dirs_.back());
+ }
+ }
+
+ ~DeleteSchedulerTest() override {
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({});
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+ for (const auto& dummy_files_dir : dummy_files_dirs_) {
+ test::DestroyDir(env_, dummy_files_dir);
+ }
+ }
+
+ void DestroyAndCreateDir(const std::string& dir) {
+ ASSERT_OK(test::DestroyDir(env_, dir));
+ EXPECT_OK(env_->CreateDir(dir));
+ }
+
+ int CountNormalFiles(size_t dummy_files_dirs_idx = 0) {
+ std::vector<std::string> files_in_dir;
+ EXPECT_OK(env_->GetChildren(dummy_files_dirs_[dummy_files_dirs_idx],
+ &files_in_dir));
+
+ int normal_cnt = 0;
+ for (auto& f : files_in_dir) {
+ if (!DeleteScheduler::IsTrashFile(f) && f != "." && f != "..") {
+ normal_cnt++;
+ }
+ }
+ return normal_cnt;
+ }
+
+ int CountTrashFiles(size_t dummy_files_dirs_idx = 0) {
+ std::vector<std::string> files_in_dir;
+ EXPECT_OK(env_->GetChildren(dummy_files_dirs_[dummy_files_dirs_idx],
+ &files_in_dir));
+
+ int trash_cnt = 0;
+ for (auto& f : files_in_dir) {
+ if (DeleteScheduler::IsTrashFile(f)) {
+ trash_cnt++;
+ }
+ }
+ return trash_cnt;
+ }
+
+ std::string NewDummyFile(const std::string& file_name, uint64_t size = 1024,
+ size_t dummy_files_dirs_idx = 0) {
+ std::string file_path =
+ dummy_files_dirs_[dummy_files_dirs_idx] + "/" + file_name;
+ std::unique_ptr<WritableFile> f;
+ env_->NewWritableFile(file_path, &f, EnvOptions());
+ std::string data(size, 'A');
+ EXPECT_OK(f->Append(data));
+ EXPECT_OK(f->Close());
+ sst_file_mgr_->OnAddFile(file_path, false);
+ return file_path;
+ }
+
+ void NewDeleteScheduler() {
+ // Tests in this file are for DeleteScheduler component and dont create any
+ // DBs, so we need to set max_trash_db_ratio to 100% (instead of default
+ // 25%)
+ std::shared_ptr<FileSystem>
+ fs(std::make_shared<LegacyFileSystemWrapper>(env_));
+ sst_file_mgr_.reset(
+ new SstFileManagerImpl(env_, fs, nullptr, rate_bytes_per_sec_,
+ /* max_trash_db_ratio= */ 1.1, 128 * 1024));
+ delete_scheduler_ = sst_file_mgr_->delete_scheduler();
+ }
+
+ Env* env_;
+ std::vector<std::string> dummy_files_dirs_;
+ int64_t rate_bytes_per_sec_;
+ DeleteScheduler* delete_scheduler_;
+ std::unique_ptr<SstFileManagerImpl> sst_file_mgr_;
+};
+
+// Test the basic functionality of DeleteScheduler (Rate Limiting).
+// 1- Create 100 dummy files
+// 2- Delete the 100 dummy files using DeleteScheduler
+// --- Hold DeleteScheduler::BackgroundEmptyTrash ---
+// 3- Wait for DeleteScheduler to delete all files in trash
+// 4- Verify that BackgroundEmptyTrash used to correct penlties for the files
+// 5- Make sure that all created files were completely deleted
+TEST_F(DeleteSchedulerTest, BasicRateLimiting) {
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+ {"DeleteSchedulerTest::BasicRateLimiting:1",
+ "DeleteScheduler::BackgroundEmptyTrash"},
+ });
+
+ std::vector<uint64_t> penalties;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DeleteScheduler::BackgroundEmptyTrash:Wait",
+ [&](void* arg) { penalties.push_back(*(static_cast<uint64_t*>(arg))); });
+ int dir_synced = 0;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DeleteScheduler::DeleteTrashFile::AfterSyncDir", [&](void* arg) {
+ dir_synced++;
+ std::string* dir = reinterpret_cast<std::string*>(arg);
+ EXPECT_EQ(dummy_files_dirs_[0], *dir);
+ });
+
+ int num_files = 100; // 100 files
+ uint64_t file_size = 1024; // every file is 1 kb
+ std::vector<uint64_t> delete_kbs_per_sec = {512, 200, 100, 50, 25};
+
+ for (size_t t = 0; t < delete_kbs_per_sec.size(); t++) {
+ penalties.clear();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearTrace();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ DestroyAndCreateDir(dummy_files_dirs_[0]);
+ rate_bytes_per_sec_ = delete_kbs_per_sec[t] * 1024;
+ NewDeleteScheduler();
+
+ dir_synced = 0;
+ // Create 100 dummy files, every file is 1 Kb
+ std::vector<std::string> generated_files;
+ for (int i = 0; i < num_files; i++) {
+ std::string file_name = "file" + ToString(i) + ".data";
+ generated_files.push_back(NewDummyFile(file_name, file_size));
+ }
+
+ // Delete dummy files and measure time spent to empty trash
+ for (int i = 0; i < num_files; i++) {
+ ASSERT_OK(delete_scheduler_->DeleteFile(generated_files[i],
+ dummy_files_dirs_[0]));
+ }
+ ASSERT_EQ(CountNormalFiles(), 0);
+
+ uint64_t delete_start_time = env_->NowMicros();
+ TEST_SYNC_POINT("DeleteSchedulerTest::BasicRateLimiting:1");
+ delete_scheduler_->WaitForEmptyTrash();
+ uint64_t time_spent_deleting = env_->NowMicros() - delete_start_time;
+
+ auto bg_errors = delete_scheduler_->GetBackgroundErrors();
+ ASSERT_EQ(bg_errors.size(), 0);
+
+ uint64_t total_files_size = 0;
+ uint64_t expected_penlty = 0;
+ ASSERT_EQ(penalties.size(), num_files);
+ for (int i = 0; i < num_files; i++) {
+ total_files_size += file_size;
+ expected_penlty = ((total_files_size * 1000000) / rate_bytes_per_sec_);
+ ASSERT_EQ(expected_penlty, penalties[i]);
+ }
+ ASSERT_GT(time_spent_deleting, expected_penlty * 0.9);
+
+ ASSERT_EQ(num_files, dir_synced);
+
+ ASSERT_EQ(CountTrashFiles(), 0);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ }
+}
+
+TEST_F(DeleteSchedulerTest, MultiDirectoryDeletionsScheduled) {
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+ {"DeleteSchedulerTest::MultiDbPathDeletionsScheduled:1",
+ "DeleteScheduler::BackgroundEmptyTrash"},
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ rate_bytes_per_sec_ = 1 << 20; // 1MB
+ NewDeleteScheduler();
+
+ // Generate dummy files in multiple directories
+ const size_t kNumFiles = dummy_files_dirs_.size();
+ const size_t kFileSize = 1 << 10; // 1KB
+ std::vector<std::string> generated_files;
+ for (size_t i = 0; i < kNumFiles; i++) {
+ generated_files.push_back(NewDummyFile("file", kFileSize, i));
+ ASSERT_EQ(1, CountNormalFiles(i));
+ }
+
+ // Mark dummy files as trash
+ for (size_t i = 0; i < kNumFiles; i++) {
+ ASSERT_OK(delete_scheduler_->DeleteFile(generated_files[i], ""));
+ ASSERT_EQ(0, CountNormalFiles(i));
+ ASSERT_EQ(1, CountTrashFiles(i));
+ }
+ TEST_SYNC_POINT("DeleteSchedulerTest::MultiDbPathDeletionsScheduled:1");
+ delete_scheduler_->WaitForEmptyTrash();
+
+ // Verify dummy files eventually got deleted
+ for (size_t i = 0; i < kNumFiles; i++) {
+ ASSERT_EQ(0, CountNormalFiles(i));
+ ASSERT_EQ(0, CountTrashFiles(i));
+ }
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+// Same as the BasicRateLimiting test but delete files in multiple threads.
+// 1- Create 100 dummy files
+// 2- Delete the 100 dummy files using DeleteScheduler using 10 threads
+// --- Hold DeleteScheduler::BackgroundEmptyTrash ---
+// 3- Wait for DeleteScheduler to delete all files in queue
+// 4- Verify that BackgroundEmptyTrash used to correct penlties for the files
+// 5- Make sure that all created files were completely deleted
+TEST_F(DeleteSchedulerTest, RateLimitingMultiThreaded) {
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+ {"DeleteSchedulerTest::RateLimitingMultiThreaded:1",
+ "DeleteScheduler::BackgroundEmptyTrash"},
+ });
+
+ std::vector<uint64_t> penalties;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DeleteScheduler::BackgroundEmptyTrash:Wait",
+ [&](void* arg) { penalties.push_back(*(static_cast<uint64_t*>(arg))); });
+
+ int thread_cnt = 10;
+ int num_files = 10; // 10 files per thread
+ uint64_t file_size = 1024; // every file is 1 kb
+
+ std::vector<uint64_t> delete_kbs_per_sec = {512, 200, 100, 50, 25};
+ for (size_t t = 0; t < delete_kbs_per_sec.size(); t++) {
+ penalties.clear();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearTrace();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ DestroyAndCreateDir(dummy_files_dirs_[0]);
+ rate_bytes_per_sec_ = delete_kbs_per_sec[t] * 1024;
+ NewDeleteScheduler();
+
+ // Create 100 dummy files, every file is 1 Kb
+ std::vector<std::string> generated_files;
+ for (int i = 0; i < num_files * thread_cnt; i++) {
+ std::string file_name = "file" + ToString(i) + ".data";
+ generated_files.push_back(NewDummyFile(file_name, file_size));
+ }
+
+ // Delete dummy files using 10 threads and measure time spent to empty trash
+ std::atomic<int> thread_num(0);
+ std::vector<port::Thread> threads;
+ std::function<void()> delete_thread = [&]() {
+ int idx = thread_num.fetch_add(1);
+ int range_start = idx * num_files;
+ int range_end = range_start + num_files;
+ for (int j = range_start; j < range_end; j++) {
+ ASSERT_OK(delete_scheduler_->DeleteFile(generated_files[j], ""));
+ }
+ };
+
+ for (int i = 0; i < thread_cnt; i++) {
+ threads.emplace_back(delete_thread);
+ }
+
+ for (size_t i = 0; i < threads.size(); i++) {
+ threads[i].join();
+ }
+
+ uint64_t delete_start_time = env_->NowMicros();
+ TEST_SYNC_POINT("DeleteSchedulerTest::RateLimitingMultiThreaded:1");
+ delete_scheduler_->WaitForEmptyTrash();
+ uint64_t time_spent_deleting = env_->NowMicros() - delete_start_time;
+
+ auto bg_errors = delete_scheduler_->GetBackgroundErrors();
+ ASSERT_EQ(bg_errors.size(), 0);
+
+ uint64_t total_files_size = 0;
+ uint64_t expected_penlty = 0;
+ ASSERT_EQ(penalties.size(), num_files * thread_cnt);
+ for (int i = 0; i < num_files * thread_cnt; i++) {
+ total_files_size += file_size;
+ expected_penlty = ((total_files_size * 1000000) / rate_bytes_per_sec_);
+ ASSERT_EQ(expected_penlty, penalties[i]);
+ }
+ ASSERT_GT(time_spent_deleting, expected_penlty * 0.9);
+
+ ASSERT_EQ(CountNormalFiles(), 0);
+ ASSERT_EQ(CountTrashFiles(), 0);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ }
+}
+
+// Disable rate limiting by setting rate_bytes_per_sec_ to 0 and make sure
+// that when DeleteScheduler delete a file it delete it immediately and dont
+// move it to trash
+TEST_F(DeleteSchedulerTest, DisableRateLimiting) {
+ int bg_delete_file = 0;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DeleteScheduler::DeleteTrashFile:DeleteFile",
+ [&](void* /*arg*/) { bg_delete_file++; });
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ rate_bytes_per_sec_ = 0;
+ NewDeleteScheduler();
+
+ for (int i = 0; i < 10; i++) {
+ // Every file we delete will be deleted immediately
+ std::string dummy_file = NewDummyFile("dummy.data");
+ ASSERT_OK(delete_scheduler_->DeleteFile(dummy_file, ""));
+ ASSERT_TRUE(env_->FileExists(dummy_file).IsNotFound());
+ ASSERT_EQ(CountNormalFiles(), 0);
+ ASSERT_EQ(CountTrashFiles(), 0);
+ }
+
+ ASSERT_EQ(bg_delete_file, 0);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+// Testing that moving files to trash with the same name is not a problem
+// 1- Create 10 files with the same name "conflict.data"
+// 2- Delete the 10 files using DeleteScheduler
+// 3- Make sure that trash directory contain 10 files ("conflict.data" x 10)
+// --- Hold DeleteScheduler::BackgroundEmptyTrash ---
+// 4- Make sure that files are deleted from trash
+TEST_F(DeleteSchedulerTest, ConflictNames) {
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+ {"DeleteSchedulerTest::ConflictNames:1",
+ "DeleteScheduler::BackgroundEmptyTrash"},
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ rate_bytes_per_sec_ = 1024 * 1024; // 1 Mb/sec
+ NewDeleteScheduler();
+
+ // Create "conflict.data" and move it to trash 10 times
+ for (int i = 0; i < 10; i++) {
+ std::string dummy_file = NewDummyFile("conflict.data");
+ ASSERT_OK(delete_scheduler_->DeleteFile(dummy_file, ""));
+ }
+ ASSERT_EQ(CountNormalFiles(), 0);
+ // 10 files ("conflict.data" x 10) in trash
+ ASSERT_EQ(CountTrashFiles(), 10);
+
+ // Hold BackgroundEmptyTrash
+ TEST_SYNC_POINT("DeleteSchedulerTest::ConflictNames:1");
+ delete_scheduler_->WaitForEmptyTrash();
+ ASSERT_EQ(CountTrashFiles(), 0);
+
+ auto bg_errors = delete_scheduler_->GetBackgroundErrors();
+ ASSERT_EQ(bg_errors.size(), 0);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+// 1- Create 10 dummy files
+// 2- Delete the 10 files using DeleteScheduler (move them to trsah)
+// 3- Delete the 10 files directly (using env_->DeleteFile)
+// --- Hold DeleteScheduler::BackgroundEmptyTrash ---
+// 4- Make sure that DeleteScheduler failed to delete the 10 files and
+// reported 10 background errors
+TEST_F(DeleteSchedulerTest, BackgroundError) {
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+ {"DeleteSchedulerTest::BackgroundError:1",
+ "DeleteScheduler::BackgroundEmptyTrash"},
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ rate_bytes_per_sec_ = 1024 * 1024; // 1 Mb/sec
+ NewDeleteScheduler();
+
+ // Generate 10 dummy files and move them to trash
+ for (int i = 0; i < 10; i++) {
+ std::string file_name = "data_" + ToString(i) + ".data";
+ ASSERT_OK(delete_scheduler_->DeleteFile(NewDummyFile(file_name), ""));
+ }
+ ASSERT_EQ(CountNormalFiles(), 0);
+ ASSERT_EQ(CountTrashFiles(), 10);
+
+ // Delete 10 files from trash, this will cause background errors in
+ // BackgroundEmptyTrash since we already deleted the files it was
+ // goind to delete
+ for (int i = 0; i < 10; i++) {
+ std::string file_name = "data_" + ToString(i) + ".data.trash";
+ ASSERT_OK(env_->DeleteFile(dummy_files_dirs_[0] + "/" + file_name));
+ }
+
+ // Hold BackgroundEmptyTrash
+ TEST_SYNC_POINT("DeleteSchedulerTest::BackgroundError:1");
+ delete_scheduler_->WaitForEmptyTrash();
+ auto bg_errors = delete_scheduler_->GetBackgroundErrors();
+ ASSERT_EQ(bg_errors.size(), 10);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+// 1- Create 10 dummy files
+// 2- Delete 10 dummy files using DeleteScheduler
+// 3- Wait for DeleteScheduler to delete all files in queue
+// 4- Make sure all files in trash directory were deleted
+// 5- Repeat previous steps 5 times
+TEST_F(DeleteSchedulerTest, StartBGEmptyTrashMultipleTimes) {
+ int bg_delete_file = 0;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DeleteScheduler::DeleteTrashFile:DeleteFile",
+ [&](void* /*arg*/) { bg_delete_file++; });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ rate_bytes_per_sec_ = 1024 * 1024; // 1 MB / sec
+ NewDeleteScheduler();
+
+ // Move files to trash, wait for empty trash, start again
+ for (int run = 1; run <= 5; run++) {
+ // Generate 10 dummy files and move them to trash
+ for (int i = 0; i < 10; i++) {
+ std::string file_name = "data_" + ToString(i) + ".data";
+ ASSERT_OK(delete_scheduler_->DeleteFile(NewDummyFile(file_name), ""));
+ }
+ ASSERT_EQ(CountNormalFiles(), 0);
+ delete_scheduler_->WaitForEmptyTrash();
+ ASSERT_EQ(bg_delete_file, 10 * run);
+ ASSERT_EQ(CountTrashFiles(), 0);
+
+ auto bg_errors = delete_scheduler_->GetBackgroundErrors();
+ ASSERT_EQ(bg_errors.size(), 0);
+ }
+
+ ASSERT_EQ(bg_delete_file, 50);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+}
+
+TEST_F(DeleteSchedulerTest, DeletePartialFile) {
+ int bg_delete_file = 0;
+ int bg_fsync = 0;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DeleteScheduler::DeleteTrashFile:DeleteFile",
+ [&](void*) { bg_delete_file++; });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DeleteScheduler::DeleteTrashFile:Fsync", [&](void*) { bg_fsync++; });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ rate_bytes_per_sec_ = 1024 * 1024; // 1 MB / sec
+ NewDeleteScheduler();
+
+ // Should delete in 4 batch
+ ASSERT_OK(
+ delete_scheduler_->DeleteFile(NewDummyFile("data_1", 500 * 1024), ""));
+ ASSERT_OK(
+ delete_scheduler_->DeleteFile(NewDummyFile("data_2", 100 * 1024), ""));
+ // Should delete in 2 batch
+ ASSERT_OK(
+ delete_scheduler_->DeleteFile(NewDummyFile("data_2", 200 * 1024), ""));
+
+ delete_scheduler_->WaitForEmptyTrash();
+
+ auto bg_errors = delete_scheduler_->GetBackgroundErrors();
+ ASSERT_EQ(bg_errors.size(), 0);
+ ASSERT_EQ(7, bg_delete_file);
+ ASSERT_EQ(4, bg_fsync);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+}
+
+#ifdef OS_LINUX
+TEST_F(DeleteSchedulerTest, NoPartialDeleteWithLink) {
+ int bg_delete_file = 0;
+ int bg_fsync = 0;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DeleteScheduler::DeleteTrashFile:DeleteFile",
+ [&](void*) { bg_delete_file++; });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DeleteScheduler::DeleteTrashFile:Fsync", [&](void*) { bg_fsync++; });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ rate_bytes_per_sec_ = 1024 * 1024; // 1 MB / sec
+ NewDeleteScheduler();
+
+ std::string file1 = NewDummyFile("data_1", 500 * 1024);
+ std::string file2 = NewDummyFile("data_2", 100 * 1024);
+
+ ASSERT_OK(env_->LinkFile(file1, dummy_files_dirs_[0] + "/data_1b"));
+ ASSERT_OK(env_->LinkFile(file2, dummy_files_dirs_[0] + "/data_2b"));
+
+ // Should delete in 4 batch if there is no hardlink
+ ASSERT_OK(delete_scheduler_->DeleteFile(file1, ""));
+ ASSERT_OK(delete_scheduler_->DeleteFile(file2, ""));
+
+ delete_scheduler_->WaitForEmptyTrash();
+
+ auto bg_errors = delete_scheduler_->GetBackgroundErrors();
+ ASSERT_EQ(bg_errors.size(), 0);
+ ASSERT_EQ(2, bg_delete_file);
+ ASSERT_EQ(0, bg_fsync);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+}
+#endif
+
+// 1- Create a DeleteScheduler with very slow rate limit (1 Byte / sec)
+// 2- Delete 100 files using DeleteScheduler
+// 3- Delete the DeleteScheduler (call the destructor while queue is not empty)
+// 4- Make sure that not all files were deleted from trash and that
+// DeleteScheduler background thread did not delete all files
+TEST_F(DeleteSchedulerTest, DestructorWithNonEmptyQueue) {
+ int bg_delete_file = 0;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DeleteScheduler::DeleteTrashFile:DeleteFile",
+ [&](void* /*arg*/) { bg_delete_file++; });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ rate_bytes_per_sec_ = 1; // 1 Byte / sec
+ NewDeleteScheduler();
+
+ for (int i = 0; i < 100; i++) {
+ std::string file_name = "data_" + ToString(i) + ".data";
+ ASSERT_OK(delete_scheduler_->DeleteFile(NewDummyFile(file_name), ""));
+ }
+
+ // Deleting 100 files will need >28 hours to delete
+ // we will delete the DeleteScheduler while delete queue is not empty
+ sst_file_mgr_.reset();
+
+ ASSERT_LT(bg_delete_file, 100);
+ ASSERT_GT(CountTrashFiles(), 0);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DeleteSchedulerTest, DISABLED_DynamicRateLimiting1) {
+ std::vector<uint64_t> penalties;
+ int bg_delete_file = 0;
+ int fg_delete_file = 0;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DeleteScheduler::DeleteTrashFile:DeleteFile",
+ [&](void* /*arg*/) { bg_delete_file++; });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DeleteScheduler::DeleteFile", [&](void* /*arg*/) { fg_delete_file++; });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DeleteScheduler::BackgroundEmptyTrash:Wait",
+ [&](void* arg) { penalties.push_back(*(static_cast<int*>(arg))); });
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+ {"DeleteSchedulerTest::DynamicRateLimiting1:1",
+ "DeleteScheduler::BackgroundEmptyTrash"},
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ rate_bytes_per_sec_ = 0; // Disable rate limiting initially
+ NewDeleteScheduler();
+
+
+ int num_files = 10; // 10 files
+ uint64_t file_size = 1024; // every file is 1 kb
+
+ std::vector<int64_t> delete_kbs_per_sec = {512, 200, 0, 100, 50, -2, 25};
+ for (size_t t = 0; t < delete_kbs_per_sec.size(); t++) {
+ penalties.clear();
+ bg_delete_file = 0;
+ fg_delete_file = 0;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearTrace();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ DestroyAndCreateDir(dummy_files_dirs_[0]);
+ rate_bytes_per_sec_ = delete_kbs_per_sec[t] * 1024;
+ delete_scheduler_->SetRateBytesPerSecond(rate_bytes_per_sec_);
+
+ // Create 100 dummy files, every file is 1 Kb
+ std::vector<std::string> generated_files;
+ for (int i = 0; i < num_files; i++) {
+ std::string file_name = "file" + ToString(i) + ".data";
+ generated_files.push_back(NewDummyFile(file_name, file_size));
+ }
+
+ // Delete dummy files and measure time spent to empty trash
+ for (int i = 0; i < num_files; i++) {
+ ASSERT_OK(delete_scheduler_->DeleteFile(generated_files[i], ""));
+ }
+ ASSERT_EQ(CountNormalFiles(), 0);
+
+ if (rate_bytes_per_sec_ > 0) {
+ uint64_t delete_start_time = env_->NowMicros();
+ TEST_SYNC_POINT("DeleteSchedulerTest::DynamicRateLimiting1:1");
+ delete_scheduler_->WaitForEmptyTrash();
+ uint64_t time_spent_deleting = env_->NowMicros() - delete_start_time;
+
+ auto bg_errors = delete_scheduler_->GetBackgroundErrors();
+ ASSERT_EQ(bg_errors.size(), 0);
+
+ uint64_t total_files_size = 0;
+ uint64_t expected_penlty = 0;
+ ASSERT_EQ(penalties.size(), num_files);
+ for (int i = 0; i < num_files; i++) {
+ total_files_size += file_size;
+ expected_penlty = ((total_files_size * 1000000) / rate_bytes_per_sec_);
+ ASSERT_EQ(expected_penlty, penalties[i]);
+ }
+ ASSERT_GT(time_spent_deleting, expected_penlty * 0.9);
+ ASSERT_EQ(bg_delete_file, num_files);
+ ASSERT_EQ(fg_delete_file, 0);
+ } else {
+ ASSERT_EQ(penalties.size(), 0);
+ ASSERT_EQ(bg_delete_file, 0);
+ ASSERT_EQ(fg_delete_file, num_files);
+ }
+
+ ASSERT_EQ(CountTrashFiles(), 0);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ }
+}
+
+TEST_F(DeleteSchedulerTest, ImmediateDeleteOn25PercDBSize) {
+ int bg_delete_file = 0;
+ int fg_delete_file = 0;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DeleteScheduler::DeleteTrashFile:DeleteFile",
+ [&](void* /*arg*/) { bg_delete_file++; });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DeleteScheduler::DeleteFile", [&](void* /*arg*/) { fg_delete_file++; });
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ int num_files = 100; // 100 files
+ uint64_t file_size = 1024 * 10; // 100 KB as a file size
+ rate_bytes_per_sec_ = 1; // 1 byte per sec (very slow trash delete)
+
+ NewDeleteScheduler();
+ delete_scheduler_->SetMaxTrashDBRatio(0.25);
+
+ std::vector<std::string> generated_files;
+ for (int i = 0; i < num_files; i++) {
+ std::string file_name = "file" + ToString(i) + ".data";
+ generated_files.push_back(NewDummyFile(file_name, file_size));
+ }
+
+ for (std::string& file_name : generated_files) {
+ delete_scheduler_->DeleteFile(file_name, "");
+ }
+
+ // When we end up with 26 files in trash we will start
+ // deleting new files immediately
+ ASSERT_EQ(fg_delete_file, 74);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DeleteSchedulerTest, IsTrashCheck) {
+ // Trash files
+ ASSERT_TRUE(DeleteScheduler::IsTrashFile("x.trash"));
+ ASSERT_TRUE(DeleteScheduler::IsTrashFile(".trash"));
+ ASSERT_TRUE(DeleteScheduler::IsTrashFile("abc.sst.trash"));
+ ASSERT_TRUE(DeleteScheduler::IsTrashFile("/a/b/c/abc..sst.trash"));
+ ASSERT_TRUE(DeleteScheduler::IsTrashFile("log.trash"));
+ ASSERT_TRUE(DeleteScheduler::IsTrashFile("^^^^^.log.trash"));
+ ASSERT_TRUE(DeleteScheduler::IsTrashFile("abc.t.trash"));
+
+ // Not trash files
+ ASSERT_FALSE(DeleteScheduler::IsTrashFile("abc.sst"));
+ ASSERT_FALSE(DeleteScheduler::IsTrashFile("abc.txt"));
+ ASSERT_FALSE(DeleteScheduler::IsTrashFile("/a/b/c/abc.sst"));
+ ASSERT_FALSE(DeleteScheduler::IsTrashFile("/a/b/c/abc.sstrash"));
+ ASSERT_FALSE(DeleteScheduler::IsTrashFile("^^^^^.trashh"));
+ ASSERT_FALSE(DeleteScheduler::IsTrashFile("abc.ttrash"));
+ ASSERT_FALSE(DeleteScheduler::IsTrashFile(".ttrash"));
+ ASSERT_FALSE(DeleteScheduler::IsTrashFile("abc.trashx"));
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
+
+#else
+int main(int /*argc*/, char** /*argv*/) {
+ printf("DeleteScheduler is not supported in ROCKSDB_LITE\n");
+ return 0;
+}
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/file/file_prefetch_buffer.cc b/src/rocksdb/file/file_prefetch_buffer.cc
new file mode 100644
index 000000000..7b55bd397
--- /dev/null
+++ b/src/rocksdb/file/file_prefetch_buffer.cc
@@ -0,0 +1,136 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "file/file_prefetch_buffer.h"
+
+#include <algorithm>
+#include <mutex>
+
+#include "file/random_access_file_reader.h"
+#include "monitoring/histogram.h"
+#include "monitoring/iostats_context_imp.h"
+#include "port/port.h"
+#include "test_util/sync_point.h"
+#include "util/random.h"
+#include "util/rate_limiter.h"
+
+namespace ROCKSDB_NAMESPACE {
+Status FilePrefetchBuffer::Prefetch(RandomAccessFileReader* reader,
+ uint64_t offset, size_t n,
+ bool for_compaction) {
+ if (!enable_ || reader == nullptr) {
+ return Status::OK();
+ }
+ size_t alignment = reader->file()->GetRequiredBufferAlignment();
+ size_t offset_ = static_cast<size_t>(offset);
+ uint64_t rounddown_offset = Rounddown(offset_, alignment);
+ uint64_t roundup_end = Roundup(offset_ + n, alignment);
+ uint64_t roundup_len = roundup_end - rounddown_offset;
+ assert(roundup_len >= alignment);
+ assert(roundup_len % alignment == 0);
+
+ // Check if requested bytes are in the existing buffer_.
+ // If all bytes exist -- return.
+ // If only a few bytes exist -- reuse them & read only what is really needed.
+ // This is typically the case of incremental reading of data.
+ // If no bytes exist in buffer -- full pread.
+
+ Status s;
+ uint64_t chunk_offset_in_buffer = 0;
+ uint64_t chunk_len = 0;
+ bool copy_data_to_new_buffer = false;
+ if (buffer_.CurrentSize() > 0 && offset >= buffer_offset_ &&
+ offset <= buffer_offset_ + buffer_.CurrentSize()) {
+ if (offset + n <= buffer_offset_ + buffer_.CurrentSize()) {
+ // All requested bytes are already in the buffer. So no need to Read
+ // again.
+ return s;
+ } else {
+ // Only a few requested bytes are in the buffer. memmove those chunk of
+ // bytes to the beginning, and memcpy them back into the new buffer if a
+ // new buffer is created.
+ chunk_offset_in_buffer =
+ Rounddown(static_cast<size_t>(offset - buffer_offset_), alignment);
+ chunk_len = buffer_.CurrentSize() - chunk_offset_in_buffer;
+ assert(chunk_offset_in_buffer % alignment == 0);
+ assert(chunk_len % alignment == 0);
+ assert(chunk_offset_in_buffer + chunk_len <=
+ buffer_offset_ + buffer_.CurrentSize());
+ if (chunk_len > 0) {
+ copy_data_to_new_buffer = true;
+ } else {
+ // this reset is not necessary, but just to be safe.
+ chunk_offset_in_buffer = 0;
+ }
+ }
+ }
+
+ // Create a new buffer only if current capacity is not sufficient, and memcopy
+ // bytes from old buffer if needed (i.e., if chunk_len is greater than 0).
+ if (buffer_.Capacity() < roundup_len) {
+ buffer_.Alignment(alignment);
+ buffer_.AllocateNewBuffer(static_cast<size_t>(roundup_len),
+ copy_data_to_new_buffer, chunk_offset_in_buffer,
+ static_cast<size_t>(chunk_len));
+ } else if (chunk_len > 0) {
+ // New buffer not needed. But memmove bytes from tail to the beginning since
+ // chunk_len is greater than 0.
+ buffer_.RefitTail(static_cast<size_t>(chunk_offset_in_buffer),
+ static_cast<size_t>(chunk_len));
+ }
+
+ Slice result;
+ s = reader->Read(rounddown_offset + chunk_len,
+ static_cast<size_t>(roundup_len - chunk_len), &result,
+ buffer_.BufferStart() + chunk_len, for_compaction);
+ if (s.ok()) {
+ buffer_offset_ = rounddown_offset;
+ buffer_.Size(static_cast<size_t>(chunk_len) + result.size());
+ }
+ return s;
+}
+
+bool FilePrefetchBuffer::TryReadFromCache(uint64_t offset, size_t n,
+ Slice* result, bool for_compaction) {
+ if (track_min_offset_ && offset < min_offset_read_) {
+ min_offset_read_ = static_cast<size_t>(offset);
+ }
+ if (!enable_ || offset < buffer_offset_) {
+ return false;
+ }
+
+ // If the buffer contains only a few of the requested bytes:
+ // If readahead is enabled: prefetch the remaining bytes + readadhead bytes
+ // and satisfy the request.
+ // If readahead is not enabled: return false.
+ if (offset + n > buffer_offset_ + buffer_.CurrentSize()) {
+ if (readahead_size_ > 0) {
+ assert(file_reader_ != nullptr);
+ assert(max_readahead_size_ >= readahead_size_);
+ Status s;
+ if (for_compaction) {
+ s = Prefetch(file_reader_, offset, std::max(n, readahead_size_),
+ for_compaction);
+ } else {
+ s = Prefetch(file_reader_, offset, n + readahead_size_, for_compaction);
+ }
+ if (!s.ok()) {
+ return false;
+ }
+ readahead_size_ = std::min(max_readahead_size_, readahead_size_ * 2);
+ } else {
+ return false;
+ }
+ }
+
+ uint64_t offset_in_buffer = offset - buffer_offset_;
+ *result = Slice(buffer_.BufferStart() + offset_in_buffer, n);
+ return true;
+}
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/file/file_prefetch_buffer.h b/src/rocksdb/file/file_prefetch_buffer.h
new file mode 100644
index 000000000..d53f627b5
--- /dev/null
+++ b/src/rocksdb/file/file_prefetch_buffer.h
@@ -0,0 +1,97 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <atomic>
+#include <sstream>
+#include <string>
+#include "file/random_access_file_reader.h"
+#include "port/port.h"
+#include "rocksdb/env.h"
+#include "util/aligned_buffer.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// FilePrefetchBuffer is a smart buffer to store and read data from a file.
+class FilePrefetchBuffer {
+ public:
+ // Constructor.
+ //
+ // All arguments are optional.
+ // file_reader : the file reader to use. Can be a nullptr.
+ // readahead_size : the initial readahead size.
+ // max_readahead_size : the maximum readahead size.
+ // If max_readahead_size > readahead_size, the readahead size will be
+ // doubled on every IO until max_readahead_size is hit.
+ // Typically this is set as a multiple of readahead_size.
+ // max_readahead_size should be greater than equal to readahead_size.
+ // enable : controls whether reading from the buffer is enabled.
+ // If false, TryReadFromCache() always return false, and we only take stats
+ // for the minimum offset if track_min_offset = true.
+ // track_min_offset : Track the minimum offset ever read and collect stats on
+ // it. Used for adaptable readahead of the file footer/metadata.
+ //
+ // Automatic readhead is enabled for a file if file_reader, readahead_size,
+ // and max_readahead_size are passed in.
+ // If file_reader is a nullptr, setting readadhead_size and max_readahead_size
+ // does not make any sense. So it does nothing.
+ // A user can construct a FilePrefetchBuffer without any arguments, but use
+ // `Prefetch` to load data into the buffer.
+ FilePrefetchBuffer(RandomAccessFileReader* file_reader = nullptr,
+ size_t readadhead_size = 0, size_t max_readahead_size = 0,
+ bool enable = true, bool track_min_offset = false)
+ : buffer_offset_(0),
+ file_reader_(file_reader),
+ readahead_size_(readadhead_size),
+ max_readahead_size_(max_readahead_size),
+ min_offset_read_(port::kMaxSizet),
+ enable_(enable),
+ track_min_offset_(track_min_offset) {}
+
+ // Load data into the buffer from a file.
+ // reader : the file reader.
+ // offset : the file offset to start reading from.
+ // n : the number of bytes to read.
+ // for_compaction : if prefetch is done for compaction read.
+ Status Prefetch(RandomAccessFileReader* reader, uint64_t offset, size_t n,
+ bool for_compaction = false);
+
+ // Tries returning the data for a file raed from this buffer, if that data is
+ // in the buffer.
+ // It handles tracking the minimum read offset if track_min_offset = true.
+ // It also does the exponential readahead when readadhead_size is set as part
+ // of the constructor.
+ //
+ // offset : the file offset.
+ // n : the number of bytes.
+ // result : output buffer to put the data into.
+ // for_compaction : if cache read is done for compaction read.
+ bool TryReadFromCache(uint64_t offset, size_t n, Slice* result,
+ bool for_compaction = false);
+
+ // The minimum `offset` ever passed to TryReadFromCache(). This will nly be
+ // tracked if track_min_offset = true.
+ size_t min_offset_read() const { return min_offset_read_; }
+
+ private:
+ AlignedBuffer buffer_;
+ uint64_t buffer_offset_;
+ RandomAccessFileReader* file_reader_;
+ size_t readahead_size_;
+ size_t max_readahead_size_;
+ // The minimum `offset` ever passed to TryReadFromCache().
+ size_t min_offset_read_;
+ // if false, TryReadFromCache() always return false, and we only take stats
+ // for track_min_offset_ if track_min_offset_ = true
+ bool enable_;
+ // If true, track minimum `offset` ever passed to TryReadFromCache(), which
+ // can be fetched from min_offset_read().
+ bool track_min_offset_;
+};
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/file/file_util.cc b/src/rocksdb/file/file_util.cc
new file mode 100644
index 000000000..c231bf7d1
--- /dev/null
+++ b/src/rocksdb/file/file_util.cc
@@ -0,0 +1,124 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+#include "file/file_util.h"
+
+#include <string>
+#include <algorithm>
+
+#include "file/random_access_file_reader.h"
+#include "file/sequence_file_reader.h"
+#include "file/sst_file_manager_impl.h"
+#include "file/writable_file_writer.h"
+#include "rocksdb/env.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Utility function to copy a file up to a specified length
+Status CopyFile(FileSystem* fs, const std::string& source,
+ const std::string& destination, uint64_t size, bool use_fsync) {
+ const FileOptions soptions;
+ Status s;
+ std::unique_ptr<SequentialFileReader> src_reader;
+ std::unique_ptr<WritableFileWriter> dest_writer;
+
+ {
+ std::unique_ptr<FSSequentialFile> srcfile;
+ s = fs->NewSequentialFile(source, soptions, &srcfile, nullptr);
+ if (!s.ok()) {
+ return s;
+ }
+ std::unique_ptr<FSWritableFile> destfile;
+ s = fs->NewWritableFile(destination, soptions, &destfile, nullptr);
+ if (!s.ok()) {
+ return s;
+ }
+
+ if (size == 0) {
+ // default argument means copy everything
+ s = fs->GetFileSize(source, IOOptions(), &size, nullptr);
+ if (!s.ok()) {
+ return s;
+ }
+ }
+ src_reader.reset(new SequentialFileReader(std::move(srcfile), source));
+ dest_writer.reset(
+ new WritableFileWriter(std::move(destfile), destination, soptions));
+ }
+
+ char buffer[4096];
+ Slice slice;
+ while (size > 0) {
+ size_t bytes_to_read = std::min(sizeof(buffer), static_cast<size_t>(size));
+ s = src_reader->Read(bytes_to_read, &slice, buffer);
+ if (!s.ok()) {
+ return s;
+ }
+ if (slice.size() == 0) {
+ return Status::Corruption("file too small");
+ }
+ s = dest_writer->Append(slice);
+ if (!s.ok()) {
+ return s;
+ }
+ size -= slice.size();
+ }
+ return dest_writer->Sync(use_fsync);
+}
+
+// Utility function to create a file with the provided contents
+Status CreateFile(FileSystem* fs, const std::string& destination,
+ const std::string& contents, bool use_fsync) {
+ const EnvOptions soptions;
+ Status s;
+ std::unique_ptr<WritableFileWriter> dest_writer;
+
+ std::unique_ptr<FSWritableFile> destfile;
+ s = fs->NewWritableFile(destination, soptions, &destfile, nullptr);
+ if (!s.ok()) {
+ return s;
+ }
+ dest_writer.reset(
+ new WritableFileWriter(std::move(destfile), destination, soptions));
+ s = dest_writer->Append(Slice(contents));
+ if (!s.ok()) {
+ return s;
+ }
+ return dest_writer->Sync(use_fsync);
+}
+
+Status DeleteDBFile(const ImmutableDBOptions* db_options,
+ const std::string& fname, const std::string& dir_to_sync,
+ const bool force_bg, const bool force_fg) {
+#ifndef ROCKSDB_LITE
+ SstFileManagerImpl* sfm =
+ static_cast<SstFileManagerImpl*>(db_options->sst_file_manager.get());
+ if (sfm && !force_fg) {
+ return sfm->ScheduleFileDeletion(fname, dir_to_sync, force_bg);
+ } else {
+ return db_options->env->DeleteFile(fname);
+ }
+#else
+ (void)dir_to_sync;
+ (void)force_bg;
+ (void)force_fg;
+ // SstFileManager is not supported in ROCKSDB_LITE
+ // Delete file immediately
+ return db_options->env->DeleteFile(fname);
+#endif
+}
+
+bool IsWalDirSameAsDBPath(const ImmutableDBOptions* db_options) {
+ bool same = false;
+ assert(!db_options->db_paths.empty());
+ Status s = db_options->env->AreFilesSame(db_options->wal_dir,
+ db_options->db_paths[0].path, &same);
+ if (s.IsNotSupported()) {
+ same = db_options->wal_dir == db_options->db_paths[0].path;
+ }
+ return same;
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/file/file_util.h b/src/rocksdb/file/file_util.h
new file mode 100644
index 000000000..333749e4d
--- /dev/null
+++ b/src/rocksdb/file/file_util.h
@@ -0,0 +1,33 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+#pragma once
+#include <string>
+
+#include "file/filename.h"
+#include "options/db_options.h"
+#include "rocksdb/env.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/status.h"
+#include "rocksdb/types.h"
+
+namespace ROCKSDB_NAMESPACE {
+// use_fsync maps to options.use_fsync, which determines the way that
+// the file is synced after copying.
+extern Status CopyFile(FileSystem* fs, const std::string& source,
+ const std::string& destination, uint64_t size,
+ bool use_fsync);
+
+extern Status CreateFile(FileSystem* fs, const std::string& destination,
+ const std::string& contents, bool use_fsync);
+
+extern Status DeleteDBFile(const ImmutableDBOptions* db_options,
+ const std::string& fname,
+ const std::string& path_to_sync, const bool force_bg,
+ const bool force_fg);
+
+extern bool IsWalDirSameAsDBPath(const ImmutableDBOptions* db_options);
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/file/filename.cc b/src/rocksdb/file/filename.cc
new file mode 100644
index 000000000..d620b7351
--- /dev/null
+++ b/src/rocksdb/file/filename.cc
@@ -0,0 +1,456 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include "file/filename.h"
+#include <cinttypes>
+
+#include <ctype.h>
+#include <stdio.h>
+#include <vector>
+#include "file/writable_file_writer.h"
+#include "logging/logging.h"
+#include "rocksdb/env.h"
+#include "test_util/sync_point.h"
+#include "util/stop_watch.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+static const std::string kRocksDbTFileExt = "sst";
+static const std::string kLevelDbTFileExt = "ldb";
+static const std::string kRocksDBBlobFileExt = "blob";
+
+// Given a path, flatten the path name by replacing all chars not in
+// {[0-9,a-z,A-Z,-,_,.]} with _. And append '_LOG\0' at the end.
+// Return the number of chars stored in dest not including the trailing '\0'.
+static size_t GetInfoLogPrefix(const std::string& path, char* dest, int len) {
+ const char suffix[] = "_LOG";
+
+ size_t write_idx = 0;
+ size_t i = 0;
+ size_t src_len = path.size();
+
+ while (i < src_len && write_idx < len - sizeof(suffix)) {
+ if ((path[i] >= 'a' && path[i] <= 'z') ||
+ (path[i] >= '0' && path[i] <= '9') ||
+ (path[i] >= 'A' && path[i] <= 'Z') ||
+ path[i] == '-' ||
+ path[i] == '.' ||
+ path[i] == '_'){
+ dest[write_idx++] = path[i];
+ } else {
+ if (i > 0) {
+ dest[write_idx++] = '_';
+ }
+ }
+ i++;
+ }
+ assert(sizeof(suffix) <= len - write_idx);
+ // "\0" is automatically added by snprintf
+ snprintf(dest + write_idx, len - write_idx, suffix);
+ write_idx += sizeof(suffix) - 1;
+ return write_idx;
+}
+
+static std::string MakeFileName(uint64_t number, const char* suffix) {
+ char buf[100];
+ snprintf(buf, sizeof(buf), "%06llu.%s",
+ static_cast<unsigned long long>(number), suffix);
+ return buf;
+}
+
+static std::string MakeFileName(const std::string& name, uint64_t number,
+ const char* suffix) {
+ return name + "/" + MakeFileName(number, suffix);
+}
+
+std::string LogFileName(const std::string& name, uint64_t number) {
+ assert(number > 0);
+ return MakeFileName(name, number, "log");
+}
+
+std::string LogFileName(uint64_t number) {
+ assert(number > 0);
+ return MakeFileName(number, "log");
+}
+
+std::string BlobFileName(const std::string& blobdirname, uint64_t number) {
+ assert(number > 0);
+ return MakeFileName(blobdirname, number, kRocksDBBlobFileExt.c_str());
+}
+
+std::string BlobFileName(const std::string& dbname, const std::string& blob_dir,
+ uint64_t number) {
+ assert(number > 0);
+ return MakeFileName(dbname + "/" + blob_dir, number,
+ kRocksDBBlobFileExt.c_str());
+}
+
+std::string ArchivalDirectory(const std::string& dir) {
+ return dir + "/" + ARCHIVAL_DIR;
+}
+std::string ArchivedLogFileName(const std::string& name, uint64_t number) {
+ assert(number > 0);
+ return MakeFileName(name + "/" + ARCHIVAL_DIR, number, "log");
+}
+
+std::string MakeTableFileName(const std::string& path, uint64_t number) {
+ return MakeFileName(path, number, kRocksDbTFileExt.c_str());
+}
+
+std::string MakeTableFileName(uint64_t number) {
+ return MakeFileName(number, kRocksDbTFileExt.c_str());
+}
+
+std::string Rocks2LevelTableFileName(const std::string& fullname) {
+ assert(fullname.size() > kRocksDbTFileExt.size() + 1);
+ if (fullname.size() <= kRocksDbTFileExt.size() + 1) {
+ return "";
+ }
+ return fullname.substr(0, fullname.size() - kRocksDbTFileExt.size()) +
+ kLevelDbTFileExt;
+}
+
+uint64_t TableFileNameToNumber(const std::string& name) {
+ uint64_t number = 0;
+ uint64_t base = 1;
+ int pos = static_cast<int>(name.find_last_of('.'));
+ while (--pos >= 0 && name[pos] >= '0' && name[pos] <= '9') {
+ number += (name[pos] - '0') * base;
+ base *= 10;
+ }
+ return number;
+}
+
+std::string TableFileName(const std::vector<DbPath>& db_paths, uint64_t number,
+ uint32_t path_id) {
+ assert(number > 0);
+ std::string path;
+ if (path_id >= db_paths.size()) {
+ path = db_paths.back().path;
+ } else {
+ path = db_paths[path_id].path;
+ }
+ return MakeTableFileName(path, number);
+}
+
+void FormatFileNumber(uint64_t number, uint32_t path_id, char* out_buf,
+ size_t out_buf_size) {
+ if (path_id == 0) {
+ snprintf(out_buf, out_buf_size, "%" PRIu64, number);
+ } else {
+ snprintf(out_buf, out_buf_size, "%" PRIu64
+ "(path "
+ "%" PRIu32 ")",
+ number, path_id);
+ }
+}
+
+std::string DescriptorFileName(const std::string& dbname, uint64_t number) {
+ assert(number > 0);
+ char buf[100];
+ snprintf(buf, sizeof(buf), "/MANIFEST-%06llu",
+ static_cast<unsigned long long>(number));
+ return dbname + buf;
+}
+
+std::string CurrentFileName(const std::string& dbname) {
+ return dbname + "/CURRENT";
+}
+
+std::string LockFileName(const std::string& dbname) {
+ return dbname + "/LOCK";
+}
+
+std::string TempFileName(const std::string& dbname, uint64_t number) {
+ return MakeFileName(dbname, number, kTempFileNameSuffix.c_str());
+}
+
+InfoLogPrefix::InfoLogPrefix(bool has_log_dir,
+ const std::string& db_absolute_path) {
+ if (!has_log_dir) {
+ const char kInfoLogPrefix[] = "LOG";
+ // "\0" is automatically added to the end
+ snprintf(buf, sizeof(buf), kInfoLogPrefix);
+ prefix = Slice(buf, sizeof(kInfoLogPrefix) - 1);
+ } else {
+ size_t len = GetInfoLogPrefix(db_absolute_path, buf, sizeof(buf));
+ prefix = Slice(buf, len);
+ }
+}
+
+std::string InfoLogFileName(const std::string& dbname,
+ const std::string& db_path, const std::string& log_dir) {
+ if (log_dir.empty()) {
+ return dbname + "/LOG";
+ }
+
+ InfoLogPrefix info_log_prefix(true, db_path);
+ return log_dir + "/" + info_log_prefix.buf;
+}
+
+// Return the name of the old info log file for "dbname".
+std::string OldInfoLogFileName(const std::string& dbname, uint64_t ts,
+ const std::string& db_path, const std::string& log_dir) {
+ char buf[50];
+ snprintf(buf, sizeof(buf), "%llu", static_cast<unsigned long long>(ts));
+
+ if (log_dir.empty()) {
+ return dbname + "/LOG.old." + buf;
+ }
+
+ InfoLogPrefix info_log_prefix(true, db_path);
+ return log_dir + "/" + info_log_prefix.buf + ".old." + buf;
+}
+
+std::string OptionsFileName(const std::string& dbname, uint64_t file_num) {
+ char buffer[256];
+ snprintf(buffer, sizeof(buffer), "%s%06" PRIu64,
+ kOptionsFileNamePrefix.c_str(), file_num);
+ return dbname + "/" + buffer;
+}
+
+std::string TempOptionsFileName(const std::string& dbname, uint64_t file_num) {
+ char buffer[256];
+ snprintf(buffer, sizeof(buffer), "%s%06" PRIu64 ".%s",
+ kOptionsFileNamePrefix.c_str(), file_num,
+ kTempFileNameSuffix.c_str());
+ return dbname + "/" + buffer;
+}
+
+std::string MetaDatabaseName(const std::string& dbname, uint64_t number) {
+ char buf[100];
+ snprintf(buf, sizeof(buf), "/METADB-%llu",
+ static_cast<unsigned long long>(number));
+ return dbname + buf;
+}
+
+std::string IdentityFileName(const std::string& dbname) {
+ return dbname + "/IDENTITY";
+}
+
+// Owned filenames have the form:
+// dbname/IDENTITY
+// dbname/CURRENT
+// dbname/LOCK
+// dbname/<info_log_name_prefix>
+// dbname/<info_log_name_prefix>.old.[0-9]+
+// dbname/MANIFEST-[0-9]+
+// dbname/[0-9]+.(log|sst|blob)
+// dbname/METADB-[0-9]+
+// dbname/OPTIONS-[0-9]+
+// dbname/OPTIONS-[0-9]+.dbtmp
+// Disregards / at the beginning
+bool ParseFileName(const std::string& fname,
+ uint64_t* number,
+ FileType* type,
+ WalFileType* log_type) {
+ return ParseFileName(fname, number, "", type, log_type);
+}
+
+bool ParseFileName(const std::string& fname, uint64_t* number,
+ const Slice& info_log_name_prefix, FileType* type,
+ WalFileType* log_type) {
+ Slice rest(fname);
+ if (fname.length() > 1 && fname[0] == '/') {
+ rest.remove_prefix(1);
+ }
+ if (rest == "IDENTITY") {
+ *number = 0;
+ *type = kIdentityFile;
+ } else if (rest == "CURRENT") {
+ *number = 0;
+ *type = kCurrentFile;
+ } else if (rest == "LOCK") {
+ *number = 0;
+ *type = kDBLockFile;
+ } else if (info_log_name_prefix.size() > 0 &&
+ rest.starts_with(info_log_name_prefix)) {
+ rest.remove_prefix(info_log_name_prefix.size());
+ if (rest == "" || rest == ".old") {
+ *number = 0;
+ *type = kInfoLogFile;
+ } else if (rest.starts_with(".old.")) {
+ uint64_t ts_suffix;
+ // sizeof also counts the trailing '\0'.
+ rest.remove_prefix(sizeof(".old.") - 1);
+ if (!ConsumeDecimalNumber(&rest, &ts_suffix)) {
+ return false;
+ }
+ *number = ts_suffix;
+ *type = kInfoLogFile;
+ }
+ } else if (rest.starts_with("MANIFEST-")) {
+ rest.remove_prefix(strlen("MANIFEST-"));
+ uint64_t num;
+ if (!ConsumeDecimalNumber(&rest, &num)) {
+ return false;
+ }
+ if (!rest.empty()) {
+ return false;
+ }
+ *type = kDescriptorFile;
+ *number = num;
+ } else if (rest.starts_with("METADB-")) {
+ rest.remove_prefix(strlen("METADB-"));
+ uint64_t num;
+ if (!ConsumeDecimalNumber(&rest, &num)) {
+ return false;
+ }
+ if (!rest.empty()) {
+ return false;
+ }
+ *type = kMetaDatabase;
+ *number = num;
+ } else if (rest.starts_with(kOptionsFileNamePrefix)) {
+ uint64_t ts_suffix;
+ bool is_temp_file = false;
+ rest.remove_prefix(kOptionsFileNamePrefix.size());
+ const std::string kTempFileNameSuffixWithDot =
+ std::string(".") + kTempFileNameSuffix;
+ if (rest.ends_with(kTempFileNameSuffixWithDot)) {
+ rest.remove_suffix(kTempFileNameSuffixWithDot.size());
+ is_temp_file = true;
+ }
+ if (!ConsumeDecimalNumber(&rest, &ts_suffix)) {
+ return false;
+ }
+ *number = ts_suffix;
+ *type = is_temp_file ? kTempFile : kOptionsFile;
+ } else {
+ // Avoid strtoull() to keep filename format independent of the
+ // current locale
+ bool archive_dir_found = false;
+ if (rest.starts_with(ARCHIVAL_DIR)) {
+ if (rest.size() <= ARCHIVAL_DIR.size()) {
+ return false;
+ }
+ rest.remove_prefix(ARCHIVAL_DIR.size() + 1); // Add 1 to remove / also
+ if (log_type) {
+ *log_type = kArchivedLogFile;
+ }
+ archive_dir_found = true;
+ }
+ uint64_t num;
+ if (!ConsumeDecimalNumber(&rest, &num)) {
+ return false;
+ }
+ if (rest.size() <= 1 || rest[0] != '.') {
+ return false;
+ }
+ rest.remove_prefix(1);
+
+ Slice suffix = rest;
+ if (suffix == Slice("log")) {
+ *type = kLogFile;
+ if (log_type && !archive_dir_found) {
+ *log_type = kAliveLogFile;
+ }
+ } else if (archive_dir_found) {
+ return false; // Archive dir can contain only log files
+ } else if (suffix == Slice(kRocksDbTFileExt) ||
+ suffix == Slice(kLevelDbTFileExt)) {
+ *type = kTableFile;
+ } else if (suffix == Slice(kRocksDBBlobFileExt)) {
+ *type = kBlobFile;
+ } else if (suffix == Slice(kTempFileNameSuffix)) {
+ *type = kTempFile;
+ } else {
+ return false;
+ }
+ *number = num;
+ }
+ return true;
+}
+
+Status SetCurrentFile(Env* env, const std::string& dbname,
+ uint64_t descriptor_number,
+ Directory* directory_to_fsync) {
+ // Remove leading "dbname/" and add newline to manifest file name
+ std::string manifest = DescriptorFileName(dbname, descriptor_number);
+ Slice contents = manifest;
+ assert(contents.starts_with(dbname + "/"));
+ contents.remove_prefix(dbname.size() + 1);
+ std::string tmp = TempFileName(dbname, descriptor_number);
+ Status s = WriteStringToFile(env, contents.ToString() + "\n", tmp, true);
+ if (s.ok()) {
+ TEST_KILL_RANDOM("SetCurrentFile:0", rocksdb_kill_odds * REDUCE_ODDS2);
+ s = env->RenameFile(tmp, CurrentFileName(dbname));
+ TEST_KILL_RANDOM("SetCurrentFile:1", rocksdb_kill_odds * REDUCE_ODDS2);
+ }
+ if (s.ok()) {
+ if (directory_to_fsync != nullptr) {
+ s = directory_to_fsync->Fsync();
+ }
+ } else {
+ env->DeleteFile(tmp);
+ }
+ return s;
+}
+
+Status SetIdentityFile(Env* env, const std::string& dbname,
+ const std::string& db_id) {
+ std::string id;
+ if (db_id.empty()) {
+ id = env->GenerateUniqueId();
+ } else {
+ id = db_id;
+ }
+ assert(!id.empty());
+ // Reserve the filename dbname/000000.dbtmp for the temporary identity file
+ std::string tmp = TempFileName(dbname, 0);
+ Status s = WriteStringToFile(env, id, tmp, true);
+ if (s.ok()) {
+ s = env->RenameFile(tmp, IdentityFileName(dbname));
+ }
+ if (!s.ok()) {
+ env->DeleteFile(tmp);
+ }
+ return s;
+}
+
+Status SyncManifest(Env* env, const ImmutableDBOptions* db_options,
+ WritableFileWriter* file) {
+ TEST_KILL_RANDOM("SyncManifest:0", rocksdb_kill_odds * REDUCE_ODDS2);
+ StopWatch sw(env, db_options->statistics.get(), MANIFEST_FILE_SYNC_MICROS);
+ return file->Sync(db_options->use_fsync);
+}
+
+Status GetInfoLogFiles(Env* env, const std::string& db_log_dir,
+ const std::string& dbname, std::string* parent_dir,
+ std::vector<std::string>* info_log_list) {
+ assert(parent_dir != nullptr);
+ assert(info_log_list != nullptr);
+ uint64_t number = 0;
+ FileType type = kLogFile;
+
+ if (!db_log_dir.empty()) {
+ *parent_dir = db_log_dir;
+ } else {
+ *parent_dir = dbname;
+ }
+
+ InfoLogPrefix info_log_prefix(!db_log_dir.empty(), dbname);
+
+ std::vector<std::string> file_names;
+ Status s = env->GetChildren(*parent_dir, &file_names);
+
+ if (!s.ok()) {
+ return s;
+ }
+
+ for (auto& f : file_names) {
+ if (ParseFileName(f, &number, info_log_prefix.prefix, &type) &&
+ (type == kInfoLogFile)) {
+ info_log_list->push_back(f);
+ }
+ }
+ return Status::OK();
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/file/filename.h b/src/rocksdb/file/filename.h
new file mode 100644
index 000000000..909287c25
--- /dev/null
+++ b/src/rocksdb/file/filename.h
@@ -0,0 +1,185 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// File names used by DB code
+
+#pragma once
+#include <stdint.h>
+#include <unordered_map>
+#include <string>
+#include <vector>
+
+#include "options/db_options.h"
+#include "port/port.h"
+#include "rocksdb/options.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+#include "rocksdb/transaction_log.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Env;
+class Directory;
+class WritableFileWriter;
+
+enum FileType {
+ kLogFile,
+ kDBLockFile,
+ kTableFile,
+ kDescriptorFile,
+ kCurrentFile,
+ kTempFile,
+ kInfoLogFile, // Either the current one, or an old one
+ kMetaDatabase,
+ kIdentityFile,
+ kOptionsFile,
+ kBlobFile
+};
+
+// Return the name of the log file with the specified number
+// in the db named by "dbname". The result will be prefixed with
+// "dbname".
+extern std::string LogFileName(const std::string& dbname, uint64_t number);
+
+extern std::string LogFileName(uint64_t number);
+
+extern std::string BlobFileName(const std::string& bdirname, uint64_t number);
+
+extern std::string BlobFileName(const std::string& dbname,
+ const std::string& blob_dir, uint64_t number);
+
+static const std::string ARCHIVAL_DIR = "archive";
+
+extern std::string ArchivalDirectory(const std::string& dbname);
+
+// Return the name of the archived log file with the specified number
+// in the db named by "dbname". The result will be prefixed with "dbname".
+extern std::string ArchivedLogFileName(const std::string& dbname,
+ uint64_t num);
+
+extern std::string MakeTableFileName(const std::string& name, uint64_t number);
+
+extern std::string MakeTableFileName(uint64_t number);
+
+// Return the name of sstable with LevelDB suffix
+// created from RocksDB sstable suffixed name
+extern std::string Rocks2LevelTableFileName(const std::string& fullname);
+
+// the reverse function of MakeTableFileName
+// TODO(yhchiang): could merge this function with ParseFileName()
+extern uint64_t TableFileNameToNumber(const std::string& name);
+
+// Return the name of the sstable with the specified number
+// in the db named by "dbname". The result will be prefixed with
+// "dbname".
+extern std::string TableFileName(const std::vector<DbPath>& db_paths,
+ uint64_t number, uint32_t path_id);
+
+// Sufficient buffer size for FormatFileNumber.
+const size_t kFormatFileNumberBufSize = 38;
+
+extern void FormatFileNumber(uint64_t number, uint32_t path_id, char* out_buf,
+ size_t out_buf_size);
+
+// Return the name of the descriptor file for the db named by
+// "dbname" and the specified incarnation number. The result will be
+// prefixed with "dbname".
+extern std::string DescriptorFileName(const std::string& dbname,
+ uint64_t number);
+
+// Return the name of the current file. This file contains the name
+// of the current manifest file. The result will be prefixed with
+// "dbname".
+extern std::string CurrentFileName(const std::string& dbname);
+
+// Return the name of the lock file for the db named by
+// "dbname". The result will be prefixed with "dbname".
+extern std::string LockFileName(const std::string& dbname);
+
+// Return the name of a temporary file owned by the db named "dbname".
+// The result will be prefixed with "dbname".
+extern std::string TempFileName(const std::string& dbname, uint64_t number);
+
+// A helper structure for prefix of info log names.
+struct InfoLogPrefix {
+ char buf[260];
+ Slice prefix;
+ // Prefix with DB absolute path encoded
+ explicit InfoLogPrefix(bool has_log_dir, const std::string& db_absolute_path);
+ // Default Prefix
+ explicit InfoLogPrefix();
+};
+
+// Return the name of the info log file for "dbname".
+extern std::string InfoLogFileName(const std::string& dbname,
+ const std::string& db_path = "",
+ const std::string& log_dir = "");
+
+// Return the name of the old info log file for "dbname".
+extern std::string OldInfoLogFileName(const std::string& dbname, uint64_t ts,
+ const std::string& db_path = "",
+ const std::string& log_dir = "");
+
+static const std::string kOptionsFileNamePrefix = "OPTIONS-";
+static const std::string kTempFileNameSuffix = "dbtmp";
+
+// Return a options file name given the "dbname" and file number.
+// Format: OPTIONS-[number].dbtmp
+extern std::string OptionsFileName(const std::string& dbname,
+ uint64_t file_num);
+
+// Return a temp options file name given the "dbname" and file number.
+// Format: OPTIONS-[number]
+extern std::string TempOptionsFileName(const std::string& dbname,
+ uint64_t file_num);
+
+// Return the name to use for a metadatabase. The result will be prefixed with
+// "dbname".
+extern std::string MetaDatabaseName(const std::string& dbname,
+ uint64_t number);
+
+// Return the name of the Identity file which stores a unique number for the db
+// that will get regenerated if the db loses all its data and is recreated fresh
+// either from a backup-image or empty
+extern std::string IdentityFileName(const std::string& dbname);
+
+// If filename is a rocksdb file, store the type of the file in *type.
+// The number encoded in the filename is stored in *number. If the
+// filename was successfully parsed, returns true. Else return false.
+// info_log_name_prefix is the path of info logs.
+extern bool ParseFileName(const std::string& filename, uint64_t* number,
+ const Slice& info_log_name_prefix, FileType* type,
+ WalFileType* log_type = nullptr);
+// Same as previous function, but skip info log files.
+extern bool ParseFileName(const std::string& filename, uint64_t* number,
+ FileType* type, WalFileType* log_type = nullptr);
+
+// Make the CURRENT file point to the descriptor file with the
+// specified number.
+extern Status SetCurrentFile(Env* env, const std::string& dbname,
+ uint64_t descriptor_number,
+ Directory* directory_to_fsync);
+
+// Make the IDENTITY file for the db
+extern Status SetIdentityFile(Env* env, const std::string& dbname,
+ const std::string& db_id = {});
+
+// Sync manifest file `file`.
+extern Status SyncManifest(Env* env, const ImmutableDBOptions* db_options,
+ WritableFileWriter* file);
+
+// Return list of file names of info logs in `file_names`.
+// The list only contains file name. The parent directory name is stored
+// in `parent_dir`.
+// `db_log_dir` should be the one as in options.db_log_dir
+extern Status GetInfoLogFiles(Env* env, const std::string& db_log_dir,
+ const std::string& dbname,
+ std::string* parent_dir,
+ std::vector<std::string>* file_names);
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/file/random_access_file_reader.cc b/src/rocksdb/file/random_access_file_reader.cc
new file mode 100644
index 000000000..46892360f
--- /dev/null
+++ b/src/rocksdb/file/random_access_file_reader.cc
@@ -0,0 +1,189 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "file/random_access_file_reader.h"
+
+#include <algorithm>
+#include <mutex>
+
+#include "monitoring/histogram.h"
+#include "monitoring/iostats_context_imp.h"
+#include "port/port.h"
+#include "test_util/sync_point.h"
+#include "util/random.h"
+#include "util/rate_limiter.h"
+
+namespace ROCKSDB_NAMESPACE {
+Status RandomAccessFileReader::Read(uint64_t offset, size_t n, Slice* result,
+ char* scratch, bool for_compaction) const {
+ Status s;
+ uint64_t elapsed = 0;
+ {
+ StopWatch sw(env_, stats_, hist_type_,
+ (stats_ != nullptr) ? &elapsed : nullptr, true /*overwrite*/,
+ true /*delay_enabled*/);
+ auto prev_perf_level = GetPerfLevel();
+ IOSTATS_TIMER_GUARD(read_nanos);
+ if (use_direct_io()) {
+#ifndef ROCKSDB_LITE
+ size_t alignment = file_->GetRequiredBufferAlignment();
+ size_t aligned_offset =
+ TruncateToPageBoundary(alignment, static_cast<size_t>(offset));
+ size_t offset_advance = static_cast<size_t>(offset) - aligned_offset;
+ size_t read_size =
+ Roundup(static_cast<size_t>(offset + n), alignment) - aligned_offset;
+ AlignedBuffer buf;
+ buf.Alignment(alignment);
+ buf.AllocateNewBuffer(read_size);
+ while (buf.CurrentSize() < read_size) {
+ size_t allowed;
+ if (for_compaction && rate_limiter_ != nullptr) {
+ allowed = rate_limiter_->RequestToken(
+ buf.Capacity() - buf.CurrentSize(), buf.Alignment(),
+ Env::IOPriority::IO_LOW, stats_, RateLimiter::OpType::kRead);
+ } else {
+ assert(buf.CurrentSize() == 0);
+ allowed = read_size;
+ }
+ Slice tmp;
+
+ FileOperationInfo::TimePoint start_ts;
+ uint64_t orig_offset = 0;
+ if (ShouldNotifyListeners()) {
+ start_ts = std::chrono::system_clock::now();
+ orig_offset = aligned_offset + buf.CurrentSize();
+ }
+ {
+ IOSTATS_CPU_TIMER_GUARD(cpu_read_nanos, env_);
+ s = file_->Read(aligned_offset + buf.CurrentSize(), allowed,
+ IOOptions(), &tmp, buf.Destination(), nullptr);
+ }
+ if (ShouldNotifyListeners()) {
+ auto finish_ts = std::chrono::system_clock::now();
+ NotifyOnFileReadFinish(orig_offset, tmp.size(), start_ts, finish_ts,
+ s);
+ }
+
+ buf.Size(buf.CurrentSize() + tmp.size());
+ if (!s.ok() || tmp.size() < allowed) {
+ break;
+ }
+ }
+ size_t res_len = 0;
+ if (s.ok() && offset_advance < buf.CurrentSize()) {
+ res_len = buf.Read(scratch, offset_advance,
+ std::min(buf.CurrentSize() - offset_advance, n));
+ }
+ *result = Slice(scratch, res_len);
+#endif // !ROCKSDB_LITE
+ } else {
+ size_t pos = 0;
+ const char* res_scratch = nullptr;
+ while (pos < n) {
+ size_t allowed;
+ if (for_compaction && rate_limiter_ != nullptr) {
+ if (rate_limiter_->IsRateLimited(RateLimiter::OpType::kRead)) {
+ sw.DelayStart();
+ }
+ allowed = rate_limiter_->RequestToken(n - pos, 0 /* alignment */,
+ Env::IOPriority::IO_LOW, stats_,
+ RateLimiter::OpType::kRead);
+ if (rate_limiter_->IsRateLimited(RateLimiter::OpType::kRead)) {
+ sw.DelayStop();
+ }
+ } else {
+ allowed = n;
+ }
+ Slice tmp_result;
+
+#ifndef ROCKSDB_LITE
+ FileOperationInfo::TimePoint start_ts;
+ if (ShouldNotifyListeners()) {
+ start_ts = std::chrono::system_clock::now();
+ }
+#endif
+ {
+ IOSTATS_CPU_TIMER_GUARD(cpu_read_nanos, env_);
+ s = file_->Read(offset + pos, allowed, IOOptions(), &tmp_result,
+ scratch + pos, nullptr);
+ }
+#ifndef ROCKSDB_LITE
+ if (ShouldNotifyListeners()) {
+ auto finish_ts = std::chrono::system_clock::now();
+ NotifyOnFileReadFinish(offset + pos, tmp_result.size(), start_ts,
+ finish_ts, s);
+ }
+#endif
+
+ if (res_scratch == nullptr) {
+ // we can't simply use `scratch` because reads of mmap'd files return
+ // data in a different buffer.
+ res_scratch = tmp_result.data();
+ } else {
+ // make sure chunks are inserted contiguously into `res_scratch`.
+ assert(tmp_result.data() == res_scratch + pos);
+ }
+ pos += tmp_result.size();
+ if (!s.ok() || tmp_result.size() < allowed) {
+ break;
+ }
+ }
+ *result = Slice(res_scratch, s.ok() ? pos : 0);
+ }
+ IOSTATS_ADD_IF_POSITIVE(bytes_read, result->size());
+ SetPerfLevel(prev_perf_level);
+ }
+ if (stats_ != nullptr && file_read_hist_ != nullptr) {
+ file_read_hist_->Add(elapsed);
+ }
+
+ return s;
+}
+
+Status RandomAccessFileReader::MultiRead(FSReadRequest* read_reqs,
+ size_t num_reqs) const {
+ Status s;
+ uint64_t elapsed = 0;
+ assert(!use_direct_io());
+ {
+ StopWatch sw(env_, stats_, hist_type_,
+ (stats_ != nullptr) ? &elapsed : nullptr, true /*overwrite*/,
+ true /*delay_enabled*/);
+ auto prev_perf_level = GetPerfLevel();
+ IOSTATS_TIMER_GUARD(read_nanos);
+
+#ifndef ROCKSDB_LITE
+ FileOperationInfo::TimePoint start_ts;
+ if (ShouldNotifyListeners()) {
+ start_ts = std::chrono::system_clock::now();
+ }
+#endif // ROCKSDB_LITE
+ {
+ IOSTATS_CPU_TIMER_GUARD(cpu_read_nanos, env_);
+ s = file_->MultiRead(read_reqs, num_reqs, IOOptions(), nullptr);
+ }
+ for (size_t i = 0; i < num_reqs; ++i) {
+#ifndef ROCKSDB_LITE
+ if (ShouldNotifyListeners()) {
+ auto finish_ts = std::chrono::system_clock::now();
+ NotifyOnFileReadFinish(read_reqs[i].offset, read_reqs[i].result.size(),
+ start_ts, finish_ts, read_reqs[i].status);
+ }
+#endif // ROCKSDB_LITE
+ IOSTATS_ADD_IF_POSITIVE(bytes_read, read_reqs[i].result.size());
+ }
+ SetPerfLevel(prev_perf_level);
+ }
+ if (stats_ != nullptr && file_read_hist_ != nullptr) {
+ file_read_hist_->Add(elapsed);
+ }
+
+ return s;
+}
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/file/random_access_file_reader.h b/src/rocksdb/file/random_access_file_reader.h
new file mode 100644
index 000000000..35027bf45
--- /dev/null
+++ b/src/rocksdb/file/random_access_file_reader.h
@@ -0,0 +1,120 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <atomic>
+#include <sstream>
+#include <string>
+#include "port/port.h"
+#include "rocksdb/env.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/listener.h"
+#include "rocksdb/rate_limiter.h"
+#include "util/aligned_buffer.h"
+
+namespace ROCKSDB_NAMESPACE {
+class Statistics;
+class HistogramImpl;
+
+// RandomAccessFileReader is a wrapper on top of Env::RnadomAccessFile. It is
+// responsible for:
+// - Handling Buffered and Direct reads appropriately.
+// - Rate limiting compaction reads.
+// - Notifying any interested listeners on the completion of a read.
+// - Updating IO stats.
+class RandomAccessFileReader {
+ private:
+#ifndef ROCKSDB_LITE
+ void NotifyOnFileReadFinish(uint64_t offset, size_t length,
+ const FileOperationInfo::TimePoint& start_ts,
+ const FileOperationInfo::TimePoint& finish_ts,
+ const Status& status) const {
+ FileOperationInfo info(file_name_, start_ts, finish_ts);
+ info.offset = offset;
+ info.length = length;
+ info.status = status;
+
+ for (auto& listener : listeners_) {
+ listener->OnFileReadFinish(info);
+ }
+ }
+#endif // ROCKSDB_LITE
+
+ bool ShouldNotifyListeners() const { return !listeners_.empty(); }
+
+ std::unique_ptr<FSRandomAccessFile> file_;
+ std::string file_name_;
+ Env* env_;
+ Statistics* stats_;
+ uint32_t hist_type_;
+ HistogramImpl* file_read_hist_;
+ RateLimiter* rate_limiter_;
+ std::vector<std::shared_ptr<EventListener>> listeners_;
+
+ public:
+ explicit RandomAccessFileReader(
+ std::unique_ptr<FSRandomAccessFile>&& raf, std::string _file_name,
+ Env* env = nullptr, Statistics* stats = nullptr, uint32_t hist_type = 0,
+ HistogramImpl* file_read_hist = nullptr,
+ RateLimiter* rate_limiter = nullptr,
+ const std::vector<std::shared_ptr<EventListener>>& listeners = {})
+ : file_(std::move(raf)),
+ file_name_(std::move(_file_name)),
+ env_(env),
+ stats_(stats),
+ hist_type_(hist_type),
+ file_read_hist_(file_read_hist),
+ rate_limiter_(rate_limiter),
+ listeners_() {
+#ifndef ROCKSDB_LITE
+ std::for_each(listeners.begin(), listeners.end(),
+ [this](const std::shared_ptr<EventListener>& e) {
+ if (e->ShouldBeNotifiedOnFileIO()) {
+ listeners_.emplace_back(e);
+ }
+ });
+#else // !ROCKSDB_LITE
+ (void)listeners;
+#endif
+ }
+
+ RandomAccessFileReader(RandomAccessFileReader&& o) ROCKSDB_NOEXCEPT {
+ *this = std::move(o);
+ }
+
+ RandomAccessFileReader& operator=(RandomAccessFileReader&& o)
+ ROCKSDB_NOEXCEPT {
+ file_ = std::move(o.file_);
+ env_ = std::move(o.env_);
+ stats_ = std::move(o.stats_);
+ hist_type_ = std::move(o.hist_type_);
+ file_read_hist_ = std::move(o.file_read_hist_);
+ rate_limiter_ = std::move(o.rate_limiter_);
+ return *this;
+ }
+
+ RandomAccessFileReader(const RandomAccessFileReader&) = delete;
+ RandomAccessFileReader& operator=(const RandomAccessFileReader&) = delete;
+
+ Status Read(uint64_t offset, size_t n, Slice* result, char* scratch,
+ bool for_compaction = false) const;
+
+ Status MultiRead(FSReadRequest* reqs, size_t num_reqs) const;
+
+ Status Prefetch(uint64_t offset, size_t n) const {
+ return file_->Prefetch(offset, n, IOOptions(), nullptr);
+ }
+
+ FSRandomAccessFile* file() { return file_.get(); }
+
+ std::string file_name() const { return file_name_; }
+
+ bool use_direct_io() const { return file_->use_direct_io(); }
+};
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/file/read_write_util.cc b/src/rocksdb/file/read_write_util.cc
new file mode 100644
index 000000000..b4854e110
--- /dev/null
+++ b/src/rocksdb/file/read_write_util.cc
@@ -0,0 +1,67 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "file/read_write_util.h"
+
+#include <sstream>
+#include "test_util/sync_point.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+IOStatus NewWritableFile(FileSystem* fs, const std::string& fname,
+ std::unique_ptr<FSWritableFile>* result,
+ const FileOptions& options) {
+ IOStatus s = fs->NewWritableFile(fname, options, result, nullptr);
+ TEST_KILL_RANDOM("NewWritableFile:0", rocksdb_kill_odds * REDUCE_ODDS2);
+ return s;
+}
+
+bool ReadOneLine(std::istringstream* iss, SequentialFileReader* seq_file_reader,
+ std::string* output, bool* has_data, Status* result) {
+ const int kBufferSize = 8192;
+ char buffer[kBufferSize + 1];
+ Slice input_slice;
+
+ std::string line;
+ bool has_complete_line = false;
+ while (!has_complete_line) {
+ if (std::getline(*iss, line)) {
+ has_complete_line = !iss->eof();
+ } else {
+ has_complete_line = false;
+ }
+ if (!has_complete_line) {
+ // if we're not sure whether we have a complete line,
+ // further read from the file.
+ if (*has_data) {
+ *result = seq_file_reader->Read(kBufferSize, &input_slice, buffer);
+ }
+ if (input_slice.size() == 0) {
+ // meaning we have read all the data
+ *has_data = false;
+ break;
+ } else {
+ iss->str(line + input_slice.ToString());
+ // reset the internal state of iss so that we can keep reading it.
+ iss->clear();
+ *has_data = (input_slice.size() == kBufferSize);
+ continue;
+ }
+ }
+ }
+ *output = line;
+ return *has_data || has_complete_line;
+}
+
+#ifndef NDEBUG
+bool IsFileSectorAligned(const size_t off, size_t sector_size) {
+ return off % sector_size == 0;
+}
+#endif // NDEBUG
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/file/read_write_util.h b/src/rocksdb/file/read_write_util.h
new file mode 100644
index 000000000..22f4076b3
--- /dev/null
+++ b/src/rocksdb/file/read_write_util.h
@@ -0,0 +1,34 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <atomic>
+#include "file/sequence_file_reader.h"
+#include "rocksdb/env.h"
+#include "rocksdb/file_system.h"
+
+namespace ROCKSDB_NAMESPACE {
+// Returns a WritableFile.
+//
+// env : the Env.
+// fname : the file name.
+// result : output arg. A WritableFile based on `fname` returned.
+// options : the Env Options.
+extern IOStatus NewWritableFile(FileSystem* fs, const std::string& fname,
+ std::unique_ptr<FSWritableFile>* result,
+ const FileOptions& options);
+
+// Read a single line from a file.
+bool ReadOneLine(std::istringstream* iss, SequentialFileReader* seq_file_reader,
+ std::string* output, bool* has_data, Status* result);
+
+#ifndef NDEBUG
+bool IsFileSectorAligned(const size_t off, size_t sector_size);
+#endif // NDEBUG
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/file/readahead_raf.cc b/src/rocksdb/file/readahead_raf.cc
new file mode 100644
index 000000000..493f9d9e8
--- /dev/null
+++ b/src/rocksdb/file/readahead_raf.cc
@@ -0,0 +1,162 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "file/readahead_raf.h"
+
+#include <algorithm>
+#include <mutex>
+#include "file/read_write_util.h"
+#include "util/aligned_buffer.h"
+#include "util/rate_limiter.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace {
+class ReadaheadRandomAccessFile : public RandomAccessFile {
+ public:
+ ReadaheadRandomAccessFile(std::unique_ptr<RandomAccessFile>&& file,
+ size_t readahead_size)
+ : file_(std::move(file)),
+ alignment_(file_->GetRequiredBufferAlignment()),
+ readahead_size_(Roundup(readahead_size, alignment_)),
+ buffer_(),
+ buffer_offset_(0) {
+ buffer_.Alignment(alignment_);
+ buffer_.AllocateNewBuffer(readahead_size_);
+ }
+
+ ReadaheadRandomAccessFile(const ReadaheadRandomAccessFile&) = delete;
+
+ ReadaheadRandomAccessFile& operator=(const ReadaheadRandomAccessFile&) =
+ delete;
+
+ Status Read(uint64_t offset, size_t n, Slice* result,
+ char* scratch) const override {
+ // Read-ahead only make sense if we have some slack left after reading
+ if (n + alignment_ >= readahead_size_) {
+ return file_->Read(offset, n, result, scratch);
+ }
+
+ std::unique_lock<std::mutex> lk(lock_);
+
+ size_t cached_len = 0;
+ // Check if there is a cache hit, meaning that [offset, offset + n) is
+ // either completely or partially in the buffer. If it's completely cached,
+ // including end of file case when offset + n is greater than EOF, then
+ // return.
+ if (TryReadFromCache(offset, n, &cached_len, scratch) &&
+ (cached_len == n || buffer_.CurrentSize() < readahead_size_)) {
+ // We read exactly what we needed, or we hit end of file - return.
+ *result = Slice(scratch, cached_len);
+ return Status::OK();
+ }
+ size_t advanced_offset = static_cast<size_t>(offset + cached_len);
+ // In the case of cache hit advanced_offset is already aligned, means that
+ // chunk_offset equals to advanced_offset
+ size_t chunk_offset = TruncateToPageBoundary(alignment_, advanced_offset);
+
+ Status s = ReadIntoBuffer(chunk_offset, readahead_size_);
+ if (s.ok()) {
+ // The data we need is now in cache, so we can safely read it
+ size_t remaining_len;
+ TryReadFromCache(advanced_offset, n - cached_len, &remaining_len,
+ scratch + cached_len);
+ *result = Slice(scratch, cached_len + remaining_len);
+ }
+ return s;
+ }
+
+ Status Prefetch(uint64_t offset, size_t n) override {
+ if (n < readahead_size_) {
+ // Don't allow smaller prefetches than the configured `readahead_size_`.
+ // `Read()` assumes a smaller prefetch buffer indicates EOF was reached.
+ return Status::OK();
+ }
+
+ std::unique_lock<std::mutex> lk(lock_);
+
+ size_t offset_ = static_cast<size_t>(offset);
+ size_t prefetch_offset = TruncateToPageBoundary(alignment_, offset_);
+ if (prefetch_offset == buffer_offset_) {
+ return Status::OK();
+ }
+ return ReadIntoBuffer(prefetch_offset,
+ Roundup(offset_ + n, alignment_) - prefetch_offset);
+ }
+
+ size_t GetUniqueId(char* id, size_t max_size) const override {
+ return file_->GetUniqueId(id, max_size);
+ }
+
+ void Hint(AccessPattern pattern) override { file_->Hint(pattern); }
+
+ Status InvalidateCache(size_t offset, size_t length) override {
+ std::unique_lock<std::mutex> lk(lock_);
+ buffer_.Clear();
+ return file_->InvalidateCache(offset, length);
+ }
+
+ bool use_direct_io() const override { return file_->use_direct_io(); }
+
+ private:
+ // Tries to read from buffer_ n bytes starting at offset. If anything was read
+ // from the cache, it sets cached_len to the number of bytes actually read,
+ // copies these number of bytes to scratch and returns true.
+ // If nothing was read sets cached_len to 0 and returns false.
+ bool TryReadFromCache(uint64_t offset, size_t n, size_t* cached_len,
+ char* scratch) const {
+ if (offset < buffer_offset_ ||
+ offset >= buffer_offset_ + buffer_.CurrentSize()) {
+ *cached_len = 0;
+ return false;
+ }
+ uint64_t offset_in_buffer = offset - buffer_offset_;
+ *cached_len = std::min(
+ buffer_.CurrentSize() - static_cast<size_t>(offset_in_buffer), n);
+ memcpy(scratch, buffer_.BufferStart() + offset_in_buffer, *cached_len);
+ return true;
+ }
+
+ // Reads into buffer_ the next n bytes from file_ starting at offset.
+ // Can actually read less if EOF was reached.
+ // Returns the status of the read operastion on the file.
+ Status ReadIntoBuffer(uint64_t offset, size_t n) const {
+ if (n > buffer_.Capacity()) {
+ n = buffer_.Capacity();
+ }
+ assert(IsFileSectorAligned(offset, alignment_));
+ assert(IsFileSectorAligned(n, alignment_));
+ Slice result;
+ Status s = file_->Read(offset, n, &result, buffer_.BufferStart());
+ if (s.ok()) {
+ buffer_offset_ = offset;
+ buffer_.Size(result.size());
+ assert(result.size() == 0 || buffer_.BufferStart() == result.data());
+ }
+ return s;
+ }
+
+ const std::unique_ptr<RandomAccessFile> file_;
+ const size_t alignment_;
+ const size_t readahead_size_;
+
+ mutable std::mutex lock_;
+ // The buffer storing the prefetched data
+ mutable AlignedBuffer buffer_;
+ // The offset in file_, corresponding to data stored in buffer_
+ mutable uint64_t buffer_offset_;
+};
+} // namespace
+
+std::unique_ptr<RandomAccessFile> NewReadaheadRandomAccessFile(
+ std::unique_ptr<RandomAccessFile>&& file, size_t readahead_size) {
+ std::unique_ptr<RandomAccessFile> result(
+ new ReadaheadRandomAccessFile(std::move(file), readahead_size));
+ return result;
+}
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/file/readahead_raf.h b/src/rocksdb/file/readahead_raf.h
new file mode 100644
index 000000000..cbdcb124f
--- /dev/null
+++ b/src/rocksdb/file/readahead_raf.h
@@ -0,0 +1,27 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <atomic>
+#include "rocksdb/env.h"
+
+namespace ROCKSDB_NAMESPACE {
+// This file provides the following main abstractions:
+// SequentialFileReader : wrapper over Env::SequentialFile
+// RandomAccessFileReader : wrapper over Env::RandomAccessFile
+// WritableFileWriter : wrapper over Env::WritableFile
+// In addition, it also exposed NewReadaheadRandomAccessFile, NewWritableFile,
+// and ReadOneLine primitives.
+
+// NewReadaheadRandomAccessFile provides a wrapper over RandomAccessFile to
+// always prefetch additional data with every read. This is mainly used in
+// Compaction Table Readers.
+std::unique_ptr<RandomAccessFile> NewReadaheadRandomAccessFile(
+ std::unique_ptr<RandomAccessFile>&& file, size_t readahead_size);
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/file/sequence_file_reader.cc b/src/rocksdb/file/sequence_file_reader.cc
new file mode 100644
index 000000000..81c5e5d1d
--- /dev/null
+++ b/src/rocksdb/file/sequence_file_reader.cc
@@ -0,0 +1,237 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "file/sequence_file_reader.h"
+
+#include <algorithm>
+#include <mutex>
+
+#include "file/read_write_util.h"
+#include "monitoring/histogram.h"
+#include "monitoring/iostats_context_imp.h"
+#include "port/port.h"
+#include "test_util/sync_point.h"
+#include "util/aligned_buffer.h"
+#include "util/random.h"
+#include "util/rate_limiter.h"
+
+namespace ROCKSDB_NAMESPACE {
+Status SequentialFileReader::Read(size_t n, Slice* result, char* scratch) {
+ Status s;
+ if (use_direct_io()) {
+#ifndef ROCKSDB_LITE
+ size_t offset = offset_.fetch_add(n);
+ size_t alignment = file_->GetRequiredBufferAlignment();
+ size_t aligned_offset = TruncateToPageBoundary(alignment, offset);
+ size_t offset_advance = offset - aligned_offset;
+ size_t size = Roundup(offset + n, alignment) - aligned_offset;
+ size_t r = 0;
+ AlignedBuffer buf;
+ buf.Alignment(alignment);
+ buf.AllocateNewBuffer(size);
+ Slice tmp;
+ s = file_->PositionedRead(aligned_offset, size, IOOptions(), &tmp,
+ buf.BufferStart(), nullptr);
+ if (s.ok() && offset_advance < tmp.size()) {
+ buf.Size(tmp.size());
+ r = buf.Read(scratch, offset_advance,
+ std::min(tmp.size() - offset_advance, n));
+ }
+ *result = Slice(scratch, r);
+#endif // !ROCKSDB_LITE
+ } else {
+ s = file_->Read(n, IOOptions(), result, scratch, nullptr);
+ }
+ IOSTATS_ADD(bytes_read, result->size());
+ return s;
+}
+
+Status SequentialFileReader::Skip(uint64_t n) {
+#ifndef ROCKSDB_LITE
+ if (use_direct_io()) {
+ offset_ += static_cast<size_t>(n);
+ return Status::OK();
+ }
+#endif // !ROCKSDB_LITE
+ return file_->Skip(n);
+}
+
+namespace {
+// This class wraps a SequentialFile, exposing same API, with the differenece
+// of being able to prefetch up to readahead_size bytes and then serve them
+// from memory, avoiding the entire round-trip if, for example, the data for the
+// file is actually remote.
+class ReadaheadSequentialFile : public FSSequentialFile {
+ public:
+ ReadaheadSequentialFile(std::unique_ptr<FSSequentialFile>&& file,
+ size_t readahead_size)
+ : file_(std::move(file)),
+ alignment_(file_->GetRequiredBufferAlignment()),
+ readahead_size_(Roundup(readahead_size, alignment_)),
+ buffer_(),
+ buffer_offset_(0),
+ read_offset_(0) {
+ buffer_.Alignment(alignment_);
+ buffer_.AllocateNewBuffer(readahead_size_);
+ }
+
+ ReadaheadSequentialFile(const ReadaheadSequentialFile&) = delete;
+
+ ReadaheadSequentialFile& operator=(const ReadaheadSequentialFile&) = delete;
+
+ IOStatus Read(size_t n, const IOOptions& opts, Slice* result, char* scratch,
+ IODebugContext* dbg) override {
+ std::unique_lock<std::mutex> lk(lock_);
+
+ size_t cached_len = 0;
+ // Check if there is a cache hit, meaning that [offset, offset + n) is
+ // either completely or partially in the buffer. If it's completely cached,
+ // including end of file case when offset + n is greater than EOF, then
+ // return.
+ if (TryReadFromCache(n, &cached_len, scratch) &&
+ (cached_len == n || buffer_.CurrentSize() < readahead_size_)) {
+ // We read exactly what we needed, or we hit end of file - return.
+ *result = Slice(scratch, cached_len);
+ return IOStatus::OK();
+ }
+ n -= cached_len;
+
+ IOStatus s;
+ // Read-ahead only make sense if we have some slack left after reading
+ if (n + alignment_ >= readahead_size_) {
+ s = file_->Read(n, opts, result, scratch + cached_len, dbg);
+ if (s.ok()) {
+ read_offset_ += result->size();
+ *result = Slice(scratch, cached_len + result->size());
+ }
+ buffer_.Clear();
+ return s;
+ }
+
+ s = ReadIntoBuffer(readahead_size_, opts, dbg);
+ if (s.ok()) {
+ // The data we need is now in cache, so we can safely read it
+ size_t remaining_len;
+ TryReadFromCache(n, &remaining_len, scratch + cached_len);
+ *result = Slice(scratch, cached_len + remaining_len);
+ }
+ return s;
+ }
+
+ IOStatus Skip(uint64_t n) override {
+ std::unique_lock<std::mutex> lk(lock_);
+ IOStatus s = IOStatus::OK();
+ // First check if we need to skip already cached data
+ if (buffer_.CurrentSize() > 0) {
+ // Do we need to skip beyond cached data?
+ if (read_offset_ + n >= buffer_offset_ + buffer_.CurrentSize()) {
+ // Yes. Skip whaterver is in memory and adjust offset accordingly
+ n -= buffer_offset_ + buffer_.CurrentSize() - read_offset_;
+ read_offset_ = buffer_offset_ + buffer_.CurrentSize();
+ } else {
+ // No. The entire section to be skipped is entirely i cache.
+ read_offset_ += n;
+ n = 0;
+ }
+ }
+ if (n > 0) {
+ // We still need to skip more, so call the file API for skipping
+ s = file_->Skip(n);
+ if (s.ok()) {
+ read_offset_ += n;
+ }
+ buffer_.Clear();
+ }
+ return s;
+ }
+
+ IOStatus PositionedRead(uint64_t offset, size_t n, const IOOptions& opts,
+ Slice* result, char* scratch,
+ IODebugContext* dbg) override {
+ return file_->PositionedRead(offset, n, opts, result, scratch, dbg);
+ }
+
+ IOStatus InvalidateCache(size_t offset, size_t length) override {
+ std::unique_lock<std::mutex> lk(lock_);
+ buffer_.Clear();
+ return file_->InvalidateCache(offset, length);
+ }
+
+ bool use_direct_io() const override { return file_->use_direct_io(); }
+
+ private:
+ // Tries to read from buffer_ n bytes. If anything was read from the cache, it
+ // sets cached_len to the number of bytes actually read, copies these number
+ // of bytes to scratch and returns true.
+ // If nothing was read sets cached_len to 0 and returns false.
+ bool TryReadFromCache(size_t n, size_t* cached_len, char* scratch) {
+ if (read_offset_ < buffer_offset_ ||
+ read_offset_ >= buffer_offset_ + buffer_.CurrentSize()) {
+ *cached_len = 0;
+ return false;
+ }
+ uint64_t offset_in_buffer = read_offset_ - buffer_offset_;
+ *cached_len = std::min(
+ buffer_.CurrentSize() - static_cast<size_t>(offset_in_buffer), n);
+ memcpy(scratch, buffer_.BufferStart() + offset_in_buffer, *cached_len);
+ read_offset_ += *cached_len;
+ return true;
+ }
+
+ // Reads into buffer_ the next n bytes from file_.
+ // Can actually read less if EOF was reached.
+ // Returns the status of the read operastion on the file.
+ IOStatus ReadIntoBuffer(size_t n, const IOOptions& opts,
+ IODebugContext* dbg) {
+ if (n > buffer_.Capacity()) {
+ n = buffer_.Capacity();
+ }
+ assert(IsFileSectorAligned(n, alignment_));
+ Slice result;
+ IOStatus s = file_->Read(n, opts, &result, buffer_.BufferStart(), dbg);
+ if (s.ok()) {
+ buffer_offset_ = read_offset_;
+ buffer_.Size(result.size());
+ assert(result.size() == 0 || buffer_.BufferStart() == result.data());
+ }
+ return s;
+ }
+
+ const std::unique_ptr<FSSequentialFile> file_;
+ const size_t alignment_;
+ const size_t readahead_size_;
+
+ std::mutex lock_;
+ // The buffer storing the prefetched data
+ AlignedBuffer buffer_;
+ // The offset in file_, corresponding to data stored in buffer_
+ uint64_t buffer_offset_;
+ // The offset up to which data was read from file_. In fact, it can be larger
+ // than the actual file size, since the file_->Skip(n) call doesn't return the
+ // actual number of bytes that were skipped, which can be less than n.
+ // This is not a problemm since read_offset_ is monotonically increasing and
+ // its only use is to figure out if next piece of data should be read from
+ // buffer_ or file_ directly.
+ uint64_t read_offset_;
+};
+} // namespace
+
+std::unique_ptr<FSSequentialFile>
+SequentialFileReader::NewReadaheadSequentialFile(
+ std::unique_ptr<FSSequentialFile>&& file, size_t readahead_size) {
+ if (file->GetRequiredBufferAlignment() >= readahead_size) {
+ // Short-circuit and return the original file if readahead_size is
+ // too small and hence doesn't make sense to be used for prefetching.
+ return std::move(file);
+ }
+ std::unique_ptr<FSSequentialFile> result(
+ new ReadaheadSequentialFile(std::move(file), readahead_size));
+ return result;
+}
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/file/sequence_file_reader.h b/src/rocksdb/file/sequence_file_reader.h
new file mode 100644
index 000000000..2f0898b42
--- /dev/null
+++ b/src/rocksdb/file/sequence_file_reader.h
@@ -0,0 +1,67 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <atomic>
+#include <string>
+#include "port/port.h"
+#include "rocksdb/env.h"
+#include "rocksdb/file_system.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// SequentialFileReader is a wrapper on top of Env::SequentialFile. It handles
+// Buffered (i.e when page cache is enabled) and Direct (with O_DIRECT / page
+// cache disabled) reads appropriately, and also updates the IO stats.
+class SequentialFileReader {
+ private:
+ std::unique_ptr<FSSequentialFile> file_;
+ std::string file_name_;
+ std::atomic<size_t> offset_{0}; // read offset
+
+ public:
+ explicit SequentialFileReader(std::unique_ptr<FSSequentialFile>&& _file,
+ const std::string& _file_name)
+ : file_(std::move(_file)), file_name_(_file_name) {}
+
+ explicit SequentialFileReader(std::unique_ptr<FSSequentialFile>&& _file,
+ const std::string& _file_name,
+ size_t _readahead_size)
+ : file_(NewReadaheadSequentialFile(std::move(_file), _readahead_size)),
+ file_name_(_file_name) {}
+
+ SequentialFileReader(SequentialFileReader&& o) ROCKSDB_NOEXCEPT {
+ *this = std::move(o);
+ }
+
+ SequentialFileReader& operator=(SequentialFileReader&& o) ROCKSDB_NOEXCEPT {
+ file_ = std::move(o.file_);
+ return *this;
+ }
+
+ SequentialFileReader(const SequentialFileReader&) = delete;
+ SequentialFileReader& operator=(const SequentialFileReader&) = delete;
+
+ Status Read(size_t n, Slice* result, char* scratch);
+
+ Status Skip(uint64_t n);
+
+ FSSequentialFile* file() { return file_.get(); }
+
+ std::string file_name() { return file_name_; }
+
+ bool use_direct_io() const { return file_->use_direct_io(); }
+
+ private:
+ // NewReadaheadSequentialFile provides a wrapper over SequentialFile to
+ // always prefetch additional data with every read.
+ static std::unique_ptr<FSSequentialFile> NewReadaheadSequentialFile(
+ std::unique_ptr<FSSequentialFile>&& file, size_t readahead_size);
+};
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/file/sst_file_manager_impl.cc b/src/rocksdb/file/sst_file_manager_impl.cc
new file mode 100644
index 000000000..35056429e
--- /dev/null
+++ b/src/rocksdb/file/sst_file_manager_impl.cc
@@ -0,0 +1,558 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "file/sst_file_manager_impl.h"
+
+#include <cinttypes>
+#include <vector>
+
+#include "db/db_impl/db_impl.h"
+#include "env/composite_env_wrapper.h"
+#include "port/port.h"
+#include "rocksdb/env.h"
+#include "rocksdb/sst_file_manager.h"
+#include "test_util/sync_point.h"
+#include "util/mutexlock.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+#ifndef ROCKSDB_LITE
+SstFileManagerImpl::SstFileManagerImpl(Env* env, std::shared_ptr<FileSystem> fs,
+ std::shared_ptr<Logger> logger,
+ int64_t rate_bytes_per_sec,
+ double max_trash_db_ratio,
+ uint64_t bytes_max_delete_chunk)
+ : env_(env),
+ fs_(fs),
+ logger_(logger),
+ total_files_size_(0),
+ in_progress_files_size_(0),
+ compaction_buffer_size_(0),
+ cur_compactions_reserved_size_(0),
+ max_allowed_space_(0),
+ delete_scheduler_(env, fs_.get(), rate_bytes_per_sec, logger.get(), this,
+ max_trash_db_ratio, bytes_max_delete_chunk),
+ cv_(&mu_),
+ closing_(false),
+ bg_thread_(nullptr),
+ reserved_disk_buffer_(0),
+ free_space_trigger_(0),
+ cur_instance_(nullptr) {}
+
+SstFileManagerImpl::~SstFileManagerImpl() {
+ Close();
+}
+
+void SstFileManagerImpl::Close() {
+ {
+ MutexLock l(&mu_);
+ if (closing_) {
+ return;
+ }
+ closing_ = true;
+ cv_.SignalAll();
+ }
+ if (bg_thread_) {
+ bg_thread_->join();
+ }
+}
+
+Status SstFileManagerImpl::OnAddFile(const std::string& file_path,
+ bool compaction) {
+ uint64_t file_size;
+ Status s = fs_->GetFileSize(file_path, IOOptions(), &file_size, nullptr);
+ if (s.ok()) {
+ MutexLock l(&mu_);
+ OnAddFileImpl(file_path, file_size, compaction);
+ }
+ TEST_SYNC_POINT("SstFileManagerImpl::OnAddFile");
+ return s;
+}
+
+Status SstFileManagerImpl::OnAddFile(const std::string& file_path,
+ uint64_t file_size, bool compaction) {
+ MutexLock l(&mu_);
+ OnAddFileImpl(file_path, file_size, compaction);
+ TEST_SYNC_POINT("SstFileManagerImpl::OnAddFile");
+ return Status::OK();
+}
+
+Status SstFileManagerImpl::OnDeleteFile(const std::string& file_path) {
+ {
+ MutexLock l(&mu_);
+ OnDeleteFileImpl(file_path);
+ }
+ TEST_SYNC_POINT("SstFileManagerImpl::OnDeleteFile");
+ return Status::OK();
+}
+
+void SstFileManagerImpl::OnCompactionCompletion(Compaction* c) {
+ MutexLock l(&mu_);
+ uint64_t size_added_by_compaction = 0;
+ for (size_t i = 0; i < c->num_input_levels(); i++) {
+ for (size_t j = 0; j < c->num_input_files(i); j++) {
+ FileMetaData* filemeta = c->input(i, j);
+ size_added_by_compaction += filemeta->fd.GetFileSize();
+ }
+ }
+ cur_compactions_reserved_size_ -= size_added_by_compaction;
+
+ auto new_files = c->edit()->GetNewFiles();
+ for (auto& new_file : new_files) {
+ auto fn = TableFileName(c->immutable_cf_options()->cf_paths,
+ new_file.second.fd.GetNumber(),
+ new_file.second.fd.GetPathId());
+ if (in_progress_files_.find(fn) != in_progress_files_.end()) {
+ auto tracked_file = tracked_files_.find(fn);
+ assert(tracked_file != tracked_files_.end());
+ in_progress_files_size_ -= tracked_file->second;
+ in_progress_files_.erase(fn);
+ }
+ }
+}
+
+Status SstFileManagerImpl::OnMoveFile(const std::string& old_path,
+ const std::string& new_path,
+ uint64_t* file_size) {
+ {
+ MutexLock l(&mu_);
+ if (file_size != nullptr) {
+ *file_size = tracked_files_[old_path];
+ }
+ OnAddFileImpl(new_path, tracked_files_[old_path], false);
+ OnDeleteFileImpl(old_path);
+ }
+ TEST_SYNC_POINT("SstFileManagerImpl::OnMoveFile");
+ return Status::OK();
+}
+
+void SstFileManagerImpl::SetMaxAllowedSpaceUsage(uint64_t max_allowed_space) {
+ MutexLock l(&mu_);
+ max_allowed_space_ = max_allowed_space;
+}
+
+void SstFileManagerImpl::SetCompactionBufferSize(
+ uint64_t compaction_buffer_size) {
+ MutexLock l(&mu_);
+ compaction_buffer_size_ = compaction_buffer_size;
+}
+
+bool SstFileManagerImpl::IsMaxAllowedSpaceReached() {
+ MutexLock l(&mu_);
+ if (max_allowed_space_ <= 0) {
+ return false;
+ }
+ return total_files_size_ >= max_allowed_space_;
+}
+
+bool SstFileManagerImpl::IsMaxAllowedSpaceReachedIncludingCompactions() {
+ MutexLock l(&mu_);
+ if (max_allowed_space_ <= 0) {
+ return false;
+ }
+ return total_files_size_ + cur_compactions_reserved_size_ >=
+ max_allowed_space_;
+}
+
+bool SstFileManagerImpl::EnoughRoomForCompaction(
+ ColumnFamilyData* cfd, const std::vector<CompactionInputFiles>& inputs,
+ Status bg_error) {
+ MutexLock l(&mu_);
+ uint64_t size_added_by_compaction = 0;
+ // First check if we even have the space to do the compaction
+ for (size_t i = 0; i < inputs.size(); i++) {
+ for (size_t j = 0; j < inputs[i].size(); j++) {
+ FileMetaData* filemeta = inputs[i][j];
+ size_added_by_compaction += filemeta->fd.GetFileSize();
+ }
+ }
+
+ // Update cur_compactions_reserved_size_ so concurrent compaction
+ // don't max out space
+ size_t needed_headroom =
+ cur_compactions_reserved_size_ + size_added_by_compaction +
+ compaction_buffer_size_;
+ if (max_allowed_space_ != 0 &&
+ (needed_headroom + total_files_size_ > max_allowed_space_)) {
+ return false;
+ }
+
+ // Implement more aggressive checks only if this DB instance has already
+ // seen a NoSpace() error. This is tin order to contain a single potentially
+ // misbehaving DB instance and prevent it from slowing down compactions of
+ // other DB instances
+ if (CheckFreeSpace() && bg_error == Status::NoSpace()) {
+ auto fn =
+ TableFileName(cfd->ioptions()->cf_paths, inputs[0][0]->fd.GetNumber(),
+ inputs[0][0]->fd.GetPathId());
+ uint64_t free_space = 0;
+ fs_->GetFreeSpace(fn, IOOptions(), &free_space, nullptr);
+ // needed_headroom is based on current size reserved by compactions,
+ // minus any files created by running compactions as they would count
+ // against the reserved size. If user didn't specify any compaction
+ // buffer, add reserved_disk_buffer_ that's calculated by default so the
+ // compaction doesn't end up leaving nothing for logs and flush SSTs
+ if (compaction_buffer_size_ == 0) {
+ needed_headroom += reserved_disk_buffer_;
+ }
+ needed_headroom -= in_progress_files_size_;
+ if (free_space < needed_headroom + size_added_by_compaction) {
+ // We hit the condition of not enough disk space
+ ROCKS_LOG_ERROR(logger_,
+ "free space [%" PRIu64
+ " bytes] is less than "
+ "needed headroom [%" ROCKSDB_PRIszt " bytes]\n",
+ free_space, needed_headroom);
+ return false;
+ }
+ }
+
+ cur_compactions_reserved_size_ += size_added_by_compaction;
+ // Take a snapshot of cur_compactions_reserved_size_ for when we encounter
+ // a NoSpace error.
+ free_space_trigger_ = cur_compactions_reserved_size_;
+ return true;
+}
+
+uint64_t SstFileManagerImpl::GetCompactionsReservedSize() {
+ MutexLock l(&mu_);
+ return cur_compactions_reserved_size_;
+}
+
+uint64_t SstFileManagerImpl::GetTotalSize() {
+ MutexLock l(&mu_);
+ return total_files_size_;
+}
+
+std::unordered_map<std::string, uint64_t>
+SstFileManagerImpl::GetTrackedFiles() {
+ MutexLock l(&mu_);
+ return tracked_files_;
+}
+
+int64_t SstFileManagerImpl::GetDeleteRateBytesPerSecond() {
+ return delete_scheduler_.GetRateBytesPerSecond();
+}
+
+void SstFileManagerImpl::SetDeleteRateBytesPerSecond(int64_t delete_rate) {
+ return delete_scheduler_.SetRateBytesPerSecond(delete_rate);
+}
+
+double SstFileManagerImpl::GetMaxTrashDBRatio() {
+ return delete_scheduler_.GetMaxTrashDBRatio();
+}
+
+void SstFileManagerImpl::SetMaxTrashDBRatio(double r) {
+ return delete_scheduler_.SetMaxTrashDBRatio(r);
+}
+
+uint64_t SstFileManagerImpl::GetTotalTrashSize() {
+ return delete_scheduler_.GetTotalTrashSize();
+}
+
+void SstFileManagerImpl::ReserveDiskBuffer(uint64_t size,
+ const std::string& path) {
+ MutexLock l(&mu_);
+
+ reserved_disk_buffer_ += size;
+ if (path_.empty()) {
+ path_ = path;
+ }
+}
+
+void SstFileManagerImpl::ClearError() {
+ while (true) {
+ MutexLock l(&mu_);
+
+ if (closing_) {
+ return;
+ }
+
+ uint64_t free_space = 0;
+ Status s = fs_->GetFreeSpace(path_, IOOptions(), &free_space, nullptr);
+ free_space = max_allowed_space_ > 0
+ ? std::min(max_allowed_space_, free_space)
+ : free_space;
+ if (s.ok()) {
+ // In case of multi-DB instances, some of them may have experienced a
+ // soft error and some a hard error. In the SstFileManagerImpl, a hard
+ // error will basically override previously reported soft errors. Once
+ // we clear the hard error, we don't keep track of previous errors for
+ // now
+ if (bg_err_.severity() == Status::Severity::kHardError) {
+ if (free_space < reserved_disk_buffer_) {
+ ROCKS_LOG_ERROR(logger_,
+ "free space [%" PRIu64
+ " bytes] is less than "
+ "required disk buffer [%" PRIu64 " bytes]\n",
+ free_space, reserved_disk_buffer_);
+ ROCKS_LOG_ERROR(logger_, "Cannot clear hard error\n");
+ s = Status::NoSpace();
+ }
+ } else if (bg_err_.severity() == Status::Severity::kSoftError) {
+ if (free_space < free_space_trigger_) {
+ ROCKS_LOG_WARN(logger_,
+ "free space [%" PRIu64
+ " bytes] is less than "
+ "free space for compaction trigger [%" PRIu64
+ " bytes]\n",
+ free_space, free_space_trigger_);
+ ROCKS_LOG_WARN(logger_, "Cannot clear soft error\n");
+ s = Status::NoSpace();
+ }
+ }
+ }
+
+ // Someone could have called CancelErrorRecovery() and the list could have
+ // become empty, so check again here
+ if (s.ok() && !error_handler_list_.empty()) {
+ auto error_handler = error_handler_list_.front();
+ // Since we will release the mutex, set cur_instance_ to signal to the
+ // shutdown thread, if it calls // CancelErrorRecovery() the meantime,
+ // to indicate that this DB instance is busy. The DB instance is
+ // guaranteed to not be deleted before RecoverFromBGError() returns,
+ // since the ErrorHandler::recovery_in_prog_ flag would be true
+ cur_instance_ = error_handler;
+ mu_.Unlock();
+ s = error_handler->RecoverFromBGError();
+ TEST_SYNC_POINT("SstFileManagerImpl::ErrorCleared");
+ mu_.Lock();
+ // The DB instance might have been deleted while we were
+ // waiting for the mutex, so check cur_instance_ to make sure its
+ // still non-null
+ if (cur_instance_) {
+ // Check for error again, since the instance may have recovered but
+ // immediately got another error. If that's the case, and the new
+ // error is also a NoSpace() non-fatal error, leave the instance in
+ // the list
+ Status err = cur_instance_->GetBGError();
+ if (s.ok() && err == Status::NoSpace() &&
+ err.severity() < Status::Severity::kFatalError) {
+ s = err;
+ }
+ cur_instance_ = nullptr;
+ }
+
+ if (s.ok() || s.IsShutdownInProgress() ||
+ (!s.ok() && s.severity() >= Status::Severity::kFatalError)) {
+ // If shutdown is in progress, abandon this handler instance
+ // and continue with the others
+ error_handler_list_.pop_front();
+ }
+ }
+
+ if (!error_handler_list_.empty()) {
+ // If there are more instances to be recovered, reschedule after 5
+ // seconds
+ int64_t wait_until = env_->NowMicros() + 5000000;
+ cv_.TimedWait(wait_until);
+ }
+
+ // Check again for error_handler_list_ empty, as a DB instance shutdown
+ // could have removed it from the queue while we were in timed wait
+ if (error_handler_list_.empty()) {
+ ROCKS_LOG_INFO(logger_, "Clearing error\n");
+ bg_err_ = Status::OK();
+ return;
+ }
+ }
+}
+
+void SstFileManagerImpl::StartErrorRecovery(ErrorHandler* handler,
+ Status bg_error) {
+ MutexLock l(&mu_);
+ if (bg_error.severity() == Status::Severity::kSoftError) {
+ if (bg_err_.ok()) {
+ // Setting bg_err_ basically means we're in degraded mode
+ // Assume that all pending compactions will fail similarly. The trigger
+ // for clearing this condition is set to current compaction reserved
+ // size, so we stop checking disk space available in
+ // EnoughRoomForCompaction once this much free space is available
+ bg_err_ = bg_error;
+ }
+ } else if (bg_error.severity() == Status::Severity::kHardError) {
+ bg_err_ = bg_error;
+ } else {
+ assert(false);
+ }
+
+ // If this is the first instance of this error, kick of a thread to poll
+ // and recover from this condition
+ if (error_handler_list_.empty()) {
+ error_handler_list_.push_back(handler);
+ // Release lock before calling join. Its ok to do so because
+ // error_handler_list_ is now non-empty, so no other invocation of this
+ // function will execute this piece of code
+ mu_.Unlock();
+ if (bg_thread_) {
+ bg_thread_->join();
+ }
+ // Start a new thread. The previous one would have exited.
+ bg_thread_.reset(new port::Thread(&SstFileManagerImpl::ClearError, this));
+ mu_.Lock();
+ } else {
+ // Check if this DB instance is already in the list
+ for (auto iter = error_handler_list_.begin();
+ iter != error_handler_list_.end(); ++iter) {
+ if ((*iter) == handler) {
+ return;
+ }
+ }
+ error_handler_list_.push_back(handler);
+ }
+}
+
+bool SstFileManagerImpl::CancelErrorRecovery(ErrorHandler* handler) {
+ MutexLock l(&mu_);
+
+ if (cur_instance_ == handler) {
+ // This instance is currently busy attempting to recover
+ // Nullify it so the recovery thread doesn't attempt to access it again
+ cur_instance_ = nullptr;
+ return false;
+ }
+
+ for (auto iter = error_handler_list_.begin();
+ iter != error_handler_list_.end(); ++iter) {
+ if ((*iter) == handler) {
+ error_handler_list_.erase(iter);
+ return true;
+ }
+ }
+ return false;
+}
+
+Status SstFileManagerImpl::ScheduleFileDeletion(
+ const std::string& file_path, const std::string& path_to_sync,
+ const bool force_bg) {
+ TEST_SYNC_POINT_CALLBACK("SstFileManagerImpl::ScheduleFileDeletion",
+ const_cast<std::string*>(&file_path));
+ return delete_scheduler_.DeleteFile(file_path, path_to_sync,
+ force_bg);
+}
+
+void SstFileManagerImpl::WaitForEmptyTrash() {
+ delete_scheduler_.WaitForEmptyTrash();
+}
+
+void SstFileManagerImpl::OnAddFileImpl(const std::string& file_path,
+ uint64_t file_size, bool compaction) {
+ auto tracked_file = tracked_files_.find(file_path);
+ if (tracked_file != tracked_files_.end()) {
+ // File was added before, we will just update the size
+ assert(!compaction);
+ total_files_size_ -= tracked_file->second;
+ total_files_size_ += file_size;
+ cur_compactions_reserved_size_ -= file_size;
+ } else {
+ total_files_size_ += file_size;
+ if (compaction) {
+ // Keep track of the size of files created by in-progress compactions.
+ // When calculating whether there's enough headroom for new compactions,
+ // this will be subtracted from cur_compactions_reserved_size_.
+ // Otherwise, compactions will be double counted.
+ in_progress_files_size_ += file_size;
+ in_progress_files_.insert(file_path);
+ }
+ }
+ tracked_files_[file_path] = file_size;
+}
+
+void SstFileManagerImpl::OnDeleteFileImpl(const std::string& file_path) {
+ auto tracked_file = tracked_files_.find(file_path);
+ if (tracked_file == tracked_files_.end()) {
+ // File is not tracked
+ assert(in_progress_files_.find(file_path) == in_progress_files_.end());
+ return;
+ }
+
+ total_files_size_ -= tracked_file->second;
+ // Check if it belonged to an in-progress compaction
+ if (in_progress_files_.find(file_path) != in_progress_files_.end()) {
+ in_progress_files_size_ -= tracked_file->second;
+ in_progress_files_.erase(file_path);
+ }
+ tracked_files_.erase(tracked_file);
+}
+
+SstFileManager* NewSstFileManager(Env* env, std::shared_ptr<Logger> info_log,
+ std::string trash_dir,
+ int64_t rate_bytes_per_sec,
+ bool delete_existing_trash, Status* status,
+ double max_trash_db_ratio,
+ uint64_t bytes_max_delete_chunk) {
+ std::shared_ptr<FileSystem> fs;
+
+ if (env == Env::Default()) {
+ fs = FileSystem::Default();
+ } else {
+ fs.reset(new LegacyFileSystemWrapper(env));
+ }
+
+ return NewSstFileManager(env, fs, info_log, trash_dir, rate_bytes_per_sec,
+ delete_existing_trash, status, max_trash_db_ratio,
+ bytes_max_delete_chunk);
+}
+
+SstFileManager* NewSstFileManager(Env* env, std::shared_ptr<FileSystem> fs,
+ std::shared_ptr<Logger> info_log,
+ const std::string& trash_dir,
+ int64_t rate_bytes_per_sec,
+ bool delete_existing_trash, Status* status,
+ double max_trash_db_ratio,
+ uint64_t bytes_max_delete_chunk) {
+ SstFileManagerImpl* res =
+ new SstFileManagerImpl(env, fs, info_log, rate_bytes_per_sec,
+ max_trash_db_ratio, bytes_max_delete_chunk);
+
+ // trash_dir is deprecated and not needed anymore, but if user passed it
+ // we will still remove files in it.
+ Status s;
+ if (delete_existing_trash && trash_dir != "") {
+ std::vector<std::string> files_in_trash;
+ s = fs->GetChildren(trash_dir, IOOptions(), &files_in_trash, nullptr);
+ if (s.ok()) {
+ for (const std::string& trash_file : files_in_trash) {
+ if (trash_file == "." || trash_file == "..") {
+ continue;
+ }
+
+ std::string path_in_trash = trash_dir + "/" + trash_file;
+ res->OnAddFile(path_in_trash);
+ Status file_delete =
+ res->ScheduleFileDeletion(path_in_trash, trash_dir);
+ if (s.ok() && !file_delete.ok()) {
+ s = file_delete;
+ }
+ }
+ }
+ }
+
+ if (status) {
+ *status = s;
+ }
+
+ return res;
+}
+
+#else
+
+SstFileManager* NewSstFileManager(Env* /*env*/,
+ std::shared_ptr<Logger> /*info_log*/,
+ std::string /*trash_dir*/,
+ int64_t /*rate_bytes_per_sec*/,
+ bool /*delete_existing_trash*/,
+ Status* status, double /*max_trash_db_ratio*/,
+ uint64_t /*bytes_max_delete_chunk*/) {
+ if (status) {
+ *status =
+ Status::NotSupported("SstFileManager is not supported in ROCKSDB_LITE");
+ }
+ return nullptr;
+}
+
+#endif // ROCKSDB_LITE
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/file/sst_file_manager_impl.h b/src/rocksdb/file/sst_file_manager_impl.h
new file mode 100644
index 000000000..323ffc7b2
--- /dev/null
+++ b/src/rocksdb/file/sst_file_manager_impl.h
@@ -0,0 +1,197 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <string>
+
+#include "port/port.h"
+
+#include "db/compaction/compaction.h"
+#include "db/error_handler.h"
+#include "file/delete_scheduler.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/sst_file_manager.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Env;
+class Logger;
+
+// SstFileManager is used to track SST files in the DB and control there
+// deletion rate.
+// All SstFileManager public functions are thread-safe.
+class SstFileManagerImpl : public SstFileManager {
+ public:
+ explicit SstFileManagerImpl(Env* env, std::shared_ptr<FileSystem> fs,
+ std::shared_ptr<Logger> logger,
+ int64_t rate_bytes_per_sec,
+ double max_trash_db_ratio,
+ uint64_t bytes_max_delete_chunk);
+
+ ~SstFileManagerImpl();
+
+ // DB will call OnAddFile whenever a new sst file is added.
+ Status OnAddFile(const std::string& file_path, bool compaction = false);
+
+ // Overload where size of the file is provided by the caller rather than
+ // queried from the filesystem. This is an optimization.
+ Status OnAddFile(const std::string& file_path, uint64_t file_size,
+ bool compaction);
+
+ // DB will call OnDeleteFile whenever an sst file is deleted.
+ Status OnDeleteFile(const std::string& file_path);
+
+ // DB will call OnMoveFile whenever an sst file is move to a new path.
+ Status OnMoveFile(const std::string& old_path, const std::string& new_path,
+ uint64_t* file_size = nullptr);
+
+ // Update the maximum allowed space that should be used by RocksDB, if
+ // the total size of the SST files exceeds max_allowed_space, writes to
+ // RocksDB will fail.
+ //
+ // Setting max_allowed_space to 0 will disable this feature, maximum allowed
+ // space will be infinite (Default value).
+ //
+ // thread-safe.
+ void SetMaxAllowedSpaceUsage(uint64_t max_allowed_space) override;
+
+ void SetCompactionBufferSize(uint64_t compaction_buffer_size) override;
+
+ // Return true if the total size of SST files exceeded the maximum allowed
+ // space usage.
+ //
+ // thread-safe.
+ bool IsMaxAllowedSpaceReached() override;
+
+ bool IsMaxAllowedSpaceReachedIncludingCompactions() override;
+
+ // Returns true is there is enough (approximate) space for the specified
+ // compaction. Space is approximate because this function conservatively
+ // estimates how much space is currently being used by compactions (i.e.
+ // if a compaction has started, this function bumps the used space by
+ // the full compaction size).
+ bool EnoughRoomForCompaction(ColumnFamilyData* cfd,
+ const std::vector<CompactionInputFiles>& inputs,
+ Status bg_error);
+
+ // Bookkeeping so total_file_sizes_ goes back to normal after compaction
+ // finishes
+ void OnCompactionCompletion(Compaction* c);
+
+ uint64_t GetCompactionsReservedSize();
+
+ // Return the total size of all tracked files.
+ uint64_t GetTotalSize() override;
+
+ // Return a map containing all tracked files and there corresponding sizes.
+ std::unordered_map<std::string, uint64_t> GetTrackedFiles() override;
+
+ // Return delete rate limit in bytes per second.
+ virtual int64_t GetDeleteRateBytesPerSecond() override;
+
+ // Update the delete rate limit in bytes per second.
+ virtual void SetDeleteRateBytesPerSecond(int64_t delete_rate) override;
+
+ // Return trash/DB size ratio where new files will be deleted immediately
+ virtual double GetMaxTrashDBRatio() override;
+
+ // Update trash/DB size ratio where new files will be deleted immediately
+ virtual void SetMaxTrashDBRatio(double ratio) override;
+
+ // Return the total size of trash files
+ uint64_t GetTotalTrashSize() override;
+
+ // Called by each DB instance using this sst file manager to reserve
+ // disk buffer space for recovery from out of space errors
+ void ReserveDiskBuffer(uint64_t buffer, const std::string& path);
+
+ // Set a flag upon encountering disk full. May enqueue the ErrorHandler
+ // instance for background polling and recovery
+ void StartErrorRecovery(ErrorHandler* db, Status bg_error);
+
+ // Remove the given Errorhandler instance from the recovery queue. Its
+ // not guaranteed
+ bool CancelErrorRecovery(ErrorHandler* db);
+
+ // Mark file as trash and schedule it's deletion. If force_bg is set, it
+ // forces the file to be deleting in the background regardless of DB size,
+ // except when rate limited delete is disabled
+ virtual Status ScheduleFileDeletion(const std::string& file_path,
+ const std::string& dir_to_sync,
+ const bool force_bg = false);
+
+ // Wait for all files being deleteing in the background to finish or for
+ // destructor to be called.
+ virtual void WaitForEmptyTrash();
+
+ DeleteScheduler* delete_scheduler() { return &delete_scheduler_; }
+
+ // Stop the error recovery background thread. This should be called only
+ // once in the object's lifetime, and before the destructor
+ void Close();
+
+ private:
+ // REQUIRES: mutex locked
+ void OnAddFileImpl(const std::string& file_path, uint64_t file_size,
+ bool compaction);
+ // REQUIRES: mutex locked
+ void OnDeleteFileImpl(const std::string& file_path);
+
+ void ClearError();
+ bool CheckFreeSpace() {
+ return bg_err_.severity() == Status::Severity::kSoftError;
+ }
+
+ Env* env_;
+ std::shared_ptr<FileSystem> fs_;
+ std::shared_ptr<Logger> logger_;
+ // Mutex to protect tracked_files_, total_files_size_
+ port::Mutex mu_;
+ // The summation of the sizes of all files in tracked_files_ map
+ uint64_t total_files_size_;
+ // The summation of all output files of in-progress compactions
+ uint64_t in_progress_files_size_;
+ // Compactions should only execute if they can leave at least
+ // this amount of buffer space for logs and flushes
+ uint64_t compaction_buffer_size_;
+ // Estimated size of the current ongoing compactions
+ uint64_t cur_compactions_reserved_size_;
+ // A map containing all tracked files and there sizes
+ // file_path => file_size
+ std::unordered_map<std::string, uint64_t> tracked_files_;
+ // A set of files belonging to in-progress compactions
+ std::unordered_set<std::string> in_progress_files_;
+ // The maximum allowed space (in bytes) for sst files.
+ uint64_t max_allowed_space_;
+ // DeleteScheduler used to throttle file deletition.
+ DeleteScheduler delete_scheduler_;
+ port::CondVar cv_;
+ // Flag to force error recovery thread to exit
+ bool closing_;
+ // Background error recovery thread
+ std::unique_ptr<port::Thread> bg_thread_;
+ // A path in the filesystem corresponding to this SFM. This is used for
+ // calling Env::GetFreeSpace. Posix requires a path in the filesystem
+ std::string path_;
+ // Save the current background error
+ Status bg_err_;
+ // Amount of free disk headroom before allowing recovery from hard errors
+ uint64_t reserved_disk_buffer_;
+ // For soft errors, amount of free disk space before we can allow
+ // compactions to run full throttle. If disk space is below this trigger,
+ // compactions will be gated by free disk space > input size
+ uint64_t free_space_trigger_;
+ // List of database error handler instances tracked by this sst file manager
+ std::list<ErrorHandler*> error_handler_list_;
+ // Pointer to ErrorHandler instance that is currently processing recovery
+ ErrorHandler* cur_instance_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/file/writable_file_writer.cc b/src/rocksdb/file/writable_file_writer.cc
new file mode 100644
index 000000000..d5894c17a
--- /dev/null
+++ b/src/rocksdb/file/writable_file_writer.cc
@@ -0,0 +1,429 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "file/writable_file_writer.h"
+
+#include <algorithm>
+#include <mutex>
+
+#include "db/version_edit.h"
+#include "monitoring/histogram.h"
+#include "monitoring/iostats_context_imp.h"
+#include "port/port.h"
+#include "test_util/sync_point.h"
+#include "util/random.h"
+#include "util/rate_limiter.h"
+
+namespace ROCKSDB_NAMESPACE {
+Status WritableFileWriter::Append(const Slice& data) {
+ const char* src = data.data();
+ size_t left = data.size();
+ Status s;
+ pending_sync_ = true;
+
+ TEST_KILL_RANDOM("WritableFileWriter::Append:0",
+ rocksdb_kill_odds * REDUCE_ODDS2);
+
+ {
+ IOSTATS_TIMER_GUARD(prepare_write_nanos);
+ TEST_SYNC_POINT("WritableFileWriter::Append:BeforePrepareWrite");
+ writable_file_->PrepareWrite(static_cast<size_t>(GetFileSize()), left,
+ IOOptions(), nullptr);
+ }
+
+ // See whether we need to enlarge the buffer to avoid the flush
+ if (buf_.Capacity() - buf_.CurrentSize() < left) {
+ for (size_t cap = buf_.Capacity();
+ cap < max_buffer_size_; // There is still room to increase
+ cap *= 2) {
+ // See whether the next available size is large enough.
+ // Buffer will never be increased to more than max_buffer_size_.
+ size_t desired_capacity = std::min(cap * 2, max_buffer_size_);
+ if (desired_capacity - buf_.CurrentSize() >= left ||
+ (use_direct_io() && desired_capacity == max_buffer_size_)) {
+ buf_.AllocateNewBuffer(desired_capacity, true);
+ break;
+ }
+ }
+ }
+
+ // Flush only when buffered I/O
+ if (!use_direct_io() && (buf_.Capacity() - buf_.CurrentSize()) < left) {
+ if (buf_.CurrentSize() > 0) {
+ s = Flush();
+ if (!s.ok()) {
+ return s;
+ }
+ }
+ assert(buf_.CurrentSize() == 0);
+ }
+
+ // We never write directly to disk with direct I/O on.
+ // or we simply use it for its original purpose to accumulate many small
+ // chunks
+ if (use_direct_io() || (buf_.Capacity() >= left)) {
+ while (left > 0) {
+ size_t appended = buf_.Append(src, left);
+ left -= appended;
+ src += appended;
+
+ if (left > 0) {
+ s = Flush();
+ if (!s.ok()) {
+ break;
+ }
+ }
+ }
+ } else {
+ // Writing directly to file bypassing the buffer
+ assert(buf_.CurrentSize() == 0);
+ s = WriteBuffered(src, left);
+ }
+
+ TEST_KILL_RANDOM("WritableFileWriter::Append:1", rocksdb_kill_odds);
+ if (s.ok()) {
+ filesize_ += data.size();
+ CalculateFileChecksum(data);
+ }
+ return s;
+}
+
+Status WritableFileWriter::Pad(const size_t pad_bytes) {
+ assert(pad_bytes < kDefaultPageSize);
+ size_t left = pad_bytes;
+ size_t cap = buf_.Capacity() - buf_.CurrentSize();
+
+ // Assume pad_bytes is small compared to buf_ capacity. So we always
+ // use buf_ rather than write directly to file in certain cases like
+ // Append() does.
+ while (left) {
+ size_t append_bytes = std::min(cap, left);
+ buf_.PadWith(append_bytes, 0);
+ left -= append_bytes;
+ if (left > 0) {
+ Status s = Flush();
+ if (!s.ok()) {
+ return s;
+ }
+ }
+ cap = buf_.Capacity() - buf_.CurrentSize();
+ }
+ pending_sync_ = true;
+ filesize_ += pad_bytes;
+ return Status::OK();
+}
+
+Status WritableFileWriter::Close() {
+ // Do not quit immediately on failure the file MUST be closed
+ Status s;
+
+ // Possible to close it twice now as we MUST close
+ // in __dtor, simply flushing is not enough
+ // Windows when pre-allocating does not fill with zeros
+ // also with unbuffered access we also set the end of data.
+ if (!writable_file_) {
+ return s;
+ }
+
+ s = Flush(); // flush cache to OS
+
+ Status interim;
+ // In direct I/O mode we write whole pages so
+ // we need to let the file know where data ends.
+ if (use_direct_io()) {
+ interim = writable_file_->Truncate(filesize_, IOOptions(), nullptr);
+ if (interim.ok()) {
+ interim = writable_file_->Fsync(IOOptions(), nullptr);
+ }
+ if (!interim.ok() && s.ok()) {
+ s = interim;
+ }
+ }
+
+ TEST_KILL_RANDOM("WritableFileWriter::Close:0", rocksdb_kill_odds);
+ interim = writable_file_->Close(IOOptions(), nullptr);
+ if (!interim.ok() && s.ok()) {
+ s = interim;
+ }
+
+ writable_file_.reset();
+ TEST_KILL_RANDOM("WritableFileWriter::Close:1", rocksdb_kill_odds);
+
+ return s;
+}
+
+// write out the cached data to the OS cache or storage if direct I/O
+// enabled
+Status WritableFileWriter::Flush() {
+ Status s;
+ TEST_KILL_RANDOM("WritableFileWriter::Flush:0",
+ rocksdb_kill_odds * REDUCE_ODDS2);
+
+ if (buf_.CurrentSize() > 0) {
+ if (use_direct_io()) {
+#ifndef ROCKSDB_LITE
+ if (pending_sync_) {
+ s = WriteDirect();
+ }
+#endif // !ROCKSDB_LITE
+ } else {
+ s = WriteBuffered(buf_.BufferStart(), buf_.CurrentSize());
+ }
+ if (!s.ok()) {
+ return s;
+ }
+ }
+
+ s = writable_file_->Flush(IOOptions(), nullptr);
+
+ if (!s.ok()) {
+ return s;
+ }
+
+ // sync OS cache to disk for every bytes_per_sync_
+ // TODO: give log file and sst file different options (log
+ // files could be potentially cached in OS for their whole
+ // life time, thus we might not want to flush at all).
+
+ // We try to avoid sync to the last 1MB of data. For two reasons:
+ // (1) avoid rewrite the same page that is modified later.
+ // (2) for older version of OS, write can block while writing out
+ // the page.
+ // Xfs does neighbor page flushing outside of the specified ranges. We
+ // need to make sure sync range is far from the write offset.
+ if (!use_direct_io() && bytes_per_sync_) {
+ const uint64_t kBytesNotSyncRange =
+ 1024 * 1024; // recent 1MB is not synced.
+ const uint64_t kBytesAlignWhenSync = 4 * 1024; // Align 4KB.
+ if (filesize_ > kBytesNotSyncRange) {
+ uint64_t offset_sync_to = filesize_ - kBytesNotSyncRange;
+ offset_sync_to -= offset_sync_to % kBytesAlignWhenSync;
+ assert(offset_sync_to >= last_sync_size_);
+ if (offset_sync_to > 0 &&
+ offset_sync_to - last_sync_size_ >= bytes_per_sync_) {
+ s = RangeSync(last_sync_size_, offset_sync_to - last_sync_size_);
+ last_sync_size_ = offset_sync_to;
+ }
+ }
+ }
+
+ return s;
+}
+
+const char* WritableFileWriter::GetFileChecksumFuncName() const {
+ if (checksum_func_ != nullptr) {
+ return checksum_func_->Name();
+ } else {
+ return kUnknownFileChecksumFuncName.c_str();
+ }
+}
+
+Status WritableFileWriter::Sync(bool use_fsync) {
+ Status s = Flush();
+ if (!s.ok()) {
+ return s;
+ }
+ TEST_KILL_RANDOM("WritableFileWriter::Sync:0", rocksdb_kill_odds);
+ if (!use_direct_io() && pending_sync_) {
+ s = SyncInternal(use_fsync);
+ if (!s.ok()) {
+ return s;
+ }
+ }
+ TEST_KILL_RANDOM("WritableFileWriter::Sync:1", rocksdb_kill_odds);
+ pending_sync_ = false;
+ return Status::OK();
+}
+
+Status WritableFileWriter::SyncWithoutFlush(bool use_fsync) {
+ if (!writable_file_->IsSyncThreadSafe()) {
+ return Status::NotSupported(
+ "Can't WritableFileWriter::SyncWithoutFlush() because "
+ "WritableFile::IsSyncThreadSafe() is false");
+ }
+ TEST_SYNC_POINT("WritableFileWriter::SyncWithoutFlush:1");
+ Status s = SyncInternal(use_fsync);
+ TEST_SYNC_POINT("WritableFileWriter::SyncWithoutFlush:2");
+ return s;
+}
+
+Status WritableFileWriter::SyncInternal(bool use_fsync) {
+ Status s;
+ IOSTATS_TIMER_GUARD(fsync_nanos);
+ TEST_SYNC_POINT("WritableFileWriter::SyncInternal:0");
+ auto prev_perf_level = GetPerfLevel();
+ IOSTATS_CPU_TIMER_GUARD(cpu_write_nanos, env_);
+ if (use_fsync) {
+ s = writable_file_->Fsync(IOOptions(), nullptr);
+ } else {
+ s = writable_file_->Sync(IOOptions(), nullptr);
+ }
+ SetPerfLevel(prev_perf_level);
+ return s;
+}
+
+Status WritableFileWriter::RangeSync(uint64_t offset, uint64_t nbytes) {
+ IOSTATS_TIMER_GUARD(range_sync_nanos);
+ TEST_SYNC_POINT("WritableFileWriter::RangeSync:0");
+ return writable_file_->RangeSync(offset, nbytes, IOOptions(), nullptr);
+}
+
+// This method writes to disk the specified data and makes use of the rate
+// limiter if available
+Status WritableFileWriter::WriteBuffered(const char* data, size_t size) {
+ Status s;
+ assert(!use_direct_io());
+ const char* src = data;
+ size_t left = size;
+
+ while (left > 0) {
+ size_t allowed;
+ if (rate_limiter_ != nullptr) {
+ allowed = rate_limiter_->RequestToken(
+ left, 0 /* alignment */, writable_file_->GetIOPriority(), stats_,
+ RateLimiter::OpType::kWrite);
+ } else {
+ allowed = left;
+ }
+
+ {
+ IOSTATS_TIMER_GUARD(write_nanos);
+ TEST_SYNC_POINT("WritableFileWriter::Flush:BeforeAppend");
+
+#ifndef ROCKSDB_LITE
+ FileOperationInfo::TimePoint start_ts;
+ uint64_t old_size = writable_file_->GetFileSize(IOOptions(), nullptr);
+ if (ShouldNotifyListeners()) {
+ start_ts = std::chrono::system_clock::now();
+ old_size = next_write_offset_;
+ }
+#endif
+ {
+ auto prev_perf_level = GetPerfLevel();
+ IOSTATS_CPU_TIMER_GUARD(cpu_write_nanos, env_);
+ s = writable_file_->Append(Slice(src, allowed), IOOptions(), nullptr);
+ SetPerfLevel(prev_perf_level);
+ }
+#ifndef ROCKSDB_LITE
+ if (ShouldNotifyListeners()) {
+ auto finish_ts = std::chrono::system_clock::now();
+ NotifyOnFileWriteFinish(old_size, allowed, start_ts, finish_ts, s);
+ }
+#endif
+ if (!s.ok()) {
+ return s;
+ }
+ }
+
+ IOSTATS_ADD(bytes_written, allowed);
+ TEST_KILL_RANDOM("WritableFileWriter::WriteBuffered:0", rocksdb_kill_odds);
+
+ left -= allowed;
+ src += allowed;
+ }
+ buf_.Size(0);
+ return s;
+}
+
+void WritableFileWriter::CalculateFileChecksum(const Slice& data) {
+ if (checksum_func_ != nullptr) {
+ if (is_first_checksum_) {
+ file_checksum_ = checksum_func_->Value(data.data(), data.size());
+ is_first_checksum_ = false;
+ } else {
+ file_checksum_ =
+ checksum_func_->Extend(file_checksum_, data.data(), data.size());
+ }
+ }
+}
+
+// This flushes the accumulated data in the buffer. We pad data with zeros if
+// necessary to the whole page.
+// However, during automatic flushes padding would not be necessary.
+// We always use RateLimiter if available. We move (Refit) any buffer bytes
+// that are left over the
+// whole number of pages to be written again on the next flush because we can
+// only write on aligned
+// offsets.
+#ifndef ROCKSDB_LITE
+Status WritableFileWriter::WriteDirect() {
+ assert(use_direct_io());
+ Status s;
+ const size_t alignment = buf_.Alignment();
+ assert((next_write_offset_ % alignment) == 0);
+
+ // Calculate whole page final file advance if all writes succeed
+ size_t file_advance = TruncateToPageBoundary(alignment, buf_.CurrentSize());
+
+ // Calculate the leftover tail, we write it here padded with zeros BUT we
+ // will write
+ // it again in the future either on Close() OR when the current whole page
+ // fills out
+ size_t leftover_tail = buf_.CurrentSize() - file_advance;
+
+ // Round up and pad
+ buf_.PadToAlignmentWith(0);
+
+ const char* src = buf_.BufferStart();
+ uint64_t write_offset = next_write_offset_;
+ size_t left = buf_.CurrentSize();
+
+ while (left > 0) {
+ // Check how much is allowed
+ size_t size;
+ if (rate_limiter_ != nullptr) {
+ size = rate_limiter_->RequestToken(left, buf_.Alignment(),
+ writable_file_->GetIOPriority(),
+ stats_, RateLimiter::OpType::kWrite);
+ } else {
+ size = left;
+ }
+
+ {
+ IOSTATS_TIMER_GUARD(write_nanos);
+ TEST_SYNC_POINT("WritableFileWriter::Flush:BeforeAppend");
+ FileOperationInfo::TimePoint start_ts;
+ if (ShouldNotifyListeners()) {
+ start_ts = std::chrono::system_clock::now();
+ }
+ // direct writes must be positional
+ s = writable_file_->PositionedAppend(Slice(src, size), write_offset,
+ IOOptions(), nullptr);
+ if (ShouldNotifyListeners()) {
+ auto finish_ts = std::chrono::system_clock::now();
+ NotifyOnFileWriteFinish(write_offset, size, start_ts, finish_ts, s);
+ }
+ if (!s.ok()) {
+ buf_.Size(file_advance + leftover_tail);
+ return s;
+ }
+ }
+
+ IOSTATS_ADD(bytes_written, size);
+ left -= size;
+ src += size;
+ write_offset += size;
+ assert((next_write_offset_ % alignment) == 0);
+ }
+
+ if (s.ok()) {
+ // Move the tail to the beginning of the buffer
+ // This never happens during normal Append but rather during
+ // explicit call to Flush()/Sync() or Close()
+ buf_.RefitTail(file_advance, leftover_tail);
+ // This is where we start writing next time which may or not be
+ // the actual file size on disk. They match if the buffer size
+ // is a multiple of whole pages otherwise filesize_ is leftover_tail
+ // behind
+ next_write_offset_ += file_advance;
+ }
+ return s;
+}
+#endif // !ROCKSDB_LITE
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/file/writable_file_writer.h b/src/rocksdb/file/writable_file_writer.h
new file mode 100644
index 000000000..123110713
--- /dev/null
+++ b/src/rocksdb/file/writable_file_writer.h
@@ -0,0 +1,171 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <atomic>
+#include <string>
+#include "db/version_edit.h"
+#include "port/port.h"
+#include "rocksdb/env.h"
+#include "rocksdb/file_checksum.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/listener.h"
+#include "rocksdb/rate_limiter.h"
+#include "test_util/sync_point.h"
+#include "util/aligned_buffer.h"
+
+namespace ROCKSDB_NAMESPACE {
+class Statistics;
+
+// WritableFileWriter is a wrapper on top of Env::WritableFile. It provides
+// facilities to:
+// - Handle Buffered and Direct writes.
+// - Rate limit writes.
+// - Flush and Sync the data to the underlying filesystem.
+// - Notify any interested listeners on the completion of a write.
+// - Update IO stats.
+class WritableFileWriter {
+ private:
+#ifndef ROCKSDB_LITE
+ void NotifyOnFileWriteFinish(uint64_t offset, size_t length,
+ const FileOperationInfo::TimePoint& start_ts,
+ const FileOperationInfo::TimePoint& finish_ts,
+ const Status& status) {
+ FileOperationInfo info(file_name_, start_ts, finish_ts);
+ info.offset = offset;
+ info.length = length;
+ info.status = status;
+
+ for (auto& listener : listeners_) {
+ listener->OnFileWriteFinish(info);
+ }
+ }
+#endif // ROCKSDB_LITE
+
+ bool ShouldNotifyListeners() const { return !listeners_.empty(); }
+ void CalculateFileChecksum(const Slice& data);
+
+ std::unique_ptr<FSWritableFile> writable_file_;
+ std::string file_name_;
+ Env* env_;
+ AlignedBuffer buf_;
+ size_t max_buffer_size_;
+ // Actually written data size can be used for truncate
+ // not counting padding data
+ uint64_t filesize_;
+#ifndef ROCKSDB_LITE
+ // This is necessary when we use unbuffered access
+ // and writes must happen on aligned offsets
+ // so we need to go back and write that page again
+ uint64_t next_write_offset_;
+#endif // ROCKSDB_LITE
+ bool pending_sync_;
+ uint64_t last_sync_size_;
+ uint64_t bytes_per_sync_;
+ RateLimiter* rate_limiter_;
+ Statistics* stats_;
+ std::vector<std::shared_ptr<EventListener>> listeners_;
+ FileChecksumFunc* checksum_func_;
+ std::string file_checksum_ = kUnknownFileChecksum;
+ bool is_first_checksum_ = true;
+
+ public:
+ WritableFileWriter(
+ std::unique_ptr<FSWritableFile>&& file, const std::string& _file_name,
+ const FileOptions& options, Env* env = nullptr,
+ Statistics* stats = nullptr,
+ const std::vector<std::shared_ptr<EventListener>>& listeners = {},
+ FileChecksumFunc* checksum_func = nullptr)
+ : writable_file_(std::move(file)),
+ file_name_(_file_name),
+ env_(env),
+ buf_(),
+ max_buffer_size_(options.writable_file_max_buffer_size),
+ filesize_(0),
+#ifndef ROCKSDB_LITE
+ next_write_offset_(0),
+#endif // ROCKSDB_LITE
+ pending_sync_(false),
+ last_sync_size_(0),
+ bytes_per_sync_(options.bytes_per_sync),
+ rate_limiter_(options.rate_limiter),
+ stats_(stats),
+ listeners_(),
+ checksum_func_(checksum_func) {
+ TEST_SYNC_POINT_CALLBACK("WritableFileWriter::WritableFileWriter:0",
+ reinterpret_cast<void*>(max_buffer_size_));
+ buf_.Alignment(writable_file_->GetRequiredBufferAlignment());
+ buf_.AllocateNewBuffer(std::min((size_t)65536, max_buffer_size_));
+#ifndef ROCKSDB_LITE
+ std::for_each(listeners.begin(), listeners.end(),
+ [this](const std::shared_ptr<EventListener>& e) {
+ if (e->ShouldBeNotifiedOnFileIO()) {
+ listeners_.emplace_back(e);
+ }
+ });
+#else // !ROCKSDB_LITE
+ (void)listeners;
+#endif
+ }
+
+ WritableFileWriter(const WritableFileWriter&) = delete;
+
+ WritableFileWriter& operator=(const WritableFileWriter&) = delete;
+
+ ~WritableFileWriter() { Close(); }
+
+ std::string file_name() const { return file_name_; }
+
+ Status Append(const Slice& data);
+
+ Status Pad(const size_t pad_bytes);
+
+ Status Flush();
+
+ Status Close();
+
+ Status Sync(bool use_fsync);
+
+ // Sync only the data that was already Flush()ed. Safe to call concurrently
+ // with Append() and Flush(). If !writable_file_->IsSyncThreadSafe(),
+ // returns NotSupported status.
+ Status SyncWithoutFlush(bool use_fsync);
+
+ uint64_t GetFileSize() const { return filesize_; }
+
+ Status InvalidateCache(size_t offset, size_t length) {
+ return writable_file_->InvalidateCache(offset, length);
+ }
+
+ FSWritableFile* writable_file() const { return writable_file_.get(); }
+
+ bool use_direct_io() { return writable_file_->use_direct_io(); }
+
+ bool TEST_BufferIsEmpty() { return buf_.CurrentSize() == 0; }
+
+ void TEST_SetFileChecksumFunc(FileChecksumFunc* checksum_func) {
+ checksum_func_ = checksum_func;
+ }
+
+ const std::string& GetFileChecksum() const { return file_checksum_; }
+
+ const char* GetFileChecksumFuncName() const;
+
+ private:
+ // Used when os buffering is OFF and we are writing
+ // DMA such as in Direct I/O mode
+#ifndef ROCKSDB_LITE
+ Status WriteDirect();
+#endif // !ROCKSDB_LITE
+ // Normal write
+ Status WriteBuffered(const char* data, size_t size);
+ Status RangeSync(uint64_t offset, uint64_t nbytes);
+ Status SyncInternal(bool use_fsync);
+};
+} // namespace ROCKSDB_NAMESPACE